Mon, 27 Jul 2009 03:26:17 +0100
minify: Add debug level, add warnings when specified level not valid, don't override options with defaults
1 | 1 | --[[-------------------------------------------------------------------- |
2 | ||
3 | optparser.lua: does parser-based optimizations | |
4 | This file is part of LuaSrcDiet. | |
5 | ||
6 | Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net> | |
7 | The COPYRIGHT file describes the conditions | |
8 | under which this software may be distributed. | |
9 | ||
10 | See the ChangeLog for more information. | |
11 | ||
12 | ----------------------------------------------------------------------]] | |
13 | ||
14 | --[[-------------------------------------------------------------------- | |
15 | -- NOTES: | |
16 | -- * For more parser-based optimization ideas, see the TODO items or | |
17 | -- look at technotes.txt. | |
18 | -- * The processing load is quite significant, but since this is an | |
19 | -- off-line text processor, I believe we can wait a few seconds. | |
20 | -- * TODO: might process "local a,a,a" wrongly... need tests! | |
21 | -- * TODO: remove position handling if overlapped locals (rem < 0) | |
22 | -- needs more study, to check behaviour | |
23 | -- * TODO: there are probably better ways to do allocation, e.g. by | |
24 | -- choosing better methods to sort and pick locals... | |
25 | -- * TODO: we don't need 53*63 two-letter identifiers; we can make | |
26 | -- do with significantly less depending on how many that are really | |
27 | -- needed and improve entropy; e.g. 13 needed -> choose 4*4 instead | |
28 | ----------------------------------------------------------------------]] | |
29 | ||
30 | local base = _G | |
31 | local string = require "string" | |
32 | local table = require "table" | |
33 | module "optparser" | |
34 | ||
35 | ---------------------------------------------------------------------- | |
36 | -- Letter frequencies for reducing symbol entropy (fixed version) | |
37 | -- * Might help a wee bit when the output file is compressed | |
38 | -- * See Wikipedia: http://en.wikipedia.org/wiki/Letter_frequencies | |
39 | -- * We use letter frequencies according to a Linotype keyboard, plus | |
40 | -- the underscore, and both lower case and upper case letters. | |
41 | -- * The arrangement below (LC, underscore, %d, UC) is arbitrary. | |
42 | -- * This is certainly not optimal, but is quick-and-dirty and the | |
43 | -- process has no significant overhead | |
44 | ---------------------------------------------------------------------- | |
45 | ||
46 | local LETTERS = "etaoinshrdlucmfwypvbgkqjxz_ETAOINSHRDLUCMFWYPVBGKQJXZ" | |
47 | local ALPHANUM = "etaoinshrdlucmfwypvbgkqjxz_0123456789ETAOINSHRDLUCMFWYPVBGKQJXZ" | |
48 | ||
49 | -- names or identifiers that must be skipped | |
50 | -- * the first two lines are for keywords | |
51 | local SKIP_NAME = {} | |
52 | for v in string.gmatch([[ | |
53 | and break do else elseif end false for function if in | |
54 | local nil not or repeat return then true until while | |
55 | self]], "%S+") do | |
56 | SKIP_NAME[v] = true | |
57 | end | |
58 | ||
59 | ------------------------------------------------------------------------ | |
60 | -- variables and data structures | |
61 | ------------------------------------------------------------------------ | |
62 | ||
63 | local toklist, seminfolist, -- token lists | |
64 | globalinfo, localinfo, -- variable information tables | |
65 | globaluniq, localuniq, -- unique name tables | |
66 | var_new, -- index of new variable names | |
67 | varlist -- list of output variables | |
68 | ||
69 | ---------------------------------------------------------------------- | |
70 | -- preprocess information table to get lists of unique names | |
71 | ---------------------------------------------------------------------- | |
72 | ||
73 | local function preprocess(infotable) | |
74 | local uniqtable = {} | |
75 | for i = 1, #infotable do -- enumerate info table | |
76 | local obj = infotable[i] | |
77 | local name = obj.name | |
78 | -------------------------------------------------------------------- | |
79 | if not uniqtable[name] then -- not found, start an entry | |
80 | uniqtable[name] = { | |
81 | decl = 0, token = 0, size = 0, | |
82 | } | |
83 | end | |
84 | -------------------------------------------------------------------- | |
85 | local uniq = uniqtable[name] -- count declarations, tokens, size | |
86 | uniq.decl = uniq.decl + 1 | |
87 | local xref = obj.xref | |
88 | local xcount = #xref | |
89 | uniq.token = uniq.token + xcount | |
90 | uniq.size = uniq.size + xcount * #name | |
91 | -------------------------------------------------------------------- | |
92 | if obj.decl then -- if local table, create first,last pairs | |
93 | obj.id = i | |
94 | obj.xcount = xcount | |
95 | if xcount > 1 then -- if ==1, means local never accessed | |
96 | obj.first = xref[2] | |
97 | obj.last = xref[xcount] | |
98 | end | |
99 | -------------------------------------------------------------------- | |
100 | else -- if global table, add a back ref | |
101 | uniq.id = i | |
102 | end | |
103 | -------------------------------------------------------------------- | |
104 | end--for | |
105 | return uniqtable | |
106 | end | |
107 | ||
108 | ---------------------------------------------------------------------- | |
109 | -- calculate actual symbol frequencies, in order to reduce entropy | |
110 | -- * this may help further reduce the size of compressed sources | |
111 | -- * note that since parsing optimizations is put before lexing | |
112 | -- optimizations, the frequency table is not exact! | |
113 | -- * yes, this will miss --keep block comments too... | |
114 | ---------------------------------------------------------------------- | |
115 | ||
116 | local function recalc_for_entropy(option) | |
117 | local byte = string.byte | |
118 | local char = string.char | |
119 | -- table of token classes to accept in calculating symbol frequency | |
120 | local ACCEPT = { | |
121 | TK_KEYWORD = true, TK_NAME = true, TK_NUMBER = true, | |
122 | TK_STRING = true, TK_LSTRING = true, | |
123 | } | |
124 | if not option["opt-comments"] then | |
125 | ACCEPT.TK_COMMENT = true | |
126 | ACCEPT.TK_LCOMMENT = true | |
127 | end | |
128 | -------------------------------------------------------------------- | |
129 | -- create a new table and remove any original locals by filtering | |
130 | -------------------------------------------------------------------- | |
131 | local filtered = {} | |
132 | for i = 1, #toklist do | |
133 | filtered[i] = seminfolist[i] | |
134 | end | |
135 | for i = 1, #localinfo do -- enumerate local info table | |
136 | local obj = localinfo[i] | |
137 | local xref = obj.xref | |
138 | for j = 1, obj.xcount do | |
139 | local p = xref[j] | |
140 | filtered[p] = "" -- remove locals | |
141 | end | |
142 | end | |
143 | -------------------------------------------------------------------- | |
144 | local freq = {} -- reset symbol frequency table | |
145 | for i = 0, 255 do freq[i] = 0 end | |
146 | for i = 1, #toklist do -- gather symbol frequency | |
147 | local tok, info = toklist[i], filtered[i] | |
148 | if ACCEPT[tok] then | |
149 | for j = 1, #info do | |
150 | local c = byte(info, j) | |
151 | freq[c] = freq[c] + 1 | |
152 | end | |
153 | end--if | |
154 | end--for | |
155 | -------------------------------------------------------------------- | |
156 | -- function to re-sort symbols according to actual frequencies | |
157 | -------------------------------------------------------------------- | |
158 | local function resort(symbols) | |
159 | local symlist = {} | |
160 | for i = 1, #symbols do -- prepare table to sort | |
161 | local c = byte(symbols, i) | |
162 | symlist[i] = { c = c, freq = freq[c], } | |
163 | end | |
164 | table.sort(symlist, -- sort selected symbols | |
165 | function(v1, v2) | |
166 | return v1.freq > v2.freq | |
167 | end | |
168 | ) | |
169 | local charlist = {} -- reconstitute the string | |
170 | for i = 1, #symlist do | |
171 | charlist[i] = char(symlist[i].c) | |
172 | end | |
173 | return table.concat(charlist) | |
174 | end | |
175 | -------------------------------------------------------------------- | |
176 | LETTERS = resort(LETTERS) -- change letter arrangement | |
177 | ALPHANUM = resort(ALPHANUM) | |
178 | end | |
179 | ||
180 | ---------------------------------------------------------------------- | |
181 | -- returns a string containing a new local variable name to use, and | |
182 | -- a flag indicating whether it collides with a global variable | |
183 | -- * trapping keywords and other names like 'self' is done elsewhere | |
184 | ---------------------------------------------------------------------- | |
185 | ||
186 | local function new_var_name() | |
187 | local var | |
188 | local cletters, calphanum = #LETTERS, #ALPHANUM | |
189 | local v = var_new | |
190 | if v < cletters then -- single char | |
191 | v = v + 1 | |
192 | var = string.sub(LETTERS, v, v) | |
193 | else -- longer names | |
194 | local range, sz = cletters, 1 -- calculate # chars fit | |
195 | repeat | |
196 | v = v - range | |
197 | range = range * calphanum | |
198 | sz = sz + 1 | |
199 | until range > v | |
200 | local n = v % cletters -- left side cycles faster | |
201 | v = (v - n) / cletters -- do first char first | |
202 | n = n + 1 | |
203 | var = string.sub(LETTERS, n, n) | |
204 | while sz > 1 do | |
205 | local m = v % calphanum | |
206 | v = (v - m) / calphanum | |
207 | m = m + 1 | |
208 | var = var..string.sub(ALPHANUM, m, m) | |
209 | sz = sz - 1 | |
210 | end | |
211 | end | |
212 | var_new = var_new + 1 | |
213 | return var, globaluniq[var] ~= nil | |
214 | end | |
215 | ||
216 | ---------------------------------------------------------------------- | |
217 | -- main entry point | |
218 | -- * does only local variable optimization for now | |
219 | ---------------------------------------------------------------------- | |
220 | ||
221 | function optimize(option, _toklist, _seminfolist, _globalinfo, _localinfo) | |
222 | -- set tables | |
223 | toklist, seminfolist, globalinfo, localinfo | |
224 | = _toklist, _seminfolist, _globalinfo, _localinfo | |
225 | var_new = 0 -- reset variable name allocator | |
226 | varlist = {} | |
227 | ------------------------------------------------------------------ | |
228 | -- preprocess global/local tables, handle entropy reduction | |
229 | ------------------------------------------------------------------ | |
230 | globaluniq = preprocess(globalinfo) | |
231 | localuniq = preprocess(localinfo) | |
232 | if option["opt-entropy"] then -- for entropy improvement | |
233 | recalc_for_entropy(option) | |
234 | end | |
235 | ------------------------------------------------------------------ | |
236 | -- build initial declared object table, then sort according to | |
237 | -- token count, this might help assign more tokens to more common | |
238 | -- variable names such as 'e' thus possibly reducing entropy | |
239 | -- * an object knows its localinfo index via its 'id' field | |
240 | -- * special handling for "self" special local (parameter) here | |
241 | ------------------------------------------------------------------ | |
242 | local object = {} | |
243 | for i = 1, #localinfo do | |
244 | object[i] = localinfo[i] | |
245 | end | |
246 | table.sort(object, -- sort largest first | |
247 | function(v1, v2) | |
248 | return v1.xcount > v2.xcount | |
249 | end | |
250 | ) | |
251 | ------------------------------------------------------------------ | |
252 | -- the special "self" function parameters must be preserved | |
253 | -- * the allocator below will never use "self", so it is safe to | |
254 | -- keep those implicit declarations as-is | |
255 | ------------------------------------------------------------------ | |
256 | local temp, j, gotself = {}, 1, false | |
257 | for i = 1, #object do | |
258 | local obj = object[i] | |
259 | if not obj.isself then | |
260 | temp[j] = obj | |
261 | j = j + 1 | |
262 | else | |
263 | gotself = true | |
264 | end | |
265 | end | |
266 | object = temp | |
267 | ------------------------------------------------------------------ | |
268 | -- a simple first-come first-served heuristic name allocator, | |
269 | -- note that this is in no way optimal... | |
270 | -- * each object is a local variable declaration plus existence | |
271 | -- * the aim is to assign short names to as many tokens as possible, | |
272 | -- so the following tries to maximize name reuse | |
273 | -- * note that we preserve sort order | |
274 | ------------------------------------------------------------------ | |
275 | local nobject = #object | |
276 | while nobject > 0 do | |
277 | local varname, gcollide | |
278 | repeat | |
279 | varname, gcollide = new_var_name() -- collect a variable name | |
280 | until not SKIP_NAME[varname] -- skip all special names | |
281 | varlist[#varlist + 1] = varname -- keep a list | |
282 | local oleft = nobject | |
283 | ------------------------------------------------------------------ | |
284 | -- if variable name collides with an existing global, the name | |
285 | -- cannot be used by a local when the name is accessed as a global | |
286 | -- during which the local is alive (between 'act' to 'rem'), so | |
287 | -- we drop objects that collides with the corresponding global | |
288 | ------------------------------------------------------------------ | |
289 | if gcollide then | |
290 | -- find the xref table of the global | |
291 | local gref = globalinfo[globaluniq[varname].id].xref | |
292 | local ngref = #gref | |
293 | -- enumerate for all current objects; all are valid at this point | |
294 | for i = 1, nobject do | |
295 | local obj = object[i] | |
296 | local act, rem = obj.act, obj.rem -- 'live' range of local | |
297 | -- if rem < 0, it is a -id to a local that had the same name | |
298 | -- so follow rem to extend it; does this make sense? | |
299 | while rem < 0 do | |
300 | rem = localinfo[-rem].rem | |
301 | end | |
302 | local drop | |
303 | for j = 1, ngref do | |
304 | local p = gref[j] | |
305 | if p >= act and p <= rem then drop = true end -- in range? | |
306 | end | |
307 | if drop then | |
308 | obj.skip = true | |
309 | oleft = oleft - 1 | |
310 | end | |
311 | end--for | |
312 | end--if gcollide | |
313 | ------------------------------------------------------------------ | |
314 | -- now the first unassigned local (since it's sorted) will be the | |
315 | -- one with the most tokens to rename, so we set this one and then | |
316 | -- eliminate all others that collides, then any locals that left | |
317 | -- can then reuse the same variable name; this is repeated until | |
318 | -- all local declaration that can use this name is assigned | |
319 | -- * the criteria for local-local reuse/collision is: | |
320 | -- A is the local with a name already assigned | |
321 | -- B is the unassigned local under consideration | |
322 | -- => anytime A is accessed, it cannot be when B is 'live' | |
323 | -- => to speed up things, we have first/last accesses noted | |
324 | ------------------------------------------------------------------ | |
325 | while oleft > 0 do | |
326 | local i = 1 | |
327 | while object[i].skip do -- scan for first object | |
328 | i = i + 1 | |
329 | end | |
330 | ------------------------------------------------------------------ | |
331 | -- first object is free for assignment of the variable name | |
332 | -- [first,last] gives the access range for collision checking | |
333 | ------------------------------------------------------------------ | |
334 | oleft = oleft - 1 | |
335 | local obja = object[i] | |
336 | i = i + 1 | |
337 | obja.newname = varname | |
338 | obja.skip = true | |
339 | obja.done = true | |
340 | local first, last = obja.first, obja.last | |
341 | local xref = obja.xref | |
342 | ------------------------------------------------------------------ | |
343 | -- then, scan all the rest and drop those colliding | |
344 | -- if A was never accessed then it'll never collide with anything | |
345 | -- otherwise trivial skip if: | |
346 | -- * B was activated after A's last access (last < act) | |
347 | -- * B was removed before A's first access (first > rem) | |
348 | -- if not, see detailed skip below... | |
349 | ------------------------------------------------------------------ | |
350 | if first and oleft > 0 then -- must have at least 1 access | |
351 | local scanleft = oleft | |
352 | while scanleft > 0 do | |
353 | while object[i].skip do -- next valid object | |
354 | i = i + 1 | |
355 | end | |
356 | scanleft = scanleft - 1 | |
357 | local objb = object[i] | |
358 | i = i + 1 | |
359 | local act, rem = objb.act, objb.rem -- live range of B | |
360 | -- if rem < 0, extend range of rem thru' following local | |
361 | while rem < 0 do | |
362 | rem = localinfo[-rem].rem | |
363 | end | |
364 | -------------------------------------------------------- | |
365 | if not(last < act or first > rem) then -- possible collision | |
366 | -------------------------------------------------------- | |
367 | -- B is activated later than A or at the same statement, | |
368 | -- this means for no collision, A cannot be accessed when B | |
369 | -- is alive, since B overrides A (or is a peer) | |
370 | -------------------------------------------------------- | |
371 | if act >= obja.act then | |
372 | for j = 1, obja.xcount do -- ... then check every access | |
373 | local p = xref[j] | |
374 | if p >= act and p <= rem then -- A accessed when B live! | |
375 | oleft = oleft - 1 | |
376 | objb.skip = true | |
377 | break | |
378 | end | |
379 | end--for | |
380 | -------------------------------------------------------- | |
381 | -- A is activated later than B, this means for no collision, | |
382 | -- A's access is okay since it overrides B, but B's last | |
383 | -- access need to be earlier than A's activation time | |
384 | -------------------------------------------------------- | |
385 | else | |
386 | if objb.last and objb.last >= obja.act then | |
387 | oleft = oleft - 1 | |
388 | objb.skip = true | |
389 | end | |
390 | end | |
391 | end | |
392 | -------------------------------------------------------- | |
393 | if oleft == 0 then break end | |
394 | end | |
395 | end--if first | |
396 | ------------------------------------------------------------------ | |
397 | end--while | |
398 | ------------------------------------------------------------------ | |
399 | -- after assigning all possible locals to one variable name, the | |
400 | -- unassigned locals/objects have the skip field reset and the table | |
401 | -- is compacted, to hopefully reduce iteration time | |
402 | ------------------------------------------------------------------ | |
403 | local temp, j = {}, 1 | |
404 | for i = 1, nobject do | |
405 | local obj = object[i] | |
406 | if not obj.done then | |
407 | obj.skip = false | |
408 | temp[j] = obj | |
409 | j = j + 1 | |
410 | end | |
411 | end | |
412 | object = temp -- new compacted object table | |
413 | nobject = #object -- objects left to process | |
414 | ------------------------------------------------------------------ | |
415 | end--while | |
416 | ------------------------------------------------------------------ | |
417 | -- after assigning all locals with new variable names, we can | |
418 | -- patch in the new names, and reprocess to get 'after' stats | |
419 | ------------------------------------------------------------------ | |
420 | for i = 1, #localinfo do -- enumerate all locals | |
421 | local obj = localinfo[i] | |
422 | local xref = obj.xref | |
423 | if obj.newname then -- if got new name, patch it in | |
424 | for j = 1, obj.xcount do | |
425 | local p = xref[j] -- xrefs indexes the token list | |
426 | seminfolist[p] = obj.newname | |
427 | end | |
428 | obj.name, obj.oldname -- adjust names | |
429 | = obj.newname, obj.name | |
430 | else | |
431 | obj.oldname = obj.name -- for cases like 'self' | |
432 | end | |
433 | end | |
434 | ------------------------------------------------------------------ | |
435 | -- deal with statistics output | |
436 | ------------------------------------------------------------------ | |
437 | if gotself then -- add 'self' to end of list | |
438 | varlist[#varlist + 1] = "self" | |
439 | end | |
440 | local afteruniq = preprocess(localinfo) | |
441 | ------------------------------------------------------------------ | |
442 | end |