Mon, 27 Jul 2009 03:26:17 +0100
minify: Add debug level, add warnings when specified level not valid, don't override options with defaults
1 | 1 | --[[-------------------------------------------------------------------- |
2 | ||
3 | optlex.lua: does lexer-based optimizations | |
4 | This file is part of LuaSrcDiet. | |
5 | ||
6 | Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net> | |
7 | The COPYRIGHT file describes the conditions | |
8 | under which this software may be distributed. | |
9 | ||
10 | See the ChangeLog for more information. | |
11 | ||
12 | ----------------------------------------------------------------------]] | |
13 | ||
14 | --[[-------------------------------------------------------------------- | |
15 | -- NOTES: | |
16 | -- * For more lexer-based optimization ideas, see the TODO items or | |
17 | -- look at technotes.txt. | |
18 | -- * TODO: general string delimiter conversion optimizer | |
19 | -- * TODO: (numbers) warn if overly significant digit | |
20 | ----------------------------------------------------------------------]] | |
21 | ||
22 | local base = _G | |
23 | local string = require "string" | |
24 | module "optlex" | |
25 | local match = string.match | |
26 | local sub = string.sub | |
27 | local find = string.find | |
28 | local rep = string.rep | |
29 | local print | |
30 | ||
31 | ------------------------------------------------------------------------ | |
32 | -- variables and data structures | |
33 | ------------------------------------------------------------------------ | |
34 | ||
35 | -- error function, can override by setting own function into module | |
36 | error = base.error | |
37 | ||
38 | warn = {} -- table for warning flags | |
39 | ||
40 | local stoks, sinfos, stoklns -- source lists | |
41 | ||
42 | local is_realtoken = { -- significant (grammar) tokens | |
43 | TK_KEYWORD = true, | |
44 | TK_NAME = true, | |
45 | TK_NUMBER = true, | |
46 | TK_STRING = true, | |
47 | TK_LSTRING = true, | |
48 | TK_OP = true, | |
49 | TK_EOS = true, | |
50 | } | |
51 | local is_faketoken = { -- whitespace (non-grammar) tokens | |
52 | TK_COMMENT = true, | |
53 | TK_LCOMMENT = true, | |
54 | TK_EOL = true, | |
55 | TK_SPACE = true, | |
56 | } | |
57 | ||
58 | local opt_details -- for extra information | |
59 | ||
60 | ------------------------------------------------------------------------ | |
61 | -- true if current token is at the start of a line | |
62 | -- * skips over deleted tokens via recursion | |
63 | ------------------------------------------------------------------------ | |
64 | ||
65 | local function atlinestart(i) | |
66 | local tok = stoks[i - 1] | |
67 | if i <= 1 or tok == "TK_EOL" then | |
68 | return true | |
69 | elseif tok == "" then | |
70 | return atlinestart(i - 1) | |
71 | end | |
72 | return false | |
73 | end | |
74 | ||
75 | ------------------------------------------------------------------------ | |
76 | -- true if current token is at the end of a line | |
77 | -- * skips over deleted tokens via recursion | |
78 | ------------------------------------------------------------------------ | |
79 | ||
80 | local function atlineend(i) | |
81 | local tok = stoks[i + 1] | |
82 | if i >= #stoks or tok == "TK_EOL" or tok == "TK_EOS" then | |
83 | return true | |
84 | elseif tok == "" then | |
85 | return atlineend(i + 1) | |
86 | end | |
87 | return false | |
88 | end | |
89 | ||
90 | ------------------------------------------------------------------------ | |
91 | -- counts comment EOLs inside a long comment | |
92 | -- * in order to keep line numbering, EOLs need to be reinserted | |
93 | ------------------------------------------------------------------------ | |
94 | ||
95 | local function commenteols(lcomment) | |
96 | local sep = #match(lcomment, "^%-%-%[=*%[") | |
97 | local z = sub(lcomment, sep + 1, -(sep - 1)) -- remove delims | |
98 | local i, c = 1, 0 | |
99 | while true do | |
100 | local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i) | |
101 | if not p then break end -- if no matches, done | |
102 | i = p + 1 | |
103 | c = c + 1 | |
104 | if #s > 0 and r ~= s then -- skip CRLF or LFCR | |
105 | i = i + 1 | |
106 | end | |
107 | end | |
108 | return c | |
109 | end | |
110 | ||
111 | ------------------------------------------------------------------------ | |
112 | -- compares two tokens (i, j) and returns the whitespace required | |
113 | -- * important! see technotes.txt for more information | |
114 | -- * only two grammar/real tokens are being considered | |
115 | -- * if "", no separation is needed | |
116 | -- * if " ", then at least one whitespace (or EOL) is required | |
117 | ------------------------------------------------------------------------ | |
118 | ||
119 | local function checkpair(i, j) | |
120 | local match = match | |
121 | local t1, t2 = stoks[i], stoks[j] | |
122 | -------------------------------------------------------------------- | |
123 | if t1 == "TK_STRING" or t1 == "TK_LSTRING" or | |
124 | t2 == "TK_STRING" or t2 == "TK_LSTRING" then | |
125 | return "" | |
126 | -------------------------------------------------------------------- | |
127 | elseif t1 == "TK_OP" or t2 == "TK_OP" then | |
128 | if (t1 == "TK_OP" and (t2 == "TK_KEYWORD" or t2 == "TK_NAME")) or | |
129 | (t2 == "TK_OP" and (t1 == "TK_KEYWORD" or t1 == "TK_NAME")) then | |
130 | return "" | |
131 | end | |
132 | if t1 == "TK_OP" and t2 == "TK_OP" then | |
133 | -- for TK_OP/TK_OP pairs, see notes in technotes.txt | |
134 | local op, op2 = sinfos[i], sinfos[j] | |
135 | if (match(op, "^%.%.?$") and match(op2, "^%.")) or | |
136 | (match(op, "^[~=<>]$") and op2 == "=") or | |
137 | (op == "[" and (op2 == "[" or op2 == "=")) then | |
138 | return " " | |
139 | end | |
140 | return "" | |
141 | end | |
142 | -- "TK_OP" + "TK_NUMBER" case | |
143 | local op = sinfos[i] | |
144 | if t2 == "TK_OP" then op = sinfos[j] end | |
145 | if match(op, "^%.%.?%.?$") then | |
146 | return " " | |
147 | end | |
148 | return "" | |
149 | -------------------------------------------------------------------- | |
150 | else-- "TK_KEYWORD" | "TK_NAME" | "TK_NUMBER" then | |
151 | return " " | |
152 | -------------------------------------------------------------------- | |
153 | end | |
154 | end | |
155 | ||
156 | ------------------------------------------------------------------------ | |
157 | -- repack tokens, removing deletions caused by optimization process | |
158 | ------------------------------------------------------------------------ | |
159 | ||
160 | local function repack_tokens() | |
161 | local dtoks, dinfos, dtoklns = {}, {}, {} | |
162 | local j = 1 | |
163 | for i = 1, #stoks do | |
164 | local tok = stoks[i] | |
165 | if tok ~= "" then | |
166 | dtoks[j], dinfos[j], dtoklns[j] = tok, sinfos[i], stoklns[i] | |
167 | j = j + 1 | |
168 | end | |
169 | end | |
170 | stoks, sinfos, stoklns = dtoks, dinfos, dtoklns | |
171 | end | |
172 | ||
173 | ------------------------------------------------------------------------ | |
174 | -- number optimization | |
175 | -- * optimization using string formatting functions is one way of doing | |
176 | -- this, but here, we consider all cases and handle them separately | |
177 | -- (possibly an idiotic approach...) | |
178 | -- * scientific notation being generated is not in canonical form, this | |
179 | -- may or may not be a bad thing, feedback welcome | |
180 | -- * note: intermediate portions need to fit into a normal number range | |
181 | -- * optimizations can be divided based on number patterns: | |
182 | -- * hexadecimal: | |
183 | -- (1) no need to remove leading zeros, just skip to (2) | |
184 | -- (2) convert to integer if size equal or smaller | |
185 | -- * change if equal size -> lose the 'x' to reduce entropy | |
186 | -- (3) number is then processed as an integer | |
187 | -- (4) note: does not make 0[xX] consistent | |
188 | -- * integer: | |
189 | -- (1) note: includes anything with trailing ".", ".0", ... | |
190 | -- (2) remove useless fractional part, if present, e.g. 123.000 | |
191 | -- (3) remove leading zeros, e.g. 000123 | |
192 | -- (4) switch to scientific if shorter, e.g. 123000 -> 123e3 | |
193 | -- * with fraction: | |
194 | -- (1) split into digits dot digits | |
195 | -- (2) if no integer portion, take as zero (can omit later) | |
196 | -- (3) handle degenerate .000 case, after which the fractional part | |
197 | -- must be non-zero (if zero, it's matched as an integer) | |
198 | -- (4) remove trailing zeros for fractional portion | |
199 | -- (5) p.q where p > 0 and q > 0 cannot be shortened any more | |
200 | -- (6) otherwise p == 0 and the form is .q, e.g. .000123 | |
201 | -- (7) if scientific shorter, convert, e.g. .000123 -> 123e-6 | |
202 | -- * scientific: | |
203 | -- (1) split into (digits dot digits) [eE] ([+-] digits) | |
204 | -- (2) if significand has ".", shift it out so it becomes an integer | |
205 | -- (3) if significand is zero, just use zero | |
206 | -- (4) remove leading zeros for significand | |
207 | -- (5) shift out trailing zeros for significand | |
208 | -- (6) examine exponent and determine which format is best: | |
209 | -- integer, with fraction, scientific | |
210 | ------------------------------------------------------------------------ | |
211 | ||
212 | local function do_number(i) | |
213 | local before = sinfos[i] -- 'before' | |
214 | local z = before -- working representation | |
215 | local y -- 'after', if better | |
216 | -------------------------------------------------------------------- | |
217 | if match(z, "^0[xX]") then -- hexadecimal number | |
218 | local v = base.tostring(base.tonumber(z)) | |
219 | if #v <= #z then | |
220 | z = v -- change to integer, AND continue | |
221 | else | |
222 | return -- no change; stick to hex | |
223 | end | |
224 | end | |
225 | -------------------------------------------------------------------- | |
226 | if match(z, "^%d+%.?0*$") then -- integer or has useless frac | |
227 | z = match(z, "^(%d+)%.?0*$") -- int portion only | |
228 | if z + 0 > 0 then | |
229 | z = match(z, "^0*([1-9]%d*)$") -- remove leading zeros | |
230 | local v = #match(z, "0*$") | |
231 | local nv = base.tostring(v) | |
232 | if v > #nv + 1 then -- scientific is shorter | |
233 | z = sub(z, 1, #z - v).."e"..nv | |
234 | end | |
235 | y = z | |
236 | else | |
237 | y = "0" -- basic zero | |
238 | end | |
239 | -------------------------------------------------------------------- | |
240 | elseif not match(z, "[eE]") then -- number with fraction part | |
241 | local p, q = match(z, "^(%d*)%.(%d+)$") -- split | |
242 | if p == "" then p = 0 end -- int part zero | |
243 | if q + 0 == 0 and p == 0 then | |
244 | y = "0" -- degenerate .000 case | |
245 | else | |
246 | -- now, q > 0 holds and p is a number | |
247 | local v = #match(q, "0*$") -- remove trailing zeros | |
248 | if v > 0 then | |
249 | q = sub(q, 1, #q - v) | |
250 | end | |
251 | -- if p > 0, nothing else we can do to simplify p.q case | |
252 | if p + 0 > 0 then | |
253 | y = p.."."..q | |
254 | else | |
255 | y = "."..q -- tentative, e.g. .000123 | |
256 | local v = #match(q, "^0*") -- # leading spaces | |
257 | local w = #q - v -- # significant digits | |
258 | local nv = base.tostring(#q) | |
259 | -- e.g. compare 123e-6 versus .000123 | |
260 | if w + 2 + #nv < 1 + #q then | |
261 | y = sub(q, -w).."e-"..nv | |
262 | end | |
263 | end | |
264 | end | |
265 | -------------------------------------------------------------------- | |
266 | else -- scientific number | |
267 | local sig, ex = match(z, "^([^eE]+)[eE]([%+%-]?%d+)$") | |
268 | ex = base.tonumber(ex) | |
269 | -- if got ".", shift out fractional portion of significand | |
270 | local p, q = match(sig, "^(%d*)%.(%d*)$") | |
271 | if p then | |
272 | ex = ex - #q | |
273 | sig = p..q | |
274 | end | |
275 | if sig + 0 == 0 then | |
276 | y = "0" -- basic zero | |
277 | else | |
278 | local v = #match(sig, "^0*") -- remove leading zeros | |
279 | sig = sub(sig, v + 1) | |
280 | v = #match(sig, "0*$") -- shift out trailing zeros | |
281 | if v > 0 then | |
282 | sig = sub(sig, 1, #sig - v) | |
283 | ex = ex + v | |
284 | end | |
285 | -- examine exponent and determine which format is best | |
286 | local nex = base.tostring(ex) | |
287 | if ex == 0 then -- it's just an integer | |
288 | y = sig | |
289 | elseif ex > 0 and (ex <= 1 + #nex) then -- a number | |
290 | y = sig..rep("0", ex) | |
291 | elseif ex < 0 and (ex >= -#sig) then -- fraction, e.g. .123 | |
292 | v = #sig + ex | |
293 | y = sub(sig, 1, v).."."..sub(sig, v + 1) | |
294 | elseif ex < 0 and (#nex >= -ex - #sig) then | |
295 | -- e.g. compare 1234e-5 versus .01234 | |
296 | -- gives: #sig + 1 + #nex >= 1 + (-ex - #sig) + #sig | |
297 | -- -> #nex >= -ex - #sig | |
298 | v = -ex - #sig | |
299 | y = "."..rep("0", v)..sig | |
300 | else -- non-canonical scientific representation | |
301 | y = sig.."e"..ex | |
302 | end | |
303 | end--if sig | |
304 | end | |
305 | -------------------------------------------------------------------- | |
306 | if y and y ~= sinfos[i] then | |
307 | if opt_details then | |
308 | print("<number> (line "..stoklns[i]..") "..sinfos[i].." -> "..y) | |
309 | opt_details = opt_details + 1 | |
310 | end | |
311 | sinfos[i] = y | |
312 | end | |
313 | end | |
314 | ||
315 | ------------------------------------------------------------------------ | |
316 | -- string optimization | |
317 | -- * note: works on well-formed strings only! | |
318 | -- * optimizations on characters can be summarized as follows: | |
319 | -- \a\b\f\n\r\t\v -- no change | |
320 | -- \\ -- no change | |
321 | -- \"\' -- depends on delim, other can remove \ | |
322 | -- \[\] -- remove \ | |
323 | -- \<char> -- general escape, remove \ | |
324 | -- \<eol> -- normalize the EOL only | |
325 | -- \ddd -- if \a\b\f\n\r\t\v, change to latter | |
326 | -- if other < ascii 32, keep ddd but zap leading zeros | |
327 | -- if >= ascii 32, translate it into the literal, then also | |
328 | -- do escapes for \\,\",\' cases | |
329 | -- <other> -- no change | |
330 | -- * switch delimiters if string becomes shorter | |
331 | ------------------------------------------------------------------------ | |
332 | ||
333 | local function do_string(I) | |
334 | local info = sinfos[I] | |
335 | local delim = sub(info, 1, 1) -- delimiter used | |
336 | local ndelim = (delim == "'") and '"' or "'" -- opposite " <-> ' | |
337 | local z = sub(info, 2, -2) -- actual string | |
338 | local i = 1 | |
339 | local c_delim, c_ndelim = 0, 0 -- "/' counts | |
340 | -------------------------------------------------------------------- | |
341 | while i <= #z do | |
342 | local c = sub(z, i, i) | |
343 | ---------------------------------------------------------------- | |
344 | if c == "\\" then -- escaped stuff | |
345 | local j = i + 1 | |
346 | local d = sub(z, j, j) | |
347 | local p = find("abfnrtv\\\n\r\"\'0123456789", d, 1, true) | |
348 | ------------------------------------------------------------ | |
349 | if not p then -- \<char> -- remove \ | |
350 | z = sub(z, 1, i - 1)..sub(z, j) | |
351 | i = i + 1 | |
352 | ------------------------------------------------------------ | |
353 | elseif p <= 8 then -- \a\b\f\n\r\t\v\\ | |
354 | i = i + 2 -- no change | |
355 | ------------------------------------------------------------ | |
356 | elseif p <= 10 then -- \<eol> -- normalize EOL | |
357 | local eol = sub(z, j, j + 1) | |
358 | if eol == "\r\n" or eol == "\n\r" then | |
359 | z = sub(z, 1, i).."\n"..sub(z, j + 2) | |
360 | elseif p == 10 then -- \r case | |
361 | z = sub(z, 1, i).."\n"..sub(z, j + 1) | |
362 | end | |
363 | i = i + 2 | |
364 | ------------------------------------------------------------ | |
365 | elseif p <= 12 then -- \"\' -- remove \ for ndelim | |
366 | if d == delim then | |
367 | c_delim = c_delim + 1 | |
368 | i = i + 2 | |
369 | else | |
370 | c_ndelim = c_ndelim + 1 | |
371 | z = sub(z, 1, i - 1)..sub(z, j) | |
372 | i = i + 1 | |
373 | end | |
374 | ------------------------------------------------------------ | |
375 | else -- \ddd -- various steps | |
376 | local s = match(z, "^(%d%d?%d?)", j) | |
377 | j = i + 1 + #s -- skip to location | |
378 | local cv = s + 0 | |
379 | local cc = string.char(cv) | |
380 | local p = find("\a\b\f\n\r\t\v", cc, 1, true) | |
381 | if p then -- special escapes | |
382 | s = "\\"..sub("abfnrtv", p, p) | |
383 | elseif cv < 32 then -- normalized \ddd | |
384 | s = "\\"..cv | |
385 | elseif cc == delim then -- \<delim> | |
386 | s = "\\"..cc | |
387 | c_delim = c_delim + 1 | |
388 | elseif cc == "\\" then -- \\ | |
389 | s = "\\\\" | |
390 | else -- literal character | |
391 | s = cc | |
392 | if cc == ndelim then | |
393 | c_ndelim = c_ndelim + 1 | |
394 | end | |
395 | end | |
396 | z = sub(z, 1, i - 1)..s..sub(z, j) | |
397 | i = i + #s | |
398 | ------------------------------------------------------------ | |
399 | end--if p | |
400 | ---------------------------------------------------------------- | |
401 | else-- c ~= "\\" -- <other> -- no change | |
402 | i = i + 1 | |
403 | if c == ndelim then -- count ndelim, for switching delimiters | |
404 | c_ndelim = c_ndelim + 1 | |
405 | end | |
406 | ---------------------------------------------------------------- | |
407 | end--if c | |
408 | end--while | |
409 | -------------------------------------------------------------------- | |
410 | -- switching delimiters, a long-winded derivation: | |
411 | -- (1) delim takes 2+2*c_delim bytes, ndelim takes c_ndelim bytes | |
412 | -- (2) delim becomes c_delim bytes, ndelim becomes 2+2*c_ndelim bytes | |
413 | -- simplifying the condition (1)>(2) --> c_delim > c_ndelim | |
414 | if c_delim > c_ndelim then | |
415 | i = 1 | |
416 | while i <= #z do | |
417 | local p, q, r = find(z, "([\'\"])", i) | |
418 | if not p then break end | |
419 | if r == delim then -- \<delim> -> <delim> | |
420 | z = sub(z, 1, p - 2)..sub(z, p) | |
421 | i = p | |
422 | else-- r == ndelim -- <ndelim> -> \<ndelim> | |
423 | z = sub(z, 1, p - 1).."\\"..sub(z, p) | |
424 | i = p + 2 | |
425 | end | |
426 | end--while | |
427 | delim = ndelim -- actually change delimiters | |
428 | end | |
429 | -------------------------------------------------------------------- | |
430 | z = delim..z..delim | |
431 | if z ~= sinfos[I] then | |
432 | if opt_details then | |
433 | print("<string> (line "..stoklns[I]..") "..sinfos[I].." -> "..z) | |
434 | opt_details = opt_details + 1 | |
435 | end | |
436 | sinfos[I] = z | |
437 | end | |
438 | end | |
439 | ||
440 | ------------------------------------------------------------------------ | |
441 | -- long string optimization | |
442 | -- * note: warning flagged if trailing whitespace found, not trimmed | |
443 | -- * remove first optional newline | |
444 | -- * normalize embedded newlines | |
445 | -- * reduce '=' separators in delimiters if possible | |
446 | ------------------------------------------------------------------------ | |
447 | ||
448 | local function do_lstring(I) | |
449 | local info = sinfos[I] | |
450 | local delim1 = match(info, "^%[=*%[") -- cut out delimiters | |
451 | local sep = #delim1 | |
452 | local delim2 = sub(info, -sep, -1) | |
453 | local z = sub(info, sep + 1, -(sep + 1)) -- lstring without delims | |
454 | local y = "" | |
455 | local i = 1 | |
456 | -------------------------------------------------------------------- | |
457 | while true do | |
458 | local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i) | |
459 | -- deal with a single line | |
460 | local ln | |
461 | if not p then | |
462 | ln = sub(z, i) | |
463 | elseif p >= i then | |
464 | ln = sub(z, i, p - 1) | |
465 | end | |
466 | if ln ~= "" then | |
467 | -- flag a warning if there are trailing spaces, won't optimize! | |
468 | if match(ln, "%s+$") then | |
469 | warn.lstring = "trailing whitespace in long string near line "..stoklns[I] | |
470 | end | |
471 | y = y..ln | |
472 | end | |
473 | if not p then -- done if no more EOLs | |
474 | break | |
475 | end | |
476 | -- deal with line endings, normalize them | |
477 | i = p + 1 | |
478 | if p then | |
479 | if #s > 0 and r ~= s then -- skip CRLF or LFCR | |
480 | i = i + 1 | |
481 | end | |
482 | -- skip first newline, which can be safely deleted | |
483 | if not(i == 1 and i == p) then | |
484 | y = y.."\n" | |
485 | end | |
486 | end | |
487 | end--while | |
488 | -------------------------------------------------------------------- | |
489 | -- handle possible deletion of one or more '=' separators | |
490 | if sep >= 3 then | |
491 | local chk, okay = sep - 1 | |
492 | -- loop to test ending delimiter with less of '=' down to zero | |
493 | while chk >= 2 do | |
494 | local delim = "%]"..rep("=", chk - 2).."%]" | |
495 | if not match(y, delim) then okay = chk end | |
496 | chk = chk - 1 | |
497 | end | |
498 | if okay then -- change delimiters | |
499 | sep = rep("=", okay - 2) | |
500 | delim1, delim2 = "["..sep.."[", "]"..sep.."]" | |
501 | end | |
502 | end | |
503 | -------------------------------------------------------------------- | |
504 | sinfos[I] = delim1..y..delim2 | |
505 | end | |
506 | ||
507 | ------------------------------------------------------------------------ | |
508 | -- long comment optimization | |
509 | -- * note: does not remove first optional newline | |
510 | -- * trim trailing whitespace | |
511 | -- * normalize embedded newlines | |
512 | -- * reduce '=' separators in delimiters if possible | |
513 | ------------------------------------------------------------------------ | |
514 | ||
515 | local function do_lcomment(I) | |
516 | local info = sinfos[I] | |
517 | local delim1 = match(info, "^%-%-%[=*%[") -- cut out delimiters | |
518 | local sep = #delim1 | |
519 | local delim2 = sub(info, -sep, -1) | |
520 | local z = sub(info, sep + 1, -(sep - 1)) -- comment without delims | |
521 | local y = "" | |
522 | local i = 1 | |
523 | -------------------------------------------------------------------- | |
524 | while true do | |
525 | local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i) | |
526 | -- deal with a single line, extract and check trailing whitespace | |
527 | local ln | |
528 | if not p then | |
529 | ln = sub(z, i) | |
530 | elseif p >= i then | |
531 | ln = sub(z, i, p - 1) | |
532 | end | |
533 | if ln ~= "" then | |
534 | -- trim trailing whitespace if non-empty line | |
535 | local ws = match(ln, "%s*$") | |
536 | if #ws > 0 then ln = sub(ln, 1, -(ws + 1)) end | |
537 | y = y..ln | |
538 | end | |
539 | if not p then -- done if no more EOLs | |
540 | break | |
541 | end | |
542 | -- deal with line endings, normalize them | |
543 | i = p + 1 | |
544 | if p then | |
545 | if #s > 0 and r ~= s then -- skip CRLF or LFCR | |
546 | i = i + 1 | |
547 | end | |
548 | y = y.."\n" | |
549 | end | |
550 | end--while | |
551 | -------------------------------------------------------------------- | |
552 | -- handle possible deletion of one or more '=' separators | |
553 | sep = sep - 2 | |
554 | if sep >= 3 then | |
555 | local chk, okay = sep - 1 | |
556 | -- loop to test ending delimiter with less of '=' down to zero | |
557 | while chk >= 2 do | |
558 | local delim = "%]"..rep("=", chk - 2).."%]" | |
559 | if not match(y, delim) then okay = chk end | |
560 | chk = chk - 1 | |
561 | end | |
562 | if okay then -- change delimiters | |
563 | sep = rep("=", okay - 2) | |
564 | delim1, delim2 = "--["..sep.."[", "]"..sep.."]" | |
565 | end | |
566 | end | |
567 | -------------------------------------------------------------------- | |
568 | sinfos[I] = delim1..y..delim2 | |
569 | end | |
570 | ||
571 | ------------------------------------------------------------------------ | |
572 | -- short comment optimization | |
573 | -- * trim trailing whitespace | |
574 | ------------------------------------------------------------------------ | |
575 | ||
576 | local function do_comment(i) | |
577 | local info = sinfos[i] | |
578 | local ws = match(info, "%s*$") -- just look from end of string | |
579 | if #ws > 0 then | |
580 | info = sub(info, 1, -(ws + 1)) -- trim trailing whitespace | |
581 | end | |
582 | sinfos[i] = info | |
583 | end | |
584 | ||
585 | ------------------------------------------------------------------------ | |
586 | -- returns true if string found in long comment | |
587 | -- * this is a feature to keep copyright or license texts | |
588 | ------------------------------------------------------------------------ | |
589 | ||
590 | local function keep_lcomment(opt_keep, info) | |
591 | if not opt_keep then return false end -- option not set | |
592 | local delim1 = match(info, "^%-%-%[=*%[") -- cut out delimiters | |
593 | local sep = #delim1 | |
594 | local delim2 = sub(info, -sep, -1) | |
595 | local z = sub(info, sep + 1, -(sep - 1)) -- comment without delims | |
596 | if find(z, opt_keep, 1, true) then -- try to match | |
597 | return true | |
598 | end | |
599 | end | |
600 | ||
601 | ------------------------------------------------------------------------ | |
602 | -- main entry point | |
603 | -- * currently, lexer processing has 2 passes | |
604 | -- * processing is done on a line-oriented basis, which is easier to | |
605 | -- grok due to the next point... | |
606 | -- * since there are various options that can be enabled or disabled, | |
607 | -- processing is a little messy or convoluted | |
608 | ------------------------------------------------------------------------ | |
609 | ||
610 | function optimize(option, toklist, semlist, toklnlist) | |
611 | -------------------------------------------------------------------- | |
612 | -- set option flags | |
613 | -------------------------------------------------------------------- | |
614 | local opt_comments = option["opt-comments"] | |
615 | local opt_whitespace = option["opt-whitespace"] | |
616 | local opt_emptylines = option["opt-emptylines"] | |
617 | local opt_eols = option["opt-eols"] | |
618 | local opt_strings = option["opt-strings"] | |
619 | local opt_numbers = option["opt-numbers"] | |
620 | local opt_keep = option.KEEP | |
621 | opt_details = option.DETAILS and 0 -- upvalues for details display | |
622 | print = print or base.print | |
623 | if opt_eols then -- forced settings, otherwise won't work properly | |
624 | opt_comments = true | |
625 | opt_whitespace = true | |
626 | opt_emptylines = true | |
627 | end | |
628 | -------------------------------------------------------------------- | |
629 | -- variable initialization | |
630 | -------------------------------------------------------------------- | |
631 | stoks, sinfos, stoklns -- set source lists | |
632 | = toklist, semlist, toklnlist | |
633 | local i = 1 -- token position | |
634 | local tok, info -- current token | |
635 | local prev -- position of last grammar token | |
636 | -- on same line (for TK_SPACE stuff) | |
637 | -------------------------------------------------------------------- | |
638 | -- changes a token, info pair | |
639 | -------------------------------------------------------------------- | |
640 | local function settoken(tok, info, I) | |
641 | I = I or i | |
642 | stoks[I] = tok or "" | |
643 | sinfos[I] = info or "" | |
644 | end | |
645 | -------------------------------------------------------------------- | |
646 | -- processing loop (PASS 1) | |
647 | -------------------------------------------------------------------- | |
648 | while true do | |
649 | tok, info = stoks[i], sinfos[i] | |
650 | ---------------------------------------------------------------- | |
651 | local atstart = atlinestart(i) -- set line begin flag | |
652 | if atstart then prev = nil end | |
653 | ---------------------------------------------------------------- | |
654 | if tok == "TK_EOS" then -- end of stream/pass | |
655 | break | |
656 | ---------------------------------------------------------------- | |
657 | elseif tok == "TK_KEYWORD" or -- keywords, identifiers, | |
658 | tok == "TK_NAME" or -- operators | |
659 | tok == "TK_OP" then | |
660 | -- TK_KEYWORD and TK_OP can't be optimized without a big | |
661 | -- optimization framework; it would be more of an optimizing | |
662 | -- compiler, not a source code compressor | |
663 | -- TK_NAME that are locals needs parser to analyze/optimize | |
664 | prev = i | |
665 | ---------------------------------------------------------------- | |
666 | elseif tok == "TK_NUMBER" then -- numbers | |
667 | if opt_numbers then | |
668 | do_number(i) -- optimize | |
669 | end | |
670 | prev = i | |
671 | ---------------------------------------------------------------- | |
672 | elseif tok == "TK_STRING" or -- strings, long strings | |
673 | tok == "TK_LSTRING" then | |
674 | if opt_strings then | |
675 | if tok == "TK_STRING" then | |
676 | do_string(i) -- optimize | |
677 | else | |
678 | do_lstring(i) -- optimize | |
679 | end | |
680 | end | |
681 | prev = i | |
682 | ---------------------------------------------------------------- | |
683 | elseif tok == "TK_COMMENT" then -- short comments | |
684 | if opt_comments then | |
685 | if i == 1 and sub(info, 1, 1) == "#" then | |
686 | -- keep shbang comment, trim whitespace | |
687 | do_comment(i) | |
688 | else | |
689 | -- safe to delete, as a TK_EOL (or TK_EOS) always follows | |
690 | settoken() -- remove entirely | |
691 | end | |
692 | elseif opt_whitespace then -- trim whitespace only | |
693 | do_comment(i) | |
694 | end | |
695 | ---------------------------------------------------------------- | |
696 | elseif tok == "TK_LCOMMENT" then -- long comments | |
697 | if keep_lcomment(opt_keep, info) then | |
698 | ------------------------------------------------------------ | |
699 | -- if --keep, we keep a long comment if <msg> is found; | |
700 | -- this is a feature to keep copyright or license texts | |
701 | if opt_whitespace then -- trim whitespace only | |
702 | do_lcomment(i) | |
703 | end | |
704 | prev = i | |
705 | elseif opt_comments then | |
706 | local eols = commenteols(info) | |
707 | ------------------------------------------------------------ | |
708 | -- prepare opt_emptylines case first, if a disposable token | |
709 | -- follows, current one is safe to dump, else keep a space; | |
710 | -- it is implied that the operation is safe for '-', because | |
711 | -- current is a TK_LCOMMENT, and must be separate from a '-' | |
712 | if is_faketoken[stoks[i + 1]] then | |
713 | settoken() -- remove entirely | |
714 | tok = "" | |
715 | else | |
716 | settoken("TK_SPACE", " ") | |
717 | end | |
718 | ------------------------------------------------------------ | |
719 | -- if there are embedded EOLs to keep and opt_emptylines is | |
720 | -- disabled, then switch the token into one or more EOLs | |
721 | if not opt_emptylines and eols > 0 then | |
722 | settoken("TK_EOL", rep("\n", eols)) | |
723 | end | |
724 | ------------------------------------------------------------ | |
725 | -- if optimizing whitespaces, force reinterpretation of the | |
726 | -- token to give a chance for the space to be optimized away | |
727 | if opt_whitespace and tok ~= "" then | |
728 | i = i - 1 -- to reinterpret | |
729 | end | |
730 | ------------------------------------------------------------ | |
731 | else -- disabled case | |
732 | if opt_whitespace then -- trim whitespace only | |
733 | do_lcomment(i) | |
734 | end | |
735 | prev = i | |
736 | end | |
737 | ---------------------------------------------------------------- | |
738 | elseif tok == "TK_EOL" then -- line endings | |
739 | if atstart and opt_emptylines then | |
740 | settoken() -- remove entirely | |
741 | elseif info == "\r\n" or info == "\n\r" then | |
742 | -- normalize the rest of the EOLs for CRLF/LFCR only | |
743 | -- (note that TK_LCOMMENT can change into several EOLs) | |
744 | settoken("TK_EOL", "\n") | |
745 | end | |
746 | ---------------------------------------------------------------- | |
747 | elseif tok == "TK_SPACE" then -- whitespace | |
748 | if opt_whitespace then | |
749 | if atstart or atlineend(i) then | |
750 | -- delete leading and trailing whitespace | |
751 | settoken() -- remove entirely | |
752 | else | |
753 | ------------------------------------------------------------ | |
754 | -- at this point, since leading whitespace have been removed, | |
755 | -- there should be a either a real token or a TK_LCOMMENT | |
756 | -- prior to hitting this whitespace; the TK_LCOMMENT case | |
757 | -- only happens if opt_comments is disabled; so prev ~= nil | |
758 | local ptok = stoks[prev] | |
759 | if ptok == "TK_LCOMMENT" then | |
760 | -- previous TK_LCOMMENT can abut with anything | |
761 | settoken() -- remove entirely | |
762 | else | |
763 | -- prev must be a grammar token; consecutive TK_SPACE | |
764 | -- tokens is impossible when optimizing whitespace | |
765 | local ntok = stoks[i + 1] | |
766 | if is_faketoken[ntok] then | |
767 | -- handle special case where a '-' cannot abut with | |
768 | -- either a short comment or a long comment | |
769 | if (ntok == "TK_COMMENT" or ntok == "TK_LCOMMENT") and | |
770 | ptok == "TK_OP" and sinfos[prev] == "-" then | |
771 | -- keep token | |
772 | else | |
773 | settoken() -- remove entirely | |
774 | end | |
775 | else--is_realtoken | |
776 | -- check a pair of grammar tokens, if can abut, then | |
777 | -- delete space token entirely, otherwise keep one space | |
778 | local s = checkpair(prev, i + 1) | |
779 | if s == "" then | |
780 | settoken() -- remove entirely | |
781 | else | |
782 | settoken("TK_SPACE", " ") | |
783 | end | |
784 | end | |
785 | end | |
786 | ------------------------------------------------------------ | |
787 | end | |
788 | end | |
789 | ---------------------------------------------------------------- | |
790 | else | |
791 | error("unidentified token encountered") | |
792 | end | |
793 | ---------------------------------------------------------------- | |
794 | i = i + 1 | |
795 | end--while | |
796 | repack_tokens() | |
797 | -------------------------------------------------------------------- | |
798 | -- processing loop (PASS 2) | |
799 | -------------------------------------------------------------------- | |
800 | if opt_eols then | |
801 | i = 1 | |
802 | -- aggressive EOL removal only works with most non-grammar tokens | |
803 | -- optimized away because it is a rather simple scheme -- basically | |
804 | -- it just checks 'real' token pairs around EOLs | |
805 | if stoks[1] == "TK_COMMENT" then | |
806 | -- first comment still existing must be shbang, skip whole line | |
807 | i = 3 | |
808 | end | |
809 | while true do | |
810 | tok, info = stoks[i], sinfos[i] | |
811 | -------------------------------------------------------------- | |
812 | if tok == "TK_EOS" then -- end of stream/pass | |
813 | break | |
814 | -------------------------------------------------------------- | |
815 | elseif tok == "TK_EOL" then -- consider each TK_EOL | |
816 | local t1, t2 = stoks[i - 1], stoks[i + 1] | |
817 | if is_realtoken[t1] and is_realtoken[t2] then -- sanity check | |
818 | local s = checkpair(i - 1, i + 1) | |
819 | if s == "" then | |
820 | settoken() -- remove entirely | |
821 | end | |
822 | end | |
823 | end--if tok | |
824 | -------------------------------------------------------------- | |
825 | i = i + 1 | |
826 | end--while | |
827 | repack_tokens() | |
828 | end | |
829 | -------------------------------------------------------------------- | |
830 | if opt_details and opt_details > 0 then print() end -- spacing | |
831 | return stoks, sinfos, stoklns | |
832 | end |