Fri, 17 Mar 2023 11:02:12 +0000
A range of fixes for Lua 5.2 support
1 | 1 | --[[-------------------------------------------------------------------- |
2 | ||
3 | optlex.lua: does lexer-based optimizations | |
4 | This file is part of LuaSrcDiet. | |
5 | ||
6 | Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net> | |
7 | The COPYRIGHT file describes the conditions | |
8 | under which this software may be distributed. | |
9 | ||
10 | See the ChangeLog for more information. | |
11 | ||
12 | ----------------------------------------------------------------------]] | |
13 | ||
14 | --[[-------------------------------------------------------------------- | |
15 | -- NOTES: | |
16 | -- * For more lexer-based optimization ideas, see the TODO items or | |
17 | -- look at technotes.txt. | |
18 | -- * TODO: general string delimiter conversion optimizer | |
19 | -- * TODO: (numbers) warn if overly significant digit | |
20 | ----------------------------------------------------------------------]] | |
21 | ||
22 | local base = _G | |
23 | local string = require "string" | |
24 | local match = string.match | |
25 | local sub = string.sub | |
26 | local find = string.find | |
27 | local rep = string.rep | |
28 | local print | |
29 | ||
30 | ------------------------------------------------------------------------ | |
31 | -- variables and data structures | |
32 | ------------------------------------------------------------------------ | |
33 | ||
34 | -- error function, can override by setting own function into module | |
35 | error = base.error | |
36 | ||
37 | warn = {} -- table for warning flags | |
38 | ||
39 | local stoks, sinfos, stoklns -- source lists | |
40 | ||
41 | local is_realtoken = { -- significant (grammar) tokens | |
42 | TK_KEYWORD = true, | |
43 | TK_NAME = true, | |
44 | TK_NUMBER = true, | |
45 | TK_STRING = true, | |
46 | TK_LSTRING = true, | |
47 | TK_OP = true, | |
48 | TK_EOS = true, | |
49 | } | |
50 | local is_faketoken = { -- whitespace (non-grammar) tokens | |
51 | TK_COMMENT = true, | |
52 | TK_LCOMMENT = true, | |
53 | TK_EOL = true, | |
54 | TK_SPACE = true, | |
55 | } | |
56 | ||
57 | local opt_details -- for extra information | |
58 | ||
59 | ------------------------------------------------------------------------ | |
60 | -- true if current token is at the start of a line | |
61 | -- * skips over deleted tokens via recursion | |
62 | ------------------------------------------------------------------------ | |
63 | ||
64 | local function atlinestart(i) | |
65 | local tok = stoks[i - 1] | |
66 | if i <= 1 or tok == "TK_EOL" then | |
67 | return true | |
68 | elseif tok == "" then | |
69 | return atlinestart(i - 1) | |
70 | end | |
71 | return false | |
72 | end | |
73 | ||
74 | ------------------------------------------------------------------------ | |
75 | -- true if current token is at the end of a line | |
76 | -- * skips over deleted tokens via recursion | |
77 | ------------------------------------------------------------------------ | |
78 | ||
79 | local function atlineend(i) | |
80 | local tok = stoks[i + 1] | |
81 | if i >= #stoks or tok == "TK_EOL" or tok == "TK_EOS" then | |
82 | return true | |
83 | elseif tok == "" then | |
84 | return atlineend(i + 1) | |
85 | end | |
86 | return false | |
87 | end | |
88 | ||
89 | ------------------------------------------------------------------------ | |
90 | -- counts comment EOLs inside a long comment | |
91 | -- * in order to keep line numbering, EOLs need to be reinserted | |
92 | ------------------------------------------------------------------------ | |
93 | ||
94 | local function commenteols(lcomment) | |
95 | local sep = #match(lcomment, "^%-%-%[=*%[") | |
96 | local z = sub(lcomment, sep + 1, -(sep - 1)) -- remove delims | |
97 | local i, c = 1, 0 | |
98 | while true do | |
99 | local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i) | |
100 | if not p then break end -- if no matches, done | |
101 | i = p + 1 | |
102 | c = c + 1 | |
103 | if #s > 0 and r ~= s then -- skip CRLF or LFCR | |
104 | i = i + 1 | |
105 | end | |
106 | end | |
107 | return c | |
108 | end | |
109 | ||
110 | ------------------------------------------------------------------------ | |
111 | -- compares two tokens (i, j) and returns the whitespace required | |
112 | -- * important! see technotes.txt for more information | |
113 | -- * only two grammar/real tokens are being considered | |
114 | -- * if "", no separation is needed | |
115 | -- * if " ", then at least one whitespace (or EOL) is required | |
116 | ------------------------------------------------------------------------ | |
117 | ||
118 | local function checkpair(i, j) | |
119 | local match = match | |
120 | local t1, t2 = stoks[i], stoks[j] | |
121 | -------------------------------------------------------------------- | |
122 | if t1 == "TK_STRING" or t1 == "TK_LSTRING" or | |
123 | t2 == "TK_STRING" or t2 == "TK_LSTRING" then | |
124 | return "" | |
125 | -------------------------------------------------------------------- | |
126 | elseif t1 == "TK_OP" or t2 == "TK_OP" then | |
127 | if (t1 == "TK_OP" and (t2 == "TK_KEYWORD" or t2 == "TK_NAME")) or | |
128 | (t2 == "TK_OP" and (t1 == "TK_KEYWORD" or t1 == "TK_NAME")) then | |
129 | return "" | |
130 | end | |
131 | if t1 == "TK_OP" and t2 == "TK_OP" then | |
132 | -- for TK_OP/TK_OP pairs, see notes in technotes.txt | |
133 | local op, op2 = sinfos[i], sinfos[j] | |
134 | if (match(op, "^%.%.?$") and match(op2, "^%.")) or | |
135 | (match(op, "^[~=<>]$") and op2 == "=") or | |
136 | (op == "[" and (op2 == "[" or op2 == "=")) then | |
137 | return " " | |
138 | end | |
139 | return "" | |
140 | end | |
141 | -- "TK_OP" + "TK_NUMBER" case | |
142 | local op = sinfos[i] | |
143 | if t2 == "TK_OP" then op = sinfos[j] end | |
144 | if match(op, "^%.%.?%.?$") then | |
145 | return " " | |
146 | end | |
147 | return "" | |
148 | -------------------------------------------------------------------- | |
149 | else-- "TK_KEYWORD" | "TK_NAME" | "TK_NUMBER" then | |
150 | return " " | |
151 | -------------------------------------------------------------------- | |
152 | end | |
153 | end | |
154 | ||
155 | ------------------------------------------------------------------------ | |
156 | -- repack tokens, removing deletions caused by optimization process | |
157 | ------------------------------------------------------------------------ | |
158 | ||
159 | local function repack_tokens() | |
160 | local dtoks, dinfos, dtoklns = {}, {}, {} | |
161 | local j = 1 | |
162 | for i = 1, #stoks do | |
163 | local tok = stoks[i] | |
164 | if tok ~= "" then | |
165 | dtoks[j], dinfos[j], dtoklns[j] = tok, sinfos[i], stoklns[i] | |
166 | j = j + 1 | |
167 | end | |
168 | end | |
169 | stoks, sinfos, stoklns = dtoks, dinfos, dtoklns | |
170 | end | |
171 | ||
172 | ------------------------------------------------------------------------ | |
173 | -- number optimization | |
174 | -- * optimization using string formatting functions is one way of doing | |
175 | -- this, but here, we consider all cases and handle them separately | |
176 | -- (possibly an idiotic approach...) | |
177 | -- * scientific notation being generated is not in canonical form, this | |
178 | -- may or may not be a bad thing, feedback welcome | |
179 | -- * note: intermediate portions need to fit into a normal number range | |
180 | -- * optimizations can be divided based on number patterns: | |
181 | -- * hexadecimal: | |
182 | -- (1) no need to remove leading zeros, just skip to (2) | |
183 | -- (2) convert to integer if size equal or smaller | |
184 | -- * change if equal size -> lose the 'x' to reduce entropy | |
185 | -- (3) number is then processed as an integer | |
186 | -- (4) note: does not make 0[xX] consistent | |
187 | -- * integer: | |
188 | -- (1) note: includes anything with trailing ".", ".0", ... | |
189 | -- (2) remove useless fractional part, if present, e.g. 123.000 | |
190 | -- (3) remove leading zeros, e.g. 000123 | |
191 | -- (4) switch to scientific if shorter, e.g. 123000 -> 123e3 | |
192 | -- * with fraction: | |
193 | -- (1) split into digits dot digits | |
194 | -- (2) if no integer portion, take as zero (can omit later) | |
195 | -- (3) handle degenerate .000 case, after which the fractional part | |
196 | -- must be non-zero (if zero, it's matched as an integer) | |
197 | -- (4) remove trailing zeros for fractional portion | |
198 | -- (5) p.q where p > 0 and q > 0 cannot be shortened any more | |
199 | -- (6) otherwise p == 0 and the form is .q, e.g. .000123 | |
200 | -- (7) if scientific shorter, convert, e.g. .000123 -> 123e-6 | |
201 | -- * scientific: | |
202 | -- (1) split into (digits dot digits) [eE] ([+-] digits) | |
203 | -- (2) if significand has ".", shift it out so it becomes an integer | |
204 | -- (3) if significand is zero, just use zero | |
205 | -- (4) remove leading zeros for significand | |
206 | -- (5) shift out trailing zeros for significand | |
207 | -- (6) examine exponent and determine which format is best: | |
208 | -- integer, with fraction, scientific | |
209 | ------------------------------------------------------------------------ | |
210 | ||
211 | local function do_number(i) | |
212 | local before = sinfos[i] -- 'before' | |
213 | local z = before -- working representation | |
214 | local y -- 'after', if better | |
215 | -------------------------------------------------------------------- | |
216 | if match(z, "^0[xX]") then -- hexadecimal number | |
217 | local v = base.tostring(base.tonumber(z)) | |
218 | if #v <= #z then | |
219 | z = v -- change to integer, AND continue | |
220 | else | |
221 | return -- no change; stick to hex | |
222 | end | |
223 | end | |
224 | -------------------------------------------------------------------- | |
225 | if match(z, "^%d+%.?0*$") then -- integer or has useless frac | |
226 | z = match(z, "^(%d+)%.?0*$") -- int portion only | |
227 | if z + 0 > 0 then | |
228 | z = match(z, "^0*([1-9]%d*)$") -- remove leading zeros | |
229 | local v = #match(z, "0*$") | |
230 | local nv = base.tostring(v) | |
231 | if v > #nv + 1 then -- scientific is shorter | |
232 | z = sub(z, 1, #z - v).."e"..nv | |
233 | end | |
234 | y = z | |
235 | else | |
236 | y = "0" -- basic zero | |
237 | end | |
238 | -------------------------------------------------------------------- | |
239 | elseif not match(z, "[eE]") then -- number with fraction part | |
240 | local p, q = match(z, "^(%d*)%.(%d+)$") -- split | |
241 | if p == "" then p = 0 end -- int part zero | |
242 | if q + 0 == 0 and p == 0 then | |
243 | y = "0" -- degenerate .000 case | |
244 | else | |
245 | -- now, q > 0 holds and p is a number | |
246 | local v = #match(q, "0*$") -- remove trailing zeros | |
247 | if v > 0 then | |
248 | q = sub(q, 1, #q - v) | |
249 | end | |
250 | -- if p > 0, nothing else we can do to simplify p.q case | |
251 | if p + 0 > 0 then | |
252 | y = p.."."..q | |
253 | else | |
254 | y = "."..q -- tentative, e.g. .000123 | |
255 | local v = #match(q, "^0*") -- # leading spaces | |
256 | local w = #q - v -- # significant digits | |
257 | local nv = base.tostring(#q) | |
258 | -- e.g. compare 123e-6 versus .000123 | |
259 | if w + 2 + #nv < 1 + #q then | |
260 | y = sub(q, -w).."e-"..nv | |
261 | end | |
262 | end | |
263 | end | |
264 | -------------------------------------------------------------------- | |
265 | else -- scientific number | |
266 | local sig, ex = match(z, "^([^eE]+)[eE]([%+%-]?%d+)$") | |
267 | ex = base.tonumber(ex) | |
268 | -- if got ".", shift out fractional portion of significand | |
269 | local p, q = match(sig, "^(%d*)%.(%d*)$") | |
270 | if p then | |
271 | ex = ex - #q | |
272 | sig = p..q | |
273 | end | |
274 | if sig + 0 == 0 then | |
275 | y = "0" -- basic zero | |
276 | else | |
277 | local v = #match(sig, "^0*") -- remove leading zeros | |
278 | sig = sub(sig, v + 1) | |
279 | v = #match(sig, "0*$") -- shift out trailing zeros | |
280 | if v > 0 then | |
281 | sig = sub(sig, 1, #sig - v) | |
282 | ex = ex + v | |
283 | end | |
284 | -- examine exponent and determine which format is best | |
285 | local nex = base.tostring(ex) | |
286 | if ex == 0 then -- it's just an integer | |
287 | y = sig | |
288 | elseif ex > 0 and (ex <= 1 + #nex) then -- a number | |
289 | y = sig..rep("0", ex) | |
290 | elseif ex < 0 and (ex >= -#sig) then -- fraction, e.g. .123 | |
291 | v = #sig + ex | |
292 | y = sub(sig, 1, v).."."..sub(sig, v + 1) | |
293 | elseif ex < 0 and (#nex >= -ex - #sig) then | |
294 | -- e.g. compare 1234e-5 versus .01234 | |
295 | -- gives: #sig + 1 + #nex >= 1 + (-ex - #sig) + #sig | |
296 | -- -> #nex >= -ex - #sig | |
297 | v = -ex - #sig | |
298 | y = "."..rep("0", v)..sig | |
299 | else -- non-canonical scientific representation | |
300 | y = sig.."e"..ex | |
301 | end | |
302 | end--if sig | |
303 | end | |
304 | -------------------------------------------------------------------- | |
305 | if y and y ~= sinfos[i] then | |
306 | if opt_details then | |
307 | print("<number> (line "..stoklns[i]..") "..sinfos[i].." -> "..y) | |
308 | opt_details = opt_details + 1 | |
309 | end | |
310 | sinfos[i] = y | |
311 | end | |
312 | end | |
313 | ||
314 | ------------------------------------------------------------------------ | |
315 | -- string optimization | |
316 | -- * note: works on well-formed strings only! | |
317 | -- * optimizations on characters can be summarized as follows: | |
318 | -- \a\b\f\n\r\t\v -- no change | |
319 | -- \\ -- no change | |
320 | -- \"\' -- depends on delim, other can remove \ | |
321 | -- \[\] -- remove \ | |
322 | -- \<char> -- general escape, remove \ | |
323 | -- \<eol> -- normalize the EOL only | |
324 | -- \ddd -- if \a\b\f\n\r\t\v, change to latter | |
325 | -- if other < ascii 32, keep ddd but zap leading zeros | |
326 | -- if >= ascii 32, translate it into the literal, then also | |
327 | -- do escapes for \\,\",\' cases | |
328 | -- <other> -- no change | |
329 | -- * switch delimiters if string becomes shorter | |
330 | ------------------------------------------------------------------------ | |
331 | ||
332 | local function do_string(I) | |
333 | local info = sinfos[I] | |
334 | local delim = sub(info, 1, 1) -- delimiter used | |
335 | local ndelim = (delim == "'") and '"' or "'" -- opposite " <-> ' | |
336 | local z = sub(info, 2, -2) -- actual string | |
337 | local i = 1 | |
338 | local c_delim, c_ndelim = 0, 0 -- "/' counts | |
339 | -------------------------------------------------------------------- | |
340 | while i <= #z do | |
341 | local c = sub(z, i, i) | |
342 | ---------------------------------------------------------------- | |
343 | if c == "\\" then -- escaped stuff | |
344 | local j = i + 1 | |
345 | local d = sub(z, j, j) | |
346 | local p = find("abfnrtv\\\n\r\"\'0123456789", d, 1, true) | |
347 | ------------------------------------------------------------ | |
348 | if not p then -- \<char> -- remove \ | |
349 | z = sub(z, 1, i - 1)..sub(z, j) | |
350 | i = i + 1 | |
351 | ------------------------------------------------------------ | |
352 | elseif p <= 8 then -- \a\b\f\n\r\t\v\\ | |
353 | i = i + 2 -- no change | |
354 | ------------------------------------------------------------ | |
355 | elseif p <= 10 then -- \<eol> -- normalize EOL | |
356 | local eol = sub(z, j, j + 1) | |
357 | if eol == "\r\n" or eol == "\n\r" then | |
358 | z = sub(z, 1, i).."\n"..sub(z, j + 2) | |
359 | elseif p == 10 then -- \r case | |
360 | z = sub(z, 1, i).."\n"..sub(z, j + 1) | |
361 | end | |
362 | i = i + 2 | |
363 | ------------------------------------------------------------ | |
364 | elseif p <= 12 then -- \"\' -- remove \ for ndelim | |
365 | if d == delim then | |
366 | c_delim = c_delim + 1 | |
367 | i = i + 2 | |
368 | else | |
369 | c_ndelim = c_ndelim + 1 | |
370 | z = sub(z, 1, i - 1)..sub(z, j) | |
371 | i = i + 1 | |
372 | end | |
373 | ------------------------------------------------------------ | |
374 | else -- \ddd -- various steps | |
375 | local s = match(z, "^(%d%d?%d?)", j) | |
376 | j = i + 1 + #s -- skip to location | |
377 | local cv = s + 0 | |
378 | local cc = string.char(cv) | |
379 | local p = find("\a\b\f\n\r\t\v", cc, 1, true) | |
380 | if p then -- special escapes | |
381 | s = "\\"..sub("abfnrtv", p, p) | |
382 | elseif cv < 32 then -- normalized \ddd | |
383 | s = "\\"..cv | |
384 | elseif cc == delim then -- \<delim> | |
385 | s = "\\"..cc | |
386 | c_delim = c_delim + 1 | |
387 | elseif cc == "\\" then -- \\ | |
388 | s = "\\\\" | |
389 | else -- literal character | |
390 | s = cc | |
391 | if cc == ndelim then | |
392 | c_ndelim = c_ndelim + 1 | |
393 | end | |
394 | end | |
395 | z = sub(z, 1, i - 1)..s..sub(z, j) | |
396 | i = i + #s | |
397 | ------------------------------------------------------------ | |
398 | end--if p | |
399 | ---------------------------------------------------------------- | |
400 | else-- c ~= "\\" -- <other> -- no change | |
401 | i = i + 1 | |
402 | if c == ndelim then -- count ndelim, for switching delimiters | |
403 | c_ndelim = c_ndelim + 1 | |
404 | end | |
405 | ---------------------------------------------------------------- | |
406 | end--if c | |
407 | end--while | |
408 | -------------------------------------------------------------------- | |
409 | -- switching delimiters, a long-winded derivation: | |
410 | -- (1) delim takes 2+2*c_delim bytes, ndelim takes c_ndelim bytes | |
411 | -- (2) delim becomes c_delim bytes, ndelim becomes 2+2*c_ndelim bytes | |
412 | -- simplifying the condition (1)>(2) --> c_delim > c_ndelim | |
413 | if c_delim > c_ndelim then | |
414 | i = 1 | |
415 | while i <= #z do | |
416 | local p, q, r = find(z, "([\'\"])", i) | |
417 | if not p then break end | |
418 | if r == delim then -- \<delim> -> <delim> | |
419 | z = sub(z, 1, p - 2)..sub(z, p) | |
420 | i = p | |
421 | else-- r == ndelim -- <ndelim> -> \<ndelim> | |
422 | z = sub(z, 1, p - 1).."\\"..sub(z, p) | |
423 | i = p + 2 | |
424 | end | |
425 | end--while | |
426 | delim = ndelim -- actually change delimiters | |
427 | end | |
428 | -------------------------------------------------------------------- | |
429 | z = delim..z..delim | |
430 | if z ~= sinfos[I] then | |
431 | if opt_details then | |
432 | print("<string> (line "..stoklns[I]..") "..sinfos[I].." -> "..z) | |
433 | opt_details = opt_details + 1 | |
434 | end | |
435 | sinfos[I] = z | |
436 | end | |
437 | end | |
438 | ||
439 | ------------------------------------------------------------------------ | |
440 | -- long string optimization | |
441 | -- * note: warning flagged if trailing whitespace found, not trimmed | |
442 | -- * remove first optional newline | |
443 | -- * normalize embedded newlines | |
444 | -- * reduce '=' separators in delimiters if possible | |
445 | ------------------------------------------------------------------------ | |
446 | ||
447 | local function do_lstring(I) | |
448 | local info = sinfos[I] | |
449 | local delim1 = match(info, "^%[=*%[") -- cut out delimiters | |
450 | local sep = #delim1 | |
451 | local delim2 = sub(info, -sep, -1) | |
452 | local z = sub(info, sep + 1, -(sep + 1)) -- lstring without delims | |
453 | local y = "" | |
454 | local i = 1 | |
455 | -------------------------------------------------------------------- | |
456 | while true do | |
457 | local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i) | |
458 | -- deal with a single line | |
459 | local ln | |
460 | if not p then | |
461 | ln = sub(z, i) | |
462 | elseif p >= i then | |
463 | ln = sub(z, i, p - 1) | |
464 | end | |
465 | if ln ~= "" then | |
466 | -- flag a warning if there are trailing spaces, won't optimize! | |
467 | if match(ln, "%s+$") then | |
468 | warn.lstring = "trailing whitespace in long string near line "..stoklns[I] | |
469 | end | |
470 | y = y..ln | |
471 | end | |
472 | if not p then -- done if no more EOLs | |
473 | break | |
474 | end | |
475 | -- deal with line endings, normalize them | |
476 | i = p + 1 | |
477 | if p then | |
478 | if #s > 0 and r ~= s then -- skip CRLF or LFCR | |
479 | i = i + 1 | |
480 | end | |
481 | -- skip first newline, which can be safely deleted | |
482 | if not(i == 1 and i == p) then | |
483 | y = y.."\n" | |
484 | end | |
485 | end | |
486 | end--while | |
487 | -------------------------------------------------------------------- | |
488 | -- handle possible deletion of one or more '=' separators | |
489 | if sep >= 3 then | |
490 | local chk, okay = sep - 1 | |
491 | -- loop to test ending delimiter with less of '=' down to zero | |
492 | while chk >= 2 do | |
493 | local delim = "%]"..rep("=", chk - 2).."%]" | |
494 | if not match(y, delim) then okay = chk end | |
495 | chk = chk - 1 | |
496 | end | |
497 | if okay then -- change delimiters | |
498 | sep = rep("=", okay - 2) | |
499 | delim1, delim2 = "["..sep.."[", "]"..sep.."]" | |
500 | end | |
501 | end | |
502 | -------------------------------------------------------------------- | |
503 | sinfos[I] = delim1..y..delim2 | |
504 | end | |
505 | ||
506 | ------------------------------------------------------------------------ | |
507 | -- long comment optimization | |
508 | -- * note: does not remove first optional newline | |
509 | -- * trim trailing whitespace | |
510 | -- * normalize embedded newlines | |
511 | -- * reduce '=' separators in delimiters if possible | |
512 | ------------------------------------------------------------------------ | |
513 | ||
514 | local function do_lcomment(I) | |
515 | local info = sinfos[I] | |
516 | local delim1 = match(info, "^%-%-%[=*%[") -- cut out delimiters | |
517 | local sep = #delim1 | |
518 | local delim2 = sub(info, -sep, -1) | |
519 | local z = sub(info, sep + 1, -(sep - 1)) -- comment without delims | |
520 | local y = "" | |
521 | local i = 1 | |
522 | -------------------------------------------------------------------- | |
523 | while true do | |
524 | local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i) | |
525 | -- deal with a single line, extract and check trailing whitespace | |
526 | local ln | |
527 | if not p then | |
528 | ln = sub(z, i) | |
529 | elseif p >= i then | |
530 | ln = sub(z, i, p - 1) | |
531 | end | |
532 | if ln ~= "" then | |
533 | -- trim trailing whitespace if non-empty line | |
534 | local ws = match(ln, "%s*$") | |
535 | if #ws > 0 then ln = sub(ln, 1, -(ws + 1)) end | |
536 | y = y..ln | |
537 | end | |
538 | if not p then -- done if no more EOLs | |
539 | break | |
540 | end | |
541 | -- deal with line endings, normalize them | |
542 | i = p + 1 | |
543 | if p then | |
544 | if #s > 0 and r ~= s then -- skip CRLF or LFCR | |
545 | i = i + 1 | |
546 | end | |
547 | y = y.."\n" | |
548 | end | |
549 | end--while | |
550 | -------------------------------------------------------------------- | |
551 | -- handle possible deletion of one or more '=' separators | |
552 | sep = sep - 2 | |
553 | if sep >= 3 then | |
554 | local chk, okay = sep - 1 | |
555 | -- loop to test ending delimiter with less of '=' down to zero | |
556 | while chk >= 2 do | |
557 | local delim = "%]"..rep("=", chk - 2).."%]" | |
558 | if not match(y, delim) then okay = chk end | |
559 | chk = chk - 1 | |
560 | end | |
561 | if okay then -- change delimiters | |
562 | sep = rep("=", okay - 2) | |
563 | delim1, delim2 = "--["..sep.."[", "]"..sep.."]" | |
564 | end | |
565 | end | |
566 | -------------------------------------------------------------------- | |
567 | sinfos[I] = delim1..y..delim2 | |
568 | end | |
569 | ||
570 | ------------------------------------------------------------------------ | |
571 | -- short comment optimization | |
572 | -- * trim trailing whitespace | |
573 | ------------------------------------------------------------------------ | |
574 | ||
575 | local function do_comment(i) | |
576 | local info = sinfos[i] | |
577 | local ws = match(info, "%s*$") -- just look from end of string | |
578 | if #ws > 0 then | |
579 | info = sub(info, 1, -(ws + 1)) -- trim trailing whitespace | |
580 | end | |
581 | sinfos[i] = info | |
582 | end | |
583 | ||
584 | ------------------------------------------------------------------------ | |
585 | -- returns true if string found in long comment | |
586 | -- * this is a feature to keep copyright or license texts | |
587 | ------------------------------------------------------------------------ | |
588 | ||
589 | local function keep_lcomment(opt_keep, info) | |
590 | if not opt_keep then return false end -- option not set | |
591 | local delim1 = match(info, "^%-%-%[=*%[") -- cut out delimiters | |
592 | local sep = #delim1 | |
593 | local delim2 = sub(info, -sep, -1) | |
594 | local z = sub(info, sep + 1, -(sep - 1)) -- comment without delims | |
595 | if find(z, opt_keep, 1, true) then -- try to match | |
596 | return true | |
597 | end | |
598 | end | |
599 | ||
600 | ------------------------------------------------------------------------ | |
601 | -- main entry point | |
602 | -- * currently, lexer processing has 2 passes | |
603 | -- * processing is done on a line-oriented basis, which is easier to | |
604 | -- grok due to the next point... | |
605 | -- * since there are various options that can be enabled or disabled, | |
606 | -- processing is a little messy or convoluted | |
607 | ------------------------------------------------------------------------ | |
608 | ||
609 | function optimize(option, toklist, semlist, toklnlist) | |
610 | -------------------------------------------------------------------- | |
611 | -- set option flags | |
612 | -------------------------------------------------------------------- | |
613 | local opt_comments = option["opt-comments"] | |
614 | local opt_whitespace = option["opt-whitespace"] | |
615 | local opt_emptylines = option["opt-emptylines"] | |
616 | local opt_eols = option["opt-eols"] | |
617 | local opt_strings = option["opt-strings"] | |
618 | local opt_numbers = option["opt-numbers"] | |
619 | local opt_keep = option.KEEP | |
620 | opt_details = option.DETAILS and 0 -- upvalues for details display | |
621 | print = print or base.print | |
622 | if opt_eols then -- forced settings, otherwise won't work properly | |
623 | opt_comments = true | |
624 | opt_whitespace = true | |
625 | opt_emptylines = true | |
626 | end | |
627 | -------------------------------------------------------------------- | |
628 | -- variable initialization | |
629 | -------------------------------------------------------------------- | |
630 | stoks, sinfos, stoklns -- set source lists | |
631 | = toklist, semlist, toklnlist | |
632 | local i = 1 -- token position | |
633 | local tok, info -- current token | |
634 | local prev -- position of last grammar token | |
635 | -- on same line (for TK_SPACE stuff) | |
636 | -------------------------------------------------------------------- | |
637 | -- changes a token, info pair | |
638 | -------------------------------------------------------------------- | |
639 | local function settoken(tok, info, I) | |
640 | I = I or i | |
641 | stoks[I] = tok or "" | |
642 | sinfos[I] = info or "" | |
643 | end | |
644 | -------------------------------------------------------------------- | |
645 | -- processing loop (PASS 1) | |
646 | -------------------------------------------------------------------- | |
647 | while true do | |
648 | tok, info = stoks[i], sinfos[i] | |
649 | ---------------------------------------------------------------- | |
650 | local atstart = atlinestart(i) -- set line begin flag | |
651 | if atstart then prev = nil end | |
652 | ---------------------------------------------------------------- | |
653 | if tok == "TK_EOS" then -- end of stream/pass | |
654 | break | |
655 | ---------------------------------------------------------------- | |
656 | elseif tok == "TK_KEYWORD" or -- keywords, identifiers, | |
657 | tok == "TK_NAME" or -- operators | |
658 | tok == "TK_OP" then | |
659 | -- TK_KEYWORD and TK_OP can't be optimized without a big | |
660 | -- optimization framework; it would be more of an optimizing | |
661 | -- compiler, not a source code compressor | |
662 | -- TK_NAME that are locals needs parser to analyze/optimize | |
663 | prev = i | |
664 | ---------------------------------------------------------------- | |
665 | elseif tok == "TK_NUMBER" then -- numbers | |
666 | if opt_numbers then | |
667 | do_number(i) -- optimize | |
668 | end | |
669 | prev = i | |
670 | ---------------------------------------------------------------- | |
671 | elseif tok == "TK_STRING" or -- strings, long strings | |
672 | tok == "TK_LSTRING" then | |
673 | if opt_strings then | |
674 | if tok == "TK_STRING" then | |
675 | do_string(i) -- optimize | |
676 | else | |
677 | do_lstring(i) -- optimize | |
678 | end | |
679 | end | |
680 | prev = i | |
681 | ---------------------------------------------------------------- | |
682 | elseif tok == "TK_COMMENT" then -- short comments | |
683 | if opt_comments then | |
684 | if i == 1 and sub(info, 1, 1) == "#" then | |
685 | -- keep shbang comment, trim whitespace | |
686 | do_comment(i) | |
687 | else | |
688 | -- safe to delete, as a TK_EOL (or TK_EOS) always follows | |
689 | settoken() -- remove entirely | |
690 | end | |
691 | elseif opt_whitespace then -- trim whitespace only | |
692 | do_comment(i) | |
693 | end | |
694 | ---------------------------------------------------------------- | |
695 | elseif tok == "TK_LCOMMENT" then -- long comments | |
696 | if keep_lcomment(opt_keep, info) then | |
697 | ------------------------------------------------------------ | |
698 | -- if --keep, we keep a long comment if <msg> is found; | |
699 | -- this is a feature to keep copyright or license texts | |
700 | if opt_whitespace then -- trim whitespace only | |
701 | do_lcomment(i) | |
702 | end | |
703 | prev = i | |
704 | elseif opt_comments then | |
705 | local eols = commenteols(info) | |
706 | ------------------------------------------------------------ | |
707 | -- prepare opt_emptylines case first, if a disposable token | |
708 | -- follows, current one is safe to dump, else keep a space; | |
709 | -- it is implied that the operation is safe for '-', because | |
710 | -- current is a TK_LCOMMENT, and must be separate from a '-' | |
711 | if is_faketoken[stoks[i + 1]] then | |
712 | settoken() -- remove entirely | |
713 | tok = "" | |
714 | else | |
715 | settoken("TK_SPACE", " ") | |
716 | end | |
717 | ------------------------------------------------------------ | |
718 | -- if there are embedded EOLs to keep and opt_emptylines is | |
719 | -- disabled, then switch the token into one or more EOLs | |
720 | if not opt_emptylines and eols > 0 then | |
721 | settoken("TK_EOL", rep("\n", eols)) | |
722 | end | |
723 | ------------------------------------------------------------ | |
724 | -- if optimizing whitespaces, force reinterpretation of the | |
725 | -- token to give a chance for the space to be optimized away | |
726 | if opt_whitespace and tok ~= "" then | |
727 | i = i - 1 -- to reinterpret | |
728 | end | |
729 | ------------------------------------------------------------ | |
730 | else -- disabled case | |
731 | if opt_whitespace then -- trim whitespace only | |
732 | do_lcomment(i) | |
733 | end | |
734 | prev = i | |
735 | end | |
736 | ---------------------------------------------------------------- | |
737 | elseif tok == "TK_EOL" then -- line endings | |
738 | if atstart and opt_emptylines then | |
739 | settoken() -- remove entirely | |
740 | elseif info == "\r\n" or info == "\n\r" then | |
741 | -- normalize the rest of the EOLs for CRLF/LFCR only | |
742 | -- (note that TK_LCOMMENT can change into several EOLs) | |
743 | settoken("TK_EOL", "\n") | |
744 | end | |
745 | ---------------------------------------------------------------- | |
746 | elseif tok == "TK_SPACE" then -- whitespace | |
747 | if opt_whitespace then | |
748 | if atstart or atlineend(i) then | |
749 | -- delete leading and trailing whitespace | |
750 | settoken() -- remove entirely | |
751 | else | |
752 | ------------------------------------------------------------ | |
753 | -- at this point, since leading whitespace have been removed, | |
754 | -- there should be a either a real token or a TK_LCOMMENT | |
755 | -- prior to hitting this whitespace; the TK_LCOMMENT case | |
756 | -- only happens if opt_comments is disabled; so prev ~= nil | |
757 | local ptok = stoks[prev] | |
758 | if ptok == "TK_LCOMMENT" then | |
759 | -- previous TK_LCOMMENT can abut with anything | |
760 | settoken() -- remove entirely | |
761 | else | |
762 | -- prev must be a grammar token; consecutive TK_SPACE | |
763 | -- tokens is impossible when optimizing whitespace | |
764 | local ntok = stoks[i + 1] | |
765 | if is_faketoken[ntok] then | |
766 | -- handle special case where a '-' cannot abut with | |
767 | -- either a short comment or a long comment | |
768 | if (ntok == "TK_COMMENT" or ntok == "TK_LCOMMENT") and | |
769 | ptok == "TK_OP" and sinfos[prev] == "-" then | |
770 | -- keep token | |
771 | else | |
772 | settoken() -- remove entirely | |
773 | end | |
774 | else--is_realtoken | |
775 | -- check a pair of grammar tokens, if can abut, then | |
776 | -- delete space token entirely, otherwise keep one space | |
777 | local s = checkpair(prev, i + 1) | |
778 | if s == "" then | |
779 | settoken() -- remove entirely | |
780 | else | |
781 | settoken("TK_SPACE", " ") | |
782 | end | |
783 | end | |
784 | end | |
785 | ------------------------------------------------------------ | |
786 | end | |
787 | end | |
788 | ---------------------------------------------------------------- | |
789 | else | |
790 | error("unidentified token encountered") | |
791 | end | |
792 | ---------------------------------------------------------------- | |
793 | i = i + 1 | |
794 | end--while | |
795 | repack_tokens() | |
796 | -------------------------------------------------------------------- | |
797 | -- processing loop (PASS 2) | |
798 | -------------------------------------------------------------------- | |
799 | if opt_eols then | |
800 | i = 1 | |
801 | -- aggressive EOL removal only works with most non-grammar tokens | |
802 | -- optimized away because it is a rather simple scheme -- basically | |
803 | -- it just checks 'real' token pairs around EOLs | |
804 | if stoks[1] == "TK_COMMENT" then | |
805 | -- first comment still existing must be shbang, skip whole line | |
806 | i = 3 | |
807 | end | |
808 | while true do | |
809 | tok, info = stoks[i], sinfos[i] | |
810 | -------------------------------------------------------------- | |
811 | if tok == "TK_EOS" then -- end of stream/pass | |
812 | break | |
813 | -------------------------------------------------------------- | |
814 | elseif tok == "TK_EOL" then -- consider each TK_EOL | |
815 | local t1, t2 = stoks[i - 1], stoks[i + 1] | |
816 | if is_realtoken[t1] and is_realtoken[t2] then -- sanity check | |
817 | local s = checkpair(i - 1, i + 1) | |
818 | if s == "" then | |
819 | settoken() -- remove entirely | |
820 | end | |
821 | end | |
822 | end--if tok | |
823 | -------------------------------------------------------------- | |
824 | i = i + 1 | |
825 | end--while | |
826 | repack_tokens() | |
827 | end | |
828 | -------------------------------------------------------------------- | |
829 | if opt_details and opt_details > 0 then print() end -- spacing | |
830 | return stoks, sinfos, stoklns | |
831 | end | |
93
07c10f9ba77c
minify: Return _M from modules that weren't
Matthew Wild <mwild1@gmail.com>
parents:
1
diff
changeset
|
832 | |
99
2b6416334a25
A range of fixes for Lua 5.2 support
Matthew Wild <mwild1@gmail.com>
parents:
93
diff
changeset
|
833 | return {optimize = optimize} |