|
1 --[[-------------------------------------------------------------------- |
|
2 |
|
3 optlex.lua: does lexer-based optimizations |
|
4 This file is part of LuaSrcDiet. |
|
5 |
|
6 Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net> |
|
7 The COPYRIGHT file describes the conditions |
|
8 under which this software may be distributed. |
|
9 |
|
10 See the ChangeLog for more information. |
|
11 |
|
12 ----------------------------------------------------------------------]] |
|
13 |
|
14 --[[-------------------------------------------------------------------- |
|
15 -- NOTES: |
|
16 -- * For more lexer-based optimization ideas, see the TODO items or |
|
17 -- look at technotes.txt. |
|
18 -- * TODO: general string delimiter conversion optimizer |
|
19 -- * TODO: (numbers) warn if overly significant digit |
|
20 ----------------------------------------------------------------------]] |
|
21 |
|
22 local base = _G |
|
23 local string = require "string" |
|
24 module "optlex" |
|
25 local match = string.match |
|
26 local sub = string.sub |
|
27 local find = string.find |
|
28 local rep = string.rep |
|
29 local print |
|
30 |
|
31 ------------------------------------------------------------------------ |
|
32 -- variables and data structures |
|
33 ------------------------------------------------------------------------ |
|
34 |
|
35 -- error function, can override by setting own function into module |
|
36 error = base.error |
|
37 |
|
38 warn = {} -- table for warning flags |
|
39 |
|
40 local stoks, sinfos, stoklns -- source lists |
|
41 |
|
42 local is_realtoken = { -- significant (grammar) tokens |
|
43 TK_KEYWORD = true, |
|
44 TK_NAME = true, |
|
45 TK_NUMBER = true, |
|
46 TK_STRING = true, |
|
47 TK_LSTRING = true, |
|
48 TK_OP = true, |
|
49 TK_EOS = true, |
|
50 } |
|
51 local is_faketoken = { -- whitespace (non-grammar) tokens |
|
52 TK_COMMENT = true, |
|
53 TK_LCOMMENT = true, |
|
54 TK_EOL = true, |
|
55 TK_SPACE = true, |
|
56 } |
|
57 |
|
58 local opt_details -- for extra information |
|
59 |
|
60 ------------------------------------------------------------------------ |
|
61 -- true if current token is at the start of a line |
|
62 -- * skips over deleted tokens via recursion |
|
63 ------------------------------------------------------------------------ |
|
64 |
|
65 local function atlinestart(i) |
|
66 local tok = stoks[i - 1] |
|
67 if i <= 1 or tok == "TK_EOL" then |
|
68 return true |
|
69 elseif tok == "" then |
|
70 return atlinestart(i - 1) |
|
71 end |
|
72 return false |
|
73 end |
|
74 |
|
75 ------------------------------------------------------------------------ |
|
76 -- true if current token is at the end of a line |
|
77 -- * skips over deleted tokens via recursion |
|
78 ------------------------------------------------------------------------ |
|
79 |
|
80 local function atlineend(i) |
|
81 local tok = stoks[i + 1] |
|
82 if i >= #stoks or tok == "TK_EOL" or tok == "TK_EOS" then |
|
83 return true |
|
84 elseif tok == "" then |
|
85 return atlineend(i + 1) |
|
86 end |
|
87 return false |
|
88 end |
|
89 |
|
90 ------------------------------------------------------------------------ |
|
91 -- counts comment EOLs inside a long comment |
|
92 -- * in order to keep line numbering, EOLs need to be reinserted |
|
93 ------------------------------------------------------------------------ |
|
94 |
|
95 local function commenteols(lcomment) |
|
96 local sep = #match(lcomment, "^%-%-%[=*%[") |
|
97 local z = sub(lcomment, sep + 1, -(sep - 1)) -- remove delims |
|
98 local i, c = 1, 0 |
|
99 while true do |
|
100 local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i) |
|
101 if not p then break end -- if no matches, done |
|
102 i = p + 1 |
|
103 c = c + 1 |
|
104 if #s > 0 and r ~= s then -- skip CRLF or LFCR |
|
105 i = i + 1 |
|
106 end |
|
107 end |
|
108 return c |
|
109 end |
|
110 |
|
111 ------------------------------------------------------------------------ |
|
112 -- compares two tokens (i, j) and returns the whitespace required |
|
113 -- * important! see technotes.txt for more information |
|
114 -- * only two grammar/real tokens are being considered |
|
115 -- * if "", no separation is needed |
|
116 -- * if " ", then at least one whitespace (or EOL) is required |
|
117 ------------------------------------------------------------------------ |
|
118 |
|
119 local function checkpair(i, j) |
|
120 local match = match |
|
121 local t1, t2 = stoks[i], stoks[j] |
|
122 -------------------------------------------------------------------- |
|
123 if t1 == "TK_STRING" or t1 == "TK_LSTRING" or |
|
124 t2 == "TK_STRING" or t2 == "TK_LSTRING" then |
|
125 return "" |
|
126 -------------------------------------------------------------------- |
|
127 elseif t1 == "TK_OP" or t2 == "TK_OP" then |
|
128 if (t1 == "TK_OP" and (t2 == "TK_KEYWORD" or t2 == "TK_NAME")) or |
|
129 (t2 == "TK_OP" and (t1 == "TK_KEYWORD" or t1 == "TK_NAME")) then |
|
130 return "" |
|
131 end |
|
132 if t1 == "TK_OP" and t2 == "TK_OP" then |
|
133 -- for TK_OP/TK_OP pairs, see notes in technotes.txt |
|
134 local op, op2 = sinfos[i], sinfos[j] |
|
135 if (match(op, "^%.%.?$") and match(op2, "^%.")) or |
|
136 (match(op, "^[~=<>]$") and op2 == "=") or |
|
137 (op == "[" and (op2 == "[" or op2 == "=")) then |
|
138 return " " |
|
139 end |
|
140 return "" |
|
141 end |
|
142 -- "TK_OP" + "TK_NUMBER" case |
|
143 local op = sinfos[i] |
|
144 if t2 == "TK_OP" then op = sinfos[j] end |
|
145 if match(op, "^%.%.?%.?$") then |
|
146 return " " |
|
147 end |
|
148 return "" |
|
149 -------------------------------------------------------------------- |
|
150 else-- "TK_KEYWORD" | "TK_NAME" | "TK_NUMBER" then |
|
151 return " " |
|
152 -------------------------------------------------------------------- |
|
153 end |
|
154 end |
|
155 |
|
156 ------------------------------------------------------------------------ |
|
157 -- repack tokens, removing deletions caused by optimization process |
|
158 ------------------------------------------------------------------------ |
|
159 |
|
160 local function repack_tokens() |
|
161 local dtoks, dinfos, dtoklns = {}, {}, {} |
|
162 local j = 1 |
|
163 for i = 1, #stoks do |
|
164 local tok = stoks[i] |
|
165 if tok ~= "" then |
|
166 dtoks[j], dinfos[j], dtoklns[j] = tok, sinfos[i], stoklns[i] |
|
167 j = j + 1 |
|
168 end |
|
169 end |
|
170 stoks, sinfos, stoklns = dtoks, dinfos, dtoklns |
|
171 end |
|
172 |
|
173 ------------------------------------------------------------------------ |
|
174 -- number optimization |
|
175 -- * optimization using string formatting functions is one way of doing |
|
176 -- this, but here, we consider all cases and handle them separately |
|
177 -- (possibly an idiotic approach...) |
|
178 -- * scientific notation being generated is not in canonical form, this |
|
179 -- may or may not be a bad thing, feedback welcome |
|
180 -- * note: intermediate portions need to fit into a normal number range |
|
181 -- * optimizations can be divided based on number patterns: |
|
182 -- * hexadecimal: |
|
183 -- (1) no need to remove leading zeros, just skip to (2) |
|
184 -- (2) convert to integer if size equal or smaller |
|
185 -- * change if equal size -> lose the 'x' to reduce entropy |
|
186 -- (3) number is then processed as an integer |
|
187 -- (4) note: does not make 0[xX] consistent |
|
188 -- * integer: |
|
189 -- (1) note: includes anything with trailing ".", ".0", ... |
|
190 -- (2) remove useless fractional part, if present, e.g. 123.000 |
|
191 -- (3) remove leading zeros, e.g. 000123 |
|
192 -- (4) switch to scientific if shorter, e.g. 123000 -> 123e3 |
|
193 -- * with fraction: |
|
194 -- (1) split into digits dot digits |
|
195 -- (2) if no integer portion, take as zero (can omit later) |
|
196 -- (3) handle degenerate .000 case, after which the fractional part |
|
197 -- must be non-zero (if zero, it's matched as an integer) |
|
198 -- (4) remove trailing zeros for fractional portion |
|
199 -- (5) p.q where p > 0 and q > 0 cannot be shortened any more |
|
200 -- (6) otherwise p == 0 and the form is .q, e.g. .000123 |
|
201 -- (7) if scientific shorter, convert, e.g. .000123 -> 123e-6 |
|
202 -- * scientific: |
|
203 -- (1) split into (digits dot digits) [eE] ([+-] digits) |
|
204 -- (2) if significand has ".", shift it out so it becomes an integer |
|
205 -- (3) if significand is zero, just use zero |
|
206 -- (4) remove leading zeros for significand |
|
207 -- (5) shift out trailing zeros for significand |
|
208 -- (6) examine exponent and determine which format is best: |
|
209 -- integer, with fraction, scientific |
|
210 ------------------------------------------------------------------------ |
|
211 |
|
212 local function do_number(i) |
|
213 local before = sinfos[i] -- 'before' |
|
214 local z = before -- working representation |
|
215 local y -- 'after', if better |
|
216 -------------------------------------------------------------------- |
|
217 if match(z, "^0[xX]") then -- hexadecimal number |
|
218 local v = base.tostring(base.tonumber(z)) |
|
219 if #v <= #z then |
|
220 z = v -- change to integer, AND continue |
|
221 else |
|
222 return -- no change; stick to hex |
|
223 end |
|
224 end |
|
225 -------------------------------------------------------------------- |
|
226 if match(z, "^%d+%.?0*$") then -- integer or has useless frac |
|
227 z = match(z, "^(%d+)%.?0*$") -- int portion only |
|
228 if z + 0 > 0 then |
|
229 z = match(z, "^0*([1-9]%d*)$") -- remove leading zeros |
|
230 local v = #match(z, "0*$") |
|
231 local nv = base.tostring(v) |
|
232 if v > #nv + 1 then -- scientific is shorter |
|
233 z = sub(z, 1, #z - v).."e"..nv |
|
234 end |
|
235 y = z |
|
236 else |
|
237 y = "0" -- basic zero |
|
238 end |
|
239 -------------------------------------------------------------------- |
|
240 elseif not match(z, "[eE]") then -- number with fraction part |
|
241 local p, q = match(z, "^(%d*)%.(%d+)$") -- split |
|
242 if p == "" then p = 0 end -- int part zero |
|
243 if q + 0 == 0 and p == 0 then |
|
244 y = "0" -- degenerate .000 case |
|
245 else |
|
246 -- now, q > 0 holds and p is a number |
|
247 local v = #match(q, "0*$") -- remove trailing zeros |
|
248 if v > 0 then |
|
249 q = sub(q, 1, #q - v) |
|
250 end |
|
251 -- if p > 0, nothing else we can do to simplify p.q case |
|
252 if p + 0 > 0 then |
|
253 y = p.."."..q |
|
254 else |
|
255 y = "."..q -- tentative, e.g. .000123 |
|
256 local v = #match(q, "^0*") -- # leading spaces |
|
257 local w = #q - v -- # significant digits |
|
258 local nv = base.tostring(#q) |
|
259 -- e.g. compare 123e-6 versus .000123 |
|
260 if w + 2 + #nv < 1 + #q then |
|
261 y = sub(q, -w).."e-"..nv |
|
262 end |
|
263 end |
|
264 end |
|
265 -------------------------------------------------------------------- |
|
266 else -- scientific number |
|
267 local sig, ex = match(z, "^([^eE]+)[eE]([%+%-]?%d+)$") |
|
268 ex = base.tonumber(ex) |
|
269 -- if got ".", shift out fractional portion of significand |
|
270 local p, q = match(sig, "^(%d*)%.(%d*)$") |
|
271 if p then |
|
272 ex = ex - #q |
|
273 sig = p..q |
|
274 end |
|
275 if sig + 0 == 0 then |
|
276 y = "0" -- basic zero |
|
277 else |
|
278 local v = #match(sig, "^0*") -- remove leading zeros |
|
279 sig = sub(sig, v + 1) |
|
280 v = #match(sig, "0*$") -- shift out trailing zeros |
|
281 if v > 0 then |
|
282 sig = sub(sig, 1, #sig - v) |
|
283 ex = ex + v |
|
284 end |
|
285 -- examine exponent and determine which format is best |
|
286 local nex = base.tostring(ex) |
|
287 if ex == 0 then -- it's just an integer |
|
288 y = sig |
|
289 elseif ex > 0 and (ex <= 1 + #nex) then -- a number |
|
290 y = sig..rep("0", ex) |
|
291 elseif ex < 0 and (ex >= -#sig) then -- fraction, e.g. .123 |
|
292 v = #sig + ex |
|
293 y = sub(sig, 1, v).."."..sub(sig, v + 1) |
|
294 elseif ex < 0 and (#nex >= -ex - #sig) then |
|
295 -- e.g. compare 1234e-5 versus .01234 |
|
296 -- gives: #sig + 1 + #nex >= 1 + (-ex - #sig) + #sig |
|
297 -- -> #nex >= -ex - #sig |
|
298 v = -ex - #sig |
|
299 y = "."..rep("0", v)..sig |
|
300 else -- non-canonical scientific representation |
|
301 y = sig.."e"..ex |
|
302 end |
|
303 end--if sig |
|
304 end |
|
305 -------------------------------------------------------------------- |
|
306 if y and y ~= sinfos[i] then |
|
307 if opt_details then |
|
308 print("<number> (line "..stoklns[i]..") "..sinfos[i].." -> "..y) |
|
309 opt_details = opt_details + 1 |
|
310 end |
|
311 sinfos[i] = y |
|
312 end |
|
313 end |
|
314 |
|
315 ------------------------------------------------------------------------ |
|
316 -- string optimization |
|
317 -- * note: works on well-formed strings only! |
|
318 -- * optimizations on characters can be summarized as follows: |
|
319 -- \a\b\f\n\r\t\v -- no change |
|
320 -- \\ -- no change |
|
321 -- \"\' -- depends on delim, other can remove \ |
|
322 -- \[\] -- remove \ |
|
323 -- \<char> -- general escape, remove \ |
|
324 -- \<eol> -- normalize the EOL only |
|
325 -- \ddd -- if \a\b\f\n\r\t\v, change to latter |
|
326 -- if other < ascii 32, keep ddd but zap leading zeros |
|
327 -- if >= ascii 32, translate it into the literal, then also |
|
328 -- do escapes for \\,\",\' cases |
|
329 -- <other> -- no change |
|
330 -- * switch delimiters if string becomes shorter |
|
331 ------------------------------------------------------------------------ |
|
332 |
|
333 local function do_string(I) |
|
334 local info = sinfos[I] |
|
335 local delim = sub(info, 1, 1) -- delimiter used |
|
336 local ndelim = (delim == "'") and '"' or "'" -- opposite " <-> ' |
|
337 local z = sub(info, 2, -2) -- actual string |
|
338 local i = 1 |
|
339 local c_delim, c_ndelim = 0, 0 -- "/' counts |
|
340 -------------------------------------------------------------------- |
|
341 while i <= #z do |
|
342 local c = sub(z, i, i) |
|
343 ---------------------------------------------------------------- |
|
344 if c == "\\" then -- escaped stuff |
|
345 local j = i + 1 |
|
346 local d = sub(z, j, j) |
|
347 local p = find("abfnrtv\\\n\r\"\'0123456789", d, 1, true) |
|
348 ------------------------------------------------------------ |
|
349 if not p then -- \<char> -- remove \ |
|
350 z = sub(z, 1, i - 1)..sub(z, j) |
|
351 i = i + 1 |
|
352 ------------------------------------------------------------ |
|
353 elseif p <= 8 then -- \a\b\f\n\r\t\v\\ |
|
354 i = i + 2 -- no change |
|
355 ------------------------------------------------------------ |
|
356 elseif p <= 10 then -- \<eol> -- normalize EOL |
|
357 local eol = sub(z, j, j + 1) |
|
358 if eol == "\r\n" or eol == "\n\r" then |
|
359 z = sub(z, 1, i).."\n"..sub(z, j + 2) |
|
360 elseif p == 10 then -- \r case |
|
361 z = sub(z, 1, i).."\n"..sub(z, j + 1) |
|
362 end |
|
363 i = i + 2 |
|
364 ------------------------------------------------------------ |
|
365 elseif p <= 12 then -- \"\' -- remove \ for ndelim |
|
366 if d == delim then |
|
367 c_delim = c_delim + 1 |
|
368 i = i + 2 |
|
369 else |
|
370 c_ndelim = c_ndelim + 1 |
|
371 z = sub(z, 1, i - 1)..sub(z, j) |
|
372 i = i + 1 |
|
373 end |
|
374 ------------------------------------------------------------ |
|
375 else -- \ddd -- various steps |
|
376 local s = match(z, "^(%d%d?%d?)", j) |
|
377 j = i + 1 + #s -- skip to location |
|
378 local cv = s + 0 |
|
379 local cc = string.char(cv) |
|
380 local p = find("\a\b\f\n\r\t\v", cc, 1, true) |
|
381 if p then -- special escapes |
|
382 s = "\\"..sub("abfnrtv", p, p) |
|
383 elseif cv < 32 then -- normalized \ddd |
|
384 s = "\\"..cv |
|
385 elseif cc == delim then -- \<delim> |
|
386 s = "\\"..cc |
|
387 c_delim = c_delim + 1 |
|
388 elseif cc == "\\" then -- \\ |
|
389 s = "\\\\" |
|
390 else -- literal character |
|
391 s = cc |
|
392 if cc == ndelim then |
|
393 c_ndelim = c_ndelim + 1 |
|
394 end |
|
395 end |
|
396 z = sub(z, 1, i - 1)..s..sub(z, j) |
|
397 i = i + #s |
|
398 ------------------------------------------------------------ |
|
399 end--if p |
|
400 ---------------------------------------------------------------- |
|
401 else-- c ~= "\\" -- <other> -- no change |
|
402 i = i + 1 |
|
403 if c == ndelim then -- count ndelim, for switching delimiters |
|
404 c_ndelim = c_ndelim + 1 |
|
405 end |
|
406 ---------------------------------------------------------------- |
|
407 end--if c |
|
408 end--while |
|
409 -------------------------------------------------------------------- |
|
410 -- switching delimiters, a long-winded derivation: |
|
411 -- (1) delim takes 2+2*c_delim bytes, ndelim takes c_ndelim bytes |
|
412 -- (2) delim becomes c_delim bytes, ndelim becomes 2+2*c_ndelim bytes |
|
413 -- simplifying the condition (1)>(2) --> c_delim > c_ndelim |
|
414 if c_delim > c_ndelim then |
|
415 i = 1 |
|
416 while i <= #z do |
|
417 local p, q, r = find(z, "([\'\"])", i) |
|
418 if not p then break end |
|
419 if r == delim then -- \<delim> -> <delim> |
|
420 z = sub(z, 1, p - 2)..sub(z, p) |
|
421 i = p |
|
422 else-- r == ndelim -- <ndelim> -> \<ndelim> |
|
423 z = sub(z, 1, p - 1).."\\"..sub(z, p) |
|
424 i = p + 2 |
|
425 end |
|
426 end--while |
|
427 delim = ndelim -- actually change delimiters |
|
428 end |
|
429 -------------------------------------------------------------------- |
|
430 z = delim..z..delim |
|
431 if z ~= sinfos[I] then |
|
432 if opt_details then |
|
433 print("<string> (line "..stoklns[I]..") "..sinfos[I].." -> "..z) |
|
434 opt_details = opt_details + 1 |
|
435 end |
|
436 sinfos[I] = z |
|
437 end |
|
438 end |
|
439 |
|
440 ------------------------------------------------------------------------ |
|
441 -- long string optimization |
|
442 -- * note: warning flagged if trailing whitespace found, not trimmed |
|
443 -- * remove first optional newline |
|
444 -- * normalize embedded newlines |
|
445 -- * reduce '=' separators in delimiters if possible |
|
446 ------------------------------------------------------------------------ |
|
447 |
|
448 local function do_lstring(I) |
|
449 local info = sinfos[I] |
|
450 local delim1 = match(info, "^%[=*%[") -- cut out delimiters |
|
451 local sep = #delim1 |
|
452 local delim2 = sub(info, -sep, -1) |
|
453 local z = sub(info, sep + 1, -(sep + 1)) -- lstring without delims |
|
454 local y = "" |
|
455 local i = 1 |
|
456 -------------------------------------------------------------------- |
|
457 while true do |
|
458 local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i) |
|
459 -- deal with a single line |
|
460 local ln |
|
461 if not p then |
|
462 ln = sub(z, i) |
|
463 elseif p >= i then |
|
464 ln = sub(z, i, p - 1) |
|
465 end |
|
466 if ln ~= "" then |
|
467 -- flag a warning if there are trailing spaces, won't optimize! |
|
468 if match(ln, "%s+$") then |
|
469 warn.lstring = "trailing whitespace in long string near line "..stoklns[I] |
|
470 end |
|
471 y = y..ln |
|
472 end |
|
473 if not p then -- done if no more EOLs |
|
474 break |
|
475 end |
|
476 -- deal with line endings, normalize them |
|
477 i = p + 1 |
|
478 if p then |
|
479 if #s > 0 and r ~= s then -- skip CRLF or LFCR |
|
480 i = i + 1 |
|
481 end |
|
482 -- skip first newline, which can be safely deleted |
|
483 if not(i == 1 and i == p) then |
|
484 y = y.."\n" |
|
485 end |
|
486 end |
|
487 end--while |
|
488 -------------------------------------------------------------------- |
|
489 -- handle possible deletion of one or more '=' separators |
|
490 if sep >= 3 then |
|
491 local chk, okay = sep - 1 |
|
492 -- loop to test ending delimiter with less of '=' down to zero |
|
493 while chk >= 2 do |
|
494 local delim = "%]"..rep("=", chk - 2).."%]" |
|
495 if not match(y, delim) then okay = chk end |
|
496 chk = chk - 1 |
|
497 end |
|
498 if okay then -- change delimiters |
|
499 sep = rep("=", okay - 2) |
|
500 delim1, delim2 = "["..sep.."[", "]"..sep.."]" |
|
501 end |
|
502 end |
|
503 -------------------------------------------------------------------- |
|
504 sinfos[I] = delim1..y..delim2 |
|
505 end |
|
506 |
|
507 ------------------------------------------------------------------------ |
|
508 -- long comment optimization |
|
509 -- * note: does not remove first optional newline |
|
510 -- * trim trailing whitespace |
|
511 -- * normalize embedded newlines |
|
512 -- * reduce '=' separators in delimiters if possible |
|
513 ------------------------------------------------------------------------ |
|
514 |
|
515 local function do_lcomment(I) |
|
516 local info = sinfos[I] |
|
517 local delim1 = match(info, "^%-%-%[=*%[") -- cut out delimiters |
|
518 local sep = #delim1 |
|
519 local delim2 = sub(info, -sep, -1) |
|
520 local z = sub(info, sep + 1, -(sep - 1)) -- comment without delims |
|
521 local y = "" |
|
522 local i = 1 |
|
523 -------------------------------------------------------------------- |
|
524 while true do |
|
525 local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i) |
|
526 -- deal with a single line, extract and check trailing whitespace |
|
527 local ln |
|
528 if not p then |
|
529 ln = sub(z, i) |
|
530 elseif p >= i then |
|
531 ln = sub(z, i, p - 1) |
|
532 end |
|
533 if ln ~= "" then |
|
534 -- trim trailing whitespace if non-empty line |
|
535 local ws = match(ln, "%s*$") |
|
536 if #ws > 0 then ln = sub(ln, 1, -(ws + 1)) end |
|
537 y = y..ln |
|
538 end |
|
539 if not p then -- done if no more EOLs |
|
540 break |
|
541 end |
|
542 -- deal with line endings, normalize them |
|
543 i = p + 1 |
|
544 if p then |
|
545 if #s > 0 and r ~= s then -- skip CRLF or LFCR |
|
546 i = i + 1 |
|
547 end |
|
548 y = y.."\n" |
|
549 end |
|
550 end--while |
|
551 -------------------------------------------------------------------- |
|
552 -- handle possible deletion of one or more '=' separators |
|
553 sep = sep - 2 |
|
554 if sep >= 3 then |
|
555 local chk, okay = sep - 1 |
|
556 -- loop to test ending delimiter with less of '=' down to zero |
|
557 while chk >= 2 do |
|
558 local delim = "%]"..rep("=", chk - 2).."%]" |
|
559 if not match(y, delim) then okay = chk end |
|
560 chk = chk - 1 |
|
561 end |
|
562 if okay then -- change delimiters |
|
563 sep = rep("=", okay - 2) |
|
564 delim1, delim2 = "--["..sep.."[", "]"..sep.."]" |
|
565 end |
|
566 end |
|
567 -------------------------------------------------------------------- |
|
568 sinfos[I] = delim1..y..delim2 |
|
569 end |
|
570 |
|
571 ------------------------------------------------------------------------ |
|
572 -- short comment optimization |
|
573 -- * trim trailing whitespace |
|
574 ------------------------------------------------------------------------ |
|
575 |
|
576 local function do_comment(i) |
|
577 local info = sinfos[i] |
|
578 local ws = match(info, "%s*$") -- just look from end of string |
|
579 if #ws > 0 then |
|
580 info = sub(info, 1, -(ws + 1)) -- trim trailing whitespace |
|
581 end |
|
582 sinfos[i] = info |
|
583 end |
|
584 |
|
585 ------------------------------------------------------------------------ |
|
586 -- returns true if string found in long comment |
|
587 -- * this is a feature to keep copyright or license texts |
|
588 ------------------------------------------------------------------------ |
|
589 |
|
590 local function keep_lcomment(opt_keep, info) |
|
591 if not opt_keep then return false end -- option not set |
|
592 local delim1 = match(info, "^%-%-%[=*%[") -- cut out delimiters |
|
593 local sep = #delim1 |
|
594 local delim2 = sub(info, -sep, -1) |
|
595 local z = sub(info, sep + 1, -(sep - 1)) -- comment without delims |
|
596 if find(z, opt_keep, 1, true) then -- try to match |
|
597 return true |
|
598 end |
|
599 end |
|
600 |
|
601 ------------------------------------------------------------------------ |
|
602 -- main entry point |
|
603 -- * currently, lexer processing has 2 passes |
|
604 -- * processing is done on a line-oriented basis, which is easier to |
|
605 -- grok due to the next point... |
|
606 -- * since there are various options that can be enabled or disabled, |
|
607 -- processing is a little messy or convoluted |
|
608 ------------------------------------------------------------------------ |
|
609 |
|
610 function optimize(option, toklist, semlist, toklnlist) |
|
611 -------------------------------------------------------------------- |
|
612 -- set option flags |
|
613 -------------------------------------------------------------------- |
|
614 local opt_comments = option["opt-comments"] |
|
615 local opt_whitespace = option["opt-whitespace"] |
|
616 local opt_emptylines = option["opt-emptylines"] |
|
617 local opt_eols = option["opt-eols"] |
|
618 local opt_strings = option["opt-strings"] |
|
619 local opt_numbers = option["opt-numbers"] |
|
620 local opt_keep = option.KEEP |
|
621 opt_details = option.DETAILS and 0 -- upvalues for details display |
|
622 print = print or base.print |
|
623 if opt_eols then -- forced settings, otherwise won't work properly |
|
624 opt_comments = true |
|
625 opt_whitespace = true |
|
626 opt_emptylines = true |
|
627 end |
|
628 -------------------------------------------------------------------- |
|
629 -- variable initialization |
|
630 -------------------------------------------------------------------- |
|
631 stoks, sinfos, stoklns -- set source lists |
|
632 = toklist, semlist, toklnlist |
|
633 local i = 1 -- token position |
|
634 local tok, info -- current token |
|
635 local prev -- position of last grammar token |
|
636 -- on same line (for TK_SPACE stuff) |
|
637 -------------------------------------------------------------------- |
|
638 -- changes a token, info pair |
|
639 -------------------------------------------------------------------- |
|
640 local function settoken(tok, info, I) |
|
641 I = I or i |
|
642 stoks[I] = tok or "" |
|
643 sinfos[I] = info or "" |
|
644 end |
|
645 -------------------------------------------------------------------- |
|
646 -- processing loop (PASS 1) |
|
647 -------------------------------------------------------------------- |
|
648 while true do |
|
649 tok, info = stoks[i], sinfos[i] |
|
650 ---------------------------------------------------------------- |
|
651 local atstart = atlinestart(i) -- set line begin flag |
|
652 if atstart then prev = nil end |
|
653 ---------------------------------------------------------------- |
|
654 if tok == "TK_EOS" then -- end of stream/pass |
|
655 break |
|
656 ---------------------------------------------------------------- |
|
657 elseif tok == "TK_KEYWORD" or -- keywords, identifiers, |
|
658 tok == "TK_NAME" or -- operators |
|
659 tok == "TK_OP" then |
|
660 -- TK_KEYWORD and TK_OP can't be optimized without a big |
|
661 -- optimization framework; it would be more of an optimizing |
|
662 -- compiler, not a source code compressor |
|
663 -- TK_NAME that are locals needs parser to analyze/optimize |
|
664 prev = i |
|
665 ---------------------------------------------------------------- |
|
666 elseif tok == "TK_NUMBER" then -- numbers |
|
667 if opt_numbers then |
|
668 do_number(i) -- optimize |
|
669 end |
|
670 prev = i |
|
671 ---------------------------------------------------------------- |
|
672 elseif tok == "TK_STRING" or -- strings, long strings |
|
673 tok == "TK_LSTRING" then |
|
674 if opt_strings then |
|
675 if tok == "TK_STRING" then |
|
676 do_string(i) -- optimize |
|
677 else |
|
678 do_lstring(i) -- optimize |
|
679 end |
|
680 end |
|
681 prev = i |
|
682 ---------------------------------------------------------------- |
|
683 elseif tok == "TK_COMMENT" then -- short comments |
|
684 if opt_comments then |
|
685 if i == 1 and sub(info, 1, 1) == "#" then |
|
686 -- keep shbang comment, trim whitespace |
|
687 do_comment(i) |
|
688 else |
|
689 -- safe to delete, as a TK_EOL (or TK_EOS) always follows |
|
690 settoken() -- remove entirely |
|
691 end |
|
692 elseif opt_whitespace then -- trim whitespace only |
|
693 do_comment(i) |
|
694 end |
|
695 ---------------------------------------------------------------- |
|
696 elseif tok == "TK_LCOMMENT" then -- long comments |
|
697 if keep_lcomment(opt_keep, info) then |
|
698 ------------------------------------------------------------ |
|
699 -- if --keep, we keep a long comment if <msg> is found; |
|
700 -- this is a feature to keep copyright or license texts |
|
701 if opt_whitespace then -- trim whitespace only |
|
702 do_lcomment(i) |
|
703 end |
|
704 prev = i |
|
705 elseif opt_comments then |
|
706 local eols = commenteols(info) |
|
707 ------------------------------------------------------------ |
|
708 -- prepare opt_emptylines case first, if a disposable token |
|
709 -- follows, current one is safe to dump, else keep a space; |
|
710 -- it is implied that the operation is safe for '-', because |
|
711 -- current is a TK_LCOMMENT, and must be separate from a '-' |
|
712 if is_faketoken[stoks[i + 1]] then |
|
713 settoken() -- remove entirely |
|
714 tok = "" |
|
715 else |
|
716 settoken("TK_SPACE", " ") |
|
717 end |
|
718 ------------------------------------------------------------ |
|
719 -- if there are embedded EOLs to keep and opt_emptylines is |
|
720 -- disabled, then switch the token into one or more EOLs |
|
721 if not opt_emptylines and eols > 0 then |
|
722 settoken("TK_EOL", rep("\n", eols)) |
|
723 end |
|
724 ------------------------------------------------------------ |
|
725 -- if optimizing whitespaces, force reinterpretation of the |
|
726 -- token to give a chance for the space to be optimized away |
|
727 if opt_whitespace and tok ~= "" then |
|
728 i = i - 1 -- to reinterpret |
|
729 end |
|
730 ------------------------------------------------------------ |
|
731 else -- disabled case |
|
732 if opt_whitespace then -- trim whitespace only |
|
733 do_lcomment(i) |
|
734 end |
|
735 prev = i |
|
736 end |
|
737 ---------------------------------------------------------------- |
|
738 elseif tok == "TK_EOL" then -- line endings |
|
739 if atstart and opt_emptylines then |
|
740 settoken() -- remove entirely |
|
741 elseif info == "\r\n" or info == "\n\r" then |
|
742 -- normalize the rest of the EOLs for CRLF/LFCR only |
|
743 -- (note that TK_LCOMMENT can change into several EOLs) |
|
744 settoken("TK_EOL", "\n") |
|
745 end |
|
746 ---------------------------------------------------------------- |
|
747 elseif tok == "TK_SPACE" then -- whitespace |
|
748 if opt_whitespace then |
|
749 if atstart or atlineend(i) then |
|
750 -- delete leading and trailing whitespace |
|
751 settoken() -- remove entirely |
|
752 else |
|
753 ------------------------------------------------------------ |
|
754 -- at this point, since leading whitespace have been removed, |
|
755 -- there should be a either a real token or a TK_LCOMMENT |
|
756 -- prior to hitting this whitespace; the TK_LCOMMENT case |
|
757 -- only happens if opt_comments is disabled; so prev ~= nil |
|
758 local ptok = stoks[prev] |
|
759 if ptok == "TK_LCOMMENT" then |
|
760 -- previous TK_LCOMMENT can abut with anything |
|
761 settoken() -- remove entirely |
|
762 else |
|
763 -- prev must be a grammar token; consecutive TK_SPACE |
|
764 -- tokens is impossible when optimizing whitespace |
|
765 local ntok = stoks[i + 1] |
|
766 if is_faketoken[ntok] then |
|
767 -- handle special case where a '-' cannot abut with |
|
768 -- either a short comment or a long comment |
|
769 if (ntok == "TK_COMMENT" or ntok == "TK_LCOMMENT") and |
|
770 ptok == "TK_OP" and sinfos[prev] == "-" then |
|
771 -- keep token |
|
772 else |
|
773 settoken() -- remove entirely |
|
774 end |
|
775 else--is_realtoken |
|
776 -- check a pair of grammar tokens, if can abut, then |
|
777 -- delete space token entirely, otherwise keep one space |
|
778 local s = checkpair(prev, i + 1) |
|
779 if s == "" then |
|
780 settoken() -- remove entirely |
|
781 else |
|
782 settoken("TK_SPACE", " ") |
|
783 end |
|
784 end |
|
785 end |
|
786 ------------------------------------------------------------ |
|
787 end |
|
788 end |
|
789 ---------------------------------------------------------------- |
|
790 else |
|
791 error("unidentified token encountered") |
|
792 end |
|
793 ---------------------------------------------------------------- |
|
794 i = i + 1 |
|
795 end--while |
|
796 repack_tokens() |
|
797 -------------------------------------------------------------------- |
|
798 -- processing loop (PASS 2) |
|
799 -------------------------------------------------------------------- |
|
800 if opt_eols then |
|
801 i = 1 |
|
802 -- aggressive EOL removal only works with most non-grammar tokens |
|
803 -- optimized away because it is a rather simple scheme -- basically |
|
804 -- it just checks 'real' token pairs around EOLs |
|
805 if stoks[1] == "TK_COMMENT" then |
|
806 -- first comment still existing must be shbang, skip whole line |
|
807 i = 3 |
|
808 end |
|
809 while true do |
|
810 tok, info = stoks[i], sinfos[i] |
|
811 -------------------------------------------------------------- |
|
812 if tok == "TK_EOS" then -- end of stream/pass |
|
813 break |
|
814 -------------------------------------------------------------- |
|
815 elseif tok == "TK_EOL" then -- consider each TK_EOL |
|
816 local t1, t2 = stoks[i - 1], stoks[i + 1] |
|
817 if is_realtoken[t1] and is_realtoken[t2] then -- sanity check |
|
818 local s = checkpair(i - 1, i + 1) |
|
819 if s == "" then |
|
820 settoken() -- remove entirely |
|
821 end |
|
822 end |
|
823 end--if tok |
|
824 -------------------------------------------------------------- |
|
825 i = i + 1 |
|
826 end--while |
|
827 repack_tokens() |
|
828 end |
|
829 -------------------------------------------------------------------- |
|
830 if opt_details and opt_details > 0 then print() end -- spacing |
|
831 return stoks, sinfos, stoklns |
|
832 end |