|
1 --[[-------------------------------------------------------------------- |
|
2 |
|
3 llex.lua: Lua 5.1 lexical analyzer in Lua |
|
4 This file is part of LuaSrcDiet, based on Yueliang material. |
|
5 |
|
6 Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net> |
|
7 The COPYRIGHT file describes the conditions |
|
8 under which this software may be distributed. |
|
9 |
|
10 See the ChangeLog for more information. |
|
11 |
|
12 ----------------------------------------------------------------------]] |
|
13 |
|
14 --[[-------------------------------------------------------------------- |
|
15 -- NOTES: |
|
16 -- * This is a version of the native 5.1.x lexer from Yueliang 0.4.0, |
|
17 -- with significant modifications to handle LuaSrcDiet's needs: |
|
18 -- (1) llex.error is an optional error function handler |
|
19 -- (2) seminfo for strings include their delimiters and no |
|
20 -- translation operations are performed on them |
|
21 -- * ADDED shbang handling has been added to support executable scripts |
|
22 -- * NO localized decimal point replacement magic |
|
23 -- * NO limit to number of lines |
|
24 -- * NO support for compatible long strings (LUA_COMPAT_LSTR) |
|
25 -- * Please read technotes.txt for more technical details. |
|
26 ----------------------------------------------------------------------]] |
|
27 |
|
28 local base = _G |
|
29 local string = require "string" |
|
30 module "llex" |
|
31 |
|
32 local find = string.find |
|
33 local match = string.match |
|
34 local sub = string.sub |
|
35 |
|
36 ---------------------------------------------------------------------- |
|
37 -- initialize keyword list, variables |
|
38 ---------------------------------------------------------------------- |
|
39 |
|
40 local kw = {} |
|
41 for v in string.gmatch([[ |
|
42 and break do else elseif end false for function if in |
|
43 local nil not or repeat return then true until while]], "%S+") do |
|
44 kw[v] = true |
|
45 end |
|
46 |
|
47 -- NOTE: see init() for module variables (externally visible): |
|
48 -- tok, seminfo, tokln |
|
49 |
|
50 local z, -- source stream |
|
51 sourceid, -- name of source |
|
52 I, -- position of lexer |
|
53 buff, -- buffer for strings |
|
54 ln -- line number |
|
55 |
|
56 ---------------------------------------------------------------------- |
|
57 -- add information to token listing |
|
58 ---------------------------------------------------------------------- |
|
59 |
|
60 local function addtoken(token, info) |
|
61 local i = #tok + 1 |
|
62 tok[i] = token |
|
63 seminfo[i] = info |
|
64 tokln[i] = ln |
|
65 end |
|
66 |
|
67 ---------------------------------------------------------------------- |
|
68 -- handles line number incrementation and end-of-line characters |
|
69 ---------------------------------------------------------------------- |
|
70 |
|
71 local function inclinenumber(i, is_tok) |
|
72 local sub = sub |
|
73 local old = sub(z, i, i) |
|
74 i = i + 1 -- skip '\n' or '\r' |
|
75 local c = sub(z, i, i) |
|
76 if (c == "\n" or c == "\r") and (c ~= old) then |
|
77 i = i + 1 -- skip '\n\r' or '\r\n' |
|
78 old = old..c |
|
79 end |
|
80 if is_tok then addtoken("TK_EOL", old) end |
|
81 ln = ln + 1 |
|
82 I = i |
|
83 return i |
|
84 end |
|
85 |
|
86 ---------------------------------------------------------------------- |
|
87 -- initialize lexer for given source _z and source name _sourceid |
|
88 ---------------------------------------------------------------------- |
|
89 |
|
90 function init(_z, _sourceid) |
|
91 z = _z -- source |
|
92 sourceid = _sourceid -- name of source |
|
93 I = 1 -- lexer's position in source |
|
94 ln = 1 -- line number |
|
95 tok = {} -- lexed token list* |
|
96 seminfo = {} -- lexed semantic information list* |
|
97 tokln = {} -- line numbers for messages* |
|
98 -- (*) externally visible thru' module |
|
99 -------------------------------------------------------------------- |
|
100 -- initial processing (shbang handling) |
|
101 -------------------------------------------------------------------- |
|
102 local p, _, q, r = find(z, "^(#[^\r\n]*)(\r?\n?)") |
|
103 if p then -- skip first line |
|
104 I = I + #q |
|
105 addtoken("TK_COMMENT", q) |
|
106 if #r > 0 then inclinenumber(I, true) end |
|
107 end |
|
108 end |
|
109 |
|
110 ---------------------------------------------------------------------- |
|
111 -- returns a chunk name or id, no truncation for long names |
|
112 ---------------------------------------------------------------------- |
|
113 |
|
114 function chunkid() |
|
115 if sourceid and match(sourceid, "^[=@]") then |
|
116 return sub(sourceid, 2) -- remove first char |
|
117 end |
|
118 return "[string]" |
|
119 end |
|
120 |
|
121 ---------------------------------------------------------------------- |
|
122 -- formats error message and throws error |
|
123 -- * a simplified version, does not report what token was responsible |
|
124 ---------------------------------------------------------------------- |
|
125 |
|
126 function errorline(s, line) |
|
127 local e = error or base.error |
|
128 e(string.format("%s:%d: %s", chunkid(), line or ln, s)) |
|
129 end |
|
130 local errorline = errorline |
|
131 |
|
132 ------------------------------------------------------------------------ |
|
133 -- count separators ("=") in a long string delimiter |
|
134 ------------------------------------------------------------------------ |
|
135 |
|
136 local function skip_sep(i) |
|
137 local sub = sub |
|
138 local s = sub(z, i, i) |
|
139 i = i + 1 |
|
140 local count = #match(z, "=*", i) -- note, take the length |
|
141 i = i + count |
|
142 I = i |
|
143 return (sub(z, i, i) == s) and count or (-count) - 1 |
|
144 end |
|
145 |
|
146 ---------------------------------------------------------------------- |
|
147 -- reads a long string or long comment |
|
148 ---------------------------------------------------------------------- |
|
149 |
|
150 local function read_long_string(is_str, sep) |
|
151 local i = I + 1 -- skip 2nd '[' |
|
152 local sub = sub |
|
153 local c = sub(z, i, i) |
|
154 if c == "\r" or c == "\n" then -- string starts with a newline? |
|
155 i = inclinenumber(i) -- skip it |
|
156 end |
|
157 local j = i |
|
158 while true do |
|
159 local p, q, r = find(z, "([\r\n%]])", i) -- (long range) |
|
160 if not p then |
|
161 errorline(is_str and "unfinished long string" or |
|
162 "unfinished long comment") |
|
163 end |
|
164 i = p |
|
165 if r == "]" then -- delimiter test |
|
166 if skip_sep(i) == sep then |
|
167 buff = sub(z, buff, I) |
|
168 I = I + 1 -- skip 2nd ']' |
|
169 return buff |
|
170 end |
|
171 i = I |
|
172 else -- newline |
|
173 buff = buff.."\n" |
|
174 i = inclinenumber(i) |
|
175 end |
|
176 end--while |
|
177 end |
|
178 |
|
179 ---------------------------------------------------------------------- |
|
180 -- reads a string |
|
181 ---------------------------------------------------------------------- |
|
182 |
|
183 local function read_string(del) |
|
184 local i = I |
|
185 local find = find |
|
186 local sub = sub |
|
187 while true do |
|
188 local p, q, r = find(z, "([\n\r\\\"\'])", i) -- (long range) |
|
189 if p then |
|
190 if r == "\n" or r == "\r" then |
|
191 errorline("unfinished string") |
|
192 end |
|
193 i = p |
|
194 if r == "\\" then -- handle escapes |
|
195 i = i + 1 |
|
196 r = sub(z, i, i) |
|
197 if r == "" then break end -- (EOZ error) |
|
198 p = find("abfnrtv\n\r", r, 1, true) |
|
199 ------------------------------------------------------ |
|
200 if p then -- special escapes |
|
201 if p > 7 then |
|
202 i = inclinenumber(i) |
|
203 else |
|
204 i = i + 1 |
|
205 end |
|
206 ------------------------------------------------------ |
|
207 elseif find(r, "%D") then -- other non-digits |
|
208 i = i + 1 |
|
209 ------------------------------------------------------ |
|
210 else -- \xxx sequence |
|
211 local p, q, s = find(z, "^(%d%d?%d?)", i) |
|
212 i = q + 1 |
|
213 if s + 1 > 256 then -- UCHAR_MAX |
|
214 errorline("escape sequence too large") |
|
215 end |
|
216 ------------------------------------------------------ |
|
217 end--if p |
|
218 else |
|
219 i = i + 1 |
|
220 if r == del then -- ending delimiter |
|
221 I = i |
|
222 return sub(z, buff, i - 1) -- return string |
|
223 end |
|
224 end--if r |
|
225 else |
|
226 break -- (error) |
|
227 end--if p |
|
228 end--while |
|
229 errorline("unfinished string") |
|
230 end |
|
231 |
|
232 ------------------------------------------------------------------------ |
|
233 -- main lexer function |
|
234 ------------------------------------------------------------------------ |
|
235 |
|
236 function llex() |
|
237 local find = find |
|
238 local match = match |
|
239 while true do--outer |
|
240 local i = I |
|
241 -- inner loop allows break to be used to nicely section tests |
|
242 while true do--inner |
|
243 ---------------------------------------------------------------- |
|
244 local p, _, r = find(z, "^([_%a][_%w]*)", i) |
|
245 if p then |
|
246 I = i + #r |
|
247 if kw[r] then |
|
248 addtoken("TK_KEYWORD", r) -- reserved word (keyword) |
|
249 else |
|
250 addtoken("TK_NAME", r) -- identifier |
|
251 end |
|
252 break -- (continue) |
|
253 end |
|
254 ---------------------------------------------------------------- |
|
255 local p, _, r = find(z, "^(%.?)%d", i) |
|
256 if p then -- numeral |
|
257 if r == "." then i = i + 1 end |
|
258 local _, q, r = find(z, "^%d*[%.%d]*([eE]?)", i) |
|
259 i = q + 1 |
|
260 if #r == 1 then -- optional exponent |
|
261 if match(z, "^[%+%-]", i) then -- optional sign |
|
262 i = i + 1 |
|
263 end |
|
264 end |
|
265 local _, q = find(z, "^[_%w]*", i) |
|
266 I = q + 1 |
|
267 local v = sub(z, p, q) -- string equivalent |
|
268 if not base.tonumber(v) then -- handles hex test also |
|
269 errorline("malformed number") |
|
270 end |
|
271 addtoken("TK_NUMBER", v) |
|
272 break -- (continue) |
|
273 end |
|
274 ---------------------------------------------------------------- |
|
275 local p, q, r, t = find(z, "^((%s)[ \t\v\f]*)", i) |
|
276 if p then |
|
277 if t == "\n" or t == "\r" then -- newline |
|
278 inclinenumber(i, true) |
|
279 else |
|
280 I = q + 1 -- whitespace |
|
281 addtoken("TK_SPACE", r) |
|
282 end |
|
283 break -- (continue) |
|
284 end |
|
285 ---------------------------------------------------------------- |
|
286 local r = match(z, "^%p", i) |
|
287 if r then |
|
288 buff = i |
|
289 local p = find("-[\"\'.=<>~", r, 1, true) |
|
290 if p then |
|
291 -- two-level if block for punctuation/symbols |
|
292 -------------------------------------------------------- |
|
293 if p <= 2 then |
|
294 if p == 1 then -- minus |
|
295 local c = match(z, "^%-%-(%[?)", i) |
|
296 if c then |
|
297 i = i + 2 |
|
298 local sep = -1 |
|
299 if c == "[" then |
|
300 sep = skip_sep(i) |
|
301 end |
|
302 if sep >= 0 then -- long comment |
|
303 addtoken("TK_LCOMMENT", read_long_string(false, sep)) |
|
304 else -- short comment |
|
305 I = find(z, "[\n\r]", i) or (#z + 1) |
|
306 addtoken("TK_COMMENT", sub(z, buff, I - 1)) |
|
307 end |
|
308 break -- (continue) |
|
309 end |
|
310 -- (fall through for "-") |
|
311 else -- [ or long string |
|
312 local sep = skip_sep(i) |
|
313 if sep >= 0 then |
|
314 addtoken("TK_LSTRING", read_long_string(true, sep)) |
|
315 elseif sep == -1 then |
|
316 addtoken("TK_OP", "[") |
|
317 else |
|
318 errorline("invalid long string delimiter") |
|
319 end |
|
320 break -- (continue) |
|
321 end |
|
322 -------------------------------------------------------- |
|
323 elseif p <= 5 then |
|
324 if p < 5 then -- strings |
|
325 I = i + 1 |
|
326 addtoken("TK_STRING", read_string(r)) |
|
327 break -- (continue) |
|
328 end |
|
329 r = match(z, "^%.%.?%.?", i) -- .|..|... dots |
|
330 -- (fall through) |
|
331 -------------------------------------------------------- |
|
332 else -- relational |
|
333 r = match(z, "^%p=?", i) |
|
334 -- (fall through) |
|
335 end |
|
336 end |
|
337 I = i + #r |
|
338 addtoken("TK_OP", r) -- for other symbols, fall through |
|
339 break -- (continue) |
|
340 end |
|
341 ---------------------------------------------------------------- |
|
342 local r = sub(z, i, i) |
|
343 if r ~= "" then |
|
344 I = i + 1 |
|
345 addtoken("TK_OP", r) -- other single-char tokens |
|
346 break |
|
347 end |
|
348 addtoken("TK_EOS", "") -- end of stream, |
|
349 return -- exit here |
|
350 ---------------------------------------------------------------- |
|
351 end--while inner |
|
352 end--while outer |
|
353 end |
|
354 |
|
355 return base.getfenv() |