uglify: New specialised Lua 'compression' filter

Sat, 25 Jul 2009 16:44:47 +0100

author
Matthew Wild <mwild1@gmail.com>
date
Sat, 25 Jul 2009 16:44:47 +0100
changeset 8
f62f83d9dc43
parent 7
0db12f8037f7
child 9
875ff34ab96c

uglify: New specialised Lua 'compression' filter

squishy file | annotate | diff | comparison | revisions
uglify/llex.lua file | annotate | diff | comparison | revisions
uglify/squish.uglify.lua file | annotate | diff | comparison | revisions
--- a/squishy	Sat Jul 25 16:43:52 2009 +0100
+++ b/squishy	Sat Jul 25 16:44:47 2009 +0100
@@ -19,3 +19,10 @@
 
 	Main "minify/squish.minify.lua"
 end
+
+-- Compress Lua scripts (an excellent hack :) )
+if opts.with_uglify then
+	Module "llex"		"uglify/llex.lua"
+	
+	Main "uglify/squish.uglify.lua"
+end
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/uglify/llex.lua	Sat Jul 25 16:44:47 2009 +0100
@@ -0,0 +1,355 @@
+--[[--------------------------------------------------------------------
+
+  llex.lua: Lua 5.1 lexical analyzer in Lua
+  This file is part of LuaSrcDiet, based on Yueliang material.
+
+  Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net>
+  The COPYRIGHT file describes the conditions
+  under which this software may be distributed.
+
+  See the ChangeLog for more information.
+
+----------------------------------------------------------------------]]
+
+--[[--------------------------------------------------------------------
+-- NOTES:
+-- * This is a version of the native 5.1.x lexer from Yueliang 0.4.0,
+--   with significant modifications to handle LuaSrcDiet's needs:
+--   (1) llex.error is an optional error function handler
+--   (2) seminfo for strings include their delimiters and no
+--       translation operations are performed on them
+-- * ADDED shbang handling has been added to support executable scripts
+-- * NO localized decimal point replacement magic
+-- * NO limit to number of lines
+-- * NO support for compatible long strings (LUA_COMPAT_LSTR)
+-- * Please read technotes.txt for more technical details.
+----------------------------------------------------------------------]]
+
+local base = _G
+local string = require "string"
+module "llex"
+
+local find = string.find
+local match = string.match
+local sub = string.sub
+
+----------------------------------------------------------------------
+-- initialize keyword list, variables
+----------------------------------------------------------------------
+
+local kw = {}
+for v in string.gmatch([[
+and break do else elseif end false for function if in
+local nil not or repeat return then true until while]], "%S+") do
+  kw[v] = true
+end
+
+-- NOTE: see init() for module variables (externally visible):
+--       tok, seminfo, tokln
+
+local z,                -- source stream
+      sourceid,         -- name of source
+      I,                -- position of lexer
+      buff,             -- buffer for strings
+      ln                -- line number
+
+----------------------------------------------------------------------
+-- add information to token listing
+----------------------------------------------------------------------
+
+local function addtoken(token, info)
+  local i = #tok + 1
+  tok[i] = token
+  seminfo[i] = info
+  tokln[i] = ln
+end
+
+----------------------------------------------------------------------
+-- handles line number incrementation and end-of-line characters
+----------------------------------------------------------------------
+
+local function inclinenumber(i, is_tok)
+  local sub = sub
+  local old = sub(z, i, i)
+  i = i + 1  -- skip '\n' or '\r'
+  local c = sub(z, i, i)
+  if (c == "\n" or c == "\r") and (c ~= old) then
+    i = i + 1  -- skip '\n\r' or '\r\n'
+    old = old..c
+  end
+  if is_tok then addtoken("TK_EOL", old) end
+  ln = ln + 1
+  I = i
+  return i
+end
+
+----------------------------------------------------------------------
+-- initialize lexer for given source _z and source name _sourceid
+----------------------------------------------------------------------
+
+function init(_z, _sourceid)
+  z = _z                        -- source
+  sourceid = _sourceid          -- name of source
+  I = 1                         -- lexer's position in source
+  ln = 1                        -- line number
+  tok = {}                      -- lexed token list*
+  seminfo = {}                  -- lexed semantic information list*
+  tokln = {}                    -- line numbers for messages*
+                                -- (*) externally visible thru' module
+  --------------------------------------------------------------------
+  -- initial processing (shbang handling)
+  --------------------------------------------------------------------
+  local p, _, q, r = find(z, "^(#[^\r\n]*)(\r?\n?)")
+  if p then                             -- skip first line
+    I = I + #q
+    addtoken("TK_COMMENT", q)
+    if #r > 0 then inclinenumber(I, true) end
+  end
+end
+
+----------------------------------------------------------------------
+-- returns a chunk name or id, no truncation for long names
+----------------------------------------------------------------------
+
+function chunkid()
+  if sourceid and match(sourceid, "^[=@]") then
+    return sub(sourceid, 2)  -- remove first char
+  end
+  return "[string]"
+end
+
+----------------------------------------------------------------------
+-- formats error message and throws error
+-- * a simplified version, does not report what token was responsible
+----------------------------------------------------------------------
+
+function errorline(s, line)
+  local e = error or base.error
+  e(string.format("%s:%d: %s", chunkid(), line or ln, s))
+end
+local errorline = errorline
+
+------------------------------------------------------------------------
+-- count separators ("=") in a long string delimiter
+------------------------------------------------------------------------
+
+local function skip_sep(i)
+  local sub = sub
+  local s = sub(z, i, i)
+  i = i + 1
+  local count = #match(z, "=*", i)  -- note, take the length
+  i = i + count
+  I = i
+  return (sub(z, i, i) == s) and count or (-count) - 1
+end
+
+----------------------------------------------------------------------
+-- reads a long string or long comment
+----------------------------------------------------------------------
+
+local function read_long_string(is_str, sep)
+  local i = I + 1  -- skip 2nd '['
+  local sub = sub
+  local c = sub(z, i, i)
+  if c == "\r" or c == "\n" then  -- string starts with a newline?
+    i = inclinenumber(i)  -- skip it
+  end
+  local j = i
+  while true do
+    local p, q, r = find(z, "([\r\n%]])", i) -- (long range)
+    if not p then
+      errorline(is_str and "unfinished long string" or
+                "unfinished long comment")
+    end
+    i = p
+    if r == "]" then                    -- delimiter test
+      if skip_sep(i) == sep then
+        buff = sub(z, buff, I)
+        I = I + 1  -- skip 2nd ']'
+        return buff
+      end
+      i = I
+    else                                -- newline
+      buff = buff.."\n"
+      i = inclinenumber(i)
+    end
+  end--while
+end
+
+----------------------------------------------------------------------
+-- reads a string
+----------------------------------------------------------------------
+
+local function read_string(del)
+  local i = I
+  local find = find
+  local sub = sub
+  while true do
+    local p, q, r = find(z, "([\n\r\\\"\'])", i) -- (long range)
+    if p then
+      if r == "\n" or r == "\r" then
+        errorline("unfinished string")
+      end
+      i = p
+      if r == "\\" then                         -- handle escapes
+        i = i + 1
+        r = sub(z, i, i)
+        if r == "" then break end -- (EOZ error)
+        p = find("abfnrtv\n\r", r, 1, true)
+        ------------------------------------------------------
+        if p then                               -- special escapes
+          if p > 7 then
+            i = inclinenumber(i)
+          else
+            i = i + 1
+          end
+        ------------------------------------------------------
+        elseif find(r, "%D") then               -- other non-digits
+          i = i + 1
+        ------------------------------------------------------
+        else                                    -- \xxx sequence
+          local p, q, s = find(z, "^(%d%d?%d?)", i)
+          i = q + 1
+          if s + 1 > 256 then -- UCHAR_MAX
+            errorline("escape sequence too large")
+          end
+        ------------------------------------------------------
+        end--if p
+      else
+        i = i + 1
+        if r == del then                        -- ending delimiter
+          I = i
+          return sub(z, buff, i - 1)            -- return string
+        end
+      end--if r
+    else
+      break -- (error)
+    end--if p
+  end--while
+  errorline("unfinished string")
+end
+
+------------------------------------------------------------------------
+-- main lexer function
+------------------------------------------------------------------------
+
+function llex()
+  local find = find
+  local match = match
+  while true do--outer
+    local i = I
+    -- inner loop allows break to be used to nicely section tests
+    while true do--inner
+      ----------------------------------------------------------------
+      local p, _, r = find(z, "^([_%a][_%w]*)", i)
+      if p then
+        I = i + #r
+        if kw[r] then
+          addtoken("TK_KEYWORD", r)             -- reserved word (keyword)
+        else
+          addtoken("TK_NAME", r)                -- identifier
+        end
+        break -- (continue)
+      end
+      ----------------------------------------------------------------
+      local p, _, r = find(z, "^(%.?)%d", i)
+      if p then                                 -- numeral
+        if r == "." then i = i + 1 end
+        local _, q, r = find(z, "^%d*[%.%d]*([eE]?)", i)
+        i = q + 1
+        if #r == 1 then                         -- optional exponent
+          if match(z, "^[%+%-]", i) then        -- optional sign
+            i = i + 1
+          end
+        end
+        local _, q = find(z, "^[_%w]*", i)
+        I = q + 1
+        local v = sub(z, p, q)                  -- string equivalent
+        if not base.tonumber(v) then            -- handles hex test also
+          errorline("malformed number")
+        end
+        addtoken("TK_NUMBER", v)
+        break -- (continue)
+      end
+      ----------------------------------------------------------------
+      local p, q, r, t = find(z, "^((%s)[ \t\v\f]*)", i)
+      if p then
+        if t == "\n" or t == "\r" then          -- newline
+          inclinenumber(i, true)
+        else
+          I = q + 1                             -- whitespace
+          addtoken("TK_SPACE", r)
+        end
+        break -- (continue)
+      end
+      ----------------------------------------------------------------
+      local r = match(z, "^%p", i)
+      if r then
+        buff = i
+        local p = find("-[\"\'.=<>~", r, 1, true)
+        if p then
+          -- two-level if block for punctuation/symbols
+          --------------------------------------------------------
+          if p <= 2 then
+            if p == 1 then                      -- minus
+              local c = match(z, "^%-%-(%[?)", i)
+              if c then
+                i = i + 2
+                local sep = -1
+                if c == "[" then
+                  sep = skip_sep(i)
+                end
+                if sep >= 0 then                -- long comment
+                  addtoken("TK_LCOMMENT", read_long_string(false, sep))
+                else                            -- short comment
+                  I = find(z, "[\n\r]", i) or (#z + 1)
+                  addtoken("TK_COMMENT", sub(z, buff, I - 1))
+                end
+                break -- (continue)
+              end
+              -- (fall through for "-")
+            else                                -- [ or long string
+              local sep = skip_sep(i)
+              if sep >= 0 then
+                addtoken("TK_LSTRING", read_long_string(true, sep))
+              elseif sep == -1 then
+                addtoken("TK_OP", "[")
+              else
+                errorline("invalid long string delimiter")
+              end
+              break -- (continue)
+            end
+          --------------------------------------------------------
+          elseif p <= 5 then
+            if p < 5 then                       -- strings
+              I = i + 1
+              addtoken("TK_STRING", read_string(r))
+              break -- (continue)
+            end
+            r = match(z, "^%.%.?%.?", i)        -- .|..|... dots
+            -- (fall through)
+          --------------------------------------------------------
+          else                                  -- relational
+            r = match(z, "^%p=?", i)
+            -- (fall through)
+          end
+        end
+        I = i + #r
+        addtoken("TK_OP", r)  -- for other symbols, fall through
+        break -- (continue)
+      end
+      ----------------------------------------------------------------
+      local r = sub(z, i, i)
+      if r ~= "" then
+        I = i + 1
+        addtoken("TK_OP", r)                    -- other single-char tokens
+        break
+      end
+      addtoken("TK_EOS", "")                    -- end of stream,
+      return                                    -- exit here
+      ----------------------------------------------------------------
+    end--while inner
+  end--while outer
+end
+
+return base.getfenv()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/uglify/squish.uglify.lua	Sat Jul 25 16:44:47 2009 +0100
@@ -0,0 +1,94 @@
+local llex = require "llex"
+
+local base_char = 128;
+local keywords = { "and", "break", "do", "else", "elseif",
+    "end", "false", "for", "function", "if",
+        "in", "local", "nil", "not", "or", "repeat",
+            "return", "then", "true", "until", "while" }
+
+function uglify_file(infile_fn, outfile_fn)
+	local infile, err = io.open(infile_fn);
+	if not infile then
+		print_err("Can't open input file for reading: "..tostring(err));
+		return;
+	end
+	
+	local outfile, err = io.open(outfile_fn..".uglified", "w+");
+	if not outfile then
+		print_err("Can't open output file for writing: "..tostring(err));
+		return;
+	end
+	
+	local data = infile:read("*a");
+	infile:close();
+	
+	local shebang, newdata = data:match("^(#.-\n)(.+)$");
+	local code = newdata or data;
+	if shebang then
+		outfile:write(shebang)
+	end
+
+	
+	while base_char + #keywords < 255 and code:find("["..string.char(base_char).."-"..string.char(base_char+#keywords-1).."]") do
+		base_char = base_char + 1;
+	end
+	if base_char == 255 then
+		-- Sorry, can't uglify this file :(
+		-- We /could/ use a multi-byte marker, but that would complicate
+		-- things and lower the compression ratio (there are quite a few 
+		-- 2-letter keywords)
+		outfile:write(code);
+		outfile:close();
+		os.rename(outfile_fn..".uglified", outfile_fn);
+		return;
+	end
+
+	local keyword_map_to_char = {}
+	for i, keyword in ipairs(keywords) do
+		keyword_map_to_char[keyword] = string.char(base_char + i);
+	end
+	
+	outfile:write("local base_char,keywords=", tostring(base_char), ",{");
+	for _, keyword in ipairs(keywords) do
+		outfile:write('"', keyword, '",');
+	end
+	outfile:write[[}; function prettify(code) return code:gsub("["..string.char(base_char).."-"..string.char(base_char+#keywords).."]", 
+	function (c) return keywords[c:byte()-base_char]; end) end ]]
+	
+	-- Write loadstring and open string
+	local maxequals = 0;
+	data:gsub("(=+)", function (equals_string) maxequals = math.max(maxequals, #equals_string); end);
+	
+	outfile:write [[assert(loadstring(prettify]]
+	outfile:write("[", string.rep("=", maxequals+1), "[");
+	
+	-- Write code, substituting tokens as we go
+	llex.init(code, "@"..infile_fn);
+	llex.llex()
+	local seminfo = llex.seminfo;
+	for k,v in ipairs(llex.tok) do
+		if v == "TK_KEYWORD" then
+			local keyword_char = keyword_map_to_char[seminfo[k]];
+			if keyword_char then
+				outfile:write(keyword_char);
+			else -- Those who think Lua shouldn't have 'continue, fix this please :)
+				outfile:write(seminfo[k]);
+			end
+		else
+			outfile:write(seminfo[k]);
+		end
+	end
+
+	-- Close string/functions	
+	outfile:write("]", string.rep("=", maxequals+1), "]");
+	outfile:write("))()");
+	outfile:close();
+	os.rename(outfile_fn..".uglified", outfile_fn);
+end
+
+if opts.uglify then
+	print_info("Uglifying "..out_fn.."...");
+	uglify_file(out_fn, out_fn);
+	print_info("OK!");
+end
+

mercurial