New implementation of lxp.lom by Tom?s Guisasola

Fri, 23 Apr 2021 21:15:05 +0100

author
Matthew Wild <mwild1@gmail.com>
date
Fri, 23 Apr 2021 21:15:05 +0100
changeset 36
4a61f00ee916
parent 35
d2d0bc06eac2
child 37
233463804681

New implementation of lxp.lom by Tom?s Guisasola

src/lxp/lom.lua file | annotate | diff | comparison | revisions
tests/test-lom.lua file | annotate | diff | comparison | revisions
--- a/src/lxp/lom.lua	Fri Apr 23 21:03:34 2021 +0100
+++ b/src/lxp/lom.lua	Fri Apr 23 21:15:05 2021 +0100
@@ -1,12 +1,13 @@
 -- See Copyright Notice in license.html
--- $Id: lom.lua,v 1.6 2005/06/09 19:18:40 tuler Exp $
 
 local lxp = require "lxp"
 
+local table = require"table"
 local tinsert, tremove = table.insert, table.remove
-local assert, type, print = assert, type, print
+local assert, pairs, type = assert, pairs, type
 
 
+-- auxiliary functions -------------------------------------------------------
 local function starttag (p, tag, attr)
   local stack = p:getcallbacks().stack
   local newelement = {tag = tag, attr = attr}
@@ -32,28 +33,76 @@
   end
 end
 
+-- main function -------------------------------------------------------------
 local function parse (o)
-  local c = { StartElement = starttag,
-              EndElement = endtag,
-              CharacterData = text,
-              _nonstrict = true,
-              stack = {{}}
-            }
-  local p = lxp.new(c)
-  local status, err
-  if type(o) == "string" then
-    status, err = p:parse(o)
-    if not status then return nil, err end
-  else
-    for l in pairs(o) do
-      status, err = p:parse(l)
-      if not status then return nil, err end
-    end
-  end
-  status, err = p:parse()
-  if not status then return nil, err end
-  p:close()
-  return c.stack[1][1]
+	local c = { StartElement = starttag,
+		EndElement = endtag,
+		CharacterData = text,
+		_nonstrict = true,
+		stack = {{}}
+	}
+	local p = lxp.new(c)
+	local to = type(o)
+	if to == "string" then
+		local status, err, line, col, pos = p:parse(o)
+		if not status then return nil, err, line, col, pos end
+	else
+		local iter, state, init
+		if to == "table" then
+			iter, state, init = pairs(o)
+		elseif to == "function" then
+			iter = o
+		elseif to == "userdata" and o.read then
+			iter, state = o.read, o
+		else
+			error ("Bad argument #1 to parse: expected a string, a table, a function or a file, but got "..to, 2)
+		end
+		for l in iter, state, init do
+			local status, err, line, col, pos = p:parse(l)
+			if not status then return nil, err, line, col, pos end
+		end
+	end
+	local status, err, line, col, pos = p:parse() -- close document
+	if not status then return nil, err, line, col, pos end
+	p:close()
+	return c.stack[1][1]
 end
 
-return { parse = parse }
+-- utility functions ---------------------------------------------------------
+local function find_elem (self, tag)
+	if self.tag == tag then
+		return self
+	end
+	for i = 1, #self do
+		local v = self[i]
+		if type(v) == "table" then
+			local found = find_elem (v, tag)
+			if found then
+				return found
+			end
+		end
+	end
+	return nil
+end
+
+local function list_children (self, tag)
+	local i = 0
+	return function ()
+		i = i+1
+		local v = self[i]
+		while v do
+			if type (v) == "table" and (tag == nil or tag == v.tag) then
+				return v
+			end
+			i = i+1
+			v = self[i]
+		end
+		return nil
+	end
+end
+
+return {
+	find_elem = find_elem,
+	list_children = list_children,
+	parse = parse,
+}
--- a/tests/test-lom.lua	Fri Apr 23 21:03:34 2021 +0100
+++ b/tests/test-lom.lua	Fri Apr 23 21:15:05 2021 +0100
@@ -2,49 +2,158 @@
 
 local lom = require "lxp.lom"
 
+local u_acute_utf8 = string.char(195)..string.char(186) -- C3 BA
+local u_acute_latin1 = string.char(250) -- FA
+
 local tests = {
-	[[<abc a1="A1" a2="A2">inside tag `abc'</abc>]],
-	[[<qwerty q1="q1" q2="q2">
+	{
+		[[<abc a1="A1" a2="A2">inside tag `abc'</abc>]],
+		{
+			tag="abc",
+			attr = { "a1", "a2", a1 = "A1", a2 = "A2", },
+			"inside tag `abc'",
+		},
+	},
+	{
+		[[<qwerty q1="q1" q2="q2">
 	<asdf>some text</asdf>
 </qwerty>]],
+		{
+			tag = "qwerty",
+			attr = { "q1", "q2", q1 = "q1", q2 = "q2", },
+			"\n\t",
+			{
+				tag = "asdf",
+				attr = {},
+				"some text",
+			},
+			"\n",
+		},
+	},
+	{
+		[[<ul><li>conteudo 1</li><li>conte]]..u_acute_utf8..[[do 2</li></ul>]],
+		encoding = "UTF-8",
+		{
+			tag = "ul",
+			attr = {},
+			{
+				tag = "li",
+				attr = {},
+				"conteudo 1",
+			},
+			{
+				tag = "li",
+				attr = {},
+				"conteúdo 2",
+			},
+		},
+	},
+	{
+		[[<ul><li>Conteudo 1</li><li>Conte]]..u_acute_latin1..[[do 2</li><li>Conte&uacute;do 3</li></ul>]],
+		encoding = "ISO-8859-1",
+		doctype = [[<!DOCTYPE test [<!ENTITY uacute "&#250;">]>]], -- Ok!
+		{
+			tag = "ul",
+			attr = {},
+			{
+				tag = "li",
+				attr = {},
+				"Conteudo 1",
+			},
+			{
+				tag = "li",
+				attr = {},
+				"Conteúdo 2", -- Latin-1 becomes UTF-8
+			},
+			{
+				tag = "li",
+				attr = {},
+				"Conteúdo 3", -- entity becomes a UTF-8 character
+			},
+		},
+	},
+	{
+		[[<ul><li>Conte&uacute;do</li></ul>]],
+		--doctype = [[<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">]], --> ignora as entidades
+		--doctype = [[<!DOCTYPE html SYSTEM "about:legacy-compat">]], --> ignora as entidades
+		--doctype = [[<!DOCTYPE html>]], --> undefined entity
+		--doctype = [[<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">]], --> sintax error
+		--doctype = [[<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" SYSTEM "http://www.w3.org/TR/html4/strict.dtd">]], --> syntax error
+		--doctype = [[<!DOCTYPE HTMLlat1 PUBLIC "-//W3C//ENTITIES Latin 1//EN//HTML">]], --> syntax error
+		--doctype = [[<!DOCTYPE HTMLlat1 PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent">]], --> ignora entidades
+		--doctype = [[<!DOCTYPE isolat1 PUBLIC "//W3C//ENTITIES Added Latin 1//EN//XML" "http://www.w3.org/2003/entities/2007/isolat1.ent">]], --> ignora entidades
+		doctype = [[<!DOCTYPE test [<!ENTITY uacute "&#250;">]>]], -- Ok!
+		encoding = "UTF-8",
+		{
+			tag = "ul",
+			attr = {},
+			{
+				tag = "li",
+				attr = {},
+				"Conteúdo", -- entity becomes a UTF-8 character
+			},
+		},
+	},
 }
 
-function table._tostring (tab, indent, spacing)
-	local s = {}
-	spacing = spacing or ""
-	indent = indent or "\t"
-    table.insert (s, "{\n")
-    for nome, val in pairs (tab) do
-        table.insert (s, spacing..indent)
-        local t = type(nome)
-		if t == "string" then
-            table.insert (s, string.format ("[%q] = ", tostring (nome)))
-		elseif t == "number" or t == "boolean" then
-            table.insert (s, string.format ("[%s] = ", tostring (nome)))
-        else
-            table.insert (s, t)
-        end
-        t = type(val)
-        if t == "string" or t == "number" then
-            table.insert (s, string.format ("%q", val))
-        elseif t == "table" then
-            table.insert (s, table._tostring (val, indent, spacing..indent))
-        else
-            table.insert (s, t)
-        end
-        table.insert (s, ",\n")
-    end
-    table.insert (s, spacing.."}")
-	return table.concat (s)
-end
-
-function table.print (tab, indent, spacing)
-	io.write (table._tostring (tab, indent, spacing))
+function table.equal (t1, t2)
+	for nome, val in pairs (t1) do
+		local tv = type(val)
+		if tv == "table" then
+			if type(t2[nome]) ~= "table" then
+				return false, "Different types at entry `"..nome.."': t1."..nome.." is "..tv.." while t2."..nome.." is "..type(t2[nome]).." ["..tostring(t2[nome]).."]"
+			else
+				local ok, msg = table.equal (val, t2[nome])
+				if not ok then
+					return false, "["..nome.."]\t"..tostring(val).." ~= "..tostring(t2[nome]).."; "..msg
+				end
+			end
+		else
+			if val ~= t2[nome] then
+				return false, "["..nome.."]\t["..tostring(val).."] ~= ["..tostring(t2[nome])..']'
+			end
+		end
+	end
+	return true
 end
 
 
 for i, s in ipairs(tests) do
-	--s = string.gsub (s, "[\n\r\t]", "")
-	local ds = assert (lom.parse ([[<?xml version="1.0" encoding="ISO-8859-1"?>]]..s))
-	print(table._tostring(ds))
+	io.write'.'
+	local encoding = s.encoding or "ISO-8859-1"
+	local header = [[<?xml version="1.0" encoding="]]..encoding..[["?>]]..(s.doctype or '')
+	local doc = header..s[1]
+
+	local o1 = assert (lom.parse (doc))
+	assert(table.equal (o1, s[2]))
+
+	local o2 = assert (lom.parse (string.gmatch(doc, ".-%>")))
+	assert(table.equal (o2, s[2]))
 end
+
+local o = assert (lom.parse ([[
+<?xml version="1.0"?>
+<a1>
+	<b1>
+		<c1>t111</c1>
+		<c2>t112</c2>
+	</b1>
+	<b2>
+		<c1>t121</c1>
+		<c2>t122</c2>
+	</b2>
+</a1>]]))
+assert (o.tag == "a1")
+assert (o[1] == "\n\t")
+assert (o[2].tag == "b1")
+assert (o[2][2].tag == "c1")
+local c1 = lom.find_elem (o, "c1")
+assert (type(c1) == "table")
+assert (c1.tag == "c1")
+assert (c1[1] == "t111")
+local next_child = lom.list_children (o)
+assert (next_child().tag == "b1")
+assert (next_child().tag == "b2")
+assert (next_child() == nil)
+
+print"OK"

mercurial