util.xmllex: Decode XML entities

Wed, 05 Jan 2011 17:09:08 +0000

author
daurnimator <quae@daurnimator.com>
date
Wed, 05 Jan 2011 17:09:08 +0000
changeset 4003
cb6ddda1cb5f
parent 4002
2b53b4b5d46e
child 4004
3aea0f8ac7e9

util.xmllex: Decode XML entities

util/xmllex.lua file | annotate | diff | comparison | revisions
--- a/util/xmllex.lua	Wed Jan 05 05:14:02 2011 +0000
+++ b/util/xmllex.lua	Wed Jan 05 17:09:08 2011 +0000
@@ -1,11 +1,24 @@
 local assert , ipairs , pairs , setmetatable , rawget , rawset , tostring =
 	assert , ipairs , pairs , setmetatable , rawget , rawset , tostring
-local strsub , strmatch = string.sub , string.match
+local strchar , strgmatch , strgsub , strsub , strmatch = string.char , string.gmatch , string.gsub , string.sub , string.match
 local tblconcat = table.concat
 local tblinsert = table.insert
 
 local stanza_methods = require "util.stanza".stanza_mt;
 
+local entities = setmetatable ( {
+        amp = "&" ;
+        lt = "<" ;
+        gt = ">" ;
+        apos = "'" ;
+        quot = '"' ;
+} , { __index = function ( entity )
+        return strchar ( tonumber ( entity:match ( "^#%d+" ) ) or error ( "invalid entity " .. entity ) )
+end  } )
+local function xml_unescape ( str )
+	return ( strgsub ( str , "&([^;]*);" , entities ) ) 
+end
+
 local function getstring ( msgs , startpos , finishpos )
 	if #msgs == 1 then --All originated in same string
 		return strsub ( msgs[1] , startpos , finishpos )
@@ -23,6 +36,9 @@
 			return str
 		else
 			str = getstring ( v.msgs , v.start , v.finish )
+			if v.type == "text" then
+				str = xml_unescape ( str )
+			end
 			v.stringform = str
 			return str
 		end
@@ -125,8 +141,8 @@
 
 local function get_attr ( str  )
 	local attr = { }
-	for name , quote, attvalue in str:gmatch ( [=[([^%s=/<]+)%s*=%s*(["'])([^'"]*)%2]=] ) do
-		attr [ name ] = attvalue
+	for name , quote, attvalue in strgmatch ( str , [=[([^%s=/<]+)%s*=%s*(["'])([^'"]*)%2]=] ) do
+		attr [ name ] = xml_unescape ( attvalue )
 	end
 	return attr
 end

mercurial