parsers/markdown.lua

changeset 12
4c759312950b
parent 0
b40ca010c49c
child 18
a96836139ff9
equal deleted inserted replaced
11:36dd235e09d2 12:4c759312950b
1 #!/usr/bin/env lua
2
3 --[[
4 # markdown.lua -- version 0.32
5
6 <http://www.frykholm.se/files/markdown.lua>
7
8 **Author:** Niklas Frykholm, <niklas@frykholm.se>
9 **Date:** 31 May 2008
10
11 This is an implementation of the popular text markup language Markdown in pure Lua.
12 Markdown can convert documents written in a simple and easy to read text format
13 to well-formatted HTML. For a more thourough description of Markdown and the Markdown
14 syntax, see <http://daringfireball.net/projects/markdown>.
15
16 The original Markdown source is written in Perl and makes heavy use of advanced
17 regular expression techniques (such as negative look-ahead, etc) which are not available
18 in Lua's simple regex engine. Therefore this Lua port has been rewritten from the ground
19 up. It is probably not completely bug free. If you notice any bugs, please report them to
20 me. A unit test that exposes the error is helpful.
21
22 ## Usage
23
24 require "markdown"
25 markdown(source)
26
27 ``markdown.lua`` exposes a single global function named ``markdown(s)`` which applies the
28 Markdown transformation to the specified string.
29
30 ``markdown.lua`` can also be used directly from the command line:
31
32 lua markdown.lua test.md
33
34 Creates a file ``test.html`` with the converted content of ``test.md``. Run:
35
36 lua markdown.lua -h
37
38 For a description of the command-line options.
39
40 ``markdown.lua`` uses the same license as Lua, the MIT license.
41
42 ## License
43
44 Copyright &copy; 2008 Niklas Frykholm.
45
46 Permission is hereby granted, free of charge, to any person obtaining a copy of this
47 software and associated documentation files (the "Software"), to deal in the Software
48 without restriction, including without limitation the rights to use, copy, modify, merge,
49 publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
50 to whom the Software is furnished to do so, subject to the following conditions:
51
52 The above copyright notice and this permission notice shall be included in all copies
53 or substantial portions of the Software.
54
55 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
56 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
57 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
58 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
59 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
60 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
61 THE SOFTWARE.
62
63 ## Version history
64
65 - **0.32** -- 31 May 2008
66 - Fix for links containing brackets
67 - **0.31** -- 1 Mar 2008
68 - Fix for link definitions followed by spaces
69 - **0.30** -- 25 Feb 2008
70 - Consistent behavior with Markdown when the same link reference is reused
71 - **0.29** -- 24 Feb 2008
72 - Fix for <pre> blocks with spaces in them
73 - **0.28** -- 18 Feb 2008
74 - Fix for link encoding
75 - **0.27** -- 14 Feb 2008
76 - Fix for link database links with ()
77 - **0.26** -- 06 Feb 2008
78 - Fix for nested italic and bold markers
79 - **0.25** -- 24 Jan 2008
80 - Fix for encoding of naked <
81 - **0.24** -- 21 Jan 2008
82 - Fix for link behavior.
83 - **0.23** -- 10 Jan 2008
84 - Fix for a regression bug in longer expressions in italic or bold.
85 - **0.22** -- 27 Dec 2007
86 - Fix for crash when processing blocks with a percent sign in them.
87 - **0.21** -- 27 Dec 2007
88 - Fix for combined strong and emphasis tags
89 - **0.20** -- 13 Oct 2007
90 - Fix for < as well in image titles, now matches Dingus behavior
91 - **0.19** -- 28 Sep 2007
92 - Fix for quotation marks " and ampersands & in link and image titles.
93 - **0.18** -- 28 Jul 2007
94 - Does not crash on unmatched tags (behaves like standard markdown)
95 - **0.17** -- 12 Apr 2007
96 - Fix for links with %20 in them.
97 - **0.16** -- 12 Apr 2007
98 - Do not require arg global to exist.
99 - **0.15** -- 28 Aug 2006
100 - Better handling of links with underscores in them.
101 - **0.14** -- 22 Aug 2006
102 - Bug for *`foo()`*
103 - **0.13** -- 12 Aug 2006
104 - Added -l option for including stylesheet inline in document.
105 - Fixed bug in -s flag.
106 - Fixed emphasis bug.
107 - **0.12** -- 15 May 2006
108 - Fixed several bugs to comply with MarkdownTest 1.0 <http://six.pairlist.net/pipermail/markdown-discuss/2004-December/000909.html>
109 - **0.11** -- 12 May 2006
110 - Fixed bug for escaping `*` and `_` inside code spans.
111 - Added license terms.
112 - Changed join() to table.concat().
113 - **0.10** -- 3 May 2006
114 - Initial public release.
115
116 // Niklas
117 ]]
118
119
120 -- Set up a table for holding local functions to avoid polluting the global namespace
121 local M = {}
122 local MT = {__index = _G}
123 setmetatable(M, MT)
124 setfenv(1, M)
125
126 ----------------------------------------------------------------------
127 -- Utility functions
128 ----------------------------------------------------------------------
129
130 -- Locks table t from changes, writes an error if someone attempts to change the table.
131 -- This is useful for detecting variables that have "accidently" been made global. Something
132 -- I tend to do all too much.
133 function lock(t)
134 function lock_new_index(t, k, v)
135 error("module has been locked -- " .. k .. " must be declared local", 2)
136 end
137
138 local mt = {__newindex = lock_new_index}
139 if getmetatable(t) then mt.__index = getmetatable(t).__index end
140 setmetatable(t, mt)
141 end
142
143 -- Returns the result of mapping the values in table t through the function f
144 function map(t, f)
145 local out = {}
146 for k,v in pairs(t) do out[k] = f(v,k) end
147 return out
148 end
149
150 -- The identity function, useful as a placeholder.
151 function identity(text) return text end
152
153 -- Functional style if statement. (NOTE: no short circuit evaluation)
154 function iff(t, a, b) if t then return a else return b end end
155
156 -- Splits the text into an array of separate lines.
157 function split(text, sep)
158 sep = sep or "\n"
159 local lines = {}
160 local pos = 1
161 while true do
162 local b,e = text:find(sep, pos)
163 if not b then table.insert(lines, text:sub(pos)) break end
164 table.insert(lines, text:sub(pos, b-1))
165 pos = e + 1
166 end
167 return lines
168 end
169
170 -- Converts tabs to spaces
171 function detab(text)
172 local tab_width = 4
173 local function rep(match)
174 local spaces = -match:len()
175 while spaces<1 do spaces = spaces + tab_width end
176 return match .. string.rep(" ", spaces)
177 end
178 text = text:gsub("([^\n]-)\t", rep)
179 return text
180 end
181
182 -- Applies string.find for every pattern in the list and returns the first match
183 function find_first(s, patterns, index)
184 local res = {}
185 for _,p in ipairs(patterns) do
186 local match = {s:find(p, index)}
187 if #match>0 and (#res==0 or match[1] < res[1]) then res = match end
188 end
189 return unpack(res)
190 end
191
192 -- If a replacement array is specified, the range [start, stop] in the array is replaced
193 -- with the replacement array and the resulting array is returned. Without a replacement
194 -- array the section of the array between start and stop is returned.
195 function splice(array, start, stop, replacement)
196 if replacement then
197 local n = stop - start + 1
198 while n > 0 do
199 table.remove(array, start)
200 n = n - 1
201 end
202 for i,v in ipairs(replacement) do
203 table.insert(array, start, v)
204 end
205 return array
206 else
207 local res = {}
208 for i = start,stop do
209 table.insert(res, array[i])
210 end
211 return res
212 end
213 end
214
215 -- Outdents the text one step.
216 function outdent(text)
217 text = "\n" .. text
218 text = text:gsub("\n ? ? ?", "\n")
219 text = text:sub(2)
220 return text
221 end
222
223 -- Indents the text one step.
224 function indent(text)
225 text = text:gsub("\n", "\n ")
226 return text
227 end
228
229 -- Does a simple tokenization of html data. Returns the data as a list of tokens.
230 -- Each token is a table with a type field (which is either "tag" or "text") and
231 -- a text field (which contains the original token data).
232 function tokenize_html(html)
233 local tokens = {}
234 local pos = 1
235 while true do
236 local start = find_first(html, {"<!%-%-", "<[a-z/!$]", "<%?"}, pos)
237 if not start then
238 table.insert(tokens, {type="text", text=html:sub(pos)})
239 break
240 end
241 if start ~= pos then table.insert(tokens, {type="text", text = html:sub(pos, start-1)}) end
242
243 local _, stop
244 if html:match("^<!%-%-", start) then
245 _,stop = html:find("%-%->", start)
246 elseif html:match("^<%?", start) then
247 _,stop = html:find("?>", start)
248 else
249 _,stop = html:find("%b<>", start)
250 end
251 if not stop then
252 -- error("Could not match html tag " .. html:sub(start,start+30))
253 table.insert(tokens, {type="text", text=html:sub(start, start)})
254 pos = start + 1
255 else
256 table.insert(tokens, {type="tag", text=html:sub(start, stop)})
257 pos = stop + 1
258 end
259 end
260 return tokens
261 end
262
263 ----------------------------------------------------------------------
264 -- Hash
265 ----------------------------------------------------------------------
266
267 -- This is used to "hash" data into alphanumeric strings that are unique
268 -- in the document. (Note that this is not cryptographic hash, the hash
269 -- function is not one-way.) The hash procedure is used to protect parts
270 -- of the document from further processing.
271
272 local HASH = {
273 -- Has the hash been inited.
274 inited = false,
275
276 -- The unique string prepended to all hash values. This is to ensure
277 -- that hash values do not accidently coincide with an actual existing
278 -- string in the document.
279 identifier = "",
280
281 -- Counter that counts up for each new hash instance.
282 counter = 0,
283
284 -- Hash table.
285 table = {}
286 }
287
288 -- Inits hashing. Creates a hash_identifier that doesn't occur anywhere
289 -- in the text.
290 function init_hash(text)
291 HASH.inited = true
292 HASH.identifier = ""
293 HASH.counter = 0
294 HASH.table = {}
295
296 local s = "HASH"
297 local counter = 0
298 local id
299 while true do
300 id = s .. counter
301 if not text:find(id, 1, true) then break end
302 counter = counter + 1
303 end
304 HASH.identifier = id
305 end
306
307 -- Returns the hashed value for s.
308 function hash(s)
309 assert(HASH.inited)
310 if not HASH.table[s] then
311 HASH.counter = HASH.counter + 1
312 local id = HASH.identifier .. HASH.counter .. "X"
313 HASH.table[s] = id
314 end
315 return HASH.table[s]
316 end
317
318 ----------------------------------------------------------------------
319 -- Protection
320 ----------------------------------------------------------------------
321
322 -- The protection module is used to "protect" parts of a document
323 -- so that they are not modified by subsequent processing steps.
324 -- Protected parts are saved in a table for later unprotection
325
326 -- Protection data
327 local PD = {
328 -- Saved blocks that have been converted
329 blocks = {},
330
331 -- Block level tags that will be protected
332 tags = {"p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote",
333 "pre", "table", "dl", "ol", "ul", "script", "noscript", "form", "fieldset",
334 "iframe", "math", "ins", "del"}
335 }
336
337 -- Pattern for matching a block tag that begins and ends in the leftmost
338 -- column and may contain indented subtags, i.e.
339 -- <div>
340 -- A nested block.
341 -- <div>
342 -- Nested data.
343 -- </div>
344 -- </div>
345 function block_pattern(tag)
346 return "\n<" .. tag .. ".-\n</" .. tag .. ">[ \t]*\n"
347 end
348
349 -- Pattern for matching a block tag that begins and ends with a newline
350 function line_pattern(tag)
351 return "\n<" .. tag .. ".-</" .. tag .. ">[ \t]*\n"
352 end
353
354 -- Protects the range of characters from start to stop in the text and
355 -- returns the protected string.
356 function protect_range(text, start, stop)
357 local s = text:sub(start, stop)
358 local h = hash(s)
359 PD.blocks[h] = s
360 text = text:sub(1,start) .. h .. text:sub(stop)
361 return text
362 end
363
364 -- Protect every part of the text that matches any of the patterns. The first
365 -- matching pattern is protected first, etc.
366 function protect_matches(text, patterns)
367 while true do
368 local start, stop = find_first(text, patterns)
369 if not start then break end
370 text = protect_range(text, start, stop)
371 end
372 return text
373 end
374
375 -- Protects blocklevel tags in the specified text
376 function protect(text)
377 -- First protect potentially nested block tags
378 text = protect_matches(text, map(PD.tags, block_pattern))
379 -- Then protect block tags at the line level.
380 text = protect_matches(text, map(PD.tags, line_pattern))
381 -- Protect <hr> and comment tags
382 text = protect_matches(text, {"\n<hr[^>]->[ \t]*\n"})
383 text = protect_matches(text, {"\n<!%-%-.-%-%->[ \t]*\n"})
384 return text
385 end
386
387 -- Returns true if the string s is a hash resulting from protection
388 function is_protected(s)
389 return PD.blocks[s]
390 end
391
392 -- Unprotects the specified text by expanding all the nonces
393 function unprotect(text)
394 for k,v in pairs(PD.blocks) do
395 v = v:gsub("%%", "%%%%")
396 text = text:gsub(k, v)
397 end
398 return text
399 end
400
401
402 ----------------------------------------------------------------------
403 -- Block transform
404 ----------------------------------------------------------------------
405
406 -- The block transform functions transform the text on the block level.
407 -- They work with the text as an array of lines rather than as individual
408 -- characters.
409
410 -- Returns true if the line is a ruler of (char) characters.
411 -- The line must contain at least three char characters and contain only spaces and
412 -- char characters.
413 function is_ruler_of(line, char)
414 if not line:match("^[ %" .. char .. "]*$") then return false end
415 if not line:match("%" .. char .. ".*%" .. char .. ".*%" .. char) then return false end
416 return true
417 end
418
419 -- Identifies the block level formatting present in the line
420 function classify(line)
421 local info = {line = line, text = line}
422
423 if line:match("^ ") then
424 info.type = "indented"
425 info.outdented = line:sub(5)
426 return info
427 end
428
429 for _,c in ipairs({'*', '-', '_', '='}) do
430 if is_ruler_of(line, c) then
431 info.type = "ruler"
432 info.ruler_char = c
433 return info
434 end
435 end
436
437 if line == "" then
438 info.type = "blank"
439 return info
440 end
441
442 if line:match("^(#+)[ \t]*(.-)[ \t]*#*[ \t]*$") then
443 local m1, m2 = line:match("^(#+)[ \t]*(.-)[ \t]*#*[ \t]*$")
444 info.type = "header"
445 info.level = m1:len()
446 info.text = m2
447 return info
448 end
449
450 if line:match("^ ? ? ?(%d+)%.[ \t]+(.+)") then
451 local number, text = line:match("^ ? ? ?(%d+)%.[ \t]+(.+)")
452 info.type = "list_item"
453 info.list_type = "numeric"
454 info.number = 0 + number
455 info.text = text
456 return info
457 end
458
459 if line:match("^ ? ? ?([%*%+%-])[ \t]+(.+)") then
460 local bullet, text = line:match("^ ? ? ?([%*%+%-])[ \t]+(.+)")
461 info.type = "list_item"
462 info.list_type = "bullet"
463 info.bullet = bullet
464 info.text= text
465 return info
466 end
467
468 if line:match("^>[ \t]?(.*)") then
469 info.type = "blockquote"
470 info.text = line:match("^>[ \t]?(.*)")
471 return info
472 end
473
474 if is_protected(line) then
475 info.type = "raw"
476 info.html = unprotect(line)
477 return info
478 end
479
480 info.type = "normal"
481 return info
482 end
483
484 -- Find headers constisting of a normal line followed by a ruler and converts them to
485 -- header entries.
486 function headers(array)
487 local i = 1
488 while i <= #array - 1 do
489 if array[i].type == "normal" and array[i+1].type == "ruler" and
490 (array[i+1].ruler_char == "-" or array[i+1].ruler_char == "=") then
491 local info = {line = array[i].line}
492 info.text = info.line
493 info.type = "header"
494 info.level = iff(array[i+1].ruler_char == "=", 1, 2)
495 table.remove(array, i+1)
496 array[i] = info
497 end
498 i = i + 1
499 end
500 return array
501 end
502
503 -- Find list blocks and convert them to protected data blocks
504 function lists(array, sublist)
505 local function process_list(arr)
506 local function any_blanks(arr)
507 for i = 1, #arr do
508 if arr[i].type == "blank" then return true end
509 end
510 return false
511 end
512
513 local function split_list_items(arr)
514 local acc = {arr[1]}
515 local res = {}
516 for i=2,#arr do
517 if arr[i].type == "list_item" then
518 table.insert(res, acc)
519 acc = {arr[i]}
520 else
521 table.insert(acc, arr[i])
522 end
523 end
524 table.insert(res, acc)
525 return res
526 end
527
528 local function process_list_item(lines, block)
529 while lines[#lines].type == "blank" do
530 table.remove(lines)
531 end
532
533 local itemtext = lines[1].text
534 for i=2,#lines do
535 itemtext = itemtext .. "\n" .. outdent(lines[i].line)
536 end
537 if block then
538 itemtext = block_transform(itemtext, true)
539 if not itemtext:find("<pre>") then itemtext = indent(itemtext) end
540 return " <li>" .. itemtext .. "</li>"
541 else
542 local lines = split(itemtext)
543 lines = map(lines, classify)
544 lines = lists(lines, true)
545 lines = blocks_to_html(lines, true)
546 itemtext = table.concat(lines, "\n")
547 if not itemtext:find("<pre>") then itemtext = indent(itemtext) end
548 return " <li>" .. itemtext .. "</li>"
549 end
550 end
551
552 local block_list = any_blanks(arr)
553 local items = split_list_items(arr)
554 local out = ""
555 for _, item in ipairs(items) do
556 out = out .. process_list_item(item, block_list) .. "\n"
557 end
558 if arr[1].list_type == "numeric" then
559 return "<ol>\n" .. out .. "</ol>"
560 else
561 return "<ul>\n" .. out .. "</ul>"
562 end
563 end
564
565 -- Finds the range of lines composing the first list in the array. A list
566 -- starts with (^ list_item) or (blank list_item) and ends with
567 -- (blank* $) or (blank normal).
568 --
569 -- A sublist can start with just (list_item) does not need a blank...
570 local function find_list(array, sublist)
571 local function find_list_start(array, sublist)
572 if array[1].type == "list_item" then return 1 end
573 if sublist then
574 for i = 1,#array do
575 if array[i].type == "list_item" then return i end
576 end
577 else
578 for i = 1, #array-1 do
579 if array[i].type == "blank" and array[i+1].type == "list_item" then
580 return i+1
581 end
582 end
583 end
584 return nil
585 end
586 local function find_list_end(array, start)
587 local pos = #array
588 for i = start, #array-1 do
589 if array[i].type == "blank" and array[i+1].type ~= "list_item"
590 and array[i+1].type ~= "indented" and array[i+1].type ~= "blank" then
591 pos = i-1
592 break
593 end
594 end
595 while pos > start and array[pos].type == "blank" do
596 pos = pos - 1
597 end
598 return pos
599 end
600
601 local start = find_list_start(array, sublist)
602 if not start then return nil end
603 return start, find_list_end(array, start)
604 end
605
606 while true do
607 local start, stop = find_list(array, sublist)
608 if not start then break end
609 local text = process_list(splice(array, start, stop))
610 local info = {
611 line = text,
612 type = "raw",
613 html = text
614 }
615 array = splice(array, start, stop, {info})
616 end
617
618 -- Convert any remaining list items to normal
619 for _,line in ipairs(array) do
620 if line.type == "list_item" then line.type = "normal" end
621 end
622
623 return array
624 end
625
626 -- Find and convert blockquote markers.
627 function blockquotes(lines)
628 local function find_blockquote(lines)
629 local start
630 for i,line in ipairs(lines) do
631 if line.type == "blockquote" then
632 start = i
633 break
634 end
635 end
636 if not start then return nil end
637
638 local stop = #lines
639 for i = start+1, #lines do
640 if lines[i].type == "blank" or lines[i].type == "blockquote" then
641 elseif lines[i].type == "normal" then
642 if lines[i-1].type == "blank" then stop = i-1 break end
643 else
644 stop = i-1 break
645 end
646 end
647 while lines[stop].type == "blank" do stop = stop - 1 end
648 return start, stop
649 end
650
651 local function process_blockquote(lines)
652 local raw = lines[1].text
653 for i = 2,#lines do
654 raw = raw .. "\n" .. lines[i].text
655 end
656 local bt = block_transform(raw)
657 if not bt:find("<pre>") then bt = indent(bt) end
658 return "<blockquote>\n " .. bt ..
659 "\n</blockquote>"
660 end
661
662 while true do
663 local start, stop = find_blockquote(lines)
664 if not start then break end
665 local text = process_blockquote(splice(lines, start, stop))
666 local info = {
667 line = text,
668 type = "raw",
669 html = text
670 }
671 lines = splice(lines, start, stop, {info})
672 end
673 return lines
674 end
675
676 -- Find and convert codeblocks.
677 function codeblocks(lines)
678 local function find_codeblock(lines)
679 local start
680 for i,line in ipairs(lines) do
681 if line.type == "indented" then start = i break end
682 end
683 if not start then return nil end
684
685 local stop = #lines
686 for i = start+1, #lines do
687 if lines[i].type ~= "indented" and lines[i].type ~= "blank" then
688 stop = i-1
689 break
690 end
691 end
692 while lines[stop].type == "blank" do stop = stop - 1 end
693 return start, stop
694 end
695
696 local function process_codeblock(lines)
697 local raw = detab(encode_code(outdent(lines[1].line)))
698 for i = 2,#lines do
699 raw = raw .. "\n" .. detab(encode_code(outdent(lines[i].line)))
700 end
701 return "<pre><code>" .. raw .. "\n</code></pre>"
702 end
703
704 while true do
705 local start, stop = find_codeblock(lines)
706 if not start then break end
707 local text = process_codeblock(splice(lines, start, stop))
708 local info = {
709 line = text,
710 type = "raw",
711 html = text
712 }
713 lines = splice(lines, start, stop, {info})
714 end
715 return lines
716 end
717
718 -- Convert lines to html code
719 function blocks_to_html(lines, no_paragraphs)
720 local out = {}
721 local i = 1
722 while i <= #lines do
723 local line = lines[i]
724 if line.type == "ruler" then
725 table.insert(out, "<hr/>")
726 elseif line.type == "raw" then
727 table.insert(out, line.html)
728 elseif line.type == "normal" then
729 local s = line.line
730
731 while i+1 <= #lines and lines[i+1].type == "normal" do
732 i = i + 1
733 s = s .. "\n" .. lines[i].line
734 end
735
736 if no_paragraphs then
737 table.insert(out, span_transform(s))
738 else
739 table.insert(out, "<p>" .. span_transform(s) .. "</p>")
740 end
741 elseif line.type == "header" then
742 local s = "<h" .. line.level .. ">" .. span_transform(line.text) .. "</h" .. line.level .. ">"
743 table.insert(out, s)
744 else
745 table.insert(out, line.line)
746 end
747 i = i + 1
748 end
749 return out
750 end
751
752 -- Perform all the block level transforms
753 function block_transform(text, sublist)
754 local lines = split(text)
755 lines = map(lines, classify)
756 lines = headers(lines)
757 lines = lists(lines, sublist)
758 lines = codeblocks(lines)
759 lines = blockquotes(lines)
760 lines = blocks_to_html(lines)
761 local text = table.concat(lines, "\n")
762 return text
763 end
764
765 -- Debug function for printing a line array to see the result
766 -- of partial transforms.
767 function print_lines(lines)
768 for i, line in ipairs(lines) do
769 print(i, line.type, line.text or line.line)
770 end
771 end
772
773 ----------------------------------------------------------------------
774 -- Span transform
775 ----------------------------------------------------------------------
776
777 -- Functions for transforming the text at the span level.
778
779 -- These characters may need to be escaped because they have a special
780 -- meaning in markdown.
781 escape_chars = "'\\`*_{}[]()>#+-.!'"
782 escape_table = {}
783
784 function init_escape_table()
785 escape_table = {}
786 for i = 1,#escape_chars do
787 local c = escape_chars:sub(i,i)
788 escape_table[c] = hash(c)
789 end
790 end
791
792 -- Adds a new escape to the escape table.
793 function add_escape(text)
794 if not escape_table[text] then
795 escape_table[text] = hash(text)
796 end
797 return escape_table[text]
798 end
799
800 -- Escape characters that should not be disturbed by markdown.
801 function escape_special_chars(text)
802 local tokens = tokenize_html(text)
803
804 local out = ""
805 for _, token in ipairs(tokens) do
806 local t = token.text
807 if token.type == "tag" then
808 -- In tags, encode * and _ so they don't conflict with their use in markdown.
809 t = t:gsub("%*", escape_table["*"])
810 t = t:gsub("%_", escape_table["_"])
811 else
812 t = encode_backslash_escapes(t)
813 end
814 out = out .. t
815 end
816 return out
817 end
818
819 -- Encode backspace-escaped characters in the markdown source.
820 function encode_backslash_escapes(t)
821 for i=1,escape_chars:len() do
822 local c = escape_chars:sub(i,i)
823 t = t:gsub("\\%" .. c, escape_table[c])
824 end
825 return t
826 end
827
828 -- Unescape characters that have been encoded.
829 function unescape_special_chars(t)
830 local tin = t
831 for k,v in pairs(escape_table) do
832 k = k:gsub("%%", "%%%%")
833 t = t:gsub(v,k)
834 end
835 if t ~= tin then t = unescape_special_chars(t) end
836 return t
837 end
838
839 -- Encode/escape certain characters inside Markdown code runs.
840 -- The point is that in code, these characters are literals,
841 -- and lose their special Markdown meanings.
842 function encode_code(s)
843 s = s:gsub("%&", "&amp;")
844 s = s:gsub("<", "&lt;")
845 s = s:gsub(">", "&gt;")
846 for k,v in pairs(escape_table) do
847 s = s:gsub("%"..k, v)
848 end
849 return s
850 end
851
852 -- Handle backtick blocks.
853 function code_spans(s)
854 s = s:gsub("\\\\", escape_table["\\"])
855 s = s:gsub("\\`", escape_table["`"])
856
857 local pos = 1
858 while true do
859 local start, stop = s:find("`+", pos)
860 if not start then return s end
861 local count = stop - start + 1
862 -- Find a matching numbert of backticks
863 local estart, estop = s:find(string.rep("`", count), stop+1)
864 local brstart = s:find("\n", stop+1)
865 if estart and (not brstart or estart < brstart) then
866 local code = s:sub(stop+1, estart-1)
867 code = code:gsub("^[ \t]+", "")
868 code = code:gsub("[ \t]+$", "")
869 code = code:gsub(escape_table["\\"], escape_table["\\"] .. escape_table["\\"])
870 code = code:gsub(escape_table["`"], escape_table["\\"] .. escape_table["`"])
871 code = "<code>" .. encode_code(code) .. "</code>"
872 code = add_escape(code)
873 s = s:sub(1, start-1) .. code .. s:sub(estop+1)
874 pos = start + code:len()
875 else
876 pos = stop + 1
877 end
878 end
879 return s
880 end
881
882 -- Encode alt text... enodes &, and ".
883 function encode_alt(s)
884 if not s then return s end
885 s = s:gsub('&', '&amp;')
886 s = s:gsub('"', '&quot;')
887 s = s:gsub('<', '&lt;')
888 return s
889 end
890
891 -- Handle image references
892 function images(text)
893 local function reference_link(alt, id)
894 alt = encode_alt(alt:match("%b[]"):sub(2,-2))
895 id = id:match("%[(.*)%]"):lower()
896 if id == "" then id = text:lower() end
897 link_database[id] = link_database[id] or {}
898 if not link_database[id].url then return nil end
899 local url = link_database[id].url or id
900 url = encode_alt(url)
901 local title = encode_alt(link_database[id].title)
902 if title then title = " title=\"" .. title .. "\"" else title = "" end
903 return add_escape ('<img src="' .. url .. '" alt="' .. alt .. '"' .. title .. "/>")
904 end
905
906 local function inline_link(alt, link)
907 alt = encode_alt(alt:match("%b[]"):sub(2,-2))
908 local url, title = link:match("%(<?(.-)>?[ \t]*['\"](.+)['\"]")
909 url = url or link:match("%(<?(.-)>?%)")
910 url = encode_alt(url)
911 title = encode_alt(title)
912 if title then
913 return add_escape('<img src="' .. url .. '" alt="' .. alt .. '" title="' .. title .. '"/>')
914 else
915 return add_escape('<img src="' .. url .. '" alt="' .. alt .. '"/>')
916 end
917 end
918
919 text = text:gsub("!(%b[])[ \t]*\n?[ \t]*(%b[])", reference_link)
920 text = text:gsub("!(%b[])(%b())", inline_link)
921 return text
922 end
923
924 -- Handle anchor references
925 function anchors(text)
926 local function reference_link(text, id)
927 text = text:match("%b[]"):sub(2,-2)
928 id = id:match("%b[]"):sub(2,-2):lower()
929 if id == "" then id = text:lower() end
930 link_database[id] = link_database[id] or {}
931 if not link_database[id].url then return nil end
932 local url = link_database[id].url or id
933 url = encode_alt(url)
934 local title = encode_alt(link_database[id].title)
935 if title then title = " title=\"" .. title .. "\"" else title = "" end
936 return add_escape("<a href=\"" .. url .. "\"" .. title .. ">") .. text .. add_escape("</a>")
937 end
938
939 local function inline_link(text, link)
940 text = text:match("%b[]"):sub(2,-2)
941 local url, title = link:match("%(<?(.-)>?[ \t]*['\"](.+)['\"]")
942 title = encode_alt(title)
943 url = url or link:match("%(<?(.-)>?%)") or ""
944 url = encode_alt(url)
945 if title then
946 return add_escape("<a href=\"" .. url .. "\" title=\"" .. title .. "\">") .. text .. "</a>"
947 else
948 return add_escape("<a href=\"" .. url .. "\">") .. text .. add_escape("</a>")
949 end
950 end
951
952 text = text:gsub("(%b[])[ \t]*\n?[ \t]*(%b[])", reference_link)
953 text = text:gsub("(%b[])(%b())", inline_link)
954 return text
955 end
956
957 -- Handle auto links, i.e. <http://www.google.com/>.
958 function auto_links(text)
959 local function link(s)
960 return add_escape("<a href=\"" .. s .. "\">") .. s .. "</a>"
961 end
962 -- Encode chars as a mix of dec and hex entitites to (perhaps) fool
963 -- spambots.
964 local function encode_email_address(s)
965 -- Use a deterministic encoding to make unit testing possible.
966 -- Code 45% hex, 45% dec, 10% plain.
967 local hex = {code = function(c) return "&#x" .. string.format("%x", c:byte()) .. ";" end, count = 1, rate = 0.45}
968 local dec = {code = function(c) return "&#" .. c:byte() .. ";" end, count = 0, rate = 0.45}
969 local plain = {code = function(c) return c end, count = 0, rate = 0.1}
970 local codes = {hex, dec, plain}
971 local function swap(t,k1,k2) local temp = t[k2] t[k2] = t[k1] t[k1] = temp end
972
973 local out = ""
974 for i = 1,s:len() do
975 for _,code in ipairs(codes) do code.count = code.count + code.rate end
976 if codes[1].count < codes[2].count then swap(codes,1,2) end
977 if codes[2].count < codes[3].count then swap(codes,2,3) end
978 if codes[1].count < codes[2].count then swap(codes,1,2) end
979
980 local code = codes[1]
981 local c = s:sub(i,i)
982 -- Force encoding of "@" to make email address more invisible.
983 if c == "@" and code == plain then code = codes[2] end
984 out = out .. code.code(c)
985 code.count = code.count - 1
986 end
987 return out
988 end
989 local function mail(s)
990 s = unescape_special_chars(s)
991 local address = encode_email_address("mailto:" .. s)
992 local text = encode_email_address(s)
993 return add_escape("<a href=\"" .. address .. "\">") .. text .. "</a>"
994 end
995 -- links
996 text = text:gsub("<(https?:[^'\">%s]+)>", link)
997 text = text:gsub("<(ftp:[^'\">%s]+)>", link)
998
999 -- mail
1000 text = text:gsub("<mailto:([^'\">%s]+)>", mail)
1001 text = text:gsub("<([-.%w]+%@[-.%w]+)>", mail)
1002 return text
1003 end
1004
1005 -- Encode free standing amps (&) and angles (<)... note that this does not
1006 -- encode free >.
1007 function amps_and_angles(s)
1008 -- encode amps not part of &..; expression
1009 local pos = 1
1010 while true do
1011 local amp = s:find("&", pos)
1012 if not amp then break end
1013 local semi = s:find(";", amp+1)
1014 local stop = s:find("[ \t\n&]", amp+1)
1015 if not semi or (stop and stop < semi) or (semi - amp) > 15 then
1016 s = s:sub(1,amp-1) .. "&amp;" .. s:sub(amp+1)
1017 pos = amp+1
1018 else
1019 pos = amp+1
1020 end
1021 end
1022
1023 -- encode naked <'s
1024 s = s:gsub("<([^a-zA-Z/?$!])", "&lt;%1")
1025 s = s:gsub("<$", "&lt;")
1026
1027 -- what about >, nothing done in the original markdown source to handle them
1028 return s
1029 end
1030
1031 -- Handles emphasis markers (* and _) in the text.
1032 function emphasis(text)
1033 for _, s in ipairs {"%*%*", "%_%_"} do
1034 text = text:gsub(s .. "([^%s][%*%_]?)" .. s, "<strong>%1</strong>")
1035 text = text:gsub(s .. "([^%s][^<>]-[^%s][%*%_]?)" .. s, "<strong>%1</strong>")
1036 end
1037 for _, s in ipairs {"%*", "%_"} do
1038 text = text:gsub(s .. "([^%s_])" .. s, "<em>%1</em>")
1039 text = text:gsub(s .. "(<strong>[^%s_]</strong>)" .. s, "<em>%1</em>")
1040 text = text:gsub(s .. "([^%s_][^<>_]-[^%s_])" .. s, "<em>%1</em>")
1041 text = text:gsub(s .. "([^<>_]-<strong>[^<>_]-</strong>[^<>_]-)" .. s, "<em>%1</em>")
1042 end
1043 return text
1044 end
1045
1046 -- Handles line break markers in the text.
1047 function line_breaks(text)
1048 return text:gsub(" +\n", " <br/>\n")
1049 end
1050
1051 -- Perform all span level transforms.
1052 function span_transform(text)
1053 text = code_spans(text)
1054 text = escape_special_chars(text)
1055 text = images(text)
1056 text = anchors(text)
1057 text = auto_links(text)
1058 text = amps_and_angles(text)
1059 text = emphasis(text)
1060 text = line_breaks(text)
1061 return text
1062 end
1063
1064 ----------------------------------------------------------------------
1065 -- Markdown
1066 ----------------------------------------------------------------------
1067
1068 -- Cleanup the text by normalizing some possible variations to make further
1069 -- processing easier.
1070 function cleanup(text)
1071 -- Standardize line endings
1072 text = text:gsub("\r\n", "\n") -- DOS to UNIX
1073 text = text:gsub("\r", "\n") -- Mac to UNIX
1074
1075 -- Convert all tabs to spaces
1076 text = detab(text)
1077
1078 -- Strip lines with only spaces and tabs
1079 while true do
1080 local subs
1081 text, subs = text:gsub("\n[ \t]+\n", "\n\n")
1082 if subs == 0 then break end
1083 end
1084
1085 return "\n" .. text .. "\n"
1086 end
1087
1088 -- Strips link definitions from the text and stores the data in a lookup table.
1089 function strip_link_definitions(text)
1090 local linkdb = {}
1091
1092 local function link_def(id, url, title)
1093 id = id:match("%[(.+)%]"):lower()
1094 linkdb[id] = linkdb[id] or {}
1095 linkdb[id].url = url or linkdb[id].url
1096 linkdb[id].title = title or linkdb[id].title
1097 return ""
1098 end
1099
1100 local def_no_title = "\n ? ? ?(%b[]):[ \t]*\n?[ \t]*<?([^%s>]+)>?[ \t]*"
1101 local def_title1 = def_no_title .. "[ \t]+\n?[ \t]*[\"'(]([^\n]+)[\"')][ \t]*"
1102 local def_title2 = def_no_title .. "[ \t]*\n[ \t]*[\"'(]([^\n]+)[\"')][ \t]*"
1103 local def_title3 = def_no_title .. "[ \t]*\n?[ \t]+[\"'(]([^\n]+)[\"')][ \t]*"
1104
1105 text = text:gsub(def_title1, link_def)
1106 text = text:gsub(def_title2, link_def)
1107 text = text:gsub(def_title3, link_def)
1108 text = text:gsub(def_no_title, link_def)
1109 return text, linkdb
1110 end
1111
1112 link_database = {}
1113
1114 -- Main markdown processing function
1115 function markdown(text)
1116 init_hash(text)
1117 init_escape_table()
1118
1119 text = cleanup(text)
1120 text = protect(text)
1121 text, link_database = strip_link_definitions(text)
1122 text = block_transform(text)
1123 text = unescape_special_chars(text)
1124 return text
1125 end
1126
1127 ----------------------------------------------------------------------
1128 -- End of module
1129 ----------------------------------------------------------------------
1130
1131 setfenv(1, _G)
1132 M.lock(M)
1133
1134 -- Expose markdown function to the world
1135 markdown = M.markdown
1136
1137 -- Class for parsing command-line options
1138 local OptionParser = {}
1139 OptionParser.__index = OptionParser
1140
1141 -- Creates a new option parser
1142 function OptionParser:new()
1143 local o = {short = {}, long = {}}
1144 setmetatable(o, self)
1145 return o
1146 end
1147
1148 -- Calls f() whenever a flag with specified short and long name is encountered
1149 function OptionParser:flag(short, long, f)
1150 local info = {type = "flag", f = f}
1151 if short then self.short[short] = info end
1152 if long then self.long[long] = info end
1153 end
1154
1155 -- Calls f(param) whenever a parameter flag with specified short and long name is encountered
1156 function OptionParser:param(short, long, f)
1157 local info = {type = "param", f = f}
1158 if short then self.short[short] = info end
1159 if long then self.long[long] = info end
1160 end
1161
1162 -- Calls f(v) for each non-flag argument
1163 function OptionParser:arg(f)
1164 self.arg = f
1165 end
1166
1167 -- Runs the option parser for the specified set of arguments. Returns true if all arguments
1168 -- where successfully parsed and false otherwise.
1169 function OptionParser:run(args)
1170 local pos = 1
1171 while pos <= #args do
1172 local arg = args[pos]
1173 if arg == "--" then
1174 for i=pos+1,#args do
1175 if self.arg then self.arg(args[i]) end
1176 return true
1177 end
1178 end
1179 if arg:match("^%-%-") then
1180 local info = self.long[arg:sub(3)]
1181 if not info then print("Unknown flag: " .. arg) return false end
1182 if info.type == "flag" then
1183 info.f()
1184 pos = pos + 1
1185 else
1186 param = args[pos+1]
1187 if not param then print("No parameter for flag: " .. arg) return false end
1188 info.f(param)
1189 pos = pos+2
1190 end
1191 elseif arg:match("^%-") then
1192 for i=2,arg:len() do
1193 local c = arg:sub(i,i)
1194 local info = self.short[c]
1195 if not info then print("Unknown flag: -" .. c) return false end
1196 if info.type == "flag" then
1197 info.f()
1198 else
1199 if i == arg:len() then
1200 param = args[pos+1]
1201 if not param then print("No parameter for flag: -" .. c) return false end
1202 info.f(param)
1203 pos = pos + 1
1204 else
1205 param = arg:sub(i+1)
1206 info.f(param)
1207 end
1208 break
1209 end
1210 end
1211 pos = pos + 1
1212 else
1213 if self.arg then self.arg(arg) end
1214 pos = pos + 1
1215 end
1216 end
1217 return true
1218 end
1219
1220 -- Handles the case when markdown is run from the command line
1221 local function run_command_line(arg)
1222 -- Generate output for input s given options
1223 local function run(s, options)
1224 s = markdown(s)
1225 if not options.wrap_header then return s end
1226 local header = ""
1227 if options.header then
1228 local f = io.open(options.header) or error("Could not open file: " .. options.header)
1229 header = f:read("*a")
1230 f:close()
1231 else
1232 header = [[
1233 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
1234 <html>
1235 <head>
1236 <meta http-equiv="content-type" content="text/html; charset=CHARSET" />
1237 <title>TITLE</title>
1238 <link rel="stylesheet" type="text/css" href="STYLESHEET" />
1239 </head>
1240 <body>
1241 ]]
1242 local title = options.title or s:match("<h1>(.-)</h1>") or s:match("<h2>(.-)</h2>") or
1243 s:match("<h3>(.-)</h3>") or "Untitled"
1244 header = header:gsub("TITLE", title)
1245 if options.inline_style then
1246 local style = ""
1247 local f = io.open(options.stylesheet)
1248 if f then
1249 style = f:read("*a") f:close()
1250 else
1251 error("Could not include style sheet " .. options.stylesheet .. ": File not found")
1252 end
1253 header = header:gsub('<link rel="stylesheet" type="text/css" href="STYLESHEET" />',
1254 "<style type=\"text/css\"><!--\n" .. style .. "\n--></style>")
1255 else
1256 header = header:gsub("STYLESHEET", options.stylesheet)
1257 end
1258 header = header:gsub("CHARSET", options.charset)
1259 end
1260 local footer = "</body></html>"
1261 if options.footer then
1262 local f = io.open(options.footer) or error("Could not open file: " .. options.footer)
1263 footer = f:read("*a")
1264 f:close()
1265 end
1266 return header .. s .. footer
1267 end
1268
1269 -- Generate output path name from input path name given options.
1270 local function outpath(path, options)
1271 if options.append then return path .. ".html" end
1272 local m = path:match("^(.+%.html)[^/\\]+$") if m then return m end
1273 m = path:match("^(.+%.)[^/\\]*$") if m and path ~= m .. "html" then return m .. "html" end
1274 return path .. ".html"
1275 end
1276
1277 -- Default commandline options
1278 local options = {
1279 wrap_header = true,
1280 header = nil,
1281 footer = nil,
1282 charset = "utf-8",
1283 title = nil,
1284 stylesheet = "default.css",
1285 inline_style = false
1286 }
1287 local help = [[
1288 Usage: markdown.lua [OPTION] [FILE]
1289 Runs the markdown text markup to HTML converter on each file specified on the
1290 command line. If no files are specified, runs on standard input.
1291
1292 No header:
1293 -n, --no-wrap Don't wrap the output in <html>... tags.
1294 Custom header:
1295 -e, --header FILE Use content of FILE for header.
1296 -f, --footer FILE Use content of FILE for footer.
1297 Generated header:
1298 -c, --charset SET Specifies charset (default utf-8).
1299 -i, --title TITLE Specifies title (default from first <h1> tag).
1300 -s, --style STYLE Specifies style sheet file (default default.css).
1301 -l, --inline-style Include the style sheet file inline in the header.
1302 Generated files:
1303 -a, --append Append .html extension (instead of replacing).
1304 Other options:
1305 -h, --help Print this help text.
1306 -t, --test Run the unit tests.
1307 ]]
1308
1309 local run_stdin = true
1310 local op = OptionParser:new()
1311 op:flag("n", "no-wrap", function () options.wrap_header = false end)
1312 op:param("e", "header", function (x) options.header = x end)
1313 op:param("f", "footer", function (x) options.footer = x end)
1314 op:param("c", "charset", function (x) options.charset = x end)
1315 op:param("i", "title", function(x) options.title = x end)
1316 op:param("s", "style", function(x) options.stylesheet = x end)
1317 op:flag("l", "inline-style", function(x) options.inline_style = true end)
1318 op:flag("a", "append", function() options.append = true end)
1319 op:flag("t", "test", function()
1320 local n = arg[0]:gsub("markdown.lua", "markdown-tests.lua")
1321 local f = io.open(n)
1322 if f then
1323 f:close() dofile(n)
1324 else
1325 error("Cannot find markdown-tests.lua")
1326 end
1327 run_stdin = false
1328 end)
1329 op:flag("h", "help", function() print(help) run_stdin = false end)
1330 op:arg(function(path)
1331 local file = io.open(path) or error("Could not open file: " .. path)
1332 local s = file:read("*a")
1333 file:close()
1334 s = run(s, options)
1335 file = io.open(outpath(path, options), "w") or error("Could not open output file: " .. outpath(path, options))
1336 file:write(s)
1337 file:close()
1338 run_stdin = false
1339 end
1340 )
1341
1342 if not op:run(arg) then
1343 print(help)
1344 run_stdin = false
1345 end
1346
1347 if run_stdin then
1348 local s = io.read("*a")
1349 s = run(s, options)
1350 io.write(s)
1351 end
1352 end
1353
1354 -- If we are being run from the command-line, act accordingly
1355 if arg and arg[0]:find("markdown%.lua$") then
1356 run_command_line(arg)
1357 else
1358 return markdown
1359 end

mercurial