Skip to content

Commit 4fcd76f

Browse files
committed
Markdown: Complete Unicode support where there should be.
1 parent 8d333c9 commit 4fcd76f

File tree

2 files changed

+32
-32
lines changed

2 files changed

+32
-32
lines changed

examples/testsite/scripts/tests/runTests.lua

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -194,14 +194,12 @@ return function()
194194
timerStart("markdown")
195195

196196
for _, test in ipairs(tests) do
197-
if not (false
198-
or is(test,363) -- @Incomplete: Complete Unicode support.
199-
200-
-- or test.n < 118 -- Jump to HTML block parsing tests.
201-
-- or test.n < 307 -- Jump to inline parsing tests.
202-
-- or test.n < 360 -- Jump to emphasis parsing tests.
203-
-- or test.n < 493 -- Jump to link parsing tests.
204-
) then
197+
if true
198+
-- and test.n >= 118 -- Jump to HTML block parsing tests.
199+
-- and test.n >= 307 -- Jump to inline parsing tests.
200+
-- and test.n >= 360 -- Jump to emphasis parsing tests.
201+
-- and test.n >= 493 -- Jump to link parsing tests.
202+
then
205203
-- print("Test#"..test.n)
206204

207205
local html = (markdown(test.input)

src/markdown.lua2p

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,15 @@
1717
local markdown = {}
1818

1919
!(
20-
local PATTERN_CHAR = require"src.utf8".CHARACTER_PATTERN
21-
local PATTERN_LINE = "[^\n]*"
22-
local PATTERN_BLANK_LINE = "^[ \t]*$"
23-
local PATTERN_WHITESPACE_CHAR = "[ \t\n\v\f]" -- Excluding \r.
24-
local PATTERN_WHITESPACE_SEQUENCE = PATTERN_WHITESPACE_CHAR.."+"
25-
local PATTERN_UNICODE_WHITESPACE_CHAR = PATTERN_WHITESPACE_CHAR -- @Incomplete: Include Unicode general category Zs.
26-
local PATTERN_NON_WHITESPACE_CHAR = "[^ \t\n\v\f]"
27-
local PATTERN_NON_UNICODE_WHITESPACE_CHAR = "[^ \t\n\v\f]" -- @Incomplete: Include Unicode general category Zs.
28-
local PATTERN_ASCII_PUNCTUATION_CHAR = "[\33-\47\58-\64\91-\96\123-\126]"
29-
local PATTERN_PUNCTUATION_CHAR = PATTERN_ASCII_PUNCTUATION_CHAR -- @Incomplete: Include Unicode general category P (Pc+Pd+Pe+Pf+Pi+Po+Ps).
20+
local WHITESPACE_CHARS = " \t\n\v\f" -- Excluding \r.
21+
22+
local PATTERN_CHAR = require"src.utf8".CHARACTER_PATTERN
23+
local PATTERN_LINE = "[^\n]*"
24+
local PATTERN_BLANK_LINE = "^[ \t]*$"
25+
local PATTERN_WHITESPACE_CHAR = "["..WHITESPACE_CHARS.."]"
26+
local PATTERN_WHITESPACE_SEQUENCE = PATTERN_WHITESPACE_CHAR.."+"
27+
local PATTERN_NON_WHITESPACE_CHAR = "[^"..WHITESPACE_CHARS.."]"
28+
local PATTERN_ASCII_PUNCTUATION_CHAR = "[\33-\47\58-\64\91-\96\123-\126]" -- !"#$%&'()*+,-./ :;<=>?@ [\]^_` {|}~
3029

3130
local REPLACEMENT_CHARACTER = "\239\191\189" -- U+FFFD
3231

@@ -197,7 +196,7 @@ local function isHtmlBlockStartByRule7(line, col, pos)
197196
pos = (
198197
line:match(!("^"..PATTERN_WHITESPACE_CHAR.."*="..PATTERN_WHITESPACE_CHAR..'*"[^"]*"()'), pos) or
199198
line:match(!("^"..PATTERN_WHITESPACE_CHAR.."*="..PATTERN_WHITESPACE_CHAR.."*'[^']*'()"), pos) or
200-
line:match(!("^"..PATTERN_WHITESPACE_CHAR.."*="..PATTERN_WHITESPACE_CHAR.."*[^ \t\n\v\f\"'=<>`]+()"), pos) or
199+
line:match(!("^"..PATTERN_WHITESPACE_CHAR.."*="..PATTERN_WHITESPACE_CHAR.."*[^"..WHITESPACE_CHARS.."\"'=<>`]+()"), pos) or
201200
pos
202201
)
203202
end
@@ -239,8 +238,8 @@ local function normalizeLinkLabel(label)
239238
local getCodepoint = utf8.getCodepointAndLength
240239
local caseFolding = unicode.caseFolding
241240

242-
label = label:gsub(!(PATTERN_CHAR), function(c)
243-
return caseFolding[getCodepoint(c)] -- May be nil.
241+
label = label:gsub(!("()"..PATTERN_CHAR), function(pos)
242+
return caseFolding[getCodepoint(label, pos)] -- May be nil.
244243
end)
245244

246245
return label
@@ -612,7 +611,7 @@ local function getEndOfHtmlTag(s, pos)
612611
pos = (
613612
s:match(!("^"..PATTERN_WHITESPACE_CHAR.."*="..PATTERN_WHITESPACE_CHAR..'*"[^"]*"()'), pos) or
614613
s:match(!("^"..PATTERN_WHITESPACE_CHAR.."*="..PATTERN_WHITESPACE_CHAR.."*'[^']*'()"), pos) or
615-
s:match(!("^"..PATTERN_WHITESPACE_CHAR.."*="..PATTERN_WHITESPACE_CHAR.."*[^ \t\n\v\f\"'=<>`]+()"), pos) or
614+
s:match(!("^"..PATTERN_WHITESPACE_CHAR.."*="..PATTERN_WHITESPACE_CHAR.."*[^"..WHITESPACE_CHARS.."\"'=<>`]+()"), pos) or
616615
pos
617616
)
618617
end
@@ -829,7 +828,7 @@ local function parseExtendedEmailAutolink(s, pos)
829828
local uri, text, posNext = parseEmailAutolinkContent(s, pos)
830829
if not uri then return nil end
831830

832-
if s:find("^[^ \t\n\v\f.]", posNext) then return nil end
831+
if s:find(!("^[^"..WHITESPACE_CHARS..".]"), posNext) then return nil end
833832

834833
return uri, text, posNext
835834
end
@@ -1265,7 +1264,7 @@ local function parseInline(parentEl, s, linkReferenceDefinitions)
12651264

12661265
-- Autolink. ['autolink' extension]
12671266
if b >= !(BYTE"a") and b <= !(BYTE"z") then
1268-
if pos == 1 or @@IS_CHAR(s, pos-1, " \t\n\v\f*_~(") then
1267+
if pos == 1 or @@IS_CHAR(s, pos-1, " \t\n\v\f*_~(") then -- WHITESPACE_CHARS
12691268
local uri, text, posNext = parseExtendedWwwAutolink (s, pos)
12701269
if not uri then uri, text, posNext = parseExtendedUrlAutolink (s, pos) end
12711270
if not uri then uri, text, posNext = parseExtendedEmailAutolink(s, pos) end
@@ -1341,13 +1340,16 @@ local function parseInline(parentEl, s, linkReferenceDefinitions)
13411340
end
13421341

13431342
local runType = s:sub(pos, pos)
1344-
local posPrev = pos - 1 --utf8.getStartOfCharacter(s, pos-1)
1343+
local posPrev = (pos >= 2) and utf8.getStartOfCharacter(s, pos-1) or 0
13451344
local posNext = pos + #delimRunChars
13461345

1347-
local precededByText = s:find(!("^"..PATTERN_NON_UNICODE_WHITESPACE_CHAR), posPrev) ~= nil
1348-
local followedByText = s:find(!("^"..PATTERN_NON_UNICODE_WHITESPACE_CHAR), posNext) ~= nil
1349-
local precededByPunct = s:find(!("^"..PATTERN_PUNCTUATION_CHAR), posPrev) ~= nil
1350-
local followedByPunct = s:find(!("^"..PATTERN_PUNCTUATION_CHAR), posNext) ~= nil
1346+
local cpPrev = (posPrev >= 1 ) and utf8.getCodepointAndLength(s, posPrev) or 0
1347+
local cpNext = (posNext <= #s) and utf8.getCodepointAndLength(s, posNext) or 0
1348+
1349+
local precededByText = (cpPrev > 0) and not (unicode.generalCategoryCodepointSet.Zs[cpPrev] or @@CONSTSET{ getStringBytes"\t\n\f" }[cpPrev])
1350+
local followedByText = (cpNext > 0) and not (unicode.generalCategoryCodepointSet.Zs[cpNext] or @@CONSTSET{ getStringBytes"\t\n\f" }[cpNext])
1351+
local precededByPunct = unicode.generalCategoryCodepointSet.P[cpPrev] ~= nil
1352+
local followedByPunct = unicode.generalCategoryCodepointSet.P[cpNext] ~= nil
13511353

13521354
local tok = {
13531355
position1 = pos,
@@ -2072,9 +2074,9 @@ function markdown.parse(s)
20722074

20732075
-- [Leaf] HTML block.
20742076
elseif -- §1
2075-
getBlockStart(line, "^<[Ss][Cc][Rr][Ii][Pp][Tt]", col, pos) and not getBlockStart(line, !("^<......".."[^ \t\n\v\f>]"), col, pos) or
2076-
getBlockStart(line, "^<[Ss][Tt][Yy][Ll][Ee]", col, pos) and not getBlockStart(line, !("^<....." .."[^ \t\n\v\f>]"), col, pos) or
2077-
getBlockStart(line, "^<[Pp][Rr][Ee]", col, pos) and not getBlockStart(line, !("^<..." .."[^ \t\n\v\f>]"), col, pos)
2077+
getBlockStart(line, "^<[Ss][Cc][Rr][Ii][Pp][Tt]", col, pos) and not getBlockStart(line, !("^<......".."[^"..WHITESPACE_CHARS..">]"), col, pos) or
2078+
getBlockStart(line, "^<[Ss][Tt][Yy][Ll][Ee]", col, pos) and not getBlockStart(line, !("^<....." .."[^"..WHITESPACE_CHARS..">]"), col, pos) or
2079+
getBlockStart(line, "^<[Pp][Rr][Ee]", col, pos) and not getBlockStart(line, !("^<..." .."[^"..WHITESPACE_CHARS..">]"), col, pos)
20782080
then
20792081
!local HTML_ELEMENT_PRE = PRINT`"html"`..`
20802082
local htmlEl = xmlLib.element("\0", line:sub(pos)) -- Note: The name of HTML blocks is NULL so we don't collide with user data.

0 commit comments

Comments
 (0)