Skip to content

Commit 8d333c9

Browse files
committed
Markdown: Doing Unicode case folding when normalizing reference link labels.
Markdown: Prepared for more Unicode support.
1 parent 9d27e9c commit 8d333c9

File tree

8 files changed

+35506
-34
lines changed

8 files changed

+35506
-34
lines changed

build/preprocess.lua

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@
118118

119119

120120

121-
local PP_VERSION = "1.13.1"
121+
local PP_VERSION = "1.13.2"
122122

123123
local MAX_DUPLICATE_FILE_INSERTS = 1000 -- @Incomplete: Make this a parameter for processFile()/processString().
124124

@@ -282,7 +282,7 @@ function printErrorTraceback(message, level)
282282
tableInsertFormat(buffer, "%d:", info.currentline)
283283
end
284284

285-
if info.name then
285+
if (info.name or "") ~= "" then
286286
tableInsertFormat(buffer, " in '%s'", info.name)
287287
elseif info.what == "main" then
288288
tableInsert(buffer, " in main chunk")
@@ -981,10 +981,8 @@ function serialize(buffer, v)
981981
elseif c == quote then tableInsert(buffer, [[\]]) ; tableInsert(buffer, quote) ; pos = pos+1
982982

983983
-- UTF-8 character.
984-
elseif len == 1 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos )) ; pos = pos+1 -- @Speed: We can insert multiple single-byte characters sometimes!
985-
elseif len == 2 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+1)) ; pos = pos+2
986-
elseif len == 3 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+2)) ; pos = pos+3
987-
elseif len == 4 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+3)) ; pos = pos+4
984+
elseif len == 1 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos )) ; pos = pos+1 -- @Speed: We can insert multiple single-byte characters sometimes!
985+
elseif len and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+len-1)) ; pos = pos+len
988986

989987
-- Anything else.
990988
else
@@ -998,7 +996,7 @@ function serialize(buffer, v)
998996
-- Minimize \nnn sequences that aren't followed by digits.
999997
for _, i in ipairs(toMinimize) do
1000998
if not (buffer[i+1] and buffer[i+1]:find"^%d") then
1001-
buffer[i] = buffer[i]:gsub("0+(%d+)", "%1")
999+
buffer[i] = buffer[i]:gsub("0+(%d)", "%1")
10021000
end
10031001
end
10041002

build/unicode/CaseFolding.txt

Lines changed: 1584 additions & 0 deletions
Large diffs are not rendered by default.

build/unicode/UnicodeData.txt

Lines changed: 33797 additions & 0 deletions
Large diffs are not rendered by default.

examples/testsite/scripts/tests/runTests.lua

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ return function()
195195

196196
for _, test in ipairs(tests) do
197197
if not (false
198-
or is(test,175,363,548) -- @Incomplete: Full Unicode support. (Hah...)
198+
or is(test,363) -- @Incomplete: Complete Unicode support.
199199

200200
-- or test.n < 118 -- Jump to HTML block parsing tests.
201201
-- or test.n < 307 -- Jump to inline parsing tests.

src/globals.lua2p

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ _G.jsonLib = require"json"
5050
_G.markdownLib = require"markdown"
5151
_G.markdownOldLib = require"markdownOld" -- @Deprecated
5252
_G.tomlLib = require"toml"
53+
_G.unicode = require"unicode"
5354
_G.urlLib = require"url"
5455
_G.utf8 = require"utf8"
5556
_G.xmlLib = require"xml"

src/markdown.lua2p

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
local markdown = {}
1818

1919
!(
20-
local PATTERN_CHAR = "[%z\1-\127\194-\244][\128-\191]*"
20+
local PATTERN_CHAR = require"src.utf8".CHARACTER_PATTERN
2121
local PATTERN_LINE = "[^\n]*"
2222
local PATTERN_BLANK_LINE = "^[ \t]*$"
2323
local PATTERN_WHITESPACE_CHAR = "[ \t\n\v\f]" -- Excluding \r.
@@ -235,7 +235,14 @@ end
235235
local function normalizeLinkLabel(label)
236236
label = trimWhitespace(label)
237237
label = label:gsub(!(PATTERN_WHITESPACE_SEQUENCE), " ")
238-
label = label:lower() -- @Incomplete: Perform Unicode case fold instead of this (and possibly before trimming whitespace).
238+
239+
local getCodepoint = utf8.getCodepointAndLength
240+
local caseFolding = unicode.caseFolding
241+
242+
label = label:gsub(!(PATTERN_CHAR), function(c)
243+
return caseFolding[getCodepoint(c)] -- May be nil.
244+
end)
245+
239246
return label
240247
end
241248

src/unicode.lua2p

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
--[[============================================================
2+
--=
3+
--= Unicode data
4+
--=
5+
--=-------------------------------------------------------------
6+
--=
7+
--= LuaWebGen - static website generator in Lua!
8+
--= - Written by Marcus 'ReFreezed' Thunström
9+
--= - MIT License (See LICENSE.txt)
10+
--=
11+
--============================================================]]
12+
13+
!(
14+
-- Gather info about codepoints in general categories Zs and P for use in Markdown.
15+
----------------------------------------------------------------
16+
17+
local gcCodepointSet = {}
18+
19+
for line in io.lines"build/unicode/UnicodeData.txt" do
20+
--[[
21+
Fields:
22+
1. Codepoint
23+
2. Name
24+
3. General_Category
25+
4. Canonical_Combining_Class
26+
5. Bidi_Class
27+
6. Decomposition_Type
28+
7. Decomposition_Mapping
29+
8. Numeric_Type
30+
9. Numeric_Value
31+
10. Bidi_Mirrored
32+
11. Unicode_1_Name (Obsolete as of 6.2.0)
33+
12. ISO_Comment (Obsolete as of 5.2.0; Deprecated and Stabilized as of 6.0.0)
34+
13. Simple_Uppercase_Mapping
35+
14. Simple_Lowercase_Mapping
36+
15. Simple_Titlecase_Mapping
37+
]]
38+
local cp, gc = line:match"^(%x+);[^;]*;([^;]*)"
39+
gc = (gc == "Zs" and gc) or (gc:match"P")
40+
41+
if gc then
42+
cp = tonumber(cp, 16)
43+
44+
gcCodepointSet[gc] = gcCodepointSet[gc] or {}
45+
gcCodepointSet[gc][cp] = 1
46+
end
47+
end
48+
-- print(toLua(gcCodepointSet))
49+
50+
-- Gather info about case folding for use in Markdown.
51+
----------------------------------------------------------------
52+
53+
local utf8 = require"src.utf8"
54+
local cpToString = utf8.codepointToString
55+
local caseFolding = {}
56+
57+
for line in io.lines"build/unicode/CaseFolding.txt" do
58+
local cpFromStr, status, cpsToStr = line:match"^(%x+); ([CFST]); ([%x ]+)"
59+
60+
if status == "C" or status == "F" then
61+
local cpFrom = tonumber(cpFromStr, 16)
62+
local charsTo = {}
63+
64+
for cpToStr in cpsToStr:gmatch"%x+" do
65+
table.insert(charsTo, cpToString(tonumber(cpToStr, 16)))
66+
end
67+
68+
caseFolding[cpFrom] = table.concat(charsTo)
69+
end
70+
end
71+
-- print(toLua(caseFolding))
72+
73+
----------------------------------------------------------------
74+
75+
-- os.exit(2) -- DEBUG
76+
)
77+
78+
return {
79+
generalCategoryCodepointSet = !(gcCodepointSet), -- { P|Zs={[cp]=1,...}, ... }
80+
caseFolding = !(caseFolding ), -- { [fromCp]=toString, ... }
81+
}

src/utf8.lua2p renamed to src/utf8.lua

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,22 @@
1010
--=
1111
--==============================================================
1212
13+
CHARACTER_PATTERN
14+
1315
codepointToString
1416
getCharacterLength, getCodepointAndLength
1517
getLength
1618
getStartOfCharacter
1719
1820
--============================================================]]
1921

20-
local utf8 = {}
22+
local utf8 = {
23+
CHARACTER_PATTERN = "[%z\1-\127\194-\244][\128-\191]*", -- @Doc
24+
}
2125

22-
local byteToString = string.char
23-
local getByte = string.byte
24-
local tableInsert = table.insert
26+
local stringByte = string.byte
27+
local stringChar = string.char
28+
local tableInsert = table.insert
2529

2630

2731

@@ -35,10 +39,10 @@ function utf8.codepointToString(cp, buffer)
3539
if cp >= 128 then
3640
-- void
3741
elseif buffer then
38-
tableInsert(buffer, byteToString(cp))
42+
tableInsert(buffer, stringChar(cp))
3943
return
4044
else
41-
return byteToString(cp)
45+
return stringChar(cp)
4246
end
4347

4448
local suffix = cp % 64
@@ -48,11 +52,11 @@ function utf8.codepointToString(cp, buffer)
4852
if cp >= 32 then
4953
-- void
5054
elseif buffer then
51-
tableInsert(buffer, byteToString(192+cp))
52-
tableInsert(buffer, byteToString(c4))
55+
tableInsert(buffer, stringChar(192+cp))
56+
tableInsert(buffer, stringChar(c4))
5357
return
5458
else
55-
return byteToString(192+cp, c4) -- @Speed @Memory
59+
return stringChar(192+cp, c4) -- @Speed @Memory
5660
end
5761

5862
suffix = cp % 64
@@ -62,25 +66,25 @@ function utf8.codepointToString(cp, buffer)
6266
if cp >= 16 then
6367
-- void
6468
elseif buffer then
65-
tableInsert(buffer, byteToString(224+cp))
66-
tableInsert(buffer, byteToString(c3))
67-
tableInsert(buffer, byteToString(c4))
69+
tableInsert(buffer, stringChar(224+cp))
70+
tableInsert(buffer, stringChar(c3))
71+
tableInsert(buffer, stringChar(c4))
6872
return
6973
else
70-
return byteToString(224+cp, c3, c4) -- @Speed @Memory
74+
return stringChar(224+cp, c3, c4) -- @Speed @Memory
7175
end
7276

7377
suffix = cp % 64
7478
cp = (cp - suffix) / 64
7579

7680
if buffer then
77-
tableInsert(buffer, byteToString(240+cp))
78-
tableInsert(buffer, byteToString(128+suffix))
79-
tableInsert(buffer, byteToString(c3))
80-
tableInsert(buffer, byteToString(c4))
81+
tableInsert(buffer, stringChar(240+cp))
82+
tableInsert(buffer, stringChar(128+suffix))
83+
tableInsert(buffer, stringChar(c3))
84+
tableInsert(buffer, stringChar(c4))
8185
return
8286
else
83-
return byteToString(240+cp, 128+suffix, c3, c4) -- @Speed @Memory
87+
return stringChar(240+cp, 128+suffix, c3, c4) -- @Speed @Memory
8488
end
8589
end
8690

@@ -90,7 +94,7 @@ end
9094
-- Returns nil if the string is invalid at the position.
9195
function utf8.getCharacterLength(s, pos)
9296
pos = pos or 1
93-
local b1, b2, b3, b4 = getByte(s, pos, pos+3)
97+
local b1, b2, b3, b4 = stringByte(s, pos, pos+3)
9498

9599
if b1 <= 127 then
96100
return 1
@@ -129,10 +133,10 @@ function utf8.getCodepointAndLength(s, pos)
129133
if not len then return nil end
130134

131135
-- 2^6=64, 2^12=4096, 2^18=262144
132-
if len == 1 then return getByte(s, pos), len end
133-
if len == 2 then local b1, b2 = getByte(s, pos, pos+1) ; return (b1-192)*64 + (b2-128), len end
134-
if len == 3 then local b1, b2, b3 = getByte(s, pos, pos+2) ; return (b1-224)*4096 + (b2-128)*64 + (b3-128), len end
135-
do local b1, b2, b3, b4 = getByte(s, pos, pos+3) ; return (b1-240)*262144 + (b2-128)*4096 + (b3-128)*64 + (b4-128), len end
136+
if len == 1 then return stringByte(s, pos), len end
137+
if len == 2 then local b1, b2 = stringByte(s, pos, pos+1) ; return (b1-192)*64 + (b2-128), len end
138+
if len == 3 then local b1, b2, b3 = stringByte(s, pos, pos+2) ; return (b1-224)*4096 + (b2-128)*64 + (b3-128), len end
139+
do local b1, b2, b3, b4 = stringByte(s, pos, pos+3) ; return (b1-240)*262144 + (b2-128)*4096 + (b3-128)*64 + (b4-128), len end
136140
end
137141

138142

@@ -160,7 +164,7 @@ end
160164
-- Returns nil if the string is invalid at the position.
161165
function utf8.getStartOfCharacter(s, pos)
162166
for pos = pos, math.max(pos-3, 1), -1 do
163-
local b = getByte(s, pos)
167+
local b = stringByte(s, pos)
164168

165169
if b <= 127 or (b >= 194 and b <= 244) then
166170
-- @Robustness: Verify that the following bytes are valid.

0 commit comments

Comments
 (0)