Markdown: Doing Unicode case folding when normalizing reference link labels.

ReFreezed · ReFreezed · commit 8d333c95583a · 2021-07-06T23:11:07.000+02:00
Markdown: Prepared for more Unicode support.
diff --git a/build/preprocess.lua b/build/preprocess.lua
@@ -118,7 +118,7 @@
 
 
 
-local PP_VERSION = "1.13.1"
+local PP_VERSION = "1.13.2"
 
 local MAX_DUPLICATE_FILE_INSERTS = 1000 -- @Incomplete: Make this a parameter for processFile()/processString().
 
@@ -282,7 +282,7 @@ function printErrorTraceback(message, level)
 			tableInsertFormat(buffer, "%d:", info.currentline)
 		end
 
-		if info.name then
+		if (info.name or "") ~= "" then
 			tableInsertFormat(buffer, " in '%s'", info.name)
 		elseif info.what == "main" then
 			tableInsert(buffer, " in main chunk")
@@ -981,10 +981,8 @@ function serialize(buffer, v)
 				elseif c == quote                    then  tableInsert(buffer, [[\]]) ; tableInsert(buffer, quote) ; pos = pos+1
 
 				-- UTF-8 character.
-				elseif len == 1 and not shouldCodepointBeEscaped(cp) then  tableInsert(buffer, v:sub(pos, pos  )) ; pos = pos+1 -- @Speed: We can insert multiple single-byte characters sometimes!
-				elseif len == 2 and not shouldCodepointBeEscaped(cp) then  tableInsert(buffer, v:sub(pos, pos+1)) ; pos = pos+2
-				elseif len == 3 and not shouldCodepointBeEscaped(cp) then  tableInsert(buffer, v:sub(pos, pos+2)) ; pos = pos+3
-				elseif len == 4 and not shouldCodepointBeEscaped(cp) then  tableInsert(buffer, v:sub(pos, pos+3)) ; pos = pos+4
+				elseif len == 1 and not shouldCodepointBeEscaped(cp) then  tableInsert(buffer, v:sub(pos, pos      )) ; pos = pos+1 -- @Speed: We can insert multiple single-byte characters sometimes!
+				elseif len      and not shouldCodepointBeEscaped(cp) then  tableInsert(buffer, v:sub(pos, pos+len-1)) ; pos = pos+len
 
 				-- Anything else.
 				else
@@ -998,7 +996,7 @@ function serialize(buffer, v)
 			-- Minimize \nnn sequences that aren't followed by digits.
 			for _, i in ipairs(toMinimize) do
 				if not (buffer[i+1] and buffer[i+1]:find"^%d") then
-					buffer[i] = buffer[i]:gsub("0+(%d+)", "%1")
+					buffer[i] = buffer[i]:gsub("0+(%d)", "%1")
 				end
 			end
 
diff --git a/build/unicode/CaseFolding.txt b/build/unicode/CaseFolding.txt
diff --git a/build/unicode/UnicodeData.txt b/build/unicode/UnicodeData.txt
diff --git a/examples/testsite/scripts/tests/runTests.lua b/examples/testsite/scripts/tests/runTests.lua
@@ -195,7 +195,7 @@ return function()
 
 		for _, test in ipairs(tests) do
 			if not (false
-				or is(test,175,363,548) -- @Incomplete: Full Unicode support. (Hah...)
+				or is(test,363) -- @Incomplete: Complete Unicode support.
 
 				-- or test.n < 118 -- Jump to HTML block parsing tests.
 				-- or test.n < 307 -- Jump to inline parsing tests.
diff --git a/src/globals.lua2p b/src/globals.lua2p
@@ -50,6 +50,7 @@ _G.jsonLib        = require"json"
 _G.markdownLib    = require"markdown"
 _G.markdownOldLib = require"markdownOld" -- @Deprecated
 _G.tomlLib        = require"toml"
+_G.unicode        = require"unicode"
 _G.urlLib         = require"url"
 _G.utf8           = require"utf8"
 _G.xmlLib         = require"xml"
diff --git a/src/markdown.lua2p b/src/markdown.lua2p
@@ -17,7 +17,7 @@
 local markdown = {}
 
 !(
-local PATTERN_CHAR                        = "[%z\1-\127\194-\244][\128-\191]*"
+local PATTERN_CHAR                        = require"src.utf8".CHARACTER_PATTERN
 local PATTERN_LINE                        = "[^\n]*"
 local PATTERN_BLANK_LINE                  = "^[ \t]*$"
 local PATTERN_WHITESPACE_CHAR             = "[ \t\n\v\f]" -- Excluding \r.
@@ -235,7 +235,14 @@ end
 local function normalizeLinkLabel(label)
 	label = trimWhitespace(label)
 	label = label:gsub(!(PATTERN_WHITESPACE_SEQUENCE), " ")
-	label = label:lower() -- @Incomplete: Perform Unicode case fold instead of this (and possibly before trimming whitespace).
+
+	local getCodepoint = utf8.getCodepointAndLength
+	local caseFolding  = unicode.caseFolding
+
+	label = label:gsub(!(PATTERN_CHAR), function(c)
+		return caseFolding[getCodepoint(c)] -- May be nil.
+	end)
+
 	return label
 end
 
diff --git a/src/unicode.lua2p b/src/unicode.lua2p
@@ -0,0 +1,81 @@
+--[[============================================================
+--=
+--=  Unicode data
+--=
+--=-------------------------------------------------------------
+--=
+--=  LuaWebGen - static website generator in Lua!
+--=  - Written by Marcus 'ReFreezed' Thunström
+--=  - MIT License (See LICENSE.txt)
+--=
+--============================================================]]
+
+!(
+-- Gather info about codepoints in general categories Zs and P for use in Markdown.
+----------------------------------------------------------------
+
+local gcCodepointSet = {}
+
+for line in io.lines"build/unicode/UnicodeData.txt" do
+	--[[
+		Fields:
+		 1.  Codepoint
+		 2.  Name
+		 3.  General_Category
+		 4.  Canonical_Combining_Class
+		 5.  Bidi_Class
+		 6.  Decomposition_Type
+		 7.  Decomposition_Mapping
+		 8.  Numeric_Type
+		 9.  Numeric_Value
+		 10. Bidi_Mirrored
+		 11. Unicode_1_Name (Obsolete as of 6.2.0)
+		 12. ISO_Comment (Obsolete as of 5.2.0; Deprecated and Stabilized as of 6.0.0)
+		 13. Simple_Uppercase_Mapping
+		 14. Simple_Lowercase_Mapping
+		 15. Simple_Titlecase_Mapping
+	]]
+	local cp, gc = line:match"^(%x+);[^;]*;([^;]*)"
+	gc           = (gc == "Zs" and gc) or (gc:match"P")
+
+	if gc then
+		cp = tonumber(cp, 16)
+
+		gcCodepointSet[gc]     = gcCodepointSet[gc] or {}
+		gcCodepointSet[gc][cp] = 1
+	end
+end
+-- print(toLua(gcCodepointSet))
+
+-- Gather info about case folding for use in Markdown.
+----------------------------------------------------------------
+
+local utf8        = require"src.utf8"
+local cpToString  = utf8.codepointToString
+local caseFolding = {}
+
+for line in io.lines"build/unicode/CaseFolding.txt" do
+	local cpFromStr, status, cpsToStr = line:match"^(%x+); ([CFST]); ([%x ]+)"
+
+	if status == "C" or status == "F" then
+		local cpFrom  = tonumber(cpFromStr, 16)
+		local charsTo = {}
+
+		for cpToStr in cpsToStr:gmatch"%x+" do
+			table.insert(charsTo, cpToString(tonumber(cpToStr, 16)))
+		end
+
+		caseFolding[cpFrom] = table.concat(charsTo)
+	end
+end
+-- print(toLua(caseFolding))
+
+----------------------------------------------------------------
+
+-- os.exit(2) -- DEBUG
+)
+
+return {
+	generalCategoryCodepointSet = !(gcCodepointSet), -- { P|Zs={[cp]=1,...}, ... }
+	caseFolding                 = !(caseFolding   ), -- { [fromCp]=toString, ... }
+}
diff --git a/src/utf8.lua b/src/utf8.lua
@@ -10,18 +10,22 @@
 --=
 --==============================================================
 
+	CHARACTER_PATTERN
+
 	codepointToString
 	getCharacterLength, getCodepointAndLength
 	getLength
 	getStartOfCharacter
 
 --============================================================]]
 
-local utf8 = {}
+local utf8 = {
+	CHARACTER_PATTERN = "[%z\1-\127\194-\244][\128-\191]*", -- @Doc
+}
 
-local byteToString = string.char
-local getByte      = string.byte
-local tableInsert  = table.insert
+local stringByte  = string.byte
+local stringChar  = string.char
+local tableInsert = table.insert
 
 
 
@@ -35,10 +39,10 @@ function utf8.codepointToString(cp, buffer)
 	if cp >= 128 then
 		-- void
 	elseif buffer then
-		tableInsert(buffer, byteToString(cp))
+		tableInsert(buffer, stringChar(cp))
 		return
 	else
-		return byteToString(cp)
+		return stringChar(cp)
 	end
 
 	local suffix = cp % 64
@@ -48,11 +52,11 @@ function utf8.codepointToString(cp, buffer)
 	if cp >= 32 then
 		-- void
 	elseif buffer then
-		tableInsert(buffer, byteToString(192+cp))
-		tableInsert(buffer, byteToString(c4))
+		tableInsert(buffer, stringChar(192+cp))
+		tableInsert(buffer, stringChar(c4))
 		return
 	else
-		return byteToString(192+cp, c4) -- @Speed @Memory
+		return stringChar(192+cp, c4) -- @Speed @Memory
 	end
 
 	suffix   = cp % 64
@@ -62,25 +66,25 @@ function utf8.codepointToString(cp, buffer)
 	if cp >= 16 then
 		-- void
 	elseif buffer then
-		tableInsert(buffer, byteToString(224+cp))
-		tableInsert(buffer, byteToString(c3))
-		tableInsert(buffer, byteToString(c4))
+		tableInsert(buffer, stringChar(224+cp))
+		tableInsert(buffer, stringChar(c3))
+		tableInsert(buffer, stringChar(c4))
 		return
 	else
-		return byteToString(224+cp, c3, c4) -- @Speed @Memory
+		return stringChar(224+cp, c3, c4) -- @Speed @Memory
 	end
 
 	suffix = cp % 64
 	cp     = (cp - suffix) / 64
 
 	if buffer then
-		tableInsert(buffer, byteToString(240+cp))
-		tableInsert(buffer, byteToString(128+suffix))
-		tableInsert(buffer, byteToString(c3))
-		tableInsert(buffer, byteToString(c4))
+		tableInsert(buffer, stringChar(240+cp))
+		tableInsert(buffer, stringChar(128+suffix))
+		tableInsert(buffer, stringChar(c3))
+		tableInsert(buffer, stringChar(c4))
 		return
 	else
-		return byteToString(240+cp, 128+suffix, c3, c4) -- @Speed @Memory
+		return stringChar(240+cp, 128+suffix, c3, c4) -- @Speed @Memory
 	end
 end
 
@@ -90,7 +94,7 @@ end
 -- Returns nil if the string is invalid at the position.
 function utf8.getCharacterLength(s, pos)
 	pos                  = pos or 1
-	local b1, b2, b3, b4 = getByte(s, pos, pos+3)
+	local b1, b2, b3, b4 = stringByte(s, pos, pos+3)
 
 	if b1 <= 127 then
 		return 1
@@ -129,10 +133,10 @@ function utf8.getCodepointAndLength(s, pos)
 	if not len then  return nil  end
 
 	-- 2^6=64, 2^12=4096, 2^18=262144
-	if len == 1 then                                                  return                                          getByte(s, pos), len  end
-	if len == 2 then  local b1, b2         = getByte(s, pos, pos+1) ; return                                   (b1-192)*64 + (b2-128), len  end
-	if len == 3 then  local b1, b2, b3     = getByte(s, pos, pos+2) ; return                   (b1-224)*4096 + (b2-128)*64 + (b3-128), len  end
-	do                local b1, b2, b3, b4 = getByte(s, pos, pos+3) ; return (b1-240)*262144 + (b2-128)*4096 + (b3-128)*64 + (b4-128), len  end
+	if len == 1 then                                                     return                                       stringByte(s, pos), len  end
+	if len == 2 then  local b1, b2         = stringByte(s, pos, pos+1) ; return                                   (b1-192)*64 + (b2-128), len  end
+	if len == 3 then  local b1, b2, b3     = stringByte(s, pos, pos+2) ; return                   (b1-224)*4096 + (b2-128)*64 + (b3-128), len  end
+	do                local b1, b2, b3, b4 = stringByte(s, pos, pos+3) ; return (b1-240)*262144 + (b2-128)*4096 + (b3-128)*64 + (b4-128), len  end
 end
 
 
@@ -160,7 +164,7 @@ end
 -- Returns nil if the string is invalid at the position.
 function utf8.getStartOfCharacter(s, pos)
 	for pos = pos, math.max(pos-3, 1), -1 do
-		local b = getByte(s, pos)
+		local b = stringByte(s, pos)
 
 		if b <= 127 or (b >= 194 and b <= 244) then
 			-- @Robustness: Verify that the following bytes are valid.