Skip to content

Commit 237b087

Browse files
committed
Updated Unicode data.
1 parent d489c33 commit 237b087

File tree

5 files changed

+284
-24
lines changed

5 files changed

+284
-24
lines changed

Changelog.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ LuaPreprocess
33

44
v1.13.1 (2021-05-16)
55
Library:
6-
- Dual code now supports multiple identifiers: !!x, y = ...
6+
- Dual code now supports multiple assignment targets: !!x, y = ...
77
- Some non-ASCII characters in serialized strings look nicer.
88
- Added params.fastStrings .
99
- Fixed backtick strings not working in macros.

misc/generateStringEscapeSequenceInfo.lua

Lines changed: 266 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,16 @@
1+
--
12
-- Unicode characters not to encode with escape sequences in strings.
2-
-- https://en.wikipedia.org/wiki/List_of_Unicode_characters
3+
-- Updated: 2021-05-17
4+
--
35

6+
-- U+1234 = include
7+
-- !U+1234 = exclude
8+
-- U+1x3x = 'x' means range between 0 and F
49
local codepointsStr = [[
10+
11+
Source: https://en.wikipedia.org/wiki/List_of_Unicode_characters
12+
----------------------------------------------------------------
13+
514
Basic Latin
615
U+0020 (space)
716
U+0021 !
@@ -974,17 +983,268 @@ U+203C ‼
974983
U+203E ‾
975984
U+2044 ⁄
976985
U+204A ⁊
986+
987+
Source: https://en.wikipedia.org/wiki/Unicode_block
988+
----------------------------------------------------------------
989+
990+
General Punctuation
991+
U+201x !U+2011
992+
U+2020 U+2021 U+2022 U+2023 U+2024 U+2025 U+2026 U+2027
993+
U+203x
994+
U+204x
995+
U+205x !U+205F
996+
997+
Superscripts and Subscripts
998+
U+207x !U+2072 !U+2073
999+
U+208x !U+208F
1000+
U+209x !U+209D !U+209E !U+209F
1001+
1002+
Currency Symbols
1003+
U+20Ax
1004+
U+20Bx
1005+
1006+
Letterlike Symbols
1007+
U+210x
1008+
U+211x
1009+
U+212x
1010+
U+213x
1011+
U+214x
1012+
1013+
Number Forms
1014+
U+215x
1015+
U+216x
1016+
U+217x
1017+
U+218x !U+218C !U+218D !U+218E !U+218F
1018+
1019+
Arrows
1020+
U+219x
1021+
U+21Ax
1022+
U+21Bx
1023+
U+21Cx
1024+
U+21Dx
1025+
U+21Ex
1026+
U+21Fx
1027+
1028+
Mathematical Operators
1029+
U+220x
1030+
U+221x
1031+
U+222x
1032+
U+223x
1033+
U+224x
1034+
U+225x
1035+
U+226x
1036+
U+227x
1037+
U+228x
1038+
U+229x
1039+
U+22Ax
1040+
U+22Bx
1041+
U+22Cx
1042+
U+22Dx
1043+
U+22Ex
1044+
U+22Fx
1045+
1046+
Miscellaneous Technical
1047+
U+230x
1048+
U+231x
1049+
U+232x
1050+
U+233x
1051+
U+234x
1052+
U+235x
1053+
U+236x
1054+
U+237x
1055+
U+238x
1056+
U+239x
1057+
U+23Ax
1058+
U+23Bx
1059+
U+23Cx
1060+
U+23Dx
1061+
U+23Ex
1062+
U+23Fx
1063+
1064+
Control Pictures
1065+
U+240x
1066+
U+241x
1067+
U+2420 U+2421 U+2422 U+2423 U+2424 U+2425 U+2426
1068+
1069+
Enclosed Alphanumerics
1070+
U+246x
1071+
U+247x
1072+
U+248x
1073+
U+249x
1074+
U+24Ax
1075+
U+24Bx
1076+
U+24Cx
1077+
U+24Dx
1078+
U+24Ex
1079+
U+24Fx
1080+
1081+
Box Drawing
1082+
U+250x
1083+
U+251x
1084+
U+252x
1085+
U+253x
1086+
U+254x
1087+
U+255x
1088+
U+256x
1089+
U+257x
1090+
1091+
Block Elements
1092+
U+258x
1093+
U+259x
1094+
1095+
Geometric Shapes
1096+
U+25Ax
1097+
U+25Bx
1098+
U+25Cx
1099+
U+25Dx
1100+
U+25Ex
1101+
U+25Fx
1102+
1103+
Miscellaneous Symbols
1104+
U+260x
1105+
U+261x
1106+
U+262x
1107+
U+263x
1108+
U+264x
1109+
U+265x
1110+
U+266x
1111+
U+267x
1112+
U+268x
1113+
U+269x
1114+
U+26Ax
1115+
U+26Bx
1116+
U+26Cx
1117+
U+26Dx
1118+
U+26Ex
1119+
U+26Fx
1120+
1121+
Dingbats
1122+
U+270x
1123+
U+271x
1124+
U+272x
1125+
U+273x
1126+
U+274x
1127+
U+275x
1128+
U+276x
1129+
U+277x
1130+
U+278x
1131+
U+279x
1132+
U+27Ax
1133+
U+27Bx
1134+
1135+
Miscellaneous Mathematical Symbols-A
1136+
U+27Cx
1137+
U+27Dx
1138+
U+27Ex
1139+
1140+
Supplemental Arrows-A
1141+
U+27Fx
1142+
1143+
Supplemental Arrows-B
1144+
U+290x
1145+
U+291x
1146+
U+292x
1147+
U+293x
1148+
U+294x
1149+
U+295x
1150+
U+296x
1151+
U+297x
1152+
1153+
Miscellaneous Mathematical Symbols-B
1154+
U+298x
1155+
U+299x
1156+
U+29Ax
1157+
U+29Bx
1158+
U+29Cx
1159+
U+29Dx
1160+
U+29Ex
1161+
U+29Fx
1162+
1163+
Supplemental Mathematical Operators
1164+
U+2A0x
1165+
U+2A1x
1166+
U+2A2x
1167+
U+2A3x
1168+
U+2A4x
1169+
U+2A5x
1170+
U+2A6x
1171+
U+2A7x
1172+
U+2A8x
1173+
U+2A9x
1174+
U+2AAx
1175+
U+2ABx
1176+
U+2ACx
1177+
U+2ADx
1178+
U+2AEx
1179+
U+2AFx
1180+
1181+
Alphabetic Presentation Forms
1182+
U+FB00 U+FB01 U+FB02 U+FB03 U+FB04 U+FB05 U+FB06
1183+
1184+
Mathematical Alphanumeric Symbols
1185+
(some of these seem problematic)
9771186
]]
9781187

9791188
local lowest = 1/0
9801189
local highest = 0
9811190
local cpSet = {}
9821191

983-
for cpHex in codepointsStr:gmatch"U%+0*(%x+)" do
984-
local cp = tonumber(cpHex, 16)
985-
lowest = math.min(lowest, cp)
986-
highest = math.max(highest, cp)
987-
cpSet[cp] = true
1192+
local function eachCodepoint(cpHexPattern)
1193+
if not cpHexPattern:find"[Xx]" then
1194+
local cpHex = cpHexPattern
1195+
local done = false
1196+
1197+
return function()
1198+
if not done then
1199+
done = true
1200+
return tonumber(cpHex, 16)
1201+
end
1202+
end
1203+
end
1204+
1205+
-- Every 'x' in the hex number pattern is a variable.
1206+
local variables = {}
1207+
1208+
for _ in cpHexPattern:gmatch"[Xx]" do
1209+
table.insert(variables, 0)
1210+
end
1211+
1212+
variables[#variables] = -1
1213+
1214+
return function()
1215+
-- Increase the number represented by the variables.
1216+
for i = #variables, 1, -1 do
1217+
variables[i] = variables[i] + 1
1218+
if variables[i] < 16 then break end
1219+
variables[i] = 0
1220+
if i == 1 then return end -- Done!
1221+
end
1222+
1223+
local i = 0
1224+
1225+
local cpHex = cpHexPattern:gsub("[Xx]", function()
1226+
i = i + 1
1227+
return ("%X"):format(variables[i])
1228+
end)
1229+
1230+
return tonumber(cpHex, 16)
1231+
end
1232+
end
1233+
1234+
for ignore, cpHexPattern in codepointsStr:gmatch"(!?)U%+0*([%xXx]+)" do
1235+
ignore = (ignore == "!")
1236+
1237+
for cp in eachCodepoint(cpHexPattern) do
1238+
if ignore then
1239+
print(("Ignoring U+%04X"):format(cp))
1240+
elseif cpSet[cp] then
1241+
print(("Duplicate U+%04X"):format(cp))
1242+
end
1243+
1244+
lowest = math.min(lowest, cp) -- (It's fine if lowest and highest becomes incorrect if ignore is ever true.)
1245+
highest = math.max(highest, cp)
1246+
cpSet[cp] = not ignore
1247+
end
9881248
end
9891249

9901250
local ranges = {}

preprocess.lua

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,7 @@ end
844844

845845

846846

847+
-- (Table generated by misc/generateStringEscapeSequenceInfo.lua)
847848
local UNICODE_RANGES_NOT_TO_ESCAPE = {
848849
{from=32, to=126},
849850
{from=161, to=591},
@@ -864,17 +865,18 @@ local UNICODE_RANGES_NOT_TO_ESCAPE = {
864865
{from=7808, to=7813},
865866
{from=7835, to=7835},
866867
{from=7922, to=7923},
867-
{from=8211, to=8213},
868-
{from=8215, to=8222},
869-
{from=8224, to=8226},
870-
{from=8230, to=8230},
871-
{from=8240, to=8240},
872-
{from=8242, to=8243},
873-
{from=8249, to=8250},
874-
{from=8252, to=8252},
875-
{from=8254, to=8254},
876-
{from=8260, to=8260},
877-
{from=8266, to=8266},
868+
{from=8208, to=8208},
869+
{from=8210, to=8231},
870+
{from=8240, to=8286},
871+
{from=8304, to=8305},
872+
{from=8308, to=8334},
873+
{from=8336, to=8348},
874+
{from=8352, to=8383},
875+
{from=8448, to=8587},
876+
{from=8592, to=9254},
877+
{from=9312, to=10239},
878+
{from=10496, to=11007},
879+
{from=64256, to=64262},
878880
}
879881

880882
local function shouldCodepointBeEscaped(cp)
@@ -979,10 +981,8 @@ function serialize(buffer, v)
979981
elseif c == quote then tableInsert(buffer, [[\]]) ; tableInsert(buffer, quote) ; pos = pos+1
980982

981983
-- UTF-8 character.
982-
elseif len == 1 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos )) ; pos = pos+1 -- @Speed: We can insert multiple single-byte characters sometimes!
983-
elseif len == 2 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+1)) ; pos = pos+2
984-
elseif len == 3 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+2)) ; pos = pos+3
985-
elseif len == 4 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+3)) ; pos = pos+4
984+
elseif len == 1 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos )) ; pos = pos+1 -- @Speed: We can insert multiple single-byte characters sometimes!
985+
elseif len and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+len-1)) ; pos = pos+len
986986

987987
-- Anything else.
988988
else

tests/quickTest.lua2p

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ comment here...]] true
2828
!wrapped("dogs")
2929
!wrapped("clouds")
3030

31-
local data = !("a\n1Ü2\"\10\0003")
31+
local data = !("a\n1Ü2\"\10\255\255\0003")
3232

3333

3434

tests/quickTest.output.lua

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ print"Get wrapped! Also, dogs..."
2323

2424
print"Get wrapped! Also, clouds..."
2525

26-
local data = 'a\n1Ü2"\n\0003'
26+
local data = 'a\n1Ü2"\n\255\255\0003'
2727

2828

2929

0 commit comments

Comments
 (0)