|
| 1 | +-- |
1 | 2 | -- Unicode characters not to encode with escape sequences in strings. |
2 | | --- https://en.wikipedia.org/wiki/List_of_Unicode_characters |
| 3 | +-- Updated: 2021-05-17 |
| 4 | +-- |
3 | 5 |
|
| 6 | +-- U+1234 = include |
| 7 | +-- !U+1234 = exclude |
| 8 | +-- U+1x3x = 'x' means range between 0 and F |
4 | 9 | local codepointsStr = [[ |
| 10 | +
|
| 11 | +Source: https://en.wikipedia.org/wiki/List_of_Unicode_characters |
| 12 | +---------------------------------------------------------------- |
| 13 | +
|
5 | 14 | Basic Latin |
6 | 15 | U+0020 (space) |
7 | 16 | U+0021 ! |
@@ -974,17 +983,268 @@ U+203C ‼ |
974 | 983 | U+203E ‾ |
975 | 984 | U+2044 ⁄ |
976 | 985 | U+204A ⁊ |
| 986 | +
|
| 987 | +Source: https://en.wikipedia.org/wiki/Unicode_block |
| 988 | +---------------------------------------------------------------- |
| 989 | +
|
| 990 | +General Punctuation |
| 991 | +U+201x !U+2011 |
| 992 | +U+2020 U+2021 U+2022 U+2023 U+2024 U+2025 U+2026 U+2027 |
| 993 | +U+203x |
| 994 | +U+204x |
| 995 | +U+205x !U+205F |
| 996 | +
|
| 997 | +Superscripts and Subscripts |
| 998 | +U+207x !U+2072 !U+2073 |
| 999 | +U+208x !U+208F |
| 1000 | +U+209x !U+209D !U+209E !U+209F |
| 1001 | +
|
| 1002 | +Currency Symbols |
| 1003 | +U+20Ax |
| 1004 | +U+20Bx |
| 1005 | +
|
| 1006 | +Letterlike Symbols |
| 1007 | +U+210x |
| 1008 | +U+211x |
| 1009 | +U+212x |
| 1010 | +U+213x |
| 1011 | +U+214x |
| 1012 | +
|
| 1013 | +Number Forms |
| 1014 | +U+215x |
| 1015 | +U+216x |
| 1016 | +U+217x |
| 1017 | +U+218x !U+218C !U+218D !U+218E !U+218F |
| 1018 | +
|
| 1019 | +Arrows |
| 1020 | +U+219x |
| 1021 | +U+21Ax |
| 1022 | +U+21Bx |
| 1023 | +U+21Cx |
| 1024 | +U+21Dx |
| 1025 | +U+21Ex |
| 1026 | +U+21Fx |
| 1027 | +
|
| 1028 | +Mathematical Operators |
| 1029 | +U+220x |
| 1030 | +U+221x |
| 1031 | +U+222x |
| 1032 | +U+223x |
| 1033 | +U+224x |
| 1034 | +U+225x |
| 1035 | +U+226x |
| 1036 | +U+227x |
| 1037 | +U+228x |
| 1038 | +U+229x |
| 1039 | +U+22Ax |
| 1040 | +U+22Bx |
| 1041 | +U+22Cx |
| 1042 | +U+22Dx |
| 1043 | +U+22Ex |
| 1044 | +U+22Fx |
| 1045 | +
|
| 1046 | +Miscellaneous Technical |
| 1047 | +U+230x |
| 1048 | +U+231x |
| 1049 | +U+232x |
| 1050 | +U+233x |
| 1051 | +U+234x |
| 1052 | +U+235x |
| 1053 | +U+236x |
| 1054 | +U+237x |
| 1055 | +U+238x |
| 1056 | +U+239x |
| 1057 | +U+23Ax |
| 1058 | +U+23Bx |
| 1059 | +U+23Cx |
| 1060 | +U+23Dx |
| 1061 | +U+23Ex |
| 1062 | +U+23Fx |
| 1063 | +
|
| 1064 | +Control Pictures |
| 1065 | +U+240x |
| 1066 | +U+241x |
| 1067 | +U+2420 U+2421 U+2422 U+2423 U+2424 U+2425 U+2426 |
| 1068 | +
|
| 1069 | +Enclosed Alphanumerics |
| 1070 | +U+246x |
| 1071 | +U+247x |
| 1072 | +U+248x |
| 1073 | +U+249x |
| 1074 | +U+24Ax |
| 1075 | +U+24Bx |
| 1076 | +U+24Cx |
| 1077 | +U+24Dx |
| 1078 | +U+24Ex |
| 1079 | +U+24Fx |
| 1080 | +
|
| 1081 | +Box Drawing |
| 1082 | +U+250x |
| 1083 | +U+251x |
| 1084 | +U+252x |
| 1085 | +U+253x |
| 1086 | +U+254x |
| 1087 | +U+255x |
| 1088 | +U+256x |
| 1089 | +U+257x |
| 1090 | +
|
| 1091 | +Block Elements |
| 1092 | +U+258x |
| 1093 | +U+259x |
| 1094 | +
|
| 1095 | +Geometric Shapes |
| 1096 | +U+25Ax |
| 1097 | +U+25Bx |
| 1098 | +U+25Cx |
| 1099 | +U+25Dx |
| 1100 | +U+25Ex |
| 1101 | +U+25Fx |
| 1102 | +
|
| 1103 | +Miscellaneous Symbols |
| 1104 | +U+260x |
| 1105 | +U+261x |
| 1106 | +U+262x |
| 1107 | +U+263x |
| 1108 | +U+264x |
| 1109 | +U+265x |
| 1110 | +U+266x |
| 1111 | +U+267x |
| 1112 | +U+268x |
| 1113 | +U+269x |
| 1114 | +U+26Ax |
| 1115 | +U+26Bx |
| 1116 | +U+26Cx |
| 1117 | +U+26Dx |
| 1118 | +U+26Ex |
| 1119 | +U+26Fx |
| 1120 | +
|
| 1121 | +Dingbats |
| 1122 | +U+270x |
| 1123 | +U+271x |
| 1124 | +U+272x |
| 1125 | +U+273x |
| 1126 | +U+274x |
| 1127 | +U+275x |
| 1128 | +U+276x |
| 1129 | +U+277x |
| 1130 | +U+278x |
| 1131 | +U+279x |
| 1132 | +U+27Ax |
| 1133 | +U+27Bx |
| 1134 | +
|
| 1135 | +Miscellaneous Mathematical Symbols-A |
| 1136 | +U+27Cx |
| 1137 | +U+27Dx |
| 1138 | +U+27Ex |
| 1139 | +
|
| 1140 | +Supplemental Arrows-A |
| 1141 | +U+27Fx |
| 1142 | +
|
| 1143 | +Supplemental Arrows-B |
| 1144 | +U+290x |
| 1145 | +U+291x |
| 1146 | +U+292x |
| 1147 | +U+293x |
| 1148 | +U+294x |
| 1149 | +U+295x |
| 1150 | +U+296x |
| 1151 | +U+297x |
| 1152 | +
|
| 1153 | +Miscellaneous Mathematical Symbols-B |
| 1154 | +U+298x |
| 1155 | +U+299x |
| 1156 | +U+29Ax |
| 1157 | +U+29Bx |
| 1158 | +U+29Cx |
| 1159 | +U+29Dx |
| 1160 | +U+29Ex |
| 1161 | +U+29Fx |
| 1162 | +
|
| 1163 | +Supplemental Mathematical Operators |
| 1164 | +U+2A0x |
| 1165 | +U+2A1x |
| 1166 | +U+2A2x |
| 1167 | +U+2A3x |
| 1168 | +U+2A4x |
| 1169 | +U+2A5x |
| 1170 | +U+2A6x |
| 1171 | +U+2A7x |
| 1172 | +U+2A8x |
| 1173 | +U+2A9x |
| 1174 | +U+2AAx |
| 1175 | +U+2ABx |
| 1176 | +U+2ACx |
| 1177 | +U+2ADx |
| 1178 | +U+2AEx |
| 1179 | +U+2AFx |
| 1180 | +
|
| 1181 | +Alphabetic Presentation Forms |
| 1182 | +U+FB00 U+FB01 U+FB02 U+FB03 U+FB04 U+FB05 U+FB06 |
| 1183 | +
|
| 1184 | +Mathematical Alphanumeric Symbols |
| 1185 | +(some of these seem problematic) |
977 | 1186 | ]] |
978 | 1187 |
|
979 | 1188 | local lowest = 1/0 |
980 | 1189 | local highest = 0 |
981 | 1190 | local cpSet = {} |
982 | 1191 |
|
983 | | -for cpHex in codepointsStr:gmatch"U%+0*(%x+)" do |
984 | | - local cp = tonumber(cpHex, 16) |
985 | | - lowest = math.min(lowest, cp) |
986 | | - highest = math.max(highest, cp) |
987 | | - cpSet[cp] = true |
| 1192 | +local function eachCodepoint(cpHexPattern) |
| 1193 | + if not cpHexPattern:find"[Xx]" then |
| 1194 | + local cpHex = cpHexPattern |
| 1195 | + local done = false |
| 1196 | + |
| 1197 | + return function() |
| 1198 | + if not done then |
| 1199 | + done = true |
| 1200 | + return tonumber(cpHex, 16) |
| 1201 | + end |
| 1202 | + end |
| 1203 | + end |
| 1204 | + |
| 1205 | + -- Every 'x' in the hex number pattern is a variable. |
| 1206 | + local variables = {} |
| 1207 | + |
| 1208 | + for _ in cpHexPattern:gmatch"[Xx]" do |
| 1209 | + table.insert(variables, 0) |
| 1210 | + end |
| 1211 | + |
| 1212 | + variables[#variables] = -1 |
| 1213 | + |
| 1214 | + return function() |
| 1215 | + -- Increase the number represented by the variables. |
| 1216 | + for i = #variables, 1, -1 do |
| 1217 | + variables[i] = variables[i] + 1 |
| 1218 | + if variables[i] < 16 then break end |
| 1219 | + variables[i] = 0 |
| 1220 | + if i == 1 then return end -- Done! |
| 1221 | + end |
| 1222 | + |
| 1223 | + local i = 0 |
| 1224 | + |
| 1225 | + local cpHex = cpHexPattern:gsub("[Xx]", function() |
| 1226 | + i = i + 1 |
| 1227 | + return ("%X"):format(variables[i]) |
| 1228 | + end) |
| 1229 | + |
| 1230 | + return tonumber(cpHex, 16) |
| 1231 | + end |
| 1232 | +end |
| 1233 | + |
| 1234 | +for ignore, cpHexPattern in codepointsStr:gmatch"(!?)U%+0*([%xXx]+)" do |
| 1235 | + ignore = (ignore == "!") |
| 1236 | + |
| 1237 | + for cp in eachCodepoint(cpHexPattern) do |
| 1238 | + if ignore then |
| 1239 | + print(("Ignoring U+%04X"):format(cp)) |
| 1240 | + elseif cpSet[cp] then |
| 1241 | + print(("Duplicate U+%04X"):format(cp)) |
| 1242 | + end |
| 1243 | + |
| 1244 | + lowest = math.min(lowest, cp) -- (It's fine if lowest and highest becomes incorrect if ignore is ever true.) |
| 1245 | + highest = math.max(highest, cp) |
| 1246 | + cpSet[cp] = not ignore |
| 1247 | + end |
988 | 1248 | end |
989 | 1249 |
|
990 | 1250 | local ranges = {} |
|
0 commit comments