Skip to content

Commit c8845e7

Browse files
committed
Serialized strings look nicer in many cases by default.
Added params.fastStrings and --faststrings.
1 parent 7293e16 commit c8845e7

File tree

9 files changed

+1223
-39
lines changed

9 files changed

+1223
-39
lines changed

misc/generateStringEscapeSequenceInfo.lua

Lines changed: 1007 additions & 0 deletions
Large diffs are not rendered by default.

preprocess-cl.lua

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ exec lua "$0" "$@"
3838
processed files (and any message handler). Otherwise,
3939
'dataFromCommandLine' is nil.
4040
41+
--faststrings
42+
Force fast serialization of string values. (Non-ASCII characters
43+
will look ugly.)
44+
4145
--handler|-h=pathToMessageHandler
4246
Path to a Lua file that's expected to return a function or a
4347
table of functions. If it returns a function then it will be
@@ -165,6 +169,7 @@ local allowBacktickStrings = false
165169
local allowJitSyntax = false
166170
local canOutputNil = true
167171
local customData = nil
172+
local fastStrings = false
168173
local hasOutputExtension = false
169174
local hasOutputPaths = false
170175
local isDebug = false
@@ -309,6 +314,9 @@ for _, arg in ipairs(args) do
309314
elseif arg == "--silent" then
310315
silent = true
311316

317+
elseif arg == "--faststrings" then -- @Doc
318+
fastStrings = true
319+
312320
else
313321
errorLine("Unknown option '"..arg:gsub("=.*", "").."'.")
314322
end
@@ -460,8 +468,9 @@ for i, pathIn in ipairs(pathsIn) do
460468
addLineNumbers = addLineNumbers,
461469

462470
backtickStrings = allowBacktickStrings,
463-
canOutputNil = canOutputNil,
464471
jitSyntax = allowJitSyntax,
472+
canOutputNil = canOutputNil,
473+
fastStrings = fastStrings,
465474
validate = validate,
466475

467476
onInsert = (hasMessageHandler("insert") or nil) and function(name)

preprocess.lua

Lines changed: 172 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ local PUNCTUATION = {
144144
"//", "&", "|", "~", ">>", "<<",
145145
} for i, v in ipairs(PUNCTUATION) do PUNCTUATION[v], PUNCTUATION[i] = true, nil end
146146

147-
local ESCAPE_SEQUENCES = {
147+
local ESCAPE_SEQUENCES_EXCEPT_QUOTES = {
148148
["\a"] = [[\a]],
149149
["\b"] = [[\b]],
150150
["\f"] = [[\f]],
@@ -153,9 +153,11 @@ local ESCAPE_SEQUENCES = {
153153
["\t"] = [[\t]],
154154
["\v"] = [[\v]],
155155
["\\"] = [[\\]],
156+
}
157+
local ESCAPE_SEQUENCES = {
156158
["\""] = [[\"]],
157159
["\'"] = [[\']],
158-
}
160+
} for k, v in pairs(ESCAPE_SEQUENCES_EXCEPT_QUOTES) do ESCAPE_SEQUENCES[k] = v end
159161

160162
local USELESS_TOKENS = {whitespace=true, comment=true}
161163

@@ -185,6 +187,7 @@ local currentPathOut = ""
185187
local metaPathForErrorMessages = ""
186188
local outputFromMeta = nil
187189
local canOutputNil = true
190+
local fastStrings = false
188191

189192

190193

@@ -215,6 +218,7 @@ local pack, unpack
215218
local printf, printTokens, printError, printfError, printErrorTraceback
216219
local serialize, toLua
217220
local tableInsert, tableInsertFormat
221+
local utf8GetCharLength, utf8GetCodepointAndLength
218222

219223

220224

@@ -438,14 +442,14 @@ end
438442

439443

440444

441-
local NUM_HEX_FRAC_EXP = ("^( 0[Xx] ([%dA-Fa-f]*) %.([%dA-Fa-f]+) [Pp]([-+]?[%dA-Fa-f]+) )"):gsub(" +", "")
442-
local NUM_HEX_FRAC = ("^( 0[Xx] ([%dA-Fa-f]*) %.([%dA-Fa-f]+) )"):gsub(" +", "")
443-
local NUM_HEX_EXP = ("^( 0[Xx] ([%dA-Fa-f]+) %.? [Pp]([-+]?[%dA-Fa-f]+) )"):gsub(" +", "")
444-
local NUM_HEX = ("^( 0[Xx] [%dA-Fa-f]+ %.? )"):gsub(" +", "")
445-
local NUM_DEC_FRAC_EXP = ("^( %d* %.%d+ [Ee][-+]?%d+ )"):gsub(" +", "")
446-
local NUM_DEC_FRAC = ("^( %d* %.%d+ )"):gsub(" +", "")
447-
local NUM_DEC_EXP = ("^( %d+ %.? [Ee][-+]?%d+ )"):gsub(" +", "")
448-
local NUM_DEC = ("^( %d+ %.? )"):gsub(" +", "")
445+
local NUM_HEX_FRAC_EXP = ("^( 0[Xx] (%x*) %.(%x+) [Pp]([-+]?%x+) )"):gsub(" +", "")
446+
local NUM_HEX_FRAC = ("^( 0[Xx] (%x*) %.(%x+) )"):gsub(" +", "")
447+
local NUM_HEX_EXP = ("^( 0[Xx] (%x+) %.? [Pp]([-+]?%x+) )"):gsub(" +", "")
448+
local NUM_HEX = ("^( 0[Xx] %x+ %.? )"):gsub(" +", "")
449+
local NUM_DEC_FRAC_EXP = ("^( %d* %.%d+ [Ee][-+]?%d+ )"):gsub(" +", "")
450+
local NUM_DEC_FRAC = ("^( %d* %.%d+ )"):gsub(" +", "")
451+
local NUM_DEC_EXP = ("^( %d+ %.? [Ee][-+]?%d+ )"):gsub(" +", "")
452+
local NUM_DEC = ("^( %d+ %.? )"):gsub(" +", "")
449453

450454
-- tokens = _tokenize( luaString, path, allowPreprocessorTokens, allowBacktickStrings, allowJitSyntax )
451455
function _tokenize(s, path, allowPpTokens, allowBacktickStrings, allowJitSyntax)
@@ -836,6 +840,46 @@ end
836840

837841

838842

843+
local UNICODE_RANGES_NOT_TO_ESCAPE = {
844+
{from=32, to=126},
845+
{from=161, to=591},
846+
{from=880, to=887},
847+
{from=890, to=895},
848+
{from=900, to=906},
849+
{from=908, to=908},
850+
{from=910, to=929},
851+
{from=931, to=1154},
852+
{from=1162, to=1279},
853+
{from=7682, to=7683},
854+
{from=7690, to=7691},
855+
{from=7710, to=7711},
856+
{from=7744, to=7745},
857+
{from=7766, to=7767},
858+
{from=7776, to=7777},
859+
{from=7786, to=7787},
860+
{from=7808, to=7813},
861+
{from=7835, to=7835},
862+
{from=7922, to=7923},
863+
{from=8211, to=8213},
864+
{from=8215, to=8222},
865+
{from=8224, to=8226},
866+
{from=8230, to=8230},
867+
{from=8240, to=8240},
868+
{from=8242, to=8243},
869+
{from=8249, to=8250},
870+
{from=8252, to=8252},
871+
{from=8254, to=8254},
872+
{from=8260, to=8260},
873+
{from=8266, to=8266},
874+
}
875+
876+
local function shouldCodepointBeEscaped(cp)
877+
for _, range in ipairs(UNICODE_RANGES_NOT_TO_ESCAPE) do -- @Speed: Don't use a loop?
878+
if cp >= range.from and cp <= range.to then return false end
879+
end
880+
return true
881+
end
882+
839883
-- success, error = serialize( buffer, value )
840884
function serialize(buffer, v)
841885
local vType = type(v)
@@ -896,16 +940,64 @@ function serialize(buffer, v)
896940
tableInsert(buffer, "}")
897941

898942
elseif vType == "string" then
899-
-- @Incomplete: Add an option specifically for nice string serialization?
900-
local s = v:gsub("[%c\128-\255\"\\]", function(c)
901-
local str = ESCAPE_SEQUENCES[c] or F("\\%03d", c:byte())
902-
ESCAPE_SEQUENCES[c] = str
903-
return str
904-
end)
943+
if v == "" then
944+
tableInsert(buffer, '""')
945+
946+
elseif fastStrings or not v:find"[^\32-\126\t\n]" then
947+
-- print(">> FAST", #v) -- DEBUG
948+
949+
local s = v:gsub("[%c\128-\255\"\\]", function(c)
950+
local s = ESCAPE_SEQUENCES[c] or F("\\%03d", c:byte())
951+
ESCAPE_SEQUENCES[c] = s -- Cache the result.
952+
return s
953+
end)
954+
955+
tableInsert(buffer, '"')
956+
tableInsert(buffer, s)
957+
tableInsert(buffer, '"')
958+
959+
else
960+
-- print(">> SLOW", #v) -- DEBUG
961+
962+
local quote = (v:find('"', 1, true) and not v:find("'", 1, true)) and "'" or '"'
963+
local pos = 1
964+
local toMinimize = {}
965+
966+
tableInsert(buffer, quote)
967+
968+
-- @Speed: There are optimizations to be made here!
969+
while pos <= #v do
970+
local c = v:sub(pos, pos)
971+
local cp, len = utf8GetCodepointAndLength(v, pos)
972+
973+
-- Named escape sequences.
974+
if ESCAPE_SEQUENCES_EXCEPT_QUOTES[c] then tableInsert(buffer, ESCAPE_SEQUENCES_EXCEPT_QUOTES[c]) ; pos = pos+1
975+
elseif c == quote then tableInsert(buffer, [[\]]) ; tableInsert(buffer, quote) ; pos = pos+1
976+
977+
-- UTF-8 character.
978+
elseif len == 1 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos )) ; pos = pos+1 -- @Speed: We can insert multiple single-byte characters sometimes!
979+
elseif len == 2 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+1)) ; pos = pos+2
980+
elseif len == 3 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+2)) ; pos = pos+3
981+
elseif len == 4 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+3)) ; pos = pos+4
982+
983+
-- Anything else.
984+
else
985+
local b = v:byte(pos)
986+
tableInsert(buffer, F("\\%03d", b))
987+
if b <= 99 then tableInsert(toMinimize, #buffer) end
988+
pos = pos+1
989+
end
990+
end
905991

906-
tableInsert(buffer, '"')
907-
tableInsert(buffer, s)
908-
tableInsert(buffer, '"')
992+
-- Minimize \nnn sequences that aren't followed by digits.
993+
for _, i in ipairs(toMinimize) do
994+
if not (buffer[i+1] and buffer[i+1]:find"^%d") then
995+
buffer[i] = buffer[i]:gsub("0+(%d+)", "%1")
996+
end
997+
end
998+
999+
tableInsert(buffer, quote)
1000+
end
9091001

9101002
elseif v == 1/0 then
9111003
tableInsert(buffer, "(1/0)")
@@ -1138,6 +1230,56 @@ end
11381230

11391231

11401232

1233+
-- length|nil = utf8GetCharLength( string [, position=1 ] )
1234+
function utf8GetCharLength(s, pos)
1235+
pos = pos or 1
1236+
local b1, b2, b3, b4 = s:byte(pos, pos+3)
1237+
1238+
if b1 > 0 and b1 <= 127 then
1239+
return 1
1240+
1241+
elseif b1 >= 194 and b1 <= 223 then
1242+
if not b2 then return nil end -- UTF-8 string terminated early.
1243+
if b2 < 128 or b2 > 191 then return nil end -- Invalid UTF-8 character.
1244+
return 2
1245+
1246+
elseif b1 >= 224 and b1 <= 239 then
1247+
if not b3 then return nil end -- UTF-8 string terminated early.
1248+
if b1 == 224 and (b2 < 160 or b2 > 191) then return nil end -- Invalid UTF-8 character.
1249+
if b1 == 237 and (b2 < 128 or b2 > 159) then return nil end -- Invalid UTF-8 character.
1250+
if (b2 < 128 or b2 > 191) then return nil end -- Invalid UTF-8 character.
1251+
if (b3 < 128 or b3 > 191) then return nil end -- Invalid UTF-8 character.
1252+
return 3
1253+
1254+
elseif b1 >= 240 and b1 <= 244 then
1255+
if not b4 then return nil end -- UTF-8 string terminated early.
1256+
if b1 == 240 and (b2 < 144 or b2 > 191) then return nil end -- Invalid UTF-8 character.
1257+
if b1 == 244 and (b2 < 128 or b2 > 143) then return nil end -- Invalid UTF-8 character.
1258+
if (b2 < 128 or b2 > 191) then return nil end -- Invalid UTF-8 character.
1259+
if (b3 < 128 or b3 > 191) then return nil end -- Invalid UTF-8 character.
1260+
if (b4 < 128 or b4 > 191) then return nil end -- Invalid UTF-8 character.
1261+
return 4
1262+
end
1263+
1264+
return nil -- Invalid UTF-8 character.
1265+
end
1266+
1267+
-- codepoint, length = utf8GetCodepointAndLength( string [, position=1 ] )
1268+
-- Returns nil if the text is invalid at the position.
1269+
function utf8GetCodepointAndLength(s, pos)
1270+
pos = pos or 1
1271+
local len = utf8GetCharLength(s, pos)
1272+
if not len then return nil end
1273+
1274+
-- 2^6=64, 2^12=4096, 2^18=262144
1275+
if len == 1 then return s:byte(pos), len end
1276+
if len == 2 then local b1, b2 = s:byte(pos, pos+1) ; return (b1-192)*64 + (b2-128), len end
1277+
if len == 3 then local b1, b2, b3 = s:byte(pos, pos+2) ; return (b1-224)*4096 + (b2-128)*64 + (b3-128), len end
1278+
do local b1, b2, b3, b4 = s:byte(pos, pos+3) ; return (b1-240)*262144 + (b2-128)*4096 + (b3-128)*64 + (b4-128), len end
1279+
end
1280+
1281+
1282+
11411283
--==============================================================
11421284
--= Preprocessor Functions =====================================
11431285
--==============================================================
@@ -2362,6 +2504,7 @@ local function _processFileOrString(params, isFile)
23622504
metaPathForErrorMessages = params.pathMeta or "<meta>"
23632505
outputFromMeta = {}
23642506
canOutputNil = params.canOutputNil ~= false
2507+
fastStrings = params.fastStrings
23652508

23662509
if params.pathMeta then
23672510
local file = assert(io.open(params.pathMeta, "wb"))
@@ -2446,6 +2589,7 @@ local function _processFileOrString(params, isFile)
24462589

24472590
currentPathIn = ""
24482591
currentPathOut = ""
2592+
fastStrings = false
24492593

24502594
if isFile then
24512595
return info
@@ -2495,6 +2639,7 @@ local function processFileOrString(params, isFile)
24952639
metaPathForErrorMessages = ""
24962640
outputFromMeta = nil
24972641
canOutputNil = true
2642+
fastStrings = false
24982643

24992644
if xpcallOk then
25002645
return unpack(returnValues, 1, returnValues.n)
@@ -2532,14 +2677,15 @@ local pp = {
25322677
-- params: Table with these fields:
25332678
-- pathIn = pathToInputFile -- [Required]
25342679
-- pathOut = pathToOutputFile -- [Required]
2535-
-- pathMeta = pathForMetaprogram -- [Optional] You can inspect this temporary output file if an error ocurrs in the metaprogram.
2680+
-- pathMeta = pathForMetaprogram -- [Optional] You can inspect this temporary output file if an error occurs in the metaprogram.
25362681
--
2537-
-- addLineNumbers = boolean -- [Optional] Add comments with line numbers to the output.
25382682
-- debug = boolean -- [Optional] Debug mode. The metaprogram file is formatted more nicely and does not get deleted automatically.
2683+
-- addLineNumbers = boolean -- [Optional] Add comments with line numbers to the output.
25392684
--
25402685
-- backtickStrings = boolean -- [Optional] Enable the backtick (`) to be used as string literal delimiters. Backtick strings don't interpret any escape sequences and can't contain other backticks. (Default: false)
2541-
-- canOutputNil = boolean -- [Optional] Allow !() and outputValue() to output nil. (Default: true)
25422686
-- jitSyntax = boolean -- [Optional] Allow LuaJIT-specific syntax. (Default: false)
2687+
-- canOutputNil = boolean -- [Optional] Allow !() and outputValue() to output nil. (Default: true)
2688+
-- fastStrings = boolean -- [Optional] Force fast serialization of string values. (Non-ASCII characters will look ugly.) (Default: false) @Doc
25432689
-- validate = boolean -- [Optional] Validate output. (Default: true)
25442690
--
25452691
-- onInsert = function( name ) -- [Optional] Called for each @insert"name" statement. It's expected to return a Lua code string. By default 'name' is a path to a file to be inserted.
@@ -2557,14 +2703,15 @@ local pp = {
25572703
--
25582704
-- params: Table with these fields:
25592705
-- code = luaString -- [Required]
2560-
-- pathMeta = pathForMetaprogram -- [Optional] You can inspect this temporary output file if an error ocurrs in the metaprogram.
2706+
-- pathMeta = pathForMetaprogram -- [Optional] You can inspect this temporary output file if an error occurs in the metaprogram.
25612707
--
2562-
-- addLineNumbers = boolean -- [Optional] Add comments with line numbers to the output.
25632708
-- debug = boolean -- [Optional] Debug mode. The metaprogram file is formatted more nicely and does not get deleted automatically.
2709+
-- addLineNumbers = boolean -- [Optional] Add comments with line numbers to the output.
25642710
--
25652711
-- backtickStrings = boolean -- [Optional] Enable the backtick (`) to be used as string literal delimiters. Backtick strings don't interpret any escape sequences and can't contain other backticks. (Default: false)
2566-
-- canOutputNil = boolean -- [Optional] Allow !() and outputValue() to output nil. (Default: true)
25672712
-- jitSyntax = boolean -- [Optional] Allow LuaJIT-specific syntax. (Default: false)
2713+
-- canOutputNil = boolean -- [Optional] Allow !() and outputValue() to output nil. (Default: true)
2714+
-- fastStrings = boolean -- [Optional] Force fast serialization of string values. (Non-ASCII characters will look ugly.) (Default: false) @Doc
25682715
-- validate = boolean -- [Optional] Validate output. (Default: true)
25692716
--
25702717
-- onInsert = function( name ) -- [Optional] Called for each @insert"name" statement. It's expected to return a Lua code string. By default 'name' is a path to a file to be inserted.

tests/quickTest.lua2p

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
!print("The beginning.")
66

77
local a = "a" -- Comment A.
8-
print(a)
8+
print("a", a)
99

1010
-- More preprocessor lines.
1111
!local b = "b" -- Comment B.
12-
!print(b)
12+
!print("b", b)
1313

1414
a = a..!(b)..a -- Comment, string concat.
1515

@@ -28,6 +28,8 @@ comment here...]] true
2828
!wrapped("dogs")
2929
!wrapped("clouds")
3030

31+
local data = !("a\n1Ü2\"\10\0003")
32+
3133

3234

3335
!(

0 commit comments

Comments
 (0)