Skip to content

Commit f683212

Browse files
committed
Tokenize: Embedded whitespace trivia in strings
* Emit escaped newlines as whitespace trivia * Split triple strings at newlines so parser can detect indentation and turn it into trivia.
1 parent b1cb0f2 commit f683212

File tree

2 files changed

+76
-40
lines changed

2 files changed

+76
-40
lines changed

Tokenize/src/lexer.jl

Lines changed: 55 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,19 @@ function lex_string_chunk(l)
478478
# Start interpolation
479479
readchar(l)
480480
return emit(l, Tokens.EX_OR)
481+
elseif !state.raw && pc == '\\' && (pc2 = dpeekchar(l)[2];
482+
pc2 == '\r' || pc2 == '\n')
483+
# Process escaped newline as whitespace
484+
readchar(l)
485+
readon(l)
486+
readchar(l)
487+
if pc2 == '\r' && peekchar(l) == '\n'
488+
readchar(l)
489+
end
490+
while (pc = peekchar(l); pc == ' ' || pc == '\t')
491+
readchar(l)
492+
end
493+
return emit(l, Tokens.WHITESPACE)
481494
elseif pc == state.delim && string_terminates(l, state.delim, state.triplestr)
482495
# Terminate string
483496
pop!(l.string_states)
@@ -493,14 +506,55 @@ function lex_string_chunk(l)
493506
readon(l)
494507
# Read a chunk of string characters
495508
if state.raw
496-
read_raw_string(l, state.delim, state.triplestr)
509+
# Raw strings treat all characters as literals with the exception that
510+
# the closing quotes can be escaped with an odd number of \ characters.
511+
while true
512+
pc = peekchar(l)
513+
if string_terminates(l, state.delim, state.triplestr) || eof(pc)
514+
break
515+
elseif state.triplestr && (pc == '\n' || pc == '\r')
516+
# triple quoted newline splitting
517+
readchar(l)
518+
if pc == '\r' && peekchar(l) == '\n'
519+
readchar(l)
520+
end
521+
break
522+
end
523+
c = readchar(l)
524+
if c == '\\'
525+
n = 1
526+
while true
527+
readchar(l)
528+
n += 1
529+
if peekchar(l) != '\\'
530+
break
531+
end
532+
end
533+
if peekchar(l) == state.delim && !iseven(n)
534+
readchar(l)
535+
end
536+
end
537+
end
497538
else
498539
while true
499540
pc = peekchar(l)
500541
if pc == '$' || eof(pc)
501542
break
543+
elseif state.triplestr && (pc == '\n' || pc == '\r')
544+
# triple quoted newline splitting
545+
readchar(l)
546+
if pc == '\r' && peekchar(l) == '\n'
547+
readchar(l)
548+
end
549+
break
502550
elseif pc == state.delim && string_terminates(l, state.delim, state.triplestr)
503551
break
552+
elseif pc == '\\'
553+
# Escaped newline
554+
pc2 = dpeekchar(l)[2]
555+
if pc2 == '\r' || pc2 == '\n'
556+
break
557+
end
504558
end
505559
c = readchar(l)
506560
if c == '\\'
@@ -928,44 +982,6 @@ function string_terminates(l, delim::Char, triplestr::Bool)
928982
end
929983
end
930984

931-
function terminate_string(l, delim::Char, triplestr::Bool)
932-
# @assert string_terminates(l, delim, triplestr)
933-
readchar(l)
934-
if triplestr
935-
readchar(l)
936-
readchar(l)
937-
return delim == '"' ? Tokens.TRIPLE_DQUOTE : Tokens.TRIPLE_BACKTICK
938-
else
939-
return delim == '"' ? Tokens.DQUOTE : Tokens.BACKTICK
940-
end
941-
end
942-
943-
# Read a raw string for use with custom string macros
944-
#
945-
# Raw strings treat all characters as literals with the exception that the
946-
# closing quotes can be escaped with an odd number of \ characters.
947-
function read_raw_string(l::Lexer, delim::Char, triplestr::Bool)
948-
while true
949-
if string_terminates(l, delim, triplestr) || eof(peekchar(l))
950-
return
951-
end
952-
c = readchar(l)
953-
if c == '\\'
954-
n = 1
955-
while true
956-
readchar(l)
957-
n += 1
958-
if peekchar(l) != '\\'
959-
break
960-
end
961-
end
962-
if peekchar(l) == delim && !iseven(n)
963-
readchar(l)
964-
end
965-
end
966-
end
967-
end
968-
969985
# Parse a token starting with a forward slash.
970986
# A '/' has been consumed
971987
function lex_forwardslash(l::Lexer)

Tokenize/test/lexer.jl

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ end
202202
"""))
203203

204204
kinds = [T.COMMENT, T.NEWLINE_WS,
205-
T.TRIPLE_DQUOTE, T.STRING, T.TRIPLE_DQUOTE, T.NEWLINE_WS,
205+
T.TRIPLE_DQUOTE, T.STRING, T.STRING, T.TRIPLE_DQUOTE, T.NEWLINE_WS,
206206
T.INTEGER, T.NEWLINE_WS,
207207
T.ENDMARKER]
208208
@test T.kind.(toks) == kinds
@@ -347,6 +347,26 @@ end
347347
@test ts[4] ~ (T.STRING , "x \$ \\ y")
348348
end
349349

350+
@testset "string escaped newline whitespace" begin
351+
ts = collect(tokenize("\"x\\\n \ty\""))
352+
@test ts[1] ~ (T.DQUOTE , "\"")
353+
@test ts[2] ~ (T.STRING, "x")
354+
@test ts[3] ~ (T.WHITESPACE, "\\\n \t")
355+
@test ts[4] ~ (T.STRING, "y")
356+
@test ts[5] ~ (T.DQUOTE , "\"")
357+
end
358+
359+
@testset "triple quoted string line splitting" begin
360+
ts = collect(tokenize("\"\"\"\nx\r\ny\rz\n\r\"\"\""))
361+
@test ts[1] ~ (T.TRIPLE_DQUOTE , "\"\"\"")
362+
@test ts[2] ~ (T.STRING, "\n")
363+
@test ts[3] ~ (T.STRING, "x\r\n")
364+
@test ts[4] ~ (T.STRING, "y\r")
365+
@test ts[5] ~ (T.STRING, "z\n")
366+
@test ts[6] ~ (T.STRING, "\r")
367+
@test ts[7] ~ (T.TRIPLE_DQUOTE, "\"\"\"")
368+
end
369+
350370
@testset "interpolation" begin
351371
@testset "basic" begin
352372
ts = collect(tokenize("\"\$x \$y\""))

0 commit comments

Comments
 (0)