Skip to content

Commit b08eee2

Browse files
authored
Always encapsulate strings in a string node (#94)
This change ensures that strings are always encapsulated within a `K"string"` internal node of the parse tree, giving a place to include the delimiters and unifying interpolated strings / strings with internal whitespace with plain string literals. Also wrap backtick delimited strings with a `K"cmdstring"` head for the same reasons.
1 parent 47c3e2b commit b08eee2

File tree

6 files changed

+176
-134
lines changed

6 files changed

+176
-134
lines changed

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,17 @@ The syntax `a ? b : c` is the same as `if a b else c` in `Expr` so macros can't
450450
distinguish these cases. Instead, we use a distinct expression head `K"?"` and
451451
lower to `Expr(:if)` during `Expr` conversion.
452452

453+
### String nodes always wrapped in `K"string"` or `K"cmdstring"`
454+
455+
All strings are surrounded by a node of kind `K"string"`, even non-interpolated
456+
literals, so `"x"` parses as `(string "x")`. This makes string handling simpler
457+
and more systematic because interpolations and triple strings with embedded
458+
trivia don't need to be treated differently. It also gives a container in which
459+
to attach the delimiting quotes.
460+
461+
The same goes for command strings which are always wrapped in `K"cmdstring"`
462+
regardless of whether they have multiple pieces (due to triple-quoted
463+
dedenting) or otherwise.
453464

454465
## More about syntax kinds
455466

src/expr.jl

Lines changed: 53 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@ function is_eventually_call(ex)
66
is_eventually_call(ex.args[1]))
77
end
88

9+
function is_stringchunk(node)
10+
k = kind(node)
11+
return k == K"String" || k == K"CmdString"
12+
end
13+
914
function _to_expr(node::SyntaxNode, iteration_spec=false, need_linenodes=true)
1015
if !haschildren(node)
1116
val = node.val
@@ -32,6 +37,54 @@ function _to_expr(node::SyntaxNode, iteration_spec=false, need_linenodes=true)
3237
error("Can't untokenize head of kind $(kind(node))")
3338
end
3439
node_args = children(node)
40+
if headsym == :string || headsym == :cmdstring
41+
# Julia string literals may be interspersed with trivia in two situations:
42+
# 1. Triple quoted string indentation is trivia
43+
# 2. An \ before newline removes the newline and any following indentation
44+
#
45+
# Such trivia is eagerly removed by the reference parser, so here we
46+
# concatenate adjacent string chunks together for compatibility.
47+
args = Vector{Any}()
48+
i = 1
49+
while i <= length(node_args)
50+
if is_stringchunk(node_args[i])
51+
if i < length(node_args) && is_stringchunk(node_args[i+1])
52+
buf = IOBuffer()
53+
while i <= length(node_args) && is_stringchunk(node_args[i])
54+
write(buf, node_args[i].val)
55+
i += 1
56+
end
57+
push!(args, String(take!(buf)))
58+
else
59+
push!(args, node_args[i].val)
60+
i += 1
61+
end
62+
else
63+
e = _to_expr(node_args[i])
64+
if e isa String && headsym == :string
65+
# Wrap interpolated literal strings in (string) so we can
66+
# distinguish them from the surrounding text (issue #38501)
67+
# Ie, "$("str")" vs "str"
68+
# https://github.com/JuliaLang/julia/pull/38692
69+
e = Expr(:string, e)
70+
end
71+
push!(args, e)
72+
i += 1
73+
end
74+
end
75+
if length(args) == 1 && args[1] isa String
76+
# If there's a single string remaining after joining, we unwrap
77+
# to give a string literal.
78+
# """\n a\n b""" ==> "a\nb"
79+
# headsym === :cmdstring follows this branch
80+
return only(args)
81+
else
82+
@check headsym === :string
83+
return Expr(headsym, args...)
84+
end
85+
end
86+
87+
# Convert children
3588
insert_linenums = (headsym == :block || headsym == :toplevel) && need_linenodes
3689
args = Vector{Any}(undef, length(node_args)*(insert_linenums ? 2 : 1))
3790
if headsym == :for && length(node_args) == 2
@@ -125,38 +178,6 @@ function _to_expr(node::SyntaxNode, iteration_spec=false, need_linenodes=true)
125178
pushfirst!(args, numeric_flags(flags(node)))
126179
elseif headsym == :typed_ncat
127180
insert!(args, 2, numeric_flags(flags(node)))
128-
elseif headsym == :string && length(args) > 1
129-
# Julia string literals may be interspersed with trivia in two situations:
130-
# 1. Triple quoted string indentation is trivia
131-
# 2. An \ before newline removes the newline and any following indentation
132-
#
133-
# Such trivia is eagerly removed by the reference parser, so here we
134-
# concatenate adjacent string chunks together for compatibility.
135-
#
136-
# TODO: Manage the non-interpolation cases with String and CmdString
137-
# kinds instead?
138-
args2 = Vector{Any}()
139-
i = 1
140-
while i <= length(args)
141-
if args[i] isa String && i < length(args) && args[i+1] isa String
142-
buf = IOBuffer()
143-
while i <= length(args) && args[i] isa String
144-
write(buf, args[i])
145-
i += 1
146-
end
147-
push!(args2, String(take!(buf)))
148-
else
149-
push!(args2, args[i])
150-
i += 1
151-
end
152-
end
153-
args = args2
154-
if length(args2) == 1 && args2[1] isa String
155-
# If there's a single string remaining after joining we unwrap to
156-
# give a string literal.
157-
# """\n a\n b""" ==> "a\nb"
158-
return args2[1]
159-
end
160181
# elseif headsym == :string && length(args) == 1 && version <= (1,5)
161182
# Strip string from interpolations in 1.5 and lower to preserve
162183
# "hi$("ho")" ==> (string "hi" "ho")

src/kinds.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -871,6 +871,7 @@ const _kind_names =
871871
"curly"
872872
"inert" # QuoteNode; not quasiquote
873873
"string" # A string interior node (possibly containing interpolations)
874+
"cmdstring" # A cmd string node (containing delimiters plus string)
874875
"macrocall"
875876
"kw" # the = in f(a=1)
876877
"parameters" # the list after ; in f(; a=1)

src/parser.jl

Lines changed: 59 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -468,28 +468,28 @@ function parse_docstring(ps::ParseState, down=parse_eq)
468468
mark = position(ps)
469469
atdoc_mark = bump_invisible(ps, K"TOMBSTONE")
470470
down(ps)
471-
if peek_behind(ps).kind in KSet"String string"
471+
if peek_behind(ps).kind == K"string"
472472
is_doc = true
473473
k = peek(ps)
474474
if is_closing_token(ps, k)
475-
# "notdoc" ] ==> "notdoc"
475+
# "notdoc" ] ==> (string "notdoc")
476476
is_doc = false
477477
elseif k == K"NewlineWs"
478478
k2 = peek(ps, 2)
479479
if is_closing_token(ps, k2) || k2 == K"NewlineWs"
480-
# "notdoc" \n] ==> "notdoc"
481-
# "notdoc" \n\n foo ==> "notdoc"
480+
# "notdoc" \n] ==> (string "notdoc")
481+
# "notdoc" \n\n foo ==> (string "notdoc")
482482
is_doc = false
483483
else
484484
# Allow a single newline
485-
# "doc" \n foo ==> (macrocall core_@doc "doc" foo)
485+
# "doc" \n foo ==> (macrocall core_@doc (string "doc") foo)
486486
bump(ps, TRIVIA_FLAG) # NewlineWs
487487
end
488488
else
489-
# "doc" foo ==> (macrocall core_@doc "doc" foo)
489+
# "doc" foo ==> (macrocall core_@doc (string "doc") foo)
490490
# "doc $x" foo ==> (macrocall core_@doc (string "doc " x) foo)
491491
# Allow docstrings with embedded trailing whitespace trivia
492-
# """\n doc\n """ foo ==> (macrocall core_@doc "doc\n" foo)
492+
# """\n doc\n """ foo ==> (macrocall core_@doc (string-s "doc\n") foo)
493493
end
494494
if is_doc
495495
reset_node!(ps, atdoc_mark, kind=K"core_@doc")
@@ -1048,11 +1048,12 @@ function parse_juxtapose(ps::ParseState)
10481048
if n_terms == 1
10491049
bump_invisible(ps, K"*")
10501050
end
1051-
if prev_kind == K"String" || is_string_delim(t)
1051+
if prev_kind == K"string" || is_string_delim(t)
10521052
# issue #20575
10531053
#
1054-
# "a""b" ==> (call-i "a" * (error) "b")
1055-
# "a"x ==> (call-i "a" * (error) x)
1054+
# "a""b" ==> (call-i (string "a") * (error-t) (string "b"))
1055+
# "a"x ==> (call-i (string "a") * (error-t) x)
1056+
# "$y"x ==> (call-i (string (string y)) * (error-t) x)
10561057
bump_invisible(ps, K"error", TRIVIA_FLAG,
10571058
error="cannot juxtapose string literal")
10581059
end
@@ -1389,7 +1390,7 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
13891390
# @foo (x) ==> (macrocall @foo x)
13901391
# @foo (x,y) ==> (macrocall @foo (tuple x y))
13911392
# a().@x y ==> (macrocall (error (. (call a) (quote x))) y)
1392-
# [@foo "x"] ==> (vect (macrocall @foo "x"))
1393+
# [@foo x] ==> (vect (macrocall @foo x))
13931394
finish_macroname(ps, mark, valid_macroname, macro_name_position)
13941395
let ps = with_space_sensitive(ps)
13951396
# Space separated macro arguments
@@ -1420,7 +1421,7 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
14201421
elseif (ps.space_sensitive && preceding_whitespace(t) &&
14211422
k in KSet"( [ { \ Char \" \"\"\" ` ```")
14221423
# [f (x)] ==> (hcat f x)
1423-
# [f "x"] ==> (hcat f "x")
1424+
# [f x] ==> (hcat f x)
14241425
break
14251426
elseif k == K"("
14261427
if is_macrocall
@@ -1597,12 +1598,12 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
15971598
elseif k in KSet" \" \"\"\" ` ``` " &&
15981599
!preceding_whitespace(t) && valid_macroname
15991600
# Custom string and command literals
1600-
# x"str" ==> (macrocall @x_str "str")
1601-
# x`str` ==> (macrocall @x_cmd "str")
1602-
# x"" ==> (macrocall @x_str "")
1603-
# x`` ==> (macrocall @x_cmd "")
1601+
# x"str" ==> (macrocall @x_str (string-r "str"))
1602+
# x`str` ==> (macrocall @x_cmd (cmdstring-r "str"))
1603+
# x"" ==> (macrocall @x_str (string-r ""))
1604+
# x`` ==> (macrocall @x_cmd (cmdstring-r ""))
16041605
# Triple quoted procesing for custom strings
1605-
# r"""\nx""" ==> (macrocall @r_str "x")
1606+
# r"""\nx""" ==> (macrocall @r_str (string-sr "x"))
16061607
# r"""\n x\n y""" ==> (macrocall @r_str (string-sr "x\n" "y"))
16071608
# r"""\n x\\n y""" ==> (macrocall @r_str (string-sr "x\\\n" "y"))
16081609
#
@@ -1615,11 +1616,11 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
16151616
k = kind(t)
16161617
if !preceding_whitespace(t) && (k == K"Identifier" || is_keyword(k) || is_word_operator(k) || is_number(k))
16171618
# Macro sufficies can include keywords and numbers
1618-
# x"s"y ==> (macrocall @x_str "s" "y")
1619-
# x"s"end ==> (macrocall @x_str "s" "end")
1620-
# x"s"in ==> (macrocall @x_str "s" "in")
1621-
# x"s"2 ==> (macrocall @x_str "s" 2)
1622-
# x"s"10.0 ==> (macrocall @x_str "s" 10.0)
1619+
# x"s"y ==> (macrocall @x_str (string-r "s") "y")
1620+
# x"s"end ==> (macrocall @x_str (string-r "s") "end")
1621+
# x"s"in ==> (macrocall @x_str (string-r "s") "in")
1622+
# x"s"2 ==> (macrocall @x_str (string-r "s") 2)
1623+
# x"s"10.0 ==> (macrocall @x_str (string-r "s") 10.0)
16231624
suffix_kind = (k == K"Identifier" || is_keyword(k) ||
16241625
is_word_operator(k)) ? K"String" : k
16251626
bump(ps, remap_kind=suffix_kind)
@@ -1813,7 +1814,7 @@ function parse_resword(ps::ParseState)
18131814
parse_unary_prefix(ps)
18141815
end
18151816
# module A \n a \n b \n end ==> (module true A (block a b))
1816-
# module A \n "x"\na \n end ==> (module true A (block (core_@doc "x" a)))
1817+
# module A \n "x"\na \n end ==> (module true A (block (core_@doc (string "x") a)))
18171818
parse_block(ps, parse_docstring)
18181819
bump_closing_token(ps, K"end")
18191820
emit(ps, mark, K"module")
@@ -3032,8 +3033,7 @@ function parse_string(ps::ParseState, raw::Bool)
30323033
indent_ref_len = typemax(Int)
30333034
indent_chunks = acquire_positions(ps.stream)
30343035
buf = textbuf(ps)
3035-
str_flags = (triplestr ? TRIPLE_STRING_FLAG : EMPTY_FLAGS) |
3036-
(raw ? RAW_STRING_FLAG : EMPTY_FLAGS)
3036+
chunk_flags = raw ? RAW_STRING_FLAG : EMPTY_FLAGS
30373037
bump(ps, TRIVIA_FLAG)
30383038
first_chunk = true
30393039
n_valid_chunks = 0
@@ -3048,18 +3048,9 @@ function parse_string(ps::ParseState, raw::Bool)
30483048
bump(ps, TRIVIA_FLAG)
30493049
k = peek(ps)
30503050
if k == K"("
3051-
# "a $(x + y) b" ==> (string "a " (call-i x + y) " b")
3052-
m = position(ps)
3051+
# "a $(x + y) b" ==> (string "a " (call-i x + y) " b")
3052+
# "hi$("ho")" ==> (string "hi" (string "ho"))
30533053
parse_atom(ps)
3054-
# https://github.com/JuliaLang/julia/pull/38692
3055-
prev = peek_behind(ps)
3056-
if prev.kind == string_chunk_kind
3057-
# Wrap interpolated literal strings in (string) so we can
3058-
# distinguish them from the surrounding text (issue #38501)
3059-
# "hi$("ho")" ==> (string "hi" (string "ho"))
3060-
# "hi$("""ho""")" ==> (string "hi" (string-s "ho"))
3061-
emit(ps, m, K"string", prev.flags)
3062-
end
30633054
elseif k == K"var"
30643055
# var identifiers disabled in strings
30653056
# "$var" ==> (string var)
@@ -3087,7 +3078,7 @@ function parse_string(ps::ParseState, raw::Bool)
30873078
(s == 2 && (buf[first_byte(t)] == UInt8('\r') && b == UInt8('\n')))
30883079
end
30893080
# First line of triple string is a newline only: mark as trivia.
3090-
# """\nx""" ==> "x"
3081+
# """\nx""" ==> (string-s "x")
30913082
# """\n\nx""" ==> (string-s "\n" "x")
30923083
bump(ps, TRIVIA_FLAG)
30933084
first_chunk = false
@@ -3097,6 +3088,7 @@ function parse_string(ps::ParseState, raw::Bool)
30973088
# Triple-quoted dedenting:
30983089
# Various newlines (\n \r \r\n) and whitespace (' ' \t)
30993090
# """\n x\n y""" ==> (string-s "x\n" "y")
3091+
# ```\n x\n y``` ==> (macrocall :(Core.var"@cmd") (cmdstring-sr "x\n" "y"))
31003092
# """\r x\r y""" ==> (string-s "x\n" "y")
31013093
# """\r\n x\r\n y""" ==> (string-s "x\n" "y")
31023094
# Spaces or tabs or mixtures acceptable
@@ -3158,7 +3150,7 @@ function parse_string(ps::ParseState, raw::Bool)
31583150
b = buf[last_byte(t)]
31593151
prev_chunk_newline = b == UInt8('\n') || b == UInt8('\r')
31603152
end
3161-
bump(ps, str_flags)
3153+
bump(ps, chunk_flags)
31623154
first_chunk = false
31633155
n_valid_chunks += 1
31643156
end
@@ -3187,36 +3179,37 @@ function parse_string(ps::ParseState, raw::Bool)
31873179
if had_end_delim
31883180
if n_valid_chunks == 0
31893181
# Empty strings, or empty after triple quoted processing
3190-
# "" ==> ""
3191-
# """\n """ ==> ""
3192-
bump_invisible(ps, string_chunk_kind, str_flags)
3182+
# "" ==> (string "")
3183+
# """\n """ ==> (string-s "")
3184+
bump_invisible(ps, string_chunk_kind, chunk_flags)
31933185
end
31943186
bump(ps, TRIVIA_FLAG)
31953187
else
31963188
# Missing delimiter recovery
3197-
# "str ==> "str" (error)
3189+
# "str ==> (string "str" (error-t))
31983190
bump_invisible(ps, K"error", TRIVIA_FLAG, error="Unterminated string literal")
31993191
end
3200-
if n_valid_chunks > 1 || had_interpolation
3201-
# String interpolations
3202-
# "$x$y$z" ==> (string x y z)
3203-
# "$(x)" ==> (string x)
3204-
# "$x" ==> (string x)
3205-
# """$x""" ==> (string-s x)
3206-
#
3207-
# Strings with embedded whitespace trivia
3208-
# "a\\\nb" ==> (string "a" "b")
3209-
# "a\\\rb" ==> (string "a" "b")
3210-
# "a\\\r\nb" ==> (string "a" "b")
3211-
# "a\\\n \tb" ==> (string "a" "b")
3212-
emit(ps, mark, K"string", str_flags)
3213-
else
3214-
# Strings with only a single valid string chunk
3215-
# "str" ==> "str"
3216-
# "a\\\n" ==> "a"
3217-
# "a\\\r" ==> "a"
3218-
# "a\\\r\n" ==> "a"
3219-
end
3192+
# String interpolations
3193+
# "$x$y$z" ==> (string x y z)
3194+
# "$(x)" ==> (string x)
3195+
# "$x" ==> (string x)
3196+
# """$x""" ==> (string-s x)
3197+
#
3198+
# Strings with embedded whitespace trivia
3199+
# "a\\\nb" ==> (string "a" "b")
3200+
# "a\\\rb" ==> (string "a" "b")
3201+
# "a\\\r\nb" ==> (string "a" "b")
3202+
# "a\\\n \tb" ==> (string "a" "b")
3203+
#
3204+
# Strings with only a single valid string chunk
3205+
# "str" ==> (string "str")
3206+
# "a\\\n" ==> (string "a")
3207+
# "a\\\r" ==> (string "a")
3208+
# "a\\\r\n" ==> (string "a")
3209+
string_kind = delim_k in KSet"\" \"\"\"" ? K"string" : K"cmdstring"
3210+
str_flags = (triplestr ? TRIPLE_STRING_FLAG : EMPTY_FLAGS) |
3211+
(raw ? RAW_STRING_FLAG : EMPTY_FLAGS)
3212+
emit(ps, mark, string_kind, str_flags)
32203213
end
32213214

32223215
function emit_braces(ps, mark, ckind, cflags)
@@ -3264,7 +3257,7 @@ function parse_atom(ps::ParseState, check_identifiers=true)
32643257
# Heuristic recovery
32653258
bump(ps)
32663259
else
3267-
# Being inside quote makes keywords into identifiers at at the
3260+
# Being inside quote makes keywords into identifiers at the
32683261
# first level of nesting
32693262
# :end ==> (quote end)
32703263
# :(end) ==> (quote (error (end)))
@@ -3366,9 +3359,9 @@ function parse_atom(ps::ParseState, check_identifiers=true)
33663359
elseif is_string_delim(leading_kind)
33673360
parse_string(ps, false)
33683361
elseif leading_kind in KSet"` ```"
3369-
# `` ==> (macrocall core_@cmd "")
3370-
# `cmd` ==> (macrocall core_@cmd "cmd")
3371-
# ```cmd``` ==> (macrocall core_@cmd "cmd"-s)
3362+
# `` ==> (macrocall core_@cmd (cmdstring-r ""))
3363+
# `cmd` ==> (macrocall core_@cmd (cmdstring-r "cmd"))
3364+
# ```cmd``` ==> (macrocall core_@cmd (cmdstring-sr "cmd"))
33723365
bump_invisible(ps, K"core_@cmd")
33733366
parse_string(ps, true)
33743367
emit(ps, mark, K"macrocall")

0 commit comments

Comments
 (0)