Skip to content

Commit acb609d

Browse files
authored
Fix crashes due to lexing ambiguity of string delimiters (#394)
There are some lexing ambituities in primes vs cmd delimiters. We break these with a simple rule in the lexer but there's edge cases of invalid or extremely strange syntax where this can be inconsistent with the parser. The following were some such cases which caused an assertion error in the parser. "var\"#\"``\$" "x in'``\$" This change avoids crashing in those cases, emitting an error instead. See also #25
1 parent 5769755 commit acb609d

File tree

2 files changed

+26
-7
lines changed

2 files changed

+26
-7
lines changed

src/parser.jl

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,10 @@ function is_both_unary_and_binary(t)
301301
)
302302
end
303303

304+
function is_string_macro_suffix(k)
305+
k == K"Identifier" || is_keyword(k) || is_word_operator(k) || is_number(k)
306+
end
307+
304308
# flisp: invalid-identifier?
305309
function is_valid_identifier(k)
306310
k = kind(k)
@@ -1707,7 +1711,7 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
17071711
parse_string(ps, true)
17081712
t = peek_token(ps)
17091713
k = kind(t)
1710-
if !preceding_whitespace(t) && (k == K"Identifier" || is_keyword(k) || is_word_operator(k) || is_number(k))
1714+
if !preceding_whitespace(t) && is_string_macro_suffix(k)
17111715
# Macro sufficies can include keywords and numbers
17121716
# x"s"y ==> (macrocall @x_str (string-r "s") "y")
17131717
# x"s"end ==> (macrocall @x_str (string-r "s") "end")
@@ -2344,7 +2348,7 @@ function parse_macro_name(ps::ParseState)
23442348
# @! x ==> (macrocall @! x)
23452349
# @.. x ==> (macrocall @.. x)
23462350
# @$ x ==> (macrocall @$ x)
2347-
# @var"#" x ==> (macrocall (var #) @$ x)
2351+
# @var"#" x ==> (macrocall (var @#) x)
23482352
bump_disallowed_space(ps)
23492353
mark = position(ps)
23502354
parse_atom(ps, false)
@@ -3182,7 +3186,13 @@ function parse_string(ps::ParseState, raw::Bool)
31823186
t = peek_full_token(ps)
31833187
k = kind(t)
31843188
if k == K"$"
3185-
@assert !raw # The lexer detects raw strings separately
3189+
if raw
3190+
# FIXME: This case is actually a tokenization error:
3191+
# The `K"$"` token should not occur when a raw string
3192+
# is being parsed, but this would require the lexer to know
3193+
# about the parse state. (see also parse_atom)
3194+
break
3195+
end
31863196
if prev_chunk_newline
31873197
# """\n$x\n a""" ==> (string-s x "\n" " a")
31883198
indent_ref_i = first_byte(t)
@@ -3526,11 +3536,16 @@ function parse_atom(ps::ParseState, check_identifiers=true)
35263536
# var"x"+ ==> x
35273537
# var"x") ==> x
35283538
# var"x"( ==> x
3529-
else
3539+
elseif is_string_macro_suffix(k)
35303540
# var"x"end ==> (var x (error-t))
35313541
# var"x"1 ==> (var x (error-t))
35323542
# var"x"y ==> (var x (error-t))
3533-
bump(ps, TRIVIA_FLAG, error="suffix not allowed after var\"...\" syntax")
3543+
bump(ps, TRIVIA_FLAG, error="suffix not allowed after `var\"...\"` syntax")
3544+
elseif k == K"`" || k == K"\"" || k == K"\"\"\"" || k == K"```"
3545+
# Disallow `var"#""str". To allow this we'd need to fix `raw`
3546+
# detection in lex_quote to be consistent with the parser.
3547+
bump_invisible(ps, K"error", TRIVIA_FLAG,
3548+
error="`var\"...\"` syntax not supported as string macro name")
35343549
end
35353550
emit(ps, mark, K"var")
35363551
elseif check_identifiers && is_closing_token(ps, leading_kind)

test/parser.jl

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,9 +1012,13 @@ parsestmt_test_specs = [
10121012
"(abstract\ntype X end)" => "(wrapper (parens abstract (error-t type X)) (error-t end ✘))"
10131013
"(mutable\nstruct X end)" => "(wrapper (parens mutable (error-t struct X)) (error-t end ✘))"
10141014

1015-
# The following is currently broken but at least the parser shouldn't
1016-
# crash.
1015+
# Lexer vs parser: issues detecting which tokens are string delimiters and
1016+
# detecting raw vs non-raw strings. The old parser was tightly coupled to
1017+
# the lexer and the parser state was used to disambiguate these cases.
10171018
"x in' '" => "(call-i x in (char (error)))"
1019+
"x in'``\$" => "(call-i x in (call-i (juxtapose (char '`' (error-t)) (macrocall core_@cmd (cmdstring-r (error-t)))) \$ (error)))"
1020+
"var\"#\"`str`" => "(juxtapose (var # (error-t)) (macrocall core_@cmd (cmdstring-r \"str\")))"
1021+
"var\"#\"\"str\"" => "(juxtapose (var # (error-t)) (error-t) (string \"str\"))"
10181022
]
10191023

10201024
@testset "Parser does not crash on broken code" begin

0 commit comments

Comments
 (0)