Skip to content

Commit ec705ed

Browse files
authored
Avoid crashing on ambiguous tokenization of char vs adjoint (#251)
The tokenizer can't really solve this case, it needs help from the parser. ie, tighter coupling of tokenizer and parser. However the workarounds here avoid crashing the parser and even gives the right result in most cases.
1 parent 8fb13c6 commit ec705ed

File tree

2 files changed

+37
-15
lines changed

2 files changed

+37
-15
lines changed

src/parser.jl

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3389,8 +3389,25 @@ function parse_atom(ps::ParseState, check_identifiers=true)
33893389
# char literal
33903390
bump(ps, TRIVIA_FLAG)
33913391
k = peek(ps)
3392-
if k == K"Char"
3393-
bump(ps)
3392+
if k == K"'"
3393+
# '' ==> (char (error))
3394+
bump_invisible(ps, K"error", error="empty character literal")
3395+
elseif k == K"EndMarker"
3396+
# ' ==> (char (error))
3397+
bump_invisible(ps, K"error", error="unterminated character literal")
3398+
else
3399+
if k == K"Char"
3400+
bump(ps)
3401+
else
3402+
# FIXME: This case is actually a tokenization error.
3403+
# Make a best-effort attempt to workaround this for now by
3404+
# remapping the kind. This needs to be fixed by rewinding the
3405+
# tokenizer's buffer and re-tokenizing the next token as a
3406+
# char. (A lot of work for a very obscure edge case)
3407+
#
3408+
# x in'c' ==> (call-i x in (char 'c'))
3409+
bump(ps, remap_kind=K"Char")
3410+
end
33943411
if peek(ps) == K"'"
33953412
# 'a' ==> (char 'a')
33963413
# 'α' ==> (char 'α')
@@ -3401,15 +3418,12 @@ function parse_atom(ps::ParseState, check_identifiers=true)
34013418
bump_invisible(ps, K"error", TRIVIA_FLAG,
34023419
error="unterminated character literal")
34033420
end
3404-
elseif k == K"'"
3405-
# '' ==> (char (error))
3406-
bump_invisible(ps, K"error", error="empty character literal")
3407-
else
3408-
# ' ==> (char (error))
3409-
@check k == K"EndMarker"
3410-
bump_invisible(ps, K"error", error="unterminated character literal")
34113421
end
34123422
emit(ps, mark, K"char")
3423+
elseif leading_kind == K"Char"
3424+
# FIXME: This is a tokenization error and should be preceeded with
3425+
# K"'". However this workaround is better than emitting a bare Char.
3426+
bump(ps, remap_kind=K"Identifier")
34133427
elseif leading_kind == K":"
34143428
# symbol/expression quote
34153429
# :foo ==> (quote-: foo)

test/parser.jl

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -948,19 +948,27 @@ tests = [
948948
end
949949
end
950950

951-
parseall_test_specs = [
951+
parsestmt_test_specs = [
952952
# whitespace before keywords in space-insensitive mode
953-
"(y::\nif x z end)" => "(toplevel (parens (::-i y (if x (block z)))))"
953+
"(y::\nif x z end)" => "(parens (::-i y (if x (block z))))"
954+
# parsing of tricky primes
955+
"x in'c'" => "(call-i x in (char 'c'))"
956+
"1where'c'" => "(where 1 (char 'c'))"
957+
":+'y'" => "(juxtapose (call-post (quote-: +) ') (call-post y '))"
954958

955959
# The following may not be ideal error recovery! But at least the parser
956960
# shouldn't crash
957-
"@(x y)" => "(toplevel (macrocall (parens @x (error-t y))))"
958-
"|(&\nfunction" => "(toplevel (call | (& (function (error (error)) (block (error)) (error-t))) (error-t)))"
961+
"@(x y)" => "(macrocall (parens @x (error-t y)))"
962+
"|(&\nfunction" => "(call | (& (function (error (error)) (block (error)) (error-t))) (error-t))"
963+
964+
# The following are currently broken but at least the parser shouldn't
965+
# crash.
966+
"x in' '" => "(call-i x in (char (error))) (error-t ')"
959967
]
960968

961969
@testset "Parser does not crash on broken code" begin
962-
@testset "$(repr(input))" for (input, output) in parseall_test_specs
963-
test_parse(JuliaSyntax.parse_toplevel, input, output)
970+
@testset "$(repr(input))" for (input, output) in parsestmt_test_specs
971+
test_parse(JuliaSyntax.parse_stmts, input, output)
964972
end
965973
end
966974

0 commit comments

Comments
 (0)