Skip to content

Commit eb9fd05

Browse files
authored
Emit sensible errors for invalid operator tokens (#176)
Here we emit invalid token errors during the token validation pass. This ensures any invalid token after parsing is guarenteed to have one single error emitted for it, independent from how it's handled by the parser. Add the special K"Error**" kind to allow us to emit a specific error for the `**` operator. Ensure we reject all the invalid operators which are rejected by the reference parser including the following which were missing: * `..+` and similar * `<---` Also add an extra rule for parsing invalid binary operators (at some arbitrarily-chosen precedence) to improve the recovered parse tree.
1 parent 5ee03f2 commit eb9fd05

File tree

10 files changed

+101
-50
lines changed

10 files changed

+101
-50
lines changed

src/hooks.jl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,12 @@ function _incomplete_tag(n::SyntaxNode)
3535
return :none
3636
end
3737
# TODO: Check error hits last character
38-
if kind(c) == K"error" && begin
38+
if kind(c) == K"ErrorEofMultiComment"
39+
return :comment
40+
elseif kind(c) == K"error" && begin
3941
cs = children(c)
4042
length(cs) > 0
4143
end
42-
k1 = kind(cs[1])
43-
if k1 == K"ErrorEofMultiComment"
44-
return :comment
45-
end
4644
for cc in cs
4745
if kind(cc) == K"error"
4846
return :other

src/kinds.jl

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@ const _kind_names =
1616
# Tokenization errors
1717
"ErrorEofMultiComment"
1818
"ErrorInvalidNumericConstant"
19-
"ErrorInvalidOperator"
2019
"ErrorInvalidInterpolationTerminator"
20+
"ErrorNumericOverflow"
21+
"ErrorInvalidEscapeSequence"
22+
"ErrorOverLongCharacter"
2123
# Generic error
2224
"error"
2325
"END_ERRORS"
@@ -94,6 +96,9 @@ const _kind_names =
9496
"END_DELIMITERS"
9597

9698
"BEGIN_OPS"
99+
"ErrorInvalidOperator"
100+
"Error**"
101+
97102
"..."
98103

99104
# Level 1
@@ -1009,8 +1014,11 @@ const _nonunique_kind_names = Set([
10091014

10101015
K"ErrorEofMultiComment"
10111016
K"ErrorInvalidNumericConstant"
1012-
K"ErrorInvalidOperator"
10131017
K"ErrorInvalidInterpolationTerminator"
1018+
K"ErrorNumericOverflow"
1019+
K"ErrorInvalidEscapeSequence"
1020+
K"ErrorOverLongCharacter"
1021+
K"ErrorInvalidOperator"
10141022

10151023
K"Integer"
10161024
K"BinInt"
@@ -1049,7 +1057,7 @@ end
10491057
#-------------------------------------------------------------------------------
10501058
# Predicates
10511059
is_contextual_keyword(k::Kind) = K"BEGIN_CONTEXTUAL_KEYWORDS" < k < K"END_CONTEXTUAL_KEYWORDS"
1052-
is_error(k::Kind) = K"BEGIN_ERRORS" < k < K"END_ERRORS"
1060+
is_error(k::Kind) = K"BEGIN_ERRORS" < k < K"END_ERRORS" || k == K"ErrorInvalidOperator" || k == K"Error**"
10531061
is_keyword(k::Kind) = K"BEGIN_KEYWORDS" < k < K"END_KEYWORDS"
10541062
is_block_continuation_keyword(k::Kind) = K"BEGIN_BLOCK_CONTINUATION_KEYWORDS" < k < K"END_BLOCK_CONTINUATION_KEYWORDS"
10551063
is_literal(k::Kind) = K"BEGIN_LITERAL" < k < K"END_LITERAL"

src/parse_stream.jl

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ function Base.summary(head::SyntaxHead)
7272
end
7373

7474
function untokenize(head::SyntaxHead; unique=true, include_flag_suff=true)
75-
str = is_error(kind(head)) ? "error" : untokenize(kind(head); unique=unique)::String
75+
str = (is_error(kind(head)) ? untokenize(kind(head); unique=false) :
76+
untokenize(kind(head); unique=unique))::String
7677
if is_dotted(head)
7778
str = "."*str
7879
end
@@ -850,7 +851,7 @@ end
850851
#-------------------------------------------------------------------------------
851852
# ParseStream Post-processing
852853

853-
function validate_literal_tokens(stream::ParseStream)
854+
function validate_tokens(stream::ParseStream)
854855
text = sourcetext(stream)
855856
toks = stream.tokens
856857
charbuf = IOBuffer()
@@ -860,7 +861,7 @@ function validate_literal_tokens(stream::ParseStream)
860861
fbyte = toks[i-1].next_byte
861862
nbyte = t.next_byte
862863
lbyte = prevind(text, t.next_byte)
863-
had_error = false
864+
error_kind = K"None"
864865
if k in KSet"Integer BinInt OctInt HexInt"
865866
# The following shouldn't be able to error...
866867
# parse_int_literal
@@ -882,7 +883,7 @@ function validate_literal_tokens(stream::ParseStream)
882883
elseif code == :overflow
883884
emit_diagnostic(stream, fbyte, lbyte,
884885
error="overflow in floating point literal")
885-
had_error = true
886+
error_kind = K"ErrorNumericOverflow"
886887
elseif underflow0
887888
emit_diagnostic(stream, fbyte, lbyte,
888889
warning="underflow to zero in floating point literal")
@@ -892,21 +893,30 @@ function validate_literal_tokens(stream::ParseStream)
892893
truncate(charbuf, 0)
893894
had_error = unescape_julia_string(charbuf, text, fbyte,
894895
nbyte, stream.diagnostics)
895-
if !had_error
896+
if had_error
897+
error_kind = K"ErrorInvalidEscapeSequence"
898+
else
896899
seek(charbuf,0)
897900
read(charbuf, Char)
898901
if !eof(charbuf)
899-
had_error = true
902+
error_kind = K"ErrorOverLongCharacter"
900903
emit_diagnostic(stream, fbyte, lbyte,
901904
error="character literal contains multiple characters")
902905
end
903906
end
904907
elseif k == K"String" && !has_flags(t, RAW_STRING_FLAG)
905908
had_error = unescape_julia_string(devnull, text, fbyte,
906909
nbyte, stream.diagnostics)
910+
if had_error
911+
error_kind = K"ErrorInvalidEscapeSequence"
912+
end
913+
elseif is_error(k) && k != K"error"
914+
# Emit messages for non-generic token errors
915+
emit_diagnostic(stream, fbyte, lbyte,
916+
error=Tokenize.TOKEN_ERROR_DESCRIPTION[k])
907917
end
908-
if had_error
909-
toks[i] = SyntaxToken(SyntaxHead(K"error", EMPTY_FLAGS),
918+
if error_kind != K"None"
919+
toks[i] = SyntaxToken(SyntaxHead(error_kind, EMPTY_FLAGS),
910920
t.orig_kind, t.next_byte)
911921
end
912922
end

src/parser.jl

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -829,15 +829,15 @@ end
829829
# flisp: parse-range
830830
function parse_range(ps::ParseState)
831831
mark = position(ps)
832-
parse_expr(ps)
832+
parse_invalid_ops(ps)
833833
initial_tok = peek_token(ps)
834834
initial_kind = kind(initial_tok)
835835
if initial_kind != K":" && is_prec_colon(initial_kind)
836836
# a..b ==> (call-i a .. b)
837837
# a … b ==> (call-i a … b)
838838
# a .… b ==> (dotcall-i a … b)
839839
bump_dotsplit(ps)
840-
parse_expr(ps)
840+
parse_invalid_ops(ps)
841841
emit(ps, mark, is_dotted(initial_tok) ? K"dotcall" : K"call", INFIX_FLAG)
842842
elseif initial_kind == K":" && ps.range_colon_enabled
843843
# a ? b : c:d ==> (? a b (call-i c : d))
@@ -864,7 +864,7 @@ function parse_range(ps::ParseState)
864864
bump(ps) # K"<" or K">"
865865
emit(ps, emark, K"error",
866866
error="Invalid `:$ks` found, maybe replace with `$ks:`")
867-
parse_expr(ps)
867+
parse_invalid_ops(ps)
868868
emit(ps, mark, K"call", INFIX_FLAG)
869869
break
870870
end
@@ -891,7 +891,7 @@ function parse_range(ps::ParseState)
891891
emit(ps, mark, K"call", INFIX_FLAG)
892892
return
893893
end
894-
parse_expr(ps)
894+
parse_invalid_ops(ps)
895895
if n_colons == 2
896896
emit(ps, mark, K"call", INFIX_FLAG)
897897
n_colons = 0
@@ -911,6 +911,23 @@ function parse_range(ps::ParseState)
911911
end
912912
end
913913

914+
# Parse invalid binary operators
915+
#
916+
# Having this is unnecessary, but it improves error messages and the
917+
# error-containing parse tree.
918+
#
919+
# a--b ==> (call-i a (error) b)
920+
function parse_invalid_ops(ps::ParseState)
921+
mark = position(ps)
922+
parse_expr(ps)
923+
while (t = peek_token(ps); kind(t) in KSet"ErrorInvalidOperator Error**")
924+
bump_trivia(ps)
925+
bump_dotsplit(ps)
926+
parse_expr(ps)
927+
emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
928+
end
929+
end
930+
914931
# a - b - c ==> (call-i (call-i a - b) - c)
915932
# a + b + c ==> (call-i a + b c)
916933
# a .+ b ==> (dotcall-i a + b)
@@ -3518,6 +3535,10 @@ function parse_atom(ps::ParseState, check_identifiers=true)
35183535
"premature end of input" :
35193536
"unexpected closing token"
35203537
bump_invisible(ps, K"error", error=msg)
3538+
elseif is_error(leading_kind)
3539+
# Errors for bad tokens are emitted in validate_tokens() rather than
3540+
# here.
3541+
bump(ps)
35213542
else
35223543
bump(ps, error="invalid syntax atom")
35233544
end

src/parser_api.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ function parse!(stream::ParseStream; rule::Symbol=:toplevel)
5151
else
5252
throw(ArgumentError("Unknown grammar rule $rule"))
5353
end
54-
validate_literal_tokens(stream)
54+
validate_tokens(stream)
5555
stream
5656
end
5757

src/tokenize.jl

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,13 @@ include("tokenize_utils.jl")
1616
TOKEN_ERROR_DESCRIPTION = Dict{Kind, String}(
1717
K"ErrorEofMultiComment" => "unterminated multi-line comment #= ... =#",
1818
K"ErrorInvalidNumericConstant" => "invalid numeric constant",
19-
K"ErrorInvalidOperator" => "invalid operator",
2019
K"ErrorInvalidInterpolationTerminator" => "interpolated variable ends with invalid character; use `\$(...)` instead",
21-
K"error" => "unknown error",
20+
K"ErrorNumericOverflow"=>"overflow in numeric literal",
21+
K"ErrorInvalidEscapeSequence"=>"invalid string escape sequence",
22+
K"ErrorOverLongCharacter"=>"character literal contains multiple characters",
23+
K"ErrorInvalidOperator" => "invalid operator",
24+
K"Error**" => "use `x^y` instead of `x**y` for exponentiation, and `x...` instead of `**x` for splatting",
25+
K"error" => "unknown error token",
2226
)
2327

2428
struct Token
@@ -618,10 +622,14 @@ function lex_less(l::Lexer)
618622
return emit(l, K"<|")
619623
elseif dpeekchar(l) == ('-', '-')
620624
readchar(l); readchar(l)
621-
if accept(l, '>')
622-
return emit(l, K"<-->")
625+
if accept(l, '-')
626+
return emit_error(l, K"ErrorInvalidOperator")
623627
else
624-
return emit(l, K"<--")
628+
if accept(l, '>')
629+
return emit(l, K"<-->")
630+
else
631+
return emit(l, K"<--")
632+
end
625633
end
626634
else
627635
return emit(l, K"<")
@@ -713,7 +721,7 @@ end
713721

714722
function lex_star(l::Lexer)
715723
if accept(l, '*')
716-
return emit_error(l, K"ErrorInvalidOperator") # "**" is an invalid operator use ^
724+
return emit_error(l, K"Error**") # "**" is an invalid operator use ^
717725
elseif accept(l, '=')
718726
return emit(l, K"*=")
719727
end
@@ -811,7 +819,7 @@ function lex_digit(l::Lexer, kind)
811819
accept(l, "+-−")
812820
if accept_batch(l, isdigit)
813821
pc,ppc = dpeekchar(l)
814-
if pc === '.' && !dotop2(ppc, ' ')
822+
if pc === '.' && !dotop2(ppc)
815823
accept(l, '.')
816824
return emit_error(l, K"ErrorInvalidNumericConstant")
817825
end
@@ -829,7 +837,7 @@ function lex_digit(l::Lexer, kind)
829837
accept(l, "+-−")
830838
if accept_batch(l, isdigit)
831839
pc,ppc = dpeekchar(l)
832-
if pc === '.' && !dotop2(ppc, ' ')
840+
if pc === '.' && !dotop2(ppc)
833841
accept(l, '.')
834842
return emit_error(l, K"ErrorInvalidNumericConstant")
835843
end
@@ -959,7 +967,12 @@ function lex_dot(l::Lexer)
959967
if accept(l, '.')
960968
return emit(l, K"...")
961969
else
962-
return emit(l, K"..")
970+
if dotop2(peekchar(l))
971+
readchar(l)
972+
return emit_error(l, K"ErrorInvalidOperator")
973+
else
974+
return emit(l, K"..")
975+
end
963976
end
964977
elseif Base.isdigit(peekchar(l))
965978
return lex_digit(l, K"Float")

src/tokenize_utils.jl

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -147,25 +147,23 @@ readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)
147147
0x0000ffe9 <= c <= 0x0000ffec
148148
end
149149

150-
function dotop2(pc, dpc)
150+
function dotop2(pc)
151151
dotop1(pc) ||
152152
pc =='+' ||
153153
pc =='-' ||
154+
pc =='' ||
154155
pc =='*' ||
155156
pc =='/' ||
156157
pc =='\\' ||
157158
pc =='^' ||
158159
pc =='<' ||
159160
pc =='>' ||
160-
pc =='&' && dpc === '=' ||
161161
pc =='&' ||
162162
pc =='%' ||
163-
pc == '=' && dpc != '>' ||
164-
pc == '|' && dpc != '|' ||
165-
pc == '!' && dpc == '=' ||
163+
pc == '=' ||
164+
pc == '|' ||
166165
pc == '' ||
167-
pc == '÷' ||
168-
pc == '=' && dpc == '>'
166+
pc == '÷'
169167
end
170168

171169
# suffix operators

test/parser.jl

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
function test_parse(production, code; v=v"1.6", expr=false)
22
stream = ParseStream(code, version=v)
33
production(ParseState(stream))
4-
JuliaSyntax.validate_literal_tokens(stream)
4+
JuliaSyntax.validate_tokens(stream)
55
t = build_tree(GreenNode, stream, wrap_toplevel_as_kind=K"None")
66
source = SourceFile(code)
77
s = SyntaxNode(source, t)
@@ -127,6 +127,9 @@ tests = [
127127
"x:y..." => "(... (call-i x : y))"
128128
"x..y..." => "(... (call-i x .. y))"
129129
],
130+
JuliaSyntax.parse_invalid_ops => [
131+
"a--b" => "(call-i a (ErrorInvalidOperator) b)"
132+
],
130133
JuliaSyntax.parse_expr => [
131134
"a - b - c" => "(call-i (call-i a - b) - c)"
132135
"a + b + c" => "(call-i a + b c)"
@@ -870,11 +873,11 @@ tests = [
870873
],
871874
JuliaSyntax.parse_atom => [
872875
# errors in literals
873-
"\"\\xqqq\"" => "(string (error))"
874-
"'ab'" => "(char (error))"
875-
"'\\xq'" => "(char (error))"
876-
"10.0e1000'" => "(error)"
877-
"10.0f100'" => "(error)"
876+
"\"\\xqqq\"" => "(string (ErrorInvalidEscapeSequence))"
877+
"'\\xq'" => "(char (ErrorInvalidEscapeSequence))"
878+
"'ab'" => "(char (ErrorOverLongCharacter))"
879+
"10.0e1000'" => "(ErrorNumericOverflow)"
880+
"10.0f100'" => "(ErrorNumericOverflow)"
878881
],
879882
JuliaSyntax.parse_docstring => [
880883
""" "notdoc" ] """ => "(string \"notdoc\")"

test/test_utils.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ for debugging.
311311
function itest_parse(production, code; version::VersionNumber=v"1.6")
312312
stream = ParseStream(code; version=version)
313313
production(JuliaSyntax.ParseState(stream))
314-
JuliaSyntax.validate_literal_tokens(stream)
314+
JuliaSyntax.validate_tokens(stream)
315315
t = JuliaSyntax.build_tree(GreenNode, stream, wrap_toplevel_as_kind=K"toplevel")
316316

317317
println(stdout, "# Code:\n$code\n")

test/tokenize.jl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ end
231231

232232
@test toks("#=# text=#") == ["#=# text=#"=>K"Comment"]
233233

234+
@test toks("#= #= =#") == ["#= #= =#"=>K"ErrorEofMultiComment"]
234235
@test toks("#=#==#=#") == ["#=#==#=#"=>K"Comment"]
235236
@test toks("#=#==#=") == ["#=#==#="=>K"ErrorEofMultiComment"]
236237
end
@@ -316,11 +317,6 @@ end
316317
@test length(collect(tokenize("x)"))) == 3
317318
end
318319

319-
@testset "errors" begin
320-
@test tok("#= #= =#", 1).kind == K"ErrorEofMultiComment"
321-
@test tok("aa **", 3).kind == K"ErrorInvalidOperator"
322-
end
323-
324320
@testset "xor_eq" begin
325321
@test tok("1 ⊻= 2", 3).kind==K"⊻="
326322
end
@@ -772,7 +768,11 @@ end
772768
test_error(tok("0b3",1), K"ErrorInvalidNumericConstant")
773769
test_error(tok("0op",1), K"ErrorInvalidNumericConstant")
774770
test_error(tok("--",1), K"ErrorInvalidOperator")
775-
test_error(tok("1**2",2), K"ErrorInvalidOperator")
771+
772+
@test toks("1**2") == ["1"=>K"Integer", "**"=>K"Error**", "2"=>K"Integer"]
773+
@test toks("a<---b") == ["a"=>K"Identifier", "<---"=>K"ErrorInvalidOperator", "b"=>K"Identifier"]
774+
@test toks("a..+b") == ["a"=>K"Identifier", "..+"=>K"ErrorInvalidOperator", "b"=>K"Identifier"]
775+
@test toks("a..−b") == ["a"=>K"Identifier", "..−"=>K"ErrorInvalidOperator", "b"=>K"Identifier"]
776776
end
777777

778778
@testset "hat suffix" begin

0 commit comments

Comments
 (0)