Fix various numeric literal token errors (#196)

c42f · web-flow · commit dae2d23afcd4 · 2023-02-14T14:24:42.000+10:00
A fairly big refactor of numeric literal tokenization error cases and a
couple of other tokenizer errors ported from the flisp code.

* For hexfloat, emit a more specific errors when the `p` suffix is
  missing.
* For octal, hex and binary, add errors for trailing invalid digits or
  identifier characters like `0b123` and `0xenomorph`
* Emit an error for ambiguous numeric constants with dot suffix vs
  juxtuposition like `1.(`
* Emit an error for underscore directly after dot as in `1._`
* Emit an error for hexfloat without digits `0x.p0`
* Add an invalid operator error for `&lt;---` to follow compatibility with
  the reference parser.
diff --git a/src/kinds.jl b/src/kinds.jl
@@ -16,7 +16,9 @@ const _kind_names =
         # Tokenization errors
         "ErrorEofMultiComment"
         "ErrorInvalidNumericConstant"
+        "ErrorHexFloatMustContainP"
         "ErrorAmbiguousNumericConstant"
+        "ErrorAmbiguousNumericDotMultiply"
         "ErrorInvalidInterpolationTerminator"
         "ErrorNumericOverflow"
         "ErrorInvalidEscapeSequence"
@@ -1016,7 +1018,9 @@ const _nonunique_kind_names = Set([
 
     K"ErrorEofMultiComment"
     K"ErrorInvalidNumericConstant"
+    K"ErrorHexFloatMustContainP"
     K"ErrorAmbiguousNumericConstant"
+    K"ErrorAmbiguousNumericDotMultiply"
     K"ErrorInvalidInterpolationTerminator"
     K"ErrorNumericOverflow"
     K"ErrorInvalidEscapeSequence"
@@ -1061,7 +1065,9 @@ end
 _token_error_descriptions = Dict{Kind, String}(
     K"ErrorEofMultiComment" => "unterminated multi-line comment #= ... =#",
     K"ErrorInvalidNumericConstant" => "invalid numeric constant",
+    K"ErrorHexFloatMustContainP" => "hex float literal must contain `p` or `P`",
     K"ErrorAmbiguousNumericConstant" => "ambiguous `.` syntax; add whitespace to clarify (eg `1.+2` might be `1.0+2` or `1 .+ 2`)",
+    K"ErrorAmbiguousNumericDotMultiply" => "numeric constant cannot be implicitly multiplied because it ends with `.`",
     K"ErrorInvalidInterpolationTerminator" => "interpolated variable ends with invalid character; use `\$(...)` instead",
     K"ErrorNumericOverflow"=>"overflow in numeric literal",
     K"ErrorInvalidEscapeSequence"=>"invalid string escape sequence",
diff --git a/src/parser.jl b/src/parser.jl
@@ -1103,9 +1103,6 @@ function is_juxtapose(ps, prev_k, t)
          !(is_block_form(prev_k)         ||
            is_syntactic_unary_op(prev_k) ||
            is_initial_reserved_word(ps, prev_k) )))  &&
-    # https://github.com/JuliaLang/julia/issues/16356
-    # 0xenomorph  ==>  0x0e
-    !(prev_k in KSet"BinInt HexInt OctInt" && (k == K"Identifier" || is_keyword(k))) &&
     (!is_operator(k) || is_radical_op(k))            &&
     !is_closing_token(ps, k)                         &&
     !is_initial_reserved_word(ps, k)
diff --git a/src/tokenize.jl b/src/tokenize.jl
@@ -614,6 +614,8 @@ function lex_less(l::Lexer)
         else
             if accept(l, '>')
                 return emit(l, K"<-->")
+            elseif accept(l, '-')
+                return emit_error(l, K"ErrorInvalidOperator")
             else
                 return emit(l, K"<--")
             end
@@ -772,33 +774,13 @@ function lex_digit(l::Lexer, kind)
             return emit_error(l, K"ErrorInvalidNumericConstant")
         elseif is_operator_start_char(ppc) && ppc !== ':'
             readchar(l)
-            return emit_error(l, K"ErrorAmbiguousNumericConstant")
-        elseif (!(isdigit(ppc) ||
-            iswhitespace(ppc) ||
-            is_identifier_start_char(ppc)
-            || ppc == '('
-            || ppc == ')'
-            || ppc == '['
-            || ppc == ']'
-            || ppc == '{'
-            || ppc == '}'
-            || ppc == ','
-            || ppc == ';'
-            || ppc == '@'
-            || ppc == '`'
-            || ppc == '"'
-            || ppc == ':'
-            || ppc == '?'
-            || ppc == '#'
-            || ppc == EOF_CHAR))
-            kind = K"Integer"
-
-            return emit(l, kind)
+            return emit_error(l, K"ErrorAmbiguousNumericConstant") # `1.+`
         end
         readchar(l)
 
         kind = K"Float"
-        accept_number(l, isdigit)
+        accept(l, '_') && return emit_error(l, K"ErrorInvalidNumericConstant") # `1._`
+        had_fraction_digs = accept_number(l, isdigit)
         pc, ppc = dpeekchar(l)
         if (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−')
             kind = pc == 'f' ? K"Float32" : K"Float"
@@ -807,17 +789,20 @@ function lex_digit(l::Lexer, kind)
             if accept_batch(l, isdigit)
                 pc,ppc = dpeekchar(l)
                 if pc === '.' && !dotop2(ppc)
-                    accept(l, '.')
-                    return emit_error(l, K"ErrorInvalidNumericConstant")
+                    readchar(l)
+                    return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e1.`
                 end
             else
-                return emit_error(l, K"ErrorInvalidNumericConstant")
+                return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e`
             end
-        elseif pc == '.' && (is_identifier_start_char(ppc) || ppc == EOF_CHAR)
+        elseif pc == '.' && ppc != '.' && !is_operator_start_char(ppc)
             readchar(l)
-            return emit_error(l, K"ErrorInvalidNumericConstant")
+            return emit_error(l, K"ErrorInvalidNumericConstant") # `1.1.`
+        elseif !had_fraction_digs && (is_identifier_start_char(pc) ||
+                                      pc == '(' || pc == '[' || pc == '{' ||
+                                      pc == '@' || pc == '`' || pc == '"')
+            return emit_error(l, K"ErrorAmbiguousNumericDotMultiply") # `1.(` `1.x`
         end
-
     elseif (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−')
         kind = pc == 'f' ? K"Float32" : K"Float"
         readchar(l)
@@ -826,44 +811,54 @@ function lex_digit(l::Lexer, kind)
             pc,ppc = dpeekchar(l)
             if pc === '.' && !dotop2(ppc)
                 accept(l, '.')
-                return emit_error(l, K"ErrorInvalidNumericConstant")
+                return emit_error(l, K"ErrorInvalidNumericConstant") # `1e1.`
             end
         else
-            return emit_error(l, K"ErrorInvalidNumericConstant")
+            return emit_error(l, K"ErrorInvalidNumericConstant") # `1e+`
         end
     elseif position(l) - startpos(l) == 1 && l.chars[1] == '0'
         kind == K"Integer"
+        is_bin_oct_hex_int = false
         if pc == 'x'
             kind = K"HexInt"
             isfloat = false
             readchar(l)
-            !(ishex(ppc) || ppc == '.') && return emit_error(l, K"ErrorInvalidNumericConstant")
-            accept_number(l, ishex)
+            had_digits = accept_number(l, ishex)
             pc,ppc = dpeekchar(l)
             if pc == '.' && ppc != '.'
                 readchar(l)
-                accept_number(l, ishex)
+                had_digits |= accept_number(l, ishex)
                 isfloat = true
             end
             if accept(l, "pP")
                 kind = K"Float"
                 accept(l, "+-−")
-                if !accept_number(l, isdigit)
-                    return emit_error(l, K"ErrorInvalidNumericConstant")
+                if !accept_number(l, isdigit) || !had_digits
+                    return emit_error(l, K"ErrorInvalidNumericConstant") # `0x1p` `0x.p0`
                 end
             elseif isfloat
-                return emit_error(l, K"ErrorInvalidNumericConstant")
+                return emit_error(l, K"ErrorHexFloatMustContainP") # `0x.` `0x1.0`
             end
+            is_bin_oct_hex_int = !isfloat
         elseif pc == 'b'
-            !isbinary(ppc) && return emit_error(l, K"ErrorInvalidNumericConstant")
             readchar(l)
-            accept_number(l, isbinary)
+            had_digits = accept_number(l, isbinary)
             kind = K"BinInt"
+            is_bin_oct_hex_int = true
         elseif pc == 'o'
-            !isoctal(ppc) && return emit_error(l, K"ErrorInvalidNumericConstant")
             readchar(l)
-            accept_number(l, isoctal)
+            had_digits = accept_number(l, isoctal)
             kind = K"OctInt"
+            is_bin_oct_hex_int = true
+        end
+        if is_bin_oct_hex_int
+            pc = peekchar(l)
+            if !had_digits || isdigit(pc) || is_identifier_start_char(pc)
+                accept_batch(l, c->isdigit(c) || is_identifier_start_char(c))
+                # `0x` `0xg` `0x_` `0x-`
+                # `0b123` `0o78p` `0xenomorph` `0xaα`
+                return emit_error(l, K"ErrorInvalidNumericConstant")
+            end
         end
     end
     return emit(l, kind)
diff --git a/src/utils.jl b/src/utils.jl
@@ -121,3 +121,7 @@ function _fl_parse_string(text::AbstractString, filename::AbstractString,
     ex, offset+1
 end
 
+# Convenience functions to mirror `JuliaSyntax.parse(Expr, ...)` in simple cases.
+fl_parse(::Type{Expr}, args...; kws...) = fl_parse(args...; kws...)
+fl_parseall(::Type{Expr}, args...; kws...) = fl_parseall(args...; kws...)
+
diff --git a/test/parser.jl b/test/parser.jl
@@ -193,7 +193,6 @@ tests = [
         "sqrt(2)2"  =>  "(call sqrt 2)"
         "x' y"      =>  "(call-post x ')"
         "x 'y"      =>  "x"
-        "0xenomorph" => "0x0e"
     ],
     JuliaSyntax.parse_unary => [
         ":T"       => "(quote T)"
@@ -950,17 +949,6 @@ broken_tests = [
         "@!x"   => "(macrocall @! x)"
         "@..x"  => "(macrocall @.. x)"
         "@.x"   => "(macrocall @__dot__ x)"
-        # Invalid numeric literals, not juxtaposition
-        "0b12" => "(error \"0b12\")"
-        "0xex" => "(error \"0xex\")"
-        # Bad character literals
-        "'\\xff'"  => "(error '\\xff')"
-        "'\\x80'"  => "(error '\\x80')"
-        "'ab'"     => "(error 'ab')"
-    ]
-    JuliaSyntax.parse_juxtapose => [
-        # Want: "numeric constant \"10.\" cannot be implicitly multiplied because it ends with \".\""
-        "10.x" => "(error (call * 10.0 x))"
     ]
 ]
 
diff --git a/test/tokenize.jl b/test/tokenize.jl