get rid of the TokenError field in Token (#17)

Kristoffer Carlsson · web-flow · commit f310f60507b1 · 2022-03-09T14:33:01.000+10:00
diff --git a/README.md b/README.md
@@ -405,7 +405,7 @@ name of compatibility, perhaps with a warning.)
   broken-looking AST like `(macrocall (. A (quote (. B @x))))`.  It should
   probably be rejected.
 * Operator prefix call syntax doesn't work in the cases like `+(a;b,c)` where
-  keyword parameters are separated by commas. A tuple is produced instead. 
+  keyword parameters are separated by commas. A tuple is produced instead.
 * `const` and `global` allow chained assignment, but the right hand side is not
   constant. `a` const here but not `b`.
   ```
@@ -698,7 +698,7 @@ interface. Could we have `Expr2` wrap `SyntaxNode`?
   tree library (rowan) for representing of a non-rust toy language is here
   https://dev.to/cad97/lossless-syntax-trees-280c
 
-Not all the design decisions in `rust-analyzer` are finalized but the 
+Not all the design decisions in `rust-analyzer` are finalized but the
 [architecture document](https://github.com/rust-analyzer/rust-analyzer/blob/master/docs/dev/architecture.md)
 is a fantastic source of design inspiration.
 
@@ -772,7 +772,7 @@ The tree datastructure design here is tricky:
     parentheses in `2*(x + y)` and the explicit vs implicit multiplication
     symbol in `2*x` vs `2x`.
 
-2. There's various type of *analyses* 
+2. There's various type of *analyses*
 - There's many useful ways to augment a syntax tree depending on use case.
 - Analysis algorithms should be able to act on any tree type, ignoring
   but carrying augmentations which they don't know about.
@@ -983,4 +983,3 @@ indentation from the syntax tree?  Source formatting involves a big pile of
 heuristics to get something which "looks nice"... and ML systems have become
 very good at heuristics. Also, we've got huge piles of training data — just
 choose some high quality, tastefully hand-formatted libraries.
-
diff --git a/Tokenize/benchmark/lex_base.jl b/Tokenize/benchmark/lex_base.jl
@@ -18,7 +18,7 @@ function speed_test()
                     while !Tokenize.Lexers.eof(l)
                         t = Tokenize.Lexers.next_token(l)
                         tot_tokens += 1
-                        if t.kind == Tokens.ERROR
+                        if Tokens.iserror(t.kind)
                             tot_errors += 1
                         end
                     end
diff --git a/Tokenize/src/lexer.jl b/Tokenize/src/lexer.jl
@@ -3,7 +3,7 @@ module Lexers
 include("utilities.jl")
 
 import ..Tokens
-import ..Tokens: Token, Kind, TokenError, UNICODE_OPS, EMPTY_TOKEN, isliteral
+import ..Tokens: Token, Kind, UNICODE_OPS, EMPTY_TOKEN, isliteral
 
 import ..Tokens: FUNCTION, ABSTRACT, IDENTIFIER, BAREMODULE, BEGIN, BREAK, CATCH, CONST, CONTINUE,
                  DO, ELSE, ELSEIF, END, EXPORT, FALSE, FINALLY, FOR, FUNCTION, GLOBAL, LET, LOCAL, IF,
@@ -52,6 +52,7 @@ mutable struct Lexer{IO_t <: IO}
     charspos::Tuple{Int,Int,Int,Int}
     doread::Bool
     dotop::Bool
+    errored::Bool
 end
 
 function Lexer(io::IO)
@@ -80,7 +81,7 @@ function Lexer(io::IO)
     end
     Lexer(io, position(io), 1, 1, position(io), 1, 1, position(io),
                   Tokens.ERROR, Vector{StringState}(), IOBuffer(),
-                  (c1,c2,c3,c4), (p1,p2,p3,p4), false, false)
+                  (c1,c2,c3,c4), (p1,p2,p3,p4), false, false, false)
 end
 Lexer(str::AbstractString) = Lexer(IOBuffer(str))
 
@@ -243,11 +244,11 @@ Consumes all following characters until `accept(l, f)` is `false`.
 end
 
 """
-    emit(l::Lexer, kind::Kind, err::TokenError=Tokens.NO_ERR)
+    emit(l::Lexer, kind::Kind)
 
 Returns a `Token` of kind `kind` with contents `str` and starts a new `Token`.
 """
-function emit(l::Lexer, kind::Kind, err::TokenError = Tokens.NO_ERR)
+function emit(l::Lexer, kind::Kind)
     suffix = false
     if optakessuffix(kind)
         while isopsuffix(peekchar(l))
@@ -256,20 +257,22 @@ function emit(l::Lexer, kind::Kind, err::TokenError = Tokens.NO_ERR)
         end
     end
 
-    tok = Token(kind, startpos(l), position(l) - 1, err, l.dotop, suffix)
+    tok = Token(kind, startpos(l), position(l) - 1, l.dotop, suffix)
 
     l.dotop = false
     l.last_token = kind
     return tok
 end
 
 """
-    emit_error(l::Lexer, err::TokenError=Tokens.UNKNOWN)
+    emit_error(l::Lexer, err::Kind=Tokens.ERROR)
 
 Returns an `ERROR` token with error `err` and starts a new `Token`.
 """
-function emit_error(l::Lexer, err::TokenError = Tokens.UNKNOWN)
-    return emit(l, Tokens.ERROR, err)
+function emit_error(l::Lexer, err::Kind = Tokens.ERROR)
+    l.errored = true
+    @assert Tokens.iserror(err)
+    return emit(l, err)
 end
 
 
diff --git a/Tokenize/src/token.jl b/Tokenize/src/token.jl
@@ -10,7 +10,7 @@ include("token_kinds.jl")
 iskeyword(k::Kind) = begin_keywords < k < end_keywords
 isliteral(k::Kind) = begin_literal < k < end_literal
 isoperator(k::Kind) = begin_ops < k < end_ops
-
+iserror(k::Kind) = begin_errors < k < end_errors
 iscontextualkeyword(k::Kind) = begin_contextual_keywords < k < end_contextual_keywords
 
 function iswordoperator(k::Kind)
@@ -32,46 +32,35 @@ function _add_kws()
 end
 _add_kws()
 
-# TODO: more
-@enum(TokenError,
-    NO_ERR,
-    EOF_MULTICOMMENT,
-    EOF_CHAR,
-    INVALID_NUMERIC_CONSTANT,
-    INVALID_OPERATOR,
-    INVALID_INTERPOLATION_TERMINATOR,
-    UNKNOWN,
-)
-
 # Error kind => description
-TOKEN_ERROR_DESCRIPTION = Dict{TokenError, String}(
+TOKEN_ERROR_DESCRIPTION = Dict{Kind, String}(
     EOF_MULTICOMMENT => "unterminated multi-line comment #= ... =#",
     EOF_CHAR => "unterminated character literal",
     INVALID_NUMERIC_CONSTANT => "invalid numeric constant",
     INVALID_OPERATOR => "invalid operator",
     INVALID_INTERPOLATION_TERMINATOR => "interpolated variable ends with invalid character; use `\$(...)` instead",
-    UNKNOWN => "unknown",
+    ERROR => "unknown error",
 )
 
 struct Token
     kind::Kind
     # Offsets into a string or buffer
     startbyte::Int # The byte where the token start in the buffer
     endbyte::Int # The byte where the token ended in the buffer
-    token_error::TokenError
     dotop::Bool
     suffix::Bool
 end
 function Token(kind::Kind, startbyte::Int, endbyte::Int)
-    Token(kind, startbyte, endbyte, NO_ERR, false, false)
+    Token(kind, startbyte, endbyte, false, false)
 end
-Token() = Token(ERROR, 0, 0, UNKNOWN, false, false)
+Token() = Token(ERROR, 0, 0, false, false)
 
 const EMPTY_TOKEN = Token()
 
 function kind(t::Token)
     isoperator(t.kind) && return OP
     iskeyword(t.kind) && return KEYWORD
+    iserror(t.kind) && return ERROR
     return t.kind
 end
 exactkind(t::Token) = t.kind
diff --git a/Tokenize/src/token_kinds.jl b/Tokenize/src/token_kinds.jl
@@ -1,14 +1,22 @@
 @enum(Kind::UInt16,
     NONE,      # Placeholder; never emitted by lexer
     ENDMARKER, # EOF
-    ERROR,
     COMMENT, # aadsdsa, #= fdsf #=
     WHITESPACE, # '\n   \t'
     IDENTIFIER, # foo, Σxx
     AT_SIGN, # @
     COMMA, #,
     SEMICOLON, # ;
 
+    begin_errors,
+        EOF_MULTICOMMENT,
+        EOF_CHAR,
+        INVALID_NUMERIC_CONSTANT,
+        INVALID_OPERATOR,
+        INVALID_INTERPOLATION_TERMINATOR,
+        ERROR,
+    end_errors,
+
     begin_keywords,
         KEYWORD, # general
         BAREMODULE,
diff --git a/Tokenize/test/lexer.jl b/Tokenize/test/lexer.jl
@@ -183,7 +183,7 @@ end
     test_roundtrip("1234.0 .+1",   Tokens.FLOAT,   "1234.0")
     test_roundtrip("1234.f(a)",    Tokens.FLOAT,   "1234.")
     test_roundtrip("1234 .f(a)",   Tokens.INTEGER, "1234")
-    test_roundtrip("1234.0.f(a)",  Tokens.ERROR,   "1234.0.")
+    test_roundtrip("1234.0.f(a)",  Tokens.INVALID_NUMERIC_CONSTANT,   "1234.0.")
     test_roundtrip("1234.0 .f(a)", Tokens.FLOAT,   "1234.0")
 end
 
@@ -280,9 +280,9 @@ end
 end
 
 @testset "errors" begin
-    @test tok("#=   #=   =#",           1).kind == T.ERROR
-    @test tok("'dsadsa",                1).kind == T.ERROR
-    @test tok("aa **",                  3).kind == T.ERROR
+    @test tok("#=   #=   =#",           1).kind == T.EOF_MULTICOMMENT
+    @test tok("'dsadsa",                1).kind == T.EOF_CHAR
+    @test tok("aa **",                  3).kind == T.INVALID_OPERATOR
 end
 
 @testset "xor_eq" begin
@@ -501,9 +501,10 @@ end
         str = """ "\$x෴" """
     ts = collect(tokenize(str))
         @test ts[4] ~ (T.IDENTIFIER , "x" , str)
-        @test ts[5] ~ (T.ERROR      , ""  , str)
+        @test ts[5] ~ (T.INVALID_INTERPOLATION_TERMINATOR      , ""  , str)
         @test ts[6] ~ (T.STRING     , "෴" , str)
-        @test ts[5].token_error == Tokens.INVALID_INTERPOLATION_TERMINATOR
+        @test Tokens.iserror(ts[5].kind)
+        @test ts[5].kind == Tokens.INVALID_INTERPOLATION_TERMINATOR
     end
 end
 
@@ -650,10 +651,10 @@ end
 end
 
 @testset "hex/bin/octal errors" begin
-    @test tok("0x").kind == T.ERROR
-    @test tok("0b").kind == T.ERROR
-    @test tok("0o").kind == T.ERROR
-    @test tok("0x 2", 1).kind == T.ERROR
+    @test tok("0x").kind == T.INVALID_NUMERIC_CONSTANT
+    @test tok("0b").kind == T.INVALID_NUMERIC_CONSTANT
+    @test tok("0o").kind == T.INVALID_NUMERIC_CONSTANT
+    @test tok("0x 2", 1).kind == T.INVALID_NUMERIC_CONSTANT
     @test tok("0x.1p1").kind == T.FLOAT
 end
 
@@ -716,15 +717,20 @@ end
     @test tok("outer", 1).kind==T.OUTER
 end
 
+function test_error(tok, kind)
+    @test Tokens.iserror(tok.kind)
+    @test tok.kind == kind
+end
+
 @testset "token errors" begin
-    @test tok("1.2e2.3",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
-    @test tok("1.2.",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
-    @test tok("1.2.f",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
-    @test tok("0xv",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
-    @test tok("0b3",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
-    @test tok("0op",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
-    @test tok("--",1).token_error === Tokens.INVALID_OPERATOR
-    @test tok("1**2",2).token_error === Tokens.INVALID_OPERATOR
+    test_error(tok("1.2e2.3",1), Tokens.INVALID_NUMERIC_CONSTANT)
+    test_error(tok("1.2.",1),    Tokens.INVALID_NUMERIC_CONSTANT)
+    test_error(tok("1.2.f",1),   Tokens.INVALID_NUMERIC_CONSTANT)
+    test_error(tok("0xv",1),     Tokens.INVALID_NUMERIC_CONSTANT)
+    test_error(tok("0b3",1),     Tokens.INVALID_NUMERIC_CONSTANT)
+    test_error(tok("0op",1),     Tokens.INVALID_NUMERIC_CONSTANT)
+    test_error(tok("--",1),      Tokens.INVALID_OPERATOR)
+    test_error(tok("1**2",2),    Tokens.INVALID_OPERATOR)
 end
 
 @testset "hat suffix" begin
@@ -765,7 +771,7 @@ end
 
 @testset "invalid float" begin
     s = ".0."
-    @test collect(tokenize(s))[1].kind == Tokens.ERROR
+    @test collect(tokenize(s))[1].kind == Tokens.INVALID_NUMERIC_CONSTANT
 end
 
 @testset "allow prime after end" begin
diff --git a/src/parse_stream.jl b/src/parse_stream.jl
@@ -61,7 +61,7 @@ function Base.summary(head::SyntaxHead)
 end
 
 function untokenize(head::SyntaxHead; unique=true, include_flag_suff=true)
-    str = untokenize(kind(head); unique=unique)
+    str = is_error(kind(head)) ? "error" : untokenize(kind(head); unique=unique)
     if is_dotted(head)
         str = "."*str
     end
diff --git a/src/tokens.jl b/src/tokens.jl
@@ -43,6 +43,7 @@ kind(raw::TzTokens.Token) = TzTokens.exactkind(raw)
 # Some renaming for naming consistency
 is_literal(k) = TzTokens.isliteral(kind(k))
 is_keyword(k) = TzTokens.iskeyword(kind(k))
+is_error(k) = TzTokens.iserror(kind(k))
 is_contextual_keyword(k) = TzTokens.iscontextualkeyword(kind(k))
 is_operator(k) = TzTokens.isoperator(kind(k))
 is_word_operator(k) = TzTokens.iswordoperator(kind(k))