Skip to content

Commit f310f60

Browse files
author
Kristoffer Carlsson
authored
get rid of the TokenError field in Token (#17)
1 parent d193baa commit f310f60

File tree

8 files changed

+57
-51
lines changed

8 files changed

+57
-51
lines changed

README.md

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ name of compatibility, perhaps with a warning.)
405405
broken-looking AST like `(macrocall (. A (quote (. B @x))))`. It should
406406
probably be rejected.
407407
* Operator prefix call syntax doesn't work in the cases like `+(a;b,c)` where
408-
keyword parameters are separated by commas. A tuple is produced instead.
408+
keyword parameters are separated by commas. A tuple is produced instead.
409409
* `const` and `global` allow chained assignment, but the right hand side is not
410410
constant. `a` const here but not `b`.
411411
```
@@ -698,7 +698,7 @@ interface. Could we have `Expr2` wrap `SyntaxNode`?
698698
tree library (rowan) for representing of a non-rust toy language is here
699699
https://dev.to/cad97/lossless-syntax-trees-280c
700700

701-
Not all the design decisions in `rust-analyzer` are finalized but the
701+
Not all the design decisions in `rust-analyzer` are finalized but the
702702
[architecture document](https://github.com/rust-analyzer/rust-analyzer/blob/master/docs/dev/architecture.md)
703703
is a fantastic source of design inspiration.
704704

@@ -772,7 +772,7 @@ The tree datastructure design here is tricky:
772772
parentheses in `2*(x + y)` and the explicit vs implicit multiplication
773773
symbol in `2*x` vs `2x`.
774774

775-
2. There's various type of *analyses*
775+
2. There's various type of *analyses*
776776
- There's many useful ways to augment a syntax tree depending on use case.
777777
- Analysis algorithms should be able to act on any tree type, ignoring
778778
but carrying augmentations which they don't know about.
@@ -983,4 +983,3 @@ indentation from the syntax tree? Source formatting involves a big pile of
983983
heuristics to get something which "looks nice"... and ML systems have become
984984
very good at heuristics. Also, we've got huge piles of training data — just
985985
choose some high quality, tastefully hand-formatted libraries.
986-

Tokenize/benchmark/lex_base.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ function speed_test()
1818
while !Tokenize.Lexers.eof(l)
1919
t = Tokenize.Lexers.next_token(l)
2020
tot_tokens += 1
21-
if t.kind == Tokens.ERROR
21+
if Tokens.iserror(t.kind)
2222
tot_errors += 1
2323
end
2424
end

Tokenize/src/lexer.jl

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ module Lexers
33
include("utilities.jl")
44

55
import ..Tokens
6-
import ..Tokens: Token, Kind, TokenError, UNICODE_OPS, EMPTY_TOKEN, isliteral
6+
import ..Tokens: Token, Kind, UNICODE_OPS, EMPTY_TOKEN, isliteral
77

88
import ..Tokens: FUNCTION, ABSTRACT, IDENTIFIER, BAREMODULE, BEGIN, BREAK, CATCH, CONST, CONTINUE,
99
DO, ELSE, ELSEIF, END, EXPORT, FALSE, FINALLY, FOR, FUNCTION, GLOBAL, LET, LOCAL, IF,
@@ -52,6 +52,7 @@ mutable struct Lexer{IO_t <: IO}
5252
charspos::Tuple{Int,Int,Int,Int}
5353
doread::Bool
5454
dotop::Bool
55+
errored::Bool
5556
end
5657

5758
function Lexer(io::IO)
@@ -80,7 +81,7 @@ function Lexer(io::IO)
8081
end
8182
Lexer(io, position(io), 1, 1, position(io), 1, 1, position(io),
8283
Tokens.ERROR, Vector{StringState}(), IOBuffer(),
83-
(c1,c2,c3,c4), (p1,p2,p3,p4), false, false)
84+
(c1,c2,c3,c4), (p1,p2,p3,p4), false, false, false)
8485
end
8586
Lexer(str::AbstractString) = Lexer(IOBuffer(str))
8687

@@ -243,11 +244,11 @@ Consumes all following characters until `accept(l, f)` is `false`.
243244
end
244245

245246
"""
246-
emit(l::Lexer, kind::Kind, err::TokenError=Tokens.NO_ERR)
247+
emit(l::Lexer, kind::Kind)
247248
248249
Returns a `Token` of kind `kind` with contents `str` and starts a new `Token`.
249250
"""
250-
function emit(l::Lexer, kind::Kind, err::TokenError = Tokens.NO_ERR)
251+
function emit(l::Lexer, kind::Kind)
251252
suffix = false
252253
if optakessuffix(kind)
253254
while isopsuffix(peekchar(l))
@@ -256,20 +257,22 @@ function emit(l::Lexer, kind::Kind, err::TokenError = Tokens.NO_ERR)
256257
end
257258
end
258259

259-
tok = Token(kind, startpos(l), position(l) - 1, err, l.dotop, suffix)
260+
tok = Token(kind, startpos(l), position(l) - 1, l.dotop, suffix)
260261

261262
l.dotop = false
262263
l.last_token = kind
263264
return tok
264265
end
265266

266267
"""
267-
emit_error(l::Lexer, err::TokenError=Tokens.UNKNOWN)
268+
emit_error(l::Lexer, err::Kind=Tokens.ERROR)
268269
269270
Returns an `ERROR` token with error `err` and starts a new `Token`.
270271
"""
271-
function emit_error(l::Lexer, err::TokenError = Tokens.UNKNOWN)
272-
return emit(l, Tokens.ERROR, err)
272+
function emit_error(l::Lexer, err::Kind = Tokens.ERROR)
273+
l.errored = true
274+
@assert Tokens.iserror(err)
275+
return emit(l, err)
273276
end
274277

275278

Tokenize/src/token.jl

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ include("token_kinds.jl")
1010
iskeyword(k::Kind) = begin_keywords < k < end_keywords
1111
isliteral(k::Kind) = begin_literal < k < end_literal
1212
isoperator(k::Kind) = begin_ops < k < end_ops
13-
13+
iserror(k::Kind) = begin_errors < k < end_errors
1414
iscontextualkeyword(k::Kind) = begin_contextual_keywords < k < end_contextual_keywords
1515

1616
function iswordoperator(k::Kind)
@@ -32,46 +32,35 @@ function _add_kws()
3232
end
3333
_add_kws()
3434

35-
# TODO: more
36-
@enum(TokenError,
37-
NO_ERR,
38-
EOF_MULTICOMMENT,
39-
EOF_CHAR,
40-
INVALID_NUMERIC_CONSTANT,
41-
INVALID_OPERATOR,
42-
INVALID_INTERPOLATION_TERMINATOR,
43-
UNKNOWN,
44-
)
45-
4635
# Error kind => description
47-
TOKEN_ERROR_DESCRIPTION = Dict{TokenError, String}(
36+
TOKEN_ERROR_DESCRIPTION = Dict{Kind, String}(
4837
EOF_MULTICOMMENT => "unterminated multi-line comment #= ... =#",
4938
EOF_CHAR => "unterminated character literal",
5039
INVALID_NUMERIC_CONSTANT => "invalid numeric constant",
5140
INVALID_OPERATOR => "invalid operator",
5241
INVALID_INTERPOLATION_TERMINATOR => "interpolated variable ends with invalid character; use `\$(...)` instead",
53-
UNKNOWN => "unknown",
42+
ERROR => "unknown error",
5443
)
5544

5645
struct Token
5746
kind::Kind
5847
# Offsets into a string or buffer
5948
startbyte::Int # The byte where the token start in the buffer
6049
endbyte::Int # The byte where the token ended in the buffer
61-
token_error::TokenError
6250
dotop::Bool
6351
suffix::Bool
6452
end
6553
function Token(kind::Kind, startbyte::Int, endbyte::Int)
66-
Token(kind, startbyte, endbyte, NO_ERR, false, false)
54+
Token(kind, startbyte, endbyte, false, false)
6755
end
68-
Token() = Token(ERROR, 0, 0, UNKNOWN, false, false)
56+
Token() = Token(ERROR, 0, 0, false, false)
6957

7058
const EMPTY_TOKEN = Token()
7159

7260
function kind(t::Token)
7361
isoperator(t.kind) && return OP
7462
iskeyword(t.kind) && return KEYWORD
63+
iserror(t.kind) && return ERROR
7564
return t.kind
7665
end
7766
exactkind(t::Token) = t.kind

Tokenize/src/token_kinds.jl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,22 @@
11
@enum(Kind::UInt16,
22
NONE, # Placeholder; never emitted by lexer
33
ENDMARKER, # EOF
4-
ERROR,
54
COMMENT, # aadsdsa, #= fdsf #=
65
WHITESPACE, # '\n \t'
76
IDENTIFIER, # foo, Σxx
87
AT_SIGN, # @
98
COMMA, #,
109
SEMICOLON, # ;
1110

11+
begin_errors,
12+
EOF_MULTICOMMENT,
13+
EOF_CHAR,
14+
INVALID_NUMERIC_CONSTANT,
15+
INVALID_OPERATOR,
16+
INVALID_INTERPOLATION_TERMINATOR,
17+
ERROR,
18+
end_errors,
19+
1220
begin_keywords,
1321
KEYWORD, # general
1422
BAREMODULE,

Tokenize/test/lexer.jl

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ end
183183
test_roundtrip("1234.0 .+1", Tokens.FLOAT, "1234.0")
184184
test_roundtrip("1234.f(a)", Tokens.FLOAT, "1234.")
185185
test_roundtrip("1234 .f(a)", Tokens.INTEGER, "1234")
186-
test_roundtrip("1234.0.f(a)", Tokens.ERROR, "1234.0.")
186+
test_roundtrip("1234.0.f(a)", Tokens.INVALID_NUMERIC_CONSTANT, "1234.0.")
187187
test_roundtrip("1234.0 .f(a)", Tokens.FLOAT, "1234.0")
188188
end
189189

@@ -280,9 +280,9 @@ end
280280
end
281281

282282
@testset "errors" begin
283-
@test tok("#= #= =#", 1).kind == T.ERROR
284-
@test tok("'dsadsa", 1).kind == T.ERROR
285-
@test tok("aa **", 3).kind == T.ERROR
283+
@test tok("#= #= =#", 1).kind == T.EOF_MULTICOMMENT
284+
@test tok("'dsadsa", 1).kind == T.EOF_CHAR
285+
@test tok("aa **", 3).kind == T.INVALID_OPERATOR
286286
end
287287

288288
@testset "xor_eq" begin
@@ -501,9 +501,10 @@ end
501501
str = """ "\$x෴" """
502502
ts = collect(tokenize(str))
503503
@test ts[4] ~ (T.IDENTIFIER , "x" , str)
504-
@test ts[5] ~ (T.ERROR , "" , str)
504+
@test ts[5] ~ (T.INVALID_INTERPOLATION_TERMINATOR , "" , str)
505505
@test ts[6] ~ (T.STRING , "" , str)
506-
@test ts[5].token_error == Tokens.INVALID_INTERPOLATION_TERMINATOR
506+
@test Tokens.iserror(ts[5].kind)
507+
@test ts[5].kind == Tokens.INVALID_INTERPOLATION_TERMINATOR
507508
end
508509
end
509510

@@ -650,10 +651,10 @@ end
650651
end
651652

652653
@testset "hex/bin/octal errors" begin
653-
@test tok("0x").kind == T.ERROR
654-
@test tok("0b").kind == T.ERROR
655-
@test tok("0o").kind == T.ERROR
656-
@test tok("0x 2", 1).kind == T.ERROR
654+
@test tok("0x").kind == T.INVALID_NUMERIC_CONSTANT
655+
@test tok("0b").kind == T.INVALID_NUMERIC_CONSTANT
656+
@test tok("0o").kind == T.INVALID_NUMERIC_CONSTANT
657+
@test tok("0x 2", 1).kind == T.INVALID_NUMERIC_CONSTANT
657658
@test tok("0x.1p1").kind == T.FLOAT
658659
end
659660

@@ -716,15 +717,20 @@ end
716717
@test tok("outer", 1).kind==T.OUTER
717718
end
718719

720+
function test_error(tok, kind)
721+
@test Tokens.iserror(tok.kind)
722+
@test tok.kind == kind
723+
end
724+
719725
@testset "token errors" begin
720-
@test tok("1.2e2.3",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
721-
@test tok("1.2.",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
722-
@test tok("1.2.f",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
723-
@test tok("0xv",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
724-
@test tok("0b3",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
725-
@test tok("0op",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
726-
@test tok("--",1).token_error === Tokens.INVALID_OPERATOR
727-
@test tok("1**2",2).token_error === Tokens.INVALID_OPERATOR
726+
test_error(tok("1.2e2.3",1), Tokens.INVALID_NUMERIC_CONSTANT)
727+
test_error(tok("1.2.",1), Tokens.INVALID_NUMERIC_CONSTANT)
728+
test_error(tok("1.2.f",1), Tokens.INVALID_NUMERIC_CONSTANT)
729+
test_error(tok("0xv",1), Tokens.INVALID_NUMERIC_CONSTANT)
730+
test_error(tok("0b3",1), Tokens.INVALID_NUMERIC_CONSTANT)
731+
test_error(tok("0op",1), Tokens.INVALID_NUMERIC_CONSTANT)
732+
test_error(tok("--",1), Tokens.INVALID_OPERATOR)
733+
test_error(tok("1**2",2), Tokens.INVALID_OPERATOR)
728734
end
729735

730736
@testset "hat suffix" begin
@@ -765,7 +771,7 @@ end
765771

766772
@testset "invalid float" begin
767773
s = ".0."
768-
@test collect(tokenize(s))[1].kind == Tokens.ERROR
774+
@test collect(tokenize(s))[1].kind == Tokens.INVALID_NUMERIC_CONSTANT
769775
end
770776

771777
@testset "allow prime after end" begin

src/parse_stream.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ function Base.summary(head::SyntaxHead)
6161
end
6262

6363
function untokenize(head::SyntaxHead; unique=true, include_flag_suff=true)
64-
str = untokenize(kind(head); unique=unique)
64+
str = is_error(kind(head)) ? "error" : untokenize(kind(head); unique=unique)
6565
if is_dotted(head)
6666
str = "."*str
6767
end

src/tokens.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ kind(raw::TzTokens.Token) = TzTokens.exactkind(raw)
4343
# Some renaming for naming consistency
4444
is_literal(k) = TzTokens.isliteral(kind(k))
4545
is_keyword(k) = TzTokens.iskeyword(kind(k))
46+
is_error(k) = TzTokens.iserror(kind(k))
4647
is_contextual_keyword(k) = TzTokens.iscontextualkeyword(kind(k))
4748
is_operator(k) = TzTokens.isoperator(kind(k))
4849
is_word_operator(k) = TzTokens.iswordoperator(kind(k))

0 commit comments

Comments
 (0)