Skip to content

Commit c279c8e

Browse files
committed
Consolidate lexer code & dedupe Kind predicates
* Consolidate all Kind-based predicates into kind.jl and deduplicate the functions from Tokenize which are duplicated in JuliaSyntax. Minor restyling of the names here for consistency with the other predicates in JuliaSyntax.jl. * Move all of Tokenize into a pair of files and remove the sub-modules as these don't really seem to achieve much now that we use K_str and don't need a namespace to contain the token kinds.
1 parent 3223487 commit c279c8e

File tree

11 files changed

+873
-922
lines changed

11 files changed

+873
-922
lines changed

src/JuliaSyntax.jl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@ using Mmap
66
include("utils.jl")
77

88
include("kinds.jl")
9+
910
# Lexing uses a significantly modified version of Tokenize.jl
1011
include("Tokenize/Tokenize.jl")
11-
using .Tokenize.Tokens: Token
12-
const TzTokens = Tokenize.Tokens
13-
include("tokens.jl")
12+
using .Tokenize: Token
1413

1514
# Source and diagnostics
1615
include("source_files.jl")

src/Tokenize/Tokenize.jl

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,2 @@
1-
module Tokenize
2-
3-
if isdefined(Base, :Experimental) && isdefined(Base.Experimental, Symbol("@optlevel"))
4-
@eval Base.Experimental.@optlevel 1
5-
end
6-
7-
include("token.jl")
81
include("lexer.jl")
92

10-
import .Lexers: tokenize
11-
import .Tokens: untokenize
12-
13-
export tokenize, untokenize, Tokens
14-
15-
end # module

src/Tokenize/lexer.jl

Lines changed: 60 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,61 @@
1-
module Lexers
1+
module Tokenize
22

3-
import ..Tokens
4-
import ..Tokens: @K_str, Token, Kind, UNICODE_OPS, EMPTY_TOKEN,
5-
isliteral, iserror, iscontextualkeyword, iswordoperator
3+
export tokenize, untokenize, Tokens
4+
5+
using ..JuliaSyntax: Kind, @K_str
6+
7+
import ..JuliaSyntax: kind,
8+
is_literal, is_error, is_contextual_keyword, is_word_operator
9+
10+
import Base.eof
611

712
include("utilities.jl")
813

9-
export tokenize
14+
#-------------------------------------------------------------------------------
15+
# Tokens
16+
17+
# Error kind => description
18+
TOKEN_ERROR_DESCRIPTION = Dict{Kind, String}(
19+
K"ErrorEofMultiComment" => "unterminated multi-line comment #= ... =#",
20+
K"ErrorEofChar" => "unterminated character literal",
21+
K"ErrorInvalidNumericConstant" => "invalid numeric constant",
22+
K"ErrorInvalidOperator" => "invalid operator",
23+
K"ErrorInvalidInterpolationTerminator" => "interpolated variable ends with invalid character; use `\$(...)` instead",
24+
K"error" => "unknown error",
25+
)
26+
27+
struct Token
28+
kind::Kind
29+
# Offsets into a string or buffer
30+
startbyte::Int # The byte where the token start in the buffer
31+
endbyte::Int # The byte where the token ended in the buffer
32+
dotop::Bool
33+
suffix::Bool
34+
end
35+
function Token(kind::Kind, startbyte::Int, endbyte::Int)
36+
Token(kind, startbyte, endbyte, false, false)
37+
end
38+
Token() = Token(K"error", 0, 0, false, false)
39+
40+
const EMPTY_TOKEN = Token()
41+
42+
kind(t::Token) = t.kind
43+
44+
startbyte(t::Token) = t.startbyte
45+
endbyte(t::Token) = t.endbyte
46+
47+
48+
function untokenize(t::Token, str::String)
49+
String(codeunits(str)[1 .+ (t.startbyte:t.endbyte)])
50+
end
51+
52+
function Base.show(io::IO, t::Token)
53+
print(io, rpad(string(startbyte(t), "-", endbyte(t)), 11, " "))
54+
print(io, rpad(kind(t), 15, " "))
55+
end
56+
57+
#-------------------------------------------------------------------------------
58+
# Lexer
1059

1160
@inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F')
1261
@inline isbinary(c::Char) = c == '0' || c == '1'
@@ -266,7 +315,7 @@ Returns an `K"error"` token with error `err` and starts a new `Token`.
266315
"""
267316
function emit_error(l::Lexer, err::Kind = K"error")
268317
l.errored = true
269-
@assert iserror(err)
318+
@assert is_error(err)
270319
return emit(l, err)
271320
end
272321

@@ -838,14 +887,14 @@ end
838887

839888
function lex_prime(l, doemit = true)
840889
if l.last_token == K"Identifier" ||
841-
iscontextualkeyword(l.last_token) ||
842-
iswordoperator(l.last_token) ||
890+
is_contextual_keyword(l.last_token) ||
891+
is_word_operator(l.last_token) ||
843892
l.last_token == K"." ||
844893
l.last_token == K")" ||
845894
l.last_token == K"]" ||
846895
l.last_token == K"}" ||
847896
l.last_token == K"'" ||
848-
l.last_token == K"end" || isliteral(l.last_token)
897+
l.last_token == K"end" || is_literal(l.last_token)
849898
return emit(l, K"'")
850899
else
851900
if accept(l, '\'')
@@ -888,8 +937,8 @@ end
888937
# A '"' has been consumed
889938
function lex_quote(l::Lexer)
890939
raw = l.last_token == K"Identifier" ||
891-
iscontextualkeyword(l.last_token) ||
892-
iswordoperator(l.last_token)
940+
is_contextual_keyword(l.last_token) ||
941+
is_word_operator(l.last_token)
893942
pc, dpc = dpeekchar(l)
894943
triplestr = pc == '"' && dpc == '"'
895944
push!(l.string_states, StringState(triplestr, raw, '"', 0))

src/Tokenize/token.jl

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +0,0 @@
1-
module Tokens
2-
3-
using ...JuliaSyntax: Kind, @K_str
4-
5-
import Base.eof
6-
7-
export Token
8-
9-
include("token_kinds.jl")
10-
11-
12-
iskeyword(k::Kind) = K"BEGIN_KEYWORDS" < k < K"END_KEYWORDS"
13-
isliteral(k::Kind) = K"BEGIN_LITERAL" < k < K"END_LITERAL"
14-
isoperator(k::Kind) = K"BEGIN_OPS" < k < K"END_OPS"
15-
iserror(k::Kind) = K"BEGIN_ERRORS" < k < K"END_ERRORS"
16-
iscontextualkeyword(k::Kind) = K"BEGIN_CONTEXTUAL_KEYWORDS" < k < K"END_CONTEXTUAL_KEYWORDS"
17-
18-
function iswordoperator(k::Kind)
19-
# Keyword-like operators
20-
k == K"in" ||
21-
k == K"isa" ||
22-
k == K"where"
23-
end
24-
25-
# Error kind => description
26-
TOKEN_ERROR_DESCRIPTION = Dict{Kind, String}(
27-
K"ErrorEofMultiComment" => "unterminated multi-line comment #= ... =#",
28-
K"ErrorEofChar" => "unterminated character literal",
29-
K"ErrorInvalidNumericConstant" => "invalid numeric constant",
30-
K"ErrorInvalidOperator" => "invalid operator",
31-
K"ErrorInvalidInterpolationTerminator" => "interpolated variable ends with invalid character; use `\$(...)` instead",
32-
K"error" => "unknown error",
33-
)
34-
35-
struct Token
36-
kind::Kind
37-
# Offsets into a string or buffer
38-
startbyte::Int # The byte where the token start in the buffer
39-
endbyte::Int # The byte where the token ended in the buffer
40-
dotop::Bool
41-
suffix::Bool
42-
end
43-
function Token(kind::Kind, startbyte::Int, endbyte::Int)
44-
Token(kind, startbyte, endbyte, false, false)
45-
end
46-
Token() = Token(K"error", 0, 0, false, false)
47-
48-
const EMPTY_TOKEN = Token()
49-
50-
exactkind(t::Token) = t.kind
51-
52-
startbyte(t::Token) = t.startbyte
53-
endbyte(t::Token) = t.endbyte
54-
55-
56-
function untokenize(t::Token, str::String)
57-
String(codeunits(str)[1 .+ (t.startbyte:t.endbyte)])
58-
end
59-
60-
function Base.show(io::IO, t::Token)
61-
print(io, rpad(string(startbyte(t), "-", endbyte(t)), 11, " "))
62-
print(io, rpad(exactkind(t), 15, " "))
63-
end
64-
65-
end # module

0 commit comments

Comments
 (0)