Skip to content

Commit 36909cd

Browse files
authored
Basic after-parse tokenization interface (#221)
Implement a `tokenize()` function which retreives the tokens *after* parsing. Going through the parser isn't hugely more expensive than plain tokenization, and allows us to be more precise and complete. For example it automatically: * Determines when contextual keywords are keywords, vs identifiers. For example, the `outer` in `outer = 1` is an identifier, but a keyword in `for outer i = 1:10` * Validates numeric literals (eg, detecting overflow cases like `10e1000` and flagging as errors) * Splits or combines ambiguous tokens. For example, making the `...` in `import ...A` three separate `.` tokens.
1 parent 2720980 commit 36909cd

File tree

7 files changed

+116
-54
lines changed

7 files changed

+116
-54
lines changed

src/JuliaSyntax.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ include("kinds.jl")
77

88
# Lexing uses a significantly modified version of Tokenize.jl
99
include("tokenize.jl")
10-
using .Tokenize: Token
1110

1211
# Source and diagnostics
1312
include("source_files.jl")

src/parser_api.jl

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,3 +148,53 @@ parse(::Type{T}, text::AbstractString, index::Integer; kws...) where {T} = _pars
148148
parseall(::Type{T}, text::AbstractString, index::Integer; kws...) where {T} = _parse(:toplevel, false, T, text, index; kws...)
149149
parseatom(::Type{T}, text::AbstractString, index::Integer; kws...) where {T} = _parse(:atom, false, T, text, index; kws...)
150150

151+
#-------------------------------------------------------------------------------
152+
# Tokens interface
153+
"""
154+
Token type resulting from calling `tokenize(text)`
155+
156+
Use
157+
* `kind(tok)` to get the token kind
158+
* `untokenize(tok, text)` to retreive the text
159+
* Predicates like `is_error(tok)` to query token categories and flags
160+
"""
161+
struct Token
162+
head::SyntaxHead
163+
range::UnitRange{UInt32}
164+
end
165+
166+
Token() = Token(SyntaxHead(K"None", EMPTY_FLAGS), 0:0)
167+
168+
head(t::Token) = t.head
169+
170+
"""
171+
tokenize(text)
172+
173+
Returns the tokenized UTF-8 encoded `text` as a vector of `Token`s. The
174+
text for the token can be retreived by using `untokenize()`. The full text can be
175+
reconstructed with, for example, `join(untokenize.(tokenize(text), text))`.
176+
177+
This interface works on UTF-8 encoded string or buffer data only.
178+
"""
179+
function tokenize(text)
180+
ps = ParseStream(text)
181+
parse!(ps, rule=:toplevel)
182+
ts = ps.tokens
183+
output_tokens = Token[]
184+
for i = 2:length(ts)
185+
if kind(ts[i]) == K"TOMBSTONE"
186+
continue
187+
end
188+
r = ts[i-1].next_byte:ts[i].next_byte-1
189+
push!(output_tokens, Token(head(ts[i]), r))
190+
end
191+
output_tokens
192+
end
193+
194+
function untokenize(token::Token, text::AbstractString)
195+
text[first(token.range):thisind(text, last(token.range))]
196+
end
197+
198+
function untokenize(token::Token, text::Vector{UInt8})
199+
text[token.range]
200+
end

src/tokenize.jl

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,32 +12,32 @@ include("tokenize_utils.jl")
1212
#-------------------------------------------------------------------------------
1313
# Tokens
1414

15-
struct Token
15+
struct RawToken
1616
kind::Kind
1717
# Offsets into a string or buffer
1818
startbyte::Int # The byte where the token start in the buffer
1919
endbyte::Int # The byte where the token ended in the buffer
2020
dotop::Bool
2121
suffix::Bool
2222
end
23-
function Token(kind::Kind, startbyte::Int, endbyte::Int)
24-
Token(kind, startbyte, endbyte, false, false)
23+
function RawToken(kind::Kind, startbyte::Int, endbyte::Int)
24+
RawToken(kind, startbyte, endbyte, false, false)
2525
end
26-
Token() = Token(K"error", 0, 0, false, false)
26+
RawToken() = RawToken(K"error", 0, 0, false, false)
2727

28-
const EMPTY_TOKEN = Token()
28+
const EMPTY_TOKEN = RawToken()
2929

30-
kind(t::Token) = t.kind
30+
kind(t::RawToken) = t.kind
3131

32-
startbyte(t::Token) = t.startbyte
33-
endbyte(t::Token) = t.endbyte
32+
startbyte(t::RawToken) = t.startbyte
33+
endbyte(t::RawToken) = t.endbyte
3434

3535

36-
function untokenize(t::Token, str::String)
36+
function untokenize(t::RawToken, str::String)
3737
String(codeunits(str)[1 .+ (t.startbyte:t.endbyte)])
3838
end
3939

40-
function Base.show(io::IO, t::Token)
40+
function Base.show(io::IO, t::RawToken)
4141
print(io, rpad(string(startbyte(t), "-", endbyte(t)), 11, " "))
4242
print(io, rpad(kind(t), 15, " "))
4343
end
@@ -108,18 +108,17 @@ end
108108
Lexer(str::AbstractString) = Lexer(IOBuffer(str))
109109

110110
"""
111-
tokenize(x, T = Token)
111+
tokenize(x)
112112
113113
Returns an `Iterable` containing the tokenized input. Can be reverted by e.g.
114-
`join(untokenize.(tokenize(x)))`. Setting `T` chooses the type of token
115-
produced by the lexer (`Token` or `Token`).
114+
`join(untokenize.(tokenize(x)))`.
116115
"""
117116
tokenize(x) = Lexer(x)
118117

119118
# Iterator interface
120119
Base.IteratorSize(::Type{<:Lexer}) = Base.SizeUnknown()
121120
Base.IteratorEltype(::Type{<:Lexer}) = Base.HasEltype()
122-
Base.eltype(::Type{<:Lexer}) = Token
121+
Base.eltype(::Type{<:Lexer}) = RawToken
123122

124123

125124
function Base.iterate(l::Lexer)
@@ -142,7 +141,7 @@ end
142141
"""
143142
startpos(l::Lexer)
144143
145-
Return the latest `Token`'s starting position.
144+
Return the latest `RawToken`'s starting position.
146145
"""
147146
startpos(l::Lexer) = l.token_startpos
148147

@@ -193,7 +192,7 @@ Base.seek(l::Lexer, pos) = seek(l.io, pos)
193192
"""
194193
start_token!(l::Lexer)
195194
196-
Updates the lexer's state such that the next `Token` will start at the current
195+
Updates the lexer's state such that the next `RawToken` will start at the current
197196
position.
198197
"""
199198
function start_token!(l::Lexer)
@@ -251,7 +250,7 @@ end
251250
"""
252251
emit(l::Lexer, kind::Kind)
253252
254-
Returns a `Token` of kind `kind` with contents `str` and starts a new `Token`.
253+
Returns a `RawToken` of kind `kind` with contents `str` and starts a new `RawToken`.
255254
"""
256255
function emit(l::Lexer, kind::Kind, maybe_op=true)
257256
suffix = false
@@ -262,7 +261,7 @@ function emit(l::Lexer, kind::Kind, maybe_op=true)
262261
end
263262
end
264263

265-
tok = Token(kind, startpos(l), position(l) - 1, l.dotop, suffix)
264+
tok = RawToken(kind, startpos(l), position(l) - 1, l.dotop, suffix)
266265

267266
l.dotop = false
268267
l.last_token = kind
@@ -272,7 +271,7 @@ end
272271
"""
273272
emit_error(l::Lexer, err::Kind)
274273
275-
Returns an `K"error"` token with error `err` and starts a new `Token`.
274+
Returns an `K"error"` token with error `err` and starts a new `RawToken`.
276275
"""
277276
function emit_error(l::Lexer, err::Kind)
278277
@assert is_error(err)
@@ -283,7 +282,7 @@ end
283282
"""
284283
next_token(l::Lexer)
285284
286-
Returns the next `Token`.
285+
Returns the next `RawToken`.
287286
"""
288287
function next_token(l::Lexer, start = true)
289288
start && start_token!(l)

test/fuzz_test.jl

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using JuliaSyntax
2+
using JuliaSyntax: tokenize
23

34
# Parser fuzz testing tools.
45

@@ -882,36 +883,6 @@ const cutdown_tokens = [
882883
""
883884
]
884885

885-
#-------------------------------------------------------------------------------
886-
887-
# Rough tokenization interface.
888-
# TODO: We should have something like this in parser_api.jl
889-
890-
struct Token2
891-
head::JuliaSyntax.SyntaxHead
892-
range::UnitRange{UInt32}
893-
end
894-
895-
function tokenize(text::String)
896-
ps = JuliaSyntax.ParseStream(text)
897-
JuliaSyntax.parse!(ps, rule=:toplevel)
898-
ts = ps.tokens
899-
output_tokens = Token2[]
900-
for i = 2:length(ts)
901-
if JuliaSyntax.kind(ts[i]) == JuliaSyntax.K"TOMBSTONE"
902-
continue
903-
end
904-
r = ts[i-1].next_byte:thisind(text, ts[i].next_byte-1)
905-
push!(output_tokens, Token2(JuliaSyntax.head(ts[i]), r))
906-
end
907-
output_tokens
908-
end
909-
910-
function split_tokens(text::String)
911-
[@view text[t.range] for t in tokenize(text)]
912-
end
913-
914-
915886
#-------------------------------------------------------------------------------
916887

917888
function parser_throws_exception(str)

test/parser_api.jl

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,3 +124,44 @@ end
124124
\e[90m# └┘ ── \e[0;0m\e[91minvalid operator\e[0;0m"""
125125
end
126126
end
127+
128+
tokensplit(str) = [kind(tok) => untokenize(tok, str) for tok in tokenize(str)]
129+
130+
@testset "tokenize() API" begin
131+
# tokenize() is eager
132+
@test tokenize("aba") isa Vector{JuliaSyntax.Token}
133+
134+
# . is a separate token from + in `.+`
135+
@test tokensplit("a .+ β") == [
136+
K"Identifier" => "a",
137+
K"Whitespace" => " ",
138+
K"." => ".",
139+
K"+" => "+",
140+
K"Whitespace" => " ",
141+
K"Identifier" => "β",
142+
]
143+
144+
# Contextual keywords become identifiers where necessary
145+
@test tokensplit("outer = 1") == [
146+
K"Identifier" => "outer",
147+
K"Whitespace" => " ",
148+
K"=" => "=",
149+
K"Whitespace" => " ",
150+
K"Integer" => "1",
151+
]
152+
153+
# A predicate based on flags()
154+
@test JuliaSyntax.is_suffixed(tokenize("+₁")[1])
155+
156+
# Buffer interface
157+
@test tokenize(Vector{UInt8}("a + b")) == tokenize("a + b")
158+
159+
buf = Vector{UInt8}("a-β")
160+
@test untokenize.(tokenize(buf), Ref(buf,)) == [
161+
Vector{UInt8}("a"),
162+
Vector{UInt8}("-"),
163+
Vector{UInt8}("β")
164+
]
165+
166+
@test kind(JuliaSyntax.Token()) == K"None"
167+
end

test/test_utils.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ using .JuliaSyntax:
2727
child,
2828
fl_parseall,
2929
fl_parse,
30-
highlight
30+
highlight,
31+
tokenize,
32+
untokenize
3133

3234
if VERSION < v"1.6"
3335
# Compat stuff which might not be in Base for older versions

test/tokenize.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ using JuliaSyntax.Tokenize:
1515
Tokenize,
1616
tokenize,
1717
untokenize,
18-
Token
18+
RawToken
1919

2020
tok(str, i = 1) = collect(tokenize(str))[i]
2121

@@ -321,7 +321,7 @@ end
321321
@test String(take!(io)) == "1-5 String "
322322
end
323323

324-
~(tok::Token, t::Tuple) = tok.kind == t[1] && untokenize(tok, t[3]) == t[2]
324+
~(tok::RawToken, t::Tuple) = tok.kind == t[1] && untokenize(tok, t[3]) == t[2]
325325

326326
@testset "raw strings" begin
327327
str = raw""" str"x $ \ y" """

0 commit comments

Comments
 (0)