Skip to content

Commit 2f43f69

Browse files
committed
Basic after-parse tokenization interface (#221)
Implement a `tokenize()` function which retreives the tokens *after* parsing. Going through the parser isn't hugely more expensive than plain tokenization, and allows us to be more precise and complete. For example it automatically: * Determines when contextual keywords are keywords, vs identifiers. For example, the `outer` in `outer = 1` is an identifier, but a keyword in `for outer i = 1:10` * Validates numeric literals (eg, detecting overflow cases like `10e1000` and flagging as errors) * Splits or combines ambiguous tokens. For example, making the `...` in `import ...A` three separate `.` tokens.
1 parent fec6f3f commit 2f43f69

File tree

7 files changed

+116
-54
lines changed

7 files changed

+116
-54
lines changed

src/JuliaSyntax.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ include("kinds.jl")
77

88
# Lexing uses a significantly modified version of Tokenize.jl
99
include("tokenize.jl")
10-
using .Tokenize: Token
1110

1211
# Source and diagnostics
1312
include("source_files.jl")

src/parser_api.jl

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,3 +148,53 @@ parse(::Type{T}, text::AbstractString, index::Integer; kws...) where {T} = _pars
148148
parseall(::Type{T}, text::AbstractString, index::Integer; kws...) where {T} = _parse(:toplevel, false, T, text, index; kws...)
149149
parseatom(::Type{T}, text::AbstractString, index::Integer; kws...) where {T} = _parse(:atom, false, T, text, index; kws...)
150150

151+
#-------------------------------------------------------------------------------
152+
# Tokens interface
153+
"""
154+
Token type resulting from calling `tokenize(text)`
155+
156+
Use
157+
* `kind(tok)` to get the token kind
158+
* `untokenize(tok, text)` to retreive the text
159+
* Predicates like `is_error(tok)` to query token categories and flags
160+
"""
161+
struct Token
162+
head::SyntaxHead
163+
range::UnitRange{UInt32}
164+
end
165+
166+
Token() = Token(SyntaxHead(K"None", EMPTY_FLAGS), 0:0)
167+
168+
head(t::Token) = t.head
169+
170+
"""
171+
tokenize(text)
172+
173+
Returns the tokenized UTF-8 encoded `text` as a vector of `Token`s. The
174+
text for the token can be retreived by using `untokenize()`. The full text can be
175+
reconstructed with, for example, `join(untokenize.(tokenize(text), text))`.
176+
177+
This interface works on UTF-8 encoded string or buffer data only.
178+
"""
179+
function tokenize(text)
180+
ps = ParseStream(text)
181+
parse!(ps, rule=:toplevel)
182+
ts = ps.tokens
183+
output_tokens = Token[]
184+
for i = 2:length(ts)
185+
if kind(ts[i]) == K"TOMBSTONE"
186+
continue
187+
end
188+
r = ts[i-1].next_byte:ts[i].next_byte-1
189+
push!(output_tokens, Token(head(ts[i]), r))
190+
end
191+
output_tokens
192+
end
193+
194+
function untokenize(token::Token, text::AbstractString)
195+
text[first(token.range):thisind(text, last(token.range))]
196+
end
197+
198+
function untokenize(token::Token, text::Vector{UInt8})
199+
text[token.range]
200+
end

src/tokenize.jl

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,32 +12,32 @@ include("tokenize_utils.jl")
1212
#-------------------------------------------------------------------------------
1313
# Tokens
1414

15-
struct Token
15+
struct RawToken
1616
kind::Kind
1717
# Offsets into a string or buffer
1818
startbyte::Int # The byte where the token start in the buffer
1919
endbyte::Int # The byte where the token ended in the buffer
2020
dotop::Bool
2121
suffix::Bool
2222
end
23-
function Token(kind::Kind, startbyte::Int, endbyte::Int)
24-
Token(kind, startbyte, endbyte, false, false)
23+
function RawToken(kind::Kind, startbyte::Int, endbyte::Int)
24+
RawToken(kind, startbyte, endbyte, false, false)
2525
end
26-
Token() = Token(K"error", 0, 0, false, false)
26+
RawToken() = RawToken(K"error", 0, 0, false, false)
2727

28-
const EMPTY_TOKEN = Token()
28+
const EMPTY_TOKEN = RawToken()
2929

30-
kind(t::Token) = t.kind
30+
kind(t::RawToken) = t.kind
3131

32-
startbyte(t::Token) = t.startbyte
33-
endbyte(t::Token) = t.endbyte
32+
startbyte(t::RawToken) = t.startbyte
33+
endbyte(t::RawToken) = t.endbyte
3434

3535

36-
function untokenize(t::Token, str::String)
36+
function untokenize(t::RawToken, str::String)
3737
String(codeunits(str)[1 .+ (t.startbyte:t.endbyte)])
3838
end
3939

40-
function Base.show(io::IO, t::Token)
40+
function Base.show(io::IO, t::RawToken)
4141
print(io, rpad(string(startbyte(t), "-", endbyte(t)), 11, " "))
4242
print(io, rpad(kind(t), 15, " "))
4343
end
@@ -108,18 +108,17 @@ end
108108
Lexer(str::AbstractString) = Lexer(IOBuffer(str))
109109

110110
"""
111-
tokenize(x, T = Token)
111+
tokenize(x)
112112
113113
Returns an `Iterable` containing the tokenized input. Can be reverted by e.g.
114-
`join(untokenize.(tokenize(x)))`. Setting `T` chooses the type of token
115-
produced by the lexer (`Token` or `Token`).
114+
`join(untokenize.(tokenize(x)))`.
116115
"""
117116
tokenize(x) = Lexer(x)
118117

119118
# Iterator interface
120119
Base.IteratorSize(::Type{<:Lexer}) = Base.SizeUnknown()
121120
Base.IteratorEltype(::Type{<:Lexer}) = Base.HasEltype()
122-
Base.eltype(::Type{<:Lexer}) = Token
121+
Base.eltype(::Type{<:Lexer}) = RawToken
123122

124123

125124
function Base.iterate(l::Lexer)
@@ -142,7 +141,7 @@ end
142141
"""
143142
startpos(l::Lexer)
144143
145-
Return the latest `Token`'s starting position.
144+
Return the latest `RawToken`'s starting position.
146145
"""
147146
startpos(l::Lexer) = l.token_startpos
148147

@@ -193,7 +192,7 @@ Base.seek(l::Lexer, pos) = seek(l.io, pos)
193192
"""
194193
start_token!(l::Lexer)
195194
196-
Updates the lexer's state such that the next `Token` will start at the current
195+
Updates the lexer's state such that the next `RawToken` will start at the current
197196
position.
198197
"""
199198
function start_token!(l::Lexer)
@@ -251,7 +250,7 @@ end
251250
"""
252251
emit(l::Lexer, kind::Kind)
253252
254-
Returns a `Token` of kind `kind` with contents `str` and starts a new `Token`.
253+
Returns a `RawToken` of kind `kind` with contents `str` and starts a new `RawToken`.
255254
"""
256255
function emit(l::Lexer, kind::Kind, maybe_op=true)
257256
suffix = false
@@ -262,7 +261,7 @@ function emit(l::Lexer, kind::Kind, maybe_op=true)
262261
end
263262
end
264263

265-
tok = Token(kind, startpos(l), position(l) - 1, l.dotop, suffix)
264+
tok = RawToken(kind, startpos(l), position(l) - 1, l.dotop, suffix)
266265

267266
l.dotop = false
268267
l.last_token = kind
@@ -272,7 +271,7 @@ end
272271
"""
273272
emit_error(l::Lexer, err::Kind)
274273
275-
Returns an `K"error"` token with error `err` and starts a new `Token`.
274+
Returns an `K"error"` token with error `err` and starts a new `RawToken`.
276275
"""
277276
function emit_error(l::Lexer, err::Kind)
278277
@assert is_error(err)
@@ -283,7 +282,7 @@ end
283282
"""
284283
next_token(l::Lexer)
285284
286-
Returns the next `Token`.
285+
Returns the next `RawToken`.
287286
"""
288287
function next_token(l::Lexer, start = true)
289288
start && start_token!(l)

test/fuzz_test.jl

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using JuliaSyntax
2+
using JuliaSyntax: tokenize
23

34
# Parser fuzz testing tools.
45

@@ -882,36 +883,6 @@ const cutdown_tokens = [
882883
""
883884
]
884885

885-
#-------------------------------------------------------------------------------
886-
887-
# Rough tokenization interface.
888-
# TODO: We should have something like this in parser_api.jl
889-
890-
struct Token2
891-
head::JuliaSyntax.SyntaxHead
892-
range::UnitRange{UInt32}
893-
end
894-
895-
function tokenize(text::String)
896-
ps = JuliaSyntax.ParseStream(text)
897-
JuliaSyntax.parse!(ps, rule=:toplevel)
898-
ts = ps.tokens
899-
output_tokens = Token2[]
900-
for i = 2:length(ts)
901-
if JuliaSyntax.kind(ts[i]) == JuliaSyntax.K"TOMBSTONE"
902-
continue
903-
end
904-
r = ts[i-1].next_byte:thisind(text, ts[i].next_byte-1)
905-
push!(output_tokens, Token2(JuliaSyntax.head(ts[i]), r))
906-
end
907-
output_tokens
908-
end
909-
910-
function split_tokens(text::String)
911-
[@view text[t.range] for t in tokenize(text)]
912-
end
913-
914-
915886
#-------------------------------------------------------------------------------
916887

917888
function parser_throws_exception(str)

test/parser_api.jl

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,44 @@
9898
@test parseshow("1f1000", ignore_errors=true) == "(ErrorNumericOverflow)"
9999
end
100100
end
101+
102+
tokensplit(str) = [kind(tok) => untokenize(tok, str) for tok in tokenize(str)]
103+
104+
@testset "tokenize() API" begin
105+
# tokenize() is eager
106+
@test tokenize("aba") isa Vector{JuliaSyntax.Token}
107+
108+
# . is a separate token from + in `.+`
109+
@test tokensplit("a .+ β") == [
110+
K"Identifier" => "a",
111+
K"Whitespace" => " ",
112+
K"." => ".",
113+
K"+" => "+",
114+
K"Whitespace" => " ",
115+
K"Identifier" => "β",
116+
]
117+
118+
# Contextual keywords become identifiers where necessary
119+
@test tokensplit("outer = 1") == [
120+
K"Identifier" => "outer",
121+
K"Whitespace" => " ",
122+
K"=" => "=",
123+
K"Whitespace" => " ",
124+
K"Integer" => "1",
125+
]
126+
127+
# A predicate based on flags()
128+
@test JuliaSyntax.is_suffixed(tokenize("+₁")[1])
129+
130+
# Buffer interface
131+
@test tokenize(Vector{UInt8}("a + b")) == tokenize("a + b")
132+
133+
buf = Vector{UInt8}("a-β")
134+
@test untokenize.(tokenize(buf), Ref(buf,)) == [
135+
Vector{UInt8}("a"),
136+
Vector{UInt8}("-"),
137+
Vector{UInt8}("β")
138+
]
139+
140+
@test kind(JuliaSyntax.Token()) == K"None"
141+
end

test/test_utils.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ using .JuliaSyntax:
2626
children,
2727
child,
2828
fl_parseall,
29-
fl_parse
29+
fl_parse,
30+
tokenize,
31+
untokenize
3032

3133
if VERSION < v"1.6"
3234
# Compat stuff which might not be in Base for older versions

test/tokenize.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ using JuliaSyntax.Tokenize:
1515
Tokenize,
1616
tokenize,
1717
untokenize,
18-
Token
18+
RawToken
1919

2020
tok(str, i = 1) = collect(tokenize(str))[i]
2121

@@ -321,7 +321,7 @@ end
321321
@test String(take!(io)) == "1-5 String "
322322
end
323323

324-
~(tok::Token, t::Tuple) = tok.kind == t[1] && untokenize(tok, t[3]) == t[2]
324+
~(tok::RawToken, t::Tuple) = tok.kind == t[1] && untokenize(tok, t[3]) == t[2]
325325

326326
@testset "raw strings" begin
327327
str = raw""" str"x $ \ y" """

0 commit comments

Comments
 (0)