Basic after-parse tokenization interface (#221)

c42f · web-flow · commit 36909cd6fd31 · 2023-03-17T20:59:46.000+10:00
Implement a `tokenize()` function which retreives the tokens *after*
parsing.

Going through the parser isn't hugely more expensive than plain
tokenization, and allows us to be more precise and complete.

For example it automatically:
* Determines when contextual keywords are keywords, vs identifiers. For
  example, the `outer` in `outer = 1` is an identifier, but a keyword in
  `for outer i = 1:10`
* Validates numeric literals (eg, detecting overflow cases like
  `10e1000` and flagging as errors)
* Splits or combines ambiguous tokens. For example, making the `...` in
  `import ...A` three separate `.` tokens.
diff --git a/src/JuliaSyntax.jl b/src/JuliaSyntax.jl
@@ -7,7 +7,6 @@ include("kinds.jl")
 
 # Lexing uses a significantly modified version of Tokenize.jl
 include("tokenize.jl")
-using .Tokenize: Token
 
 # Source and diagnostics
 include("source_files.jl")
diff --git a/src/parser_api.jl b/src/parser_api.jl
@@ -148,3 +148,53 @@ parse(::Type{T}, text::AbstractString, index::Integer; kws...) where {T} = _pars
 parseall(::Type{T}, text::AbstractString, index::Integer; kws...) where {T} = _parse(:toplevel, false, T, text, index; kws...)
 parseatom(::Type{T}, text::AbstractString, index::Integer; kws...) where {T} = _parse(:atom, false, T, text, index; kws...)
 
+#-------------------------------------------------------------------------------
+# Tokens interface
+"""
+Token type resulting from calling `tokenize(text)`
+
+Use
+* `kind(tok)` to get the token kind
+* `untokenize(tok, text)` to retreive the text
+* Predicates like `is_error(tok)` to query token categories and flags
+"""
+struct Token
+    head::SyntaxHead
+    range::UnitRange{UInt32}
+end
+
+Token() = Token(SyntaxHead(K"None", EMPTY_FLAGS), 0:0)
+
+head(t::Token) = t.head
+
+"""
+    tokenize(text)
+
+Returns the tokenized UTF-8 encoded `text` as a vector of `Token`s. The
+text for the token can be retreived by using `untokenize()`. The full text can be
+reconstructed with, for example, `join(untokenize.(tokenize(text), text))`.
+
+This interface works on UTF-8 encoded string or buffer data only.
+"""
+function tokenize(text)
+    ps = ParseStream(text)
+    parse!(ps, rule=:toplevel)
+    ts = ps.tokens
+    output_tokens = Token[]
+    for i = 2:length(ts)
+        if kind(ts[i]) == K"TOMBSTONE"
+            continue
+        end
+        r = ts[i-1].next_byte:ts[i].next_byte-1
+        push!(output_tokens, Token(head(ts[i]), r))
+    end
+    output_tokens
+end
+
+function untokenize(token::Token, text::AbstractString)
+    text[first(token.range):thisind(text, last(token.range))]
+end
+
+function untokenize(token::Token, text::Vector{UInt8})
+    text[token.range]
+end
diff --git a/src/tokenize.jl b/src/tokenize.jl
@@ -12,32 +12,32 @@ include("tokenize_utils.jl")
 #-------------------------------------------------------------------------------
 # Tokens
 
-struct Token
+struct RawToken
     kind::Kind
     # Offsets into a string or buffer
     startbyte::Int # The byte where the token start in the buffer
     endbyte::Int # The byte where the token ended in the buffer
     dotop::Bool
     suffix::Bool
 end
-function Token(kind::Kind, startbyte::Int, endbyte::Int)
-    Token(kind, startbyte, endbyte, false, false)
+function RawToken(kind::Kind, startbyte::Int, endbyte::Int)
+    RawToken(kind, startbyte, endbyte, false, false)
 end
-Token() = Token(K"error", 0, 0, false, false)
+RawToken() = RawToken(K"error", 0, 0, false, false)
 
-const EMPTY_TOKEN = Token()
+const EMPTY_TOKEN = RawToken()
 
-kind(t::Token) = t.kind
+kind(t::RawToken) = t.kind
 
-startbyte(t::Token) = t.startbyte
-endbyte(t::Token) = t.endbyte
+startbyte(t::RawToken) = t.startbyte
+endbyte(t::RawToken) = t.endbyte
 
 
-function untokenize(t::Token, str::String)
+function untokenize(t::RawToken, str::String)
     String(codeunits(str)[1 .+ (t.startbyte:t.endbyte)])
 end
 
-function Base.show(io::IO, t::Token)
+function Base.show(io::IO, t::RawToken)
     print(io, rpad(string(startbyte(t), "-", endbyte(t)), 11, " "))
     print(io, rpad(kind(t), 15, " "))
 end
@@ -108,18 +108,17 @@ end
 Lexer(str::AbstractString) = Lexer(IOBuffer(str))
 
 """
-    tokenize(x, T = Token)
+    tokenize(x)
 
 Returns an `Iterable` containing the tokenized input. Can be reverted by e.g.
-`join(untokenize.(tokenize(x)))`. Setting `T` chooses the type of token
-produced by the lexer (`Token` or `Token`).
+`join(untokenize.(tokenize(x)))`.
 """
 tokenize(x) = Lexer(x)
 
 # Iterator interface
 Base.IteratorSize(::Type{<:Lexer}) = Base.SizeUnknown()
 Base.IteratorEltype(::Type{<:Lexer}) = Base.HasEltype()
-Base.eltype(::Type{<:Lexer}) = Token
+Base.eltype(::Type{<:Lexer}) = RawToken
 
 
 function Base.iterate(l::Lexer)
@@ -142,7 +141,7 @@ end
 """
     startpos(l::Lexer)
 
-Return the latest `Token`'s starting position.
+Return the latest `RawToken`'s starting position.
 """
 startpos(l::Lexer) = l.token_startpos
 
@@ -193,7 +192,7 @@ Base.seek(l::Lexer, pos) = seek(l.io, pos)
 """
     start_token!(l::Lexer)
 
-Updates the lexer's state such that the next  `Token` will start at the current
+Updates the lexer's state such that the next  `RawToken` will start at the current
 position.
 """
 function start_token!(l::Lexer)
@@ -251,7 +250,7 @@ end
 """
     emit(l::Lexer, kind::Kind)
 
-Returns a `Token` of kind `kind` with contents `str` and starts a new `Token`.
+Returns a `RawToken` of kind `kind` with contents `str` and starts a new `RawToken`.
 """
 function emit(l::Lexer, kind::Kind, maybe_op=true)
     suffix = false
@@ -262,7 +261,7 @@ function emit(l::Lexer, kind::Kind, maybe_op=true)
         end
     end
 
-    tok = Token(kind, startpos(l), position(l) - 1, l.dotop, suffix)
+    tok = RawToken(kind, startpos(l), position(l) - 1, l.dotop, suffix)
 
     l.dotop = false
     l.last_token = kind
@@ -272,7 +271,7 @@ end
 """
     emit_error(l::Lexer, err::Kind)
 
-Returns an `K"error"` token with error `err` and starts a new `Token`.
+Returns an `K"error"` token with error `err` and starts a new `RawToken`.
 """
 function emit_error(l::Lexer, err::Kind)
     @assert is_error(err)
@@ -283,7 +282,7 @@ end
 """
     next_token(l::Lexer)
 
-Returns the next `Token`.
+Returns the next `RawToken`.
 """
 function next_token(l::Lexer, start = true)
     start && start_token!(l)
diff --git a/test/fuzz_test.jl b/test/fuzz_test.jl
@@ -1,4 +1,5 @@
 using JuliaSyntax
+using JuliaSyntax: tokenize
 
 # Parser fuzz testing tools.
 
@@ -882,36 +883,6 @@ const cutdown_tokens = [
     "√"
 ]
 
-#-------------------------------------------------------------------------------
-
-# Rough tokenization interface.
-# TODO: We should have something like this in parser_api.jl
-
-struct Token2
-    head::JuliaSyntax.SyntaxHead
-    range::UnitRange{UInt32}
-end
-
-function tokenize(text::String)
-    ps = JuliaSyntax.ParseStream(text)
-    JuliaSyntax.parse!(ps, rule=:toplevel)
-    ts = ps.tokens
-    output_tokens = Token2[]
-    for i = 2:length(ts)
-        if JuliaSyntax.kind(ts[i]) == JuliaSyntax.K"TOMBSTONE"
-            continue
-        end
-        r = ts[i-1].next_byte:thisind(text, ts[i].next_byte-1)
-        push!(output_tokens, Token2(JuliaSyntax.head(ts[i]), r))
-    end
-    output_tokens
-end
-
-function split_tokens(text::String)
-    [@view text[t.range] for t in tokenize(text)]
-end
-
-
 #-------------------------------------------------------------------------------
 
 function parser_throws_exception(str)
diff --git a/test/parser_api.jl b/test/parser_api.jl
@@ -124,3 +124,44 @@ end
             \e[90m#      └┘ ── \e[0;0m\e[91minvalid operator\e[0;0m"""
     end
 end
+
+tokensplit(str) = [kind(tok) => untokenize(tok, str) for tok in tokenize(str)]
+
+@testset "tokenize() API" begin
+    # tokenize() is eager
+    @test tokenize("aba") isa Vector{JuliaSyntax.Token}
+
+    # . is a separate token from + in `.+`
+    @test tokensplit("a .+ β") == [
+        K"Identifier" => "a",
+        K"Whitespace" => " ",
+        K"." => ".",
+        K"+" => "+",
+        K"Whitespace" => " ",
+        K"Identifier" => "β",
+    ]
+
+    # Contextual keywords become identifiers where necessary
+    @test tokensplit("outer = 1") == [
+        K"Identifier" => "outer",
+        K"Whitespace" => " ",
+        K"=" => "=",
+        K"Whitespace" => " ",
+        K"Integer" => "1",
+    ]
+
+    # A predicate based on flags()
+    @test JuliaSyntax.is_suffixed(tokenize("+₁")[1])
+
+    # Buffer interface
+    @test tokenize(Vector{UInt8}("a + b")) == tokenize("a + b")
+
+    buf = Vector{UInt8}("a-β")
+    @test untokenize.(tokenize(buf), Ref(buf,)) == [
+        Vector{UInt8}("a"),
+        Vector{UInt8}("-"),
+        Vector{UInt8}("β")
+    ]
+
+    @test kind(JuliaSyntax.Token()) == K"None"
+end
diff --git a/test/test_utils.jl b/test/test_utils.jl
@@ -27,7 +27,9 @@ using .JuliaSyntax:
     child,
     fl_parseall,
     fl_parse,
-    highlight
+    highlight,
+    tokenize,
+    untokenize
 
 if VERSION < v"1.6"
     # Compat stuff which might not be in Base for older versions
diff --git a/test/tokenize.jl b/test/tokenize.jl
@@ -15,7 +15,7 @@ using JuliaSyntax.Tokenize:
     Tokenize,
     tokenize,
     untokenize,
-    Token
+    RawToken
 
 tok(str, i = 1) = collect(tokenize(str))[i]
 
@@ -321,7 +321,7 @@ end
     @test String(take!(io)) == "1-5        String         "
 end
 
-~(tok::Token, t::Tuple) = tok.kind == t[1] && untokenize(tok, t[3]) == t[2]
+~(tok::RawToken, t::Tuple) = tok.kind == t[1] && untokenize(tok, t[3]) == t[2]
 
 @testset "raw strings" begin
     str = raw""" str"x $ \ y" """