diff --git a/Project.toml b/Project.toml index 231dab6f..1abbf2f7 100644 --- a/Project.toml +++ b/Project.toml @@ -7,8 +7,6 @@ version = "1.0.2" Serialization = "1.0" julia = "1.0" -[deps] - [extras] Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" diff --git a/docs/src/design.md b/docs/src/design.md index fb2a06c2..968a0e11 100644 --- a/docs/src/design.md +++ b/docs/src/design.md @@ -56,7 +56,7 @@ We use a hand-written lexer (a heavily modified version of The main parser innovation is the `ParseStream` interface which provides a stream-like I/O interface for writing the parser. The parser does not depend on or produce any concrete tree data structure as part of the parsing -phase but the output spans can be post-processed into various tree data +phase but the output nodes can be post-processed into various tree data structures as required. This is like the design of rust-analyzer though with a simpler implementation. @@ -64,35 +64,39 @@ Parsing proceeds by recursive descent; * The parser consumes a flat list of lexed tokens as *input* using `peek()` to examine tokens and `bump()` to consume them. -* The parser produces a flat list of text spans as *output* using `bump()` to - transfer tokens to the output and `position()`/`emit()` for nonterminal ranges. +* The parser produces a flat list of `RawGreenNode`s as *output* using `bump()` to + transfer tokens to the output and `position()`/`emit()` for nonterminal nodes. * Diagnostics are emitted as separate text spans * Whitespace and comments are automatically `bump()`ed and don't need to be handled explicitly. The exception is syntactically relevant newlines in space sensitive mode. * Parser modes are passed down the call tree using `ParseState`. -The output spans track the byte range, a syntax "kind" stored as an integer -tag, and some flags. The kind tag makes the spans a [sum -type](https://blog.waleedkhan.name/union-vs-sum-types/) but where the type is -tracked explicitly outside of Julia's type system. +The output nodes track the byte range, a syntax "kind" stored as an integer +tag, and some flags. Each node also stores either the number of child nodes +(for non-terminals) or the original token kind (for terminals). The kind tag +makes the nodes a [sum type](https://blog.waleedkhan.name/union-vs-sum-types/) +but where the type is tracked explicitly outside of Julia's type system. -For lossless parsing the output spans must cover the entire input text. Using +For lossless parsing the output nodes must cover the entire input text. Using `bump()`, `position()` and `emit()` in a natural way also ensures that: -* Spans are cleanly nested with children contained entirely within their parents -* Siblings spans are emitted in source order -* Parent spans are emitted after all their children. +* Nodes are cleanly nested with children contained entirely within their parents +* Sibling nodes are emitted in source order +* Parent nodes are emitted after all their children. -These properties make the output spans naturally isomorphic to a +These properties make the output nodes a post-order traversal of a ["green tree"](#raw-syntax-tree--green-tree) -in the terminology of C#'s Roslyn compiler. +in the terminology of C#'s Roslyn compiler, with the tree structure +implicit in the node spans. ### Tree construction -The `build_tree` function performs a depth-first traversal of the `ParseStream` -output spans allowing it to be assembled into a concrete tree data structure, -for example using the `GreenNode` data type. We further build on top of this to -define `build_tree` for the AST type `SyntaxNode` and for normal Julia `Expr`. +The `build_tree` function uses the implicit tree structure in the `ParseStream` +output to assemble concrete tree data structures. Since the output is already +a post-order traversal of `RawGreenNode`s with node spans encoding parent-child +relationships, tree construction is straightforward. We build on top of this to +define `build_tree` for various tree types including `GreenNode`, the AST type +`SyntaxNode`, and for normal Julia `Expr`. ### Error recovery diff --git a/src/JuliaSyntax.jl b/src/JuliaSyntax.jl index 9afff872..3c276984 100644 --- a/src/JuliaSyntax.jl +++ b/src/JuliaSyntax.jl @@ -73,7 +73,7 @@ export @K_str, kind export SyntaxNode -@_public GreenNode, +@_public GreenNode, RedTreeCursor, GreenTreeCursor, span # Helper utilities @@ -95,7 +95,8 @@ include("parser_api.jl") include("literal_parsing.jl") # Tree data structures -include("green_tree.jl") +include("tree_cursors.jl") +include("green_node.jl") include("syntax_tree.jl") include("expr.jl") diff --git a/src/expr.jl b/src/expr.jl index 0a6f16a7..dc802b4e 100644 --- a/src/expr.jl +++ b/src/expr.jl @@ -28,7 +28,7 @@ macro isexpr(ex, head, nargs) length($(esc(ex)).args) == $(esc(nargs))) end -function _reorder_parameters!(args::Vector{Any}, params_pos) +function _reorder_parameters!(args::Vector{Any}, params_pos::Int) p = 0 for i = length(args):-1:1 ai = args[i] @@ -48,7 +48,7 @@ function _reorder_parameters!(args::Vector{Any}, params_pos) insert!(args, params_pos, pop!(args)) end -function _strip_parens(ex) +function _strip_parens(ex::Expr) while true if @isexpr(ex, :parens) if length(ex.args) == 1 @@ -63,37 +63,9 @@ function _strip_parens(ex) end end -# Get Julia value of leaf node as it would be represented in `Expr` form -function _expr_leaf_val(node::SyntaxNode) - node.val -end -function _leaf_to_Expr(source, txtbuf, txtbuf_offset, head, srcrange, node) - k = kind(head) - if k == K"MacroName" && view(source, srcrange) == "." - return Symbol("@__dot__") - elseif is_error(k) - return k == K"error" ? - Expr(:error) : - Expr(:error, "$(_token_error_descriptions[k]): `$(source[srcrange])`") - else - val = isnothing(node) ? - parse_julia_literal(txtbuf, head, srcrange .+ txtbuf_offset) : - _expr_leaf_val(node) - if val isa Union{Int128,UInt128,BigInt} - # Ignore the values of large integers and convert them back to - # symbolic/textural form for compatibility with the Expr - # representation of these. - str = replace(source[srcrange], '_'=>"") - macname = val isa Int128 ? Symbol("@int128_str") : - val isa UInt128 ? Symbol("@uint128_str") : - Symbol("@big_str") - return Expr(:macrocall, GlobalRef(Core, macname), nothing, str) - else - return val - end - end -end +reverse_nontrivia_children(cursor::RedTreeCursor) = Iterators.filter(should_include_node, Iterators.reverse(cursor)) +reverse_nontrivia_children(cursor::SyntaxNode) = Iterators.filter(should_include_node, Iterators.reverse(children(cursor))) # Julia string literals in a `K"string"` node may be split into several chunks # interspersed with trivia in two situations: @@ -102,89 +74,110 @@ end # # This function concatenating adjacent string chunks together as done in the # reference parser. -function _string_to_Expr(args) +function _string_to_Expr(cursor::Union{RedTreeCursor, SyntaxNode}, source::SourceFile, txtbuf::Vector{UInt8}, txtbuf_offset::UInt32) + ret = Expr(:string) args2 = Any[] i = 1 - while i <= length(args) - if args[i] isa String - if i < length(args) && args[i+1] isa String + it = reverse_nontrivia_children(cursor) + r = iterate(it) + while r !== nothing + (child, state) = r + ex = node_to_expr(child, source, txtbuf, txtbuf_offset) + if isa(ex, String) + # This branch combines consequent string chunks together. + # It's unrolled once to avoid unnecessary allocations. + r = iterate(it, state) + if r === nothing + pushfirst!(ret.args, ex) + continue + end + (child, state) = r + ex2 = node_to_expr(child, source, txtbuf, txtbuf_offset) + if !isa(ex2, String) + pushfirst!(ret.args, ex) + ex = ex2 + # Fall through to process `ex` (!::String) + else + strings = String[ex2, ex] # Note: reversed order since we're iterating backwards + r = iterate(it, state) + while r !== nothing + (child, state) = r + ex = node_to_expr(child, source, txtbuf, txtbuf_offset) + isa(ex, String) || break + pushfirst!(strings, ex) + r = iterate(it, state) + end buf = IOBuffer() - while i <= length(args) && args[i] isa String - write(buf, args[i]::String) - i += 1 + for s in strings + write(buf, s) end - push!(args2, String(take!(buf))) - else - push!(args2, args[i]) - i += 1 + pushfirst!(ret.args, String(take!(buf))) + r === nothing && break + # Fall through to process `ex` (!::String) end - else - ex = args[i] - if @isexpr(ex, :parens, 1) - ex = _strip_parens(ex) - if ex isa String - # Wrap interpolated literal strings in (string) so we can - # distinguish them from the surrounding text (issue #38501) - # Ie, "$("str")" vs "str" - # https://github.com/JuliaLang/julia/pull/38692 - ex = Expr(:string, ex) - end + end + # ex not a string + if @isexpr(ex, :parens, 1) + ex = _strip_parens(ex) + if ex isa String + # Wrap interpolated literal strings in (string) so we can + # distinguish them from the surrounding text (issue #38501) + # Ie, "$("str")" vs "str" + # https://github.com/JuliaLang/julia/pull/38692 + ex = Expr(:string, ex) end - push!(args2, ex) - i += 1 end + @assert ex !== nothing + pushfirst!(ret.args, ex) + r = iterate(it, state) end - if length(args2) == 1 && args2[1] isa String + + if length(ret.args) == 1 && ret.args[1] isa String # If there's a single string remaining after joining, we unwrap # to give a string literal. # """\n a\n b""" ==> "a\nb" - return only(args2) + return only(ret.args) else # This only happens when the kind is K"string" or when an error has occurred. - return Expr(:string, args2...) + return ret end end # Shared fixups for Expr children in cases where the type of the parent node # affects the child layout. -function _fixup_Expr_children!(head, loc, args) +function fixup_Expr_child(head::SyntaxHead, @nospecialize(arg), first::Bool) + isa(arg, Expr) || return arg k = kind(head) eq_to_kw_in_call = ((k == K"call" || k == K"dotcall") && is_prefix_call(head)) || k == K"ref" eq_to_kw_in_params = k != K"vect" && k != K"curly" && k != K"braces" && k != K"ref" coalesce_dot = k in KSet"call dotcall curly" || - (k == K"quote" && flags(head) == COLON_QUOTE) - for i in 1:length(args) - arg = args[i] - was_parens = @isexpr(arg, :parens) - arg = _strip_parens(arg) - if @isexpr(arg, :(=)) && eq_to_kw_in_call && i > 1 - arg = Expr(:kw, arg.args...) - elseif k != K"parens" && @isexpr(arg, :., 1) && arg.args[1] isa Tuple - h, a = arg.args[1]::Tuple{SyntaxHead,Any} - arg = ((!was_parens && coalesce_dot && i == 1) || - (k == K"comparison" && iseven(i)) || - is_syntactic_operator(h)) ? - Symbol(".", a) : Expr(:., a) - elseif @isexpr(arg, :parameters) && eq_to_kw_in_params - pargs = arg.args - for j = 1:length(pargs) - pj = pargs[j] - if @isexpr(pj, :(=)) - pargs[j] = Expr(:kw, pj.args...) - end + (k == K"quote" && has_flags(head, COLON_QUOTE)) + was_parens = @isexpr(arg, :parens) + arg = _strip_parens(arg) + if @isexpr(arg, :(=)) && eq_to_kw_in_call && !first + arg = Expr(:kw, arg.args...) + elseif k != K"parens" && @isexpr(arg, :., 1) && arg.args[1] isa Tuple + # This undoes the "Hack" below" + h, a = arg.args[1]::Tuple{SyntaxHead,Any} + arg = ((!was_parens && coalesce_dot && first) || + is_syntactic_operator(h)) ? + Symbol(".", a) : Expr(:., a) + elseif @isexpr(arg, :parameters) && eq_to_kw_in_params + pargs = arg.args + for j = 1:length(pargs) + pj = pargs[j] + if @isexpr(pj, :(=)) + pargs[j] = Expr(:kw, pj.args...) end - elseif k == K"let" && i == 1 && @isexpr(arg, :block) - filter!(a -> !(a isa LineNumberNode), arg.args) end - args[i] = arg end - return args + return arg end # Remove the `do` block from the final position in a function/macro call arg list -function _extract_do_lambda!(args) +function _extract_do_lambda!(args::Vector{Any}) if length(args) > 1 && Meta.isexpr(args[end], :do_lambda) do_ex = pop!(args)::Expr return Expr(:->, do_ex.args...) @@ -193,7 +186,7 @@ function _extract_do_lambda!(args) end end -function _append_iterspec!(args, ex) +function _append_iterspec!(args::Vector{Any}, @nospecialize(ex)) if @isexpr(ex, :iteration) for iter in ex.args::Vector{Any} push!(args, Expr(:(=), iter.args...)) @@ -204,48 +197,131 @@ function _append_iterspec!(args, ex) return args end +function parseargs!(retexpr::Expr, loc::LineNumberNode, cursor::Union{RedTreeCursor, SyntaxNode}, source::SourceFile, txtbuf::Vector{UInt8}, txtbuf_offset::UInt32) + args = retexpr.args + firstchildhead = head(cursor) + firstchildrange::UnitRange{UInt32} = byte_range(cursor) + itr = reverse_nontrivia_children(cursor) + r = iterate(itr) + while r !== nothing + (child, state) = r + r = iterate(itr, state) + expr = node_to_expr(child, source, txtbuf, txtbuf_offset) + @assert expr !== nothing + firstchildhead = head(child) + firstchildrange = byte_range(child) + pushfirst!(args, fixup_Expr_child(head(cursor), expr, r === nothing)) + end + return (firstchildhead, firstchildrange) +end + # Convert internal node of the JuliaSyntax parse tree to an Expr -function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, args) - k = kind(head) - if (k == K"var" || k == K"char") && length(args) == 1 - # Ideally we'd like `@check length(args) == 1` as an invariant for all - # K"var" and K"char" nodes, but this discounts having embedded error - # nodes when ignore_errors=true is set. - return args[1] - elseif k == K"string" - return _string_to_Expr(args) +function node_to_expr(cursor::Union{RedTreeCursor, SyntaxNode}, source::SourceFile, txtbuf::Vector{UInt8}, txtbuf_offset::UInt32=UInt32(0)) + if !should_include_node(cursor) + return nothing + end + + nodehead = head(cursor) + k = kind(cursor) + srcrange::UnitRange{UInt32} = byte_range(cursor) + if is_leaf(cursor) + if k == K"MacroName" && view(source, srcrange) == "." + return Symbol("@__dot__") + elseif is_error(k) + return k == K"error" ? + Expr(:error) : + Expr(:error, "$(_token_error_descriptions[k]): `$(source[srcrange])`") + else + val = parse_julia_literal(txtbuf, head(cursor), srcrange .+ txtbuf_offset) + if val isa Union{Int128,UInt128,BigInt} + # Ignore the values of large integers and convert them back to + # symbolic/textural form for compatibility with the Expr + # representation of these. + str = replace(source[srcrange], '_'=>"") + macname = val isa Int128 ? Symbol("@int128_str") : + val isa UInt128 ? Symbol("@uint128_str") : + Symbol("@big_str") + return Expr(:macrocall, GlobalRef(Core, macname), nothing, str) + else + return val + end + end + end + + if k == K"string" + return _string_to_Expr(cursor, source, txtbuf, txtbuf_offset) end loc = source_location(LineNumberNode, source, first(srcrange)) - endloc = source_location(LineNumberNode, source, last(srcrange)) if k == K"cmdstring" - return Expr(:macrocall, GlobalRef(Core, Symbol("@cmd")), loc, _string_to_Expr(args)) + return Expr(:macrocall, GlobalRef(Core, Symbol("@cmd")), loc, + _string_to_Expr(cursor, source, txtbuf, txtbuf_offset)) end - _fixup_Expr_children!(head, loc, args) - - headstr = untokenize(head, include_flag_suff=false) + headstr = untokenize(nodehead, include_flag_suff=false) headsym = !isnothing(headstr) ? Symbol(headstr) : error("Can't untokenize head of kind $(k)") + retexpr = Expr(headsym) - if k == K"?" - headsym = :if + # Block gets special handling for extra line number nodes + if k == K"block" || (k == K"toplevel" && !has_flags(nodehead, TOPLEVEL_SEMICOLONS_FLAG)) + args = retexpr.args + for child in reverse_nontrivia_children(cursor) + expr = node_to_expr(child, source, txtbuf, txtbuf_offset) + @assert expr !== nothing + # K"block" does not have special first-child handling, so we do not need to keep track of that here + pushfirst!(args, fixup_Expr_child(head(cursor), expr, false)) + pushfirst!(args, source_location(LineNumberNode, source, first(byte_range(child)))) + end + isempty(args) && push!(args, loc) + if k == K"block" && has_flags(nodehead, PARENS_FLAG) + popfirst!(args) + end + return retexpr + end + + # Now recurse to parse all arguments + (firstchildhead, firstchildrange) = parseargs!(retexpr, loc, cursor, source, txtbuf, txtbuf_offset) + + return _node_to_expr(retexpr, loc, srcrange, + firstchildhead, firstchildrange, + nodehead, source) +end + +# Split out from the above for codesize reasons, to avoid specialization on multiple +# tree types. +@noinline function _node_to_expr(retexpr::Expr, loc::LineNumberNode, + srcrange::UnitRange{UInt32}, + firstchildhead::SyntaxHead, + firstchildrange::UnitRange{UInt32}, + nodehead::SyntaxHead, + source::SourceFile) + args = retexpr.args + k = kind(nodehead) + endloc = source_location(LineNumberNode, source, last(srcrange)) + if (k == K"var" || k == K"char") && length(retexpr.args) == 1 + # `var` and `char` nodes have a single argument which is the value. + # However, errors can add additional errors tokens which we represent + # as e.g. `Expr(:var, ..., Expr(:error))`. + return retexpr.args[1] + elseif k == K"?" + retexpr.head = :if elseif k == K"op=" && length(args) == 3 lhs = args[1] op = args[2] rhs = args[3] headstr = string(args[2], '=') - if is_dotted(head) + if is_dotted(nodehead) headstr = '.'*headstr end - headsym = Symbol(headstr) - args = Any[lhs, rhs] + retexpr.head = Symbol(headstr) + retexpr.args = Any[lhs, rhs] elseif k == K"macrocall" if length(args) >= 2 a2 = args[2] - if @isexpr(a2, :macrocall) && kind(childheads[1]) == K"CmdMacroName" + if @isexpr(a2, :macrocall) && kind(firstchildhead) == K"CmdMacroName" # Fix up for custom cmd macros like foo`x` args[2] = a2.args[3] end @@ -254,54 +330,41 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, _reorder_parameters!(args, 2) insert!(args, 2, loc) if do_lambda isa Expr - return Expr(:do, Expr(headsym, args...), do_lambda) - end - elseif k == K"block" || (k == K"toplevel" && !has_flags(head, TOPLEVEL_SEMICOLONS_FLAG)) - if isempty(args) - push!(args, loc) - else - resize!(args, 2*length(args)) - for i = length(childranges):-1:1 - args[2*i] = args[i] - args[2*i-1] = source_location(LineNumberNode, source, first(childranges[i])) - end - end - if k == K"block" && has_flags(head, PARENS_FLAG) - popfirst!(args) + return Expr(:do, retexpr, do_lambda) end elseif k == K"doc" - headsym = :macrocall - args = [GlobalRef(Core, Symbol("@doc")), loc, args...] + retexpr.head = :macrocall + retexpr.args = [GlobalRef(Core, Symbol("@doc")), loc, args...] elseif k == K"dotcall" || k == K"call" # Julia's standard `Expr` ASTs have children stored in a canonical # order which is often not always source order. We permute the children # here as necessary to get the canonical order. - if is_infix_op_call(head) || is_postfix_op_call(head) + if is_infix_op_call(nodehead) || is_postfix_op_call(nodehead) args[2], args[1] = args[1], args[2] end # Lower (call x ') to special ' head - if is_postfix_op_call(head) && args[1] == Symbol("'") + if is_postfix_op_call(nodehead) && args[1] == Symbol("'") popfirst!(args) - headsym = Symbol("'") + retexpr.head = Symbol("'") end do_lambda = _extract_do_lambda!(args) # Move parameters blocks to args[2] _reorder_parameters!(args, 2) - if headsym === :dotcall + if retexpr.head === :dotcall funcname = args[1] - if is_prefix_call(head) - headsym = :. - args = Any[funcname, Expr(:tuple, args[2:end]...)] + if is_prefix_call(nodehead) + retexpr.head = :. + retexpr.args = Any[funcname, Expr(:tuple, args[2:end]...)] else # operator calls - headsym = :call + retexpr.head = :call if funcname isa Symbol args[1] = Symbol(:., funcname) end # else funcname could be an Expr(:error), just propagate it end end if do_lambda isa Expr - return Expr(:do, Expr(headsym, args...), do_lambda) + return Expr(:do, retexpr, do_lambda) end elseif k == K"." if length(args) == 2 @@ -312,7 +375,7 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, elseif length(args) == 1 # Hack: Here we preserve the head of the operator to determine whether # we need to coalesce it with the dot into a single symbol later on. - args[1] = (childheads[1], args[1]) + args[1] = (firstchildhead, args[1]) end elseif k == K"ref" || k == K"curly" # Move parameters blocks to args[2] @@ -335,11 +398,11 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, if @isexpr(a2, :braces) a2a = a2.args _reorder_parameters!(a2a, 2) - args = Any[args[1], a2a...] + retexpr.args = Any[args[1], a2a...] end end elseif k == K"catch" - if kind(childheads[1]) == K"Placeholder" + if kind(firstchildhead) == K"Placeholder" args[1] = false end elseif k == K"try" @@ -367,7 +430,8 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, @assert false "Illegal $a subclause in `try`" end end - args = Any[try_, catch_var, catch_] + empty!(args) + push!(args, try_, catch_var, catch_) if finally_ !== false || else_ !== false push!(args, finally_) if else_ !== false @@ -389,13 +453,13 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, return gen elseif k == K"filter" @assert length(args) == 2 - args = _append_iterspec!(Any[args[2]], args[1]) + retexpr.args = _append_iterspec!(Any[args[2]], args[1]) elseif k == K"nrow" || k == K"ncat" # For lack of a better place, the dimension argument to nrow/ncat # is stored in the flags - pushfirst!(args, numeric_flags(flags(head))) + pushfirst!(args, numeric_flags(flags(nodehead))) elseif k == K"typed_ncat" - insert!(args, 2, numeric_flags(flags(head))) + insert!(args, 2, numeric_flags(flags(nodehead))) elseif k == K"elseif" # Block for conditional's source location args[1] = Expr(:block, loc, args[1]) @@ -406,8 +470,8 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, # compatibility. We should consider deleting this special case in # the future as a minor change. if length(a1.args) == 1 && - (!has_flags(childheads[1], PARENS_FLAG) || - !has_flags(childheads[1], TRAILING_COMMA_FLAG)) && + (!has_flags(firstchildhead, PARENS_FLAG) || + !has_flags(firstchildhead, TRAILING_COMMA_FLAG)) && !Meta.isexpr(a1.args[1], :parameters) # `(a) -> c` is parsed without tuple on lhs in Expr form args[1] = a1.args[1] @@ -419,7 +483,7 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, else a111 = only(a11.args) assgn = @isexpr(a111, :kw) ? Expr(:(=), a111.args...) : a111 - argloc = source_location(LineNumberNode, source, last(childranges[1])) + argloc = source_location(LineNumberNode, source, last(firstchildrange)) args[1] = Expr(:block, a1.args[2], argloc, assgn) end end @@ -433,12 +497,12 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, end elseif k == K"function" if length(args) > 1 - if has_flags(head, SHORT_FORM_FUNCTION_FLAG) + if has_flags(nodehead, SHORT_FORM_FUNCTION_FLAG) a2 = args[2] if !@isexpr(a2, :block) args[2] = Expr(:block, a2) end - headsym = :(=) + retexpr.head = :(=) else a1 = args[1] if @isexpr(a1, :tuple) @@ -451,31 +515,36 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, end end end - pushfirst!((args[2]::Expr).args, loc) + arg2 = args[2] + # Only push if this is an Expr - could be an ErrorVal + isa(arg2, Expr) && pushfirst!(arg2.args, loc) end elseif k == K"macro" if length(args) > 1 pushfirst!((args[2]::Expr).args, loc) end elseif k == K"module" - pushfirst!(args, !has_flags(head, BARE_MODULE_FLAG)) + pushfirst!(args, !has_flags(nodehead, BARE_MODULE_FLAG)) pushfirst!((args[3]::Expr).args, loc) elseif k == K"inert" return QuoteNode(only(args)) - elseif k == K"quote" && length(args) == 1 - a1 = only(args) - if !(a1 isa Expr || a1 isa QuoteNode || a1 isa Bool) - # Flisp parser does an optimization here: simple values are stored - # as inert QuoteNode rather than in `Expr(:quote)` quasiquote - return QuoteNode(a1) + elseif k == K"quote" + if length(args) == 1 + a1 = only(args) + if !(a1 isa Expr || a1 isa QuoteNode || a1 isa Bool) + # Flisp parser does an optimization here: simple values are stored + # as inert QuoteNode rather than in `Expr(:quote)` quasiquote + return QuoteNode(a1) + end end elseif k == K"do" # Temporary head which is picked up by _extract_do_lambda - headsym = :do_lambda + retexpr.head = :do_lambda elseif k == K"let" a1 = args[1] if @isexpr(a1, :block) a1a = (args[1]::Expr).args + filter!(a -> !(a isa LineNumberNode), a1a) # Ugly logic to strip the Expr(:block) in certain cases for compatibility if length(a1a) == 1 a = a1a[1] @@ -489,17 +558,17 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, a1 = args[1] if @isexpr(a1, :const) # Normalize `local const` to `const local` - args[1] = Expr(headsym, (a1::Expr).args...) - headsym = :const + args[1] = Expr(retexpr.head, (a1::Expr).args...) + retexpr.head = :const elseif @isexpr(a1, :tuple) # Normalize `global (x, y)` to `global x, y` - args = a1.args + retexpr.args = a1.args end end elseif k == K"return" && isempty(args) push!(args, nothing) elseif k == K"juxtapose" - headsym = :call + retexpr.head = :call pushfirst!(args, :*) elseif k == K"struct" @assert args[2].head == :block @@ -515,9 +584,9 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, end end args[2] = fields - pushfirst!(args, has_flags(head, MUTABLE_FLAG)) + pushfirst!(args, has_flags(nodehead, MUTABLE_FLAG)) elseif k == K"importpath" - headsym = :. + retexpr.head = :. for i = 1:length(args) ai = args[i] if ai isa QuoteNode @@ -529,72 +598,41 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, elseif k == K"wrapper" # This should only happen for errors wrapped next to what should have # been single statements or atoms - represent these as blocks. - headsym = :block + retexpr.head = :block + elseif k == K"comparison" + for i = 2:2:length(args) + arg = args[i] + if @isexpr(arg, :., 1) + args[i] = Symbol(".", arg.args[1]) + end + end end - return Expr(headsym, args...) -end - - -# Stack entry for build_tree Expr conversion. -# We'd use `Tuple{UnitRange{Int},SyntaxHead,Any}` instead, but that's an -# abstract type due to the `Any` and tuple covariance which destroys -# performance. -struct _BuildExprStackEntry - srcrange::UnitRange{Int} - head::SyntaxHead - ex::Any + return retexpr end function build_tree(::Type{Expr}, stream::ParseStream; filename=nothing, first_line=1, kws...) source = SourceFile(stream, filename=filename, first_line=first_line) txtbuf = unsafe_textbuf(stream) - args = Any[] - childranges = UnitRange{Int}[] - childheads = SyntaxHead[] - entry = build_tree(_BuildExprStackEntry, stream; kws...) do head, srcrange, nodechildren - if is_trivia(head) && !is_error(head) - return nothing - end - k = kind(head) - if isnothing(nodechildren) - ex = _leaf_to_Expr(source, txtbuf, 0, head, srcrange, nothing) - else - resize!(childranges, length(nodechildren)) - resize!(childheads, length(nodechildren)) - resize!(args, length(nodechildren)) - for (i,c) in enumerate(nodechildren) - childranges[i] = c.srcrange - childheads[i] = c.head - args[i] = c.ex - end - ex = _internal_node_to_Expr(source, srcrange, head, childranges, childheads, args) - end - return _BuildExprStackEntry(srcrange, head, ex) - end - loc = source_location(LineNumberNode, source, first(entry.srcrange)) - only(_fixup_Expr_children!(SyntaxHead(K"None",EMPTY_FLAGS), loc, Any[entry.ex])) -end - -function _to_expr(node) - file = sourcefile(node) - if is_leaf(node) - txtbuf_offset, txtbuf = _unsafe_wrap_substring(sourcetext(file)) - return _leaf_to_Expr(file, txtbuf, txtbuf_offset, head(node), byte_range(node), node) + cursor = RedTreeCursor(stream) + wrapper_head = SyntaxHead(K"wrapper",EMPTY_FLAGS) + if has_toplevel_siblings(cursor) + entry = Expr(:block) + for child in + Iterators.filter(should_include_node, reverse_toplevel_siblings(cursor)) + pushfirst!(entry.args, fixup_Expr_child(wrapper_head, node_to_expr(child, source, txtbuf), false)) + end + length(entry.args) == 1 && (entry = only(entry.args)) + else + entry = fixup_Expr_child(wrapper_head, node_to_expr(cursor, source, txtbuf), false) end - cs = children(node) - args = Any[_to_expr(c) for c in cs] - _internal_node_to_Expr(file, byte_range(node), head(node), byte_range.(cs), head.(cs), args) -end - -function to_expr(node) - ex = _to_expr(node) - loc = source_location(LineNumberNode, node) - only(_fixup_Expr_children!(SyntaxHead(K"None",EMPTY_FLAGS), loc, Any[ex])) + return entry end function Base.Expr(node::SyntaxNode) - to_expr(node) + source = sourcefile(node) + txtbuf_offset, txtbuf = _unsafe_wrap_substring(sourcetext(source)) + wrapper_head = SyntaxHead(K"wrapper",EMPTY_FLAGS) + return fixup_Expr_child(wrapper_head, node_to_expr(node, source, txtbuf, UInt32(txtbuf_offset)), false) end - diff --git a/src/green_tree.jl b/src/green_node.jl similarity index 67% rename from src/green_tree.jl rename to src/green_node.jl index 4164529a..61bdbb01 100644 --- a/src/green_tree.jl +++ b/src/green_node.jl @@ -1,24 +1,10 @@ """ - GreenNode(head, span) - GreenNode(head, children...) + struct GreenNode -A "green tree" is a lossless syntax tree which overlays all the source text. -The most basic properties of a green tree are that: - -* Nodes cover a contiguous span of bytes in the text -* Sibling nodes are ordered in the same order as the text - -As implementation choices, we choose that: - -* Nodes are immutable and don't know their parents or absolute position, so can - be cached and reused -* Nodes are homogeneously typed at the language level so they can be stored - concretely, with the `head` defining the node type. Normally this would - include a "syntax kind" enumeration, but it can also include flags and record - information the parser knew about the layout of the child nodes. -* For simplicity and uniformity, leaf nodes cover a single token in the source. - This is like rust-analyzer, but different from Roslyn where leaves can - include syntax trivia. +An explicit pointer-y representation of the green tree produced by the parser. +See [`RawGreenNode`](@ref) for documentation on working with the implicit green +tree directly. However, this representation is useful for introspection as it +provides O(1) access to the children (as well as forward iteration). """ struct GreenNode{Head} head::Head @@ -46,7 +32,7 @@ span(node::GreenNode) = node.span Base.getindex(node::GreenNode, i::Int) = children(node)[i] Base.getindex(node::GreenNode, rng::UnitRange) = view(children(node), rng) Base.firstindex(node::GreenNode) = 1 -Base.lastindex(node::GreenNode) = length(children(node)) +Base.lastindex(node::GreenNode) = children(node) === nothing ? 0 : length(children(node)) """ Get absolute position and span of the child of `node` at the given tree `path`. @@ -132,10 +118,38 @@ function Base.show(io::IO, ::MIME"text/plain", node::GreenNode, str::AbstractStr _show_green_node(io, node, "", 1, str, show_trivia) end -function build_tree(::Type{GreenNode}, stream::ParseStream; kws...) - build_tree(GreenNode{SyntaxHead}, stream; kws...) do h, srcrange, cs - span = length(srcrange) - isnothing(cs) ? GreenNode(h, span) : - GreenNode(h, span, collect(GreenNode{SyntaxHead}, cs)) +function GreenNode(cursor::GreenTreeCursor) + chead = head(cursor) + T = typeof(chead) + if is_leaf(cursor) + return GreenNode{T}(head(cursor), span(cursor), nothing) + else + children = GreenNode{T}[] + for child in reverse(cursor) + pushfirst!(children, GreenNode(child)) + end + return GreenNode{T}(head(cursor), span(cursor), children) + end +end + +function build_tree(T::Type{GreenNode}, stream::ParseStream; kws...) + cursor = GreenTreeCursor(stream) + if has_toplevel_siblings(cursor) + # There are multiple toplevel nodes, e.g. because we're using this + # to test a partial parse. Wrap everything in K"wrapper" + all_processed = 0 + local cs + for child in reverse_toplevel_siblings(cursor) + c = GreenNode(child) + if !@isdefined(cs) + cs = GreenNode{SyntaxHead}[c] + else + pushfirst!(cs, c) + end + end + @assert length(cs) != 1 + return GreenNode(SyntaxHead(K"wrapper", NON_TERMINAL_FLAG), stream.next_byte-1, cs) + else + return GreenNode(cursor) end end diff --git a/src/kinds.jl b/src/kinds.jl index c7d27e35..9d8999c7 100644 --- a/src/kinds.jl +++ b/src/kinds.jl @@ -27,7 +27,7 @@ primitive type Kind 16 end const _kind_str_to_int = Dict{String,UInt16}() const _kind_int_to_str = Dict{UInt16,String}() const _kind_modules = Dict{Int,Union{Symbol,Module}}( - 0=>:JuliaSyntax, + 0=>nameof(@__MODULE__), 1=>:JuliaLowering, 2=>:JuliaSyntaxFormatter ) @@ -49,7 +49,7 @@ function Kind(s::AbstractString) Kind(i) end -Base.string(x::Kind) = _kind_int_to_str[reinterpret(UInt16, x)] +Base.string(x::Kind) = get(_kind_int_to_str, reinterpret(UInt16, x), "") Base.print(io::IO, x::Kind) = print(io, string(x)) Base.isless(x::Kind, y::Kind) = reinterpret(UInt16, x) < reinterpret(UInt16, y) @@ -127,7 +127,7 @@ end """ register_kinds!(mod, module_id, names) -Register custom `Kind`s with the given `names`, belonging to a module `mod`. +Register custom `Kind`s with the given `names`, belonging to a module `mod`. `names` is an array of arbitrary strings. In order for kinds to be represented by a small number of bits, some nontrivial diff --git a/src/parse_stream.jl b/src/parse_stream.jl index 0c57c2a4..1000fdaa 100644 --- a/src/parse_stream.jl +++ b/src/parse_stream.jl @@ -71,6 +71,11 @@ Set for K"module" when it's not bare (`module`, not `baremodule`) """ const BARE_MODULE_FLAG = RawFlags(1<<5) +""" +Set for nodes that are non-terminals +""" +const NON_TERMINAL_FLAG = RawFlags(1<<7) + # Flags holding the dimension of an nrow or other UInt8 not held in the source # TODO: Given this is only used for nrow/ncat, we could actually use all the flags? const NUMERIC_FLAGS = RawFlags(RawFlags(0xff)<<8) @@ -282,25 +287,105 @@ preceding_whitespace(tok::SyntaxToken) = tok.preceding_whitespace #------------------------------------------------------------------------------- """ -Range in the source text which will become a node in the tree. Can be either a -token (leaf node of the tree) or an interior node, depending on how the -start_mark compares to previous nodes. -""" -struct TaggedRange - head::SyntaxHead # Kind,flags - # The following field is used for one of two things: - # - For leaf nodes it's an index in the tokens array - # - For non-leaf nodes it points to the index of the first child - first_token::UInt32 - last_token::UInt32 + RawGreenNode(head::SyntaxHead, byte_span::UInt32, orig_kind::Kind) # Terminal + RawGreenNode(head::SyntaxHead, byte_span::UInt32, nchildren::UInt32) # Non-terminal + +A "green tree" is a lossless syntax tree which overlays all the source text. +The most basic properties of a green tree are that: + +* Nodes cover a contiguous span of bytes in the text +* Sibling nodes are ordered in the same order as the text + +As implementation choices, we choose that: + +* Nodes are immutable and don't know their parents or absolute position, so can + be cached and reused +* Nodes are homogeneously typed at the language level so they can be stored + concretely, with the `head` defining the node type. Normally this would + include a "syntax kind" enumeration, but it can also include flags and record + information the parser knew about the layout of the child nodes. +* For simplicity and uniformity, leaf nodes cover a single token in the source. + This is like rust-analyzer, but different from Roslyn where leaves can + include syntax trivia. +* The parser produces a single buffer of `RawGreenNode` which encodes the tree. + There are higher level accessors, which make working with this tree easier. +""" +struct RawGreenNode + head::SyntaxHead # Kind,flags + byte_span::UInt32 # Number of bytes covered by this range + # If NON_TERMINAL_FLAG is set, this is the total number of child nodes + # Otherwise this is a terminal node (i.e. a token) and this is orig_kind + node_span_or_orig_kind::UInt32 + + # Constructor for terminal nodes (tokens) + function RawGreenNode(head::SyntaxHead, byte_span::Integer, orig_kind::Kind) + @assert (flags(head) & NON_TERMINAL_FLAG) == 0 + new(head, UInt32(byte_span), UInt32(reinterpret(UInt16, orig_kind))) + end + + # Constructor for non-terminal nodes - automatically sets NON_TERMINAL_FLAG + function RawGreenNode(head::SyntaxHead, byte_span::Integer, node_span::Integer) + h = SyntaxHead(kind(head), flags(head) | NON_TERMINAL_FLAG) + new(h, UInt32(byte_span), UInt32(node_span)) + end + + global reset_node + function reset_node(node::RawGreenNode, kind, flags) + new(_reset_node_head(node, kind, flags), + getfield(node, :byte_span), + getfield(node, :node_span_or_orig_kind)) + end end -head(range::TaggedRange) = range.head +function _reset_node_head(node, k, f) + if !isnothing(f) + f = RawFlags(f) + @assert (f & NON_TERMINAL_FLAG) == 0 + f |= flags(node) & NON_TERMINAL_FLAG + else + f = flags(node) + end + h = SyntaxHead(isnothing(k) ? kind(node) : k, f) +end + +Base.summary(node::RawGreenNode) = summary(node.head) +function Base.show(io::IO, node::RawGreenNode) + print(io, summary(node), " (", node.byte_span, " bytes,") + if is_terminal(node) + print(io, " orig_kind=", node.orig_kind, ")") + else + print(io, " ", node.node_span, " children)") + end +end + +function Base.getproperty(rgn::RawGreenNode, name::Symbol) + if name === :node_span + has_flags(getfield(rgn, :head), NON_TERMINAL_FLAG) || return UInt32(0) # Leaf nodes have no children + return getfield(rgn, :node_span_or_orig_kind) + elseif name === :orig_kind + has_flags(getfield(rgn, :head), NON_TERMINAL_FLAG) && error("Cannot access orig_kind for non-terminal node") + return Kind(getfield(rgn, :node_span_or_orig_kind)) + end + getfield(rgn, name) +end + +head(range::RawGreenNode) = range.head + +# Helper functions for unified output +is_terminal(node::RawGreenNode) = !has_flags(node.head, NON_TERMINAL_FLAG) +is_non_terminal(node::RawGreenNode) = has_flags(node.head, NON_TERMINAL_FLAG) #------------------------------------------------------------------------------- struct ParseStreamPosition - token_index::UInt32 # Index of last token in output - range_index::UInt32 + """ + The current position in the byte stream, i.e. the byte at `byte_index` is + the first byte of the next token to be parsed. + """ + byte_index::UInt32 + """ + The total number of nodes (terminal + non-terminal) in the output so far. + """ + node_index::UInt32 end const NO_POSITION = ParseStreamPosition(0, 0) @@ -349,10 +434,9 @@ mutable struct ParseStream lookahead_index::Int # Pool of stream positions for use as working space in parsing position_pool::Vector{Vector{ParseStreamPosition}} - # Buffer of finalized tokens - tokens::Vector{SyntaxToken} - # Parser output as an ordered sequence of ranges, parent nodes after children. - ranges::Vector{TaggedRange} + output::Vector{RawGreenNode} + # Current byte position in the output (the next byte to be written) + next_byte::Int # Parsing diagnostics (errors/warnings etc) diagnostics::Vector{Diagnostic} # Counter for number of peek()s we've done without making progress via a bump() @@ -372,17 +456,16 @@ mutable struct ParseStream # numbers. This means we're inexact for old dev versions but that seems # like an acceptable tradeoff. ver = (version.major, version.minor) - # Initial sentinel token containing the first byte of the first real token. - sentinel = SyntaxToken(SyntaxHead(K"TOMBSTONE", EMPTY_FLAGS), - K"TOMBSTONE", false, next_byte) + # Initial sentinel node (covering all ignored bytes before the first token) + sentinel = RawGreenNode(SyntaxHead(K"TOMBSTONE", EMPTY_FLAGS), next_byte-1, K"TOMBSTONE") new(text_buf, text_root, lexer, Vector{SyntaxToken}(), 1, Vector{Vector{ParseStreamPosition}}(), - SyntaxToken[sentinel], - Vector{TaggedRange}(), + RawGreenNode[sentinel], + next_byte, # Initialize next_byte from the parameter Vector{Diagnostic}(), 0, ver) @@ -427,7 +510,7 @@ function ParseStream(io::IO; version=VERSION) end function Base.show(io::IO, mime::MIME"text/plain", stream::ParseStream) - println(io, "ParseStream at position $(_next_byte(stream))") + println(io, "ParseStream at position $(stream.next_byte)") end function show_diagnostics(io::IO, stream::ParseStream) @@ -448,19 +531,11 @@ function release_positions(stream, positions) end #------------------------------------------------------------------------------- -# Return true when a token was emitted last at stream position `pos` +# Return true when a terminal (token) was emitted last at stream position `pos` function token_is_last(stream, pos) - return pos.range_index == 0 || - pos.token_index > stream.ranges[pos.range_index].last_token -end - -# Compute the first byte of a token at given index `i` -function token_first_byte(stream, i) - stream.tokens[i-1].next_byte -end - -function token_last_byte(stream::ParseStream, i) - stream.tokens[i].next_byte - 1 + # In the unified structure, check if the node at pos is a terminal + return pos.node_index > 0 && pos.node_index <= length(stream.output) && + is_terminal(stream.output[pos.node_index]) end function lookahead_token_first_byte(stream, i) @@ -507,7 +582,7 @@ end # Return the index of the next byte of the input function _next_byte(stream) - last(stream.tokens).next_byte + stream.next_byte end # Find the index of the next nontrivia token @@ -571,7 +646,7 @@ end @noinline function _parser_stuck_error(stream) # Optimization: emit unlikely errors in a separate function - error("The parser seems stuck at byte $(_next_byte(stream))") + error("The parser seems stuck at byte $(stream.next_byte)") end """ @@ -644,18 +719,19 @@ Retroactively inspecting or modifying the parser's output can be confusing, so using this function should be avoided where possible. """ function peek_behind(stream::ParseStream, pos::ParseStreamPosition) - if token_is_last(stream, pos) && pos.token_index > 0 - t = stream.tokens[pos.token_index] - return (kind=kind(t), - flags=flags(t), - orig_kind=t.orig_kind, - is_leaf=true) - elseif !isempty(stream.ranges) && pos.range_index > 0 - r = stream.ranges[pos.range_index] - return (kind=kind(r), - flags=flags(r), - orig_kind=K"None", - is_leaf=false) + if pos.node_index > 0 && pos.node_index <= length(stream.output) + node = stream.output[pos.node_index] + if is_terminal(node) + return (kind=kind(node), + flags=flags(node), + orig_kind=node.orig_kind, + is_leaf=true) + else + return (kind=kind(node), + flags=flags(node), + orig_kind=K"None", + is_leaf=false) + end else return (kind=K"None", flags=EMPTY_FLAGS, @@ -664,70 +740,57 @@ function peek_behind(stream::ParseStream, pos::ParseStreamPosition) end end +""" + first_child_position(stream::ParseStream, pos::ParseStreamPosition) + +Find the first non-trivia child of this node (in the GreenTree/RedTree sense) and return +its position. +""" function first_child_position(stream::ParseStream, pos::ParseStreamPosition) - ranges = stream.ranges - @assert pos.range_index > 0 - parent = ranges[pos.range_index] - # Find the first nontrivia range which is a child of this range but not a - # child of the child - c = 0 - for i = pos.range_index-1:-1:1 - if ranges[i].first_token < parent.first_token - break - end - if (c == 0 || ranges[i].first_token < ranges[c].first_token) && !is_trivia(ranges[i]) - c = i - end + output = stream.output + @assert pos.node_index > 0 + cursor = RedTreeCursor(GreenTreeCursor(output, pos.node_index), pos.byte_index-UInt32(1)) + candidate = nothing + for child in reverse(cursor) + is_trivia(child) && continue + candidate = child end - # Find first nontrivia token - t = 0 - for i = parent.first_token:parent.last_token - if !is_trivia(stream.tokens[i]) - t = i - break + candidate !== nothing && return ParseStreamPosition(candidate.byte_end+UInt32(1), candidate.green.position) + + # No children found - return the first non-trivia *token* (even if it + # is the child of a non-terminal trivia node (e.g. an error)). + byte_end = pos.byte_index + for i in pos.node_index-1:-1:(pos.node_index - treesize(cursor)) + node = output[i] + if is_terminal(node) + if !is_trivia(node) + return ParseStreamPosition(byte_end, i) + end + byte_end -= node.byte_span end end - if c == 0 || (t != 0 && ranges[c].first_token > t) - # Return leaf node at `t` - return ParseStreamPosition(t, 0) - else - # Return interior node at `c` - return ParseStreamPosition(ranges[c].last_token, c) - end + # Still none found. Return a sentinel value + return ParseStreamPosition(0, 0) end -function last_child_position(stream::ParseStream, pos::ParseStreamPosition) - ranges = stream.ranges - @assert pos.range_index > 0 - parent = ranges[pos.range_index] - # Find the last nontrivia range which is a child of this range - c = 0 - if pos.range_index > 1 - i = pos.range_index-1 - if ranges[i].first_token >= parent.first_token - # Valid child of current range - c = i - end - end - - # Find last nontrivia token - t = 0 - for i = parent.last_token:-1:parent.first_token - if !is_trivia(stream.tokens[i]) - t = i - break - end - end +""" + first_child_position(stream::ParseStream, pos::ParseStreamPosition) - if c == 0 || (t != 0 && ranges[c].last_token < t) - # Return leaf node at `t` - return ParseStreamPosition(t, 0) - else - # Return interior node at `c` - return ParseStreamPosition(ranges[c].last_token, c) + Find the last non-trivia child of this node (in the GreenTree/RedTree sense) and + return its position (i.e. the position as if that child had been the last thing parsed). +""" +function last_child_position(stream::ParseStream, pos::ParseStreamPosition) + output = stream.output + @assert pos.node_index > 0 + cursor = RedTreeCursor(GreenTreeCursor(output, pos.node_index), pos.byte_index-1) + candidate = nothing + for child in reverse(cursor) + is_trivia(child) && continue + return ParseStreamPosition(child.byte_end+UInt32(1), child.green.position) end + return ParseStreamPosition(0, 0) end # Get last position in stream "of interest", skipping @@ -736,24 +799,34 @@ end # * whitespace (if skip_trivia=true) function peek_behind_pos(stream::ParseStream; skip_trivia::Bool=true, skip_parens::Bool=true) - token_index = lastindex(stream.tokens) - range_index = lastindex(stream.ranges) + # Work backwards through the output + node_idx = length(stream.output) + byte_idx = stream.next_byte + + # Skip parens nodes if requested if skip_parens - while range_index >= firstindex(stream.ranges) && - kind(stream.ranges[range_index]) == K"parens" - range_index -= 1 + while node_idx > 0 + node = stream.output[node_idx] + if is_non_terminal(node) && kind(node) == K"parens" + node_idx -= 1 + else + break + end end end - last_token_in_nonterminal = range_index == 0 ? 0 : - stream.ranges[range_index].last_token - while token_index > last_token_in_nonterminal - t = stream.tokens[token_index] - if kind(t) != K"TOMBSTONE" && (!skip_trivia || !is_trivia(t)) + + # Skip trivia if requested + while node_idx > 0 + node = stream.output[node_idx] + if kind(node) == K"TOMBSTONE" || (skip_trivia && is_trivia(node)) + node_idx -= 1 + byte_idx -= node.byte_span + else break end - token_index -= 1 end - return ParseStreamPosition(token_index, range_index) + + return ParseStreamPosition(byte_idx, node_idx) end function peek_behind(stream::ParseStream; kws...) @@ -767,7 +840,7 @@ end # Bump up until the `n`th token # flags and remap_kind are applied to any non-trivia tokens -function _bump_until_n(stream::ParseStream, n::Integer, flags, remap_kind=K"None") +function _bump_until_n(stream::ParseStream, n::Integer, new_flags, remap_kind=K"None") if n < stream.lookahead_index return end @@ -777,13 +850,28 @@ function _bump_until_n(stream::ParseStream, n::Integer, flags, remap_kind=K"None if k == K"EndMarker" break end - f = flags | (@__MODULE__).flags(tok) + f = new_flags | flags(tok) is_trivia = is_whitespace(k) is_trivia && (f |= TRIVIA_FLAG) outk = (is_trivia || remap_kind == K"None") ? k : remap_kind h = SyntaxHead(outk, f) - push!(stream.tokens, - SyntaxToken(h, kind(tok), tok.preceding_whitespace, tok.next_byte)) + + # Calculate byte span for this token + if i == stream.lookahead_index + # First token in this batch - calculate span from current stream position + prev_byte = stream.next_byte + else + # Subsequent tokens - use previous token's next_byte + prev_byte = stream.lookahead[i-1].next_byte + end + byte_span = Int(tok.next_byte) - Int(prev_byte) + + # Create terminal RawGreenNode + node = RawGreenNode(h, byte_span, kind(tok)) + push!(stream.output, node) + + # Update next_byte + stream.next_byte += byte_span end stream.lookahead_index = n + 1 # Defuse the time bomb @@ -838,9 +926,12 @@ example, `2x` means `2*x` via the juxtaposition rules. """ function bump_invisible(stream::ParseStream, kind, flags=EMPTY_FLAGS; error=nothing) - b = _next_byte(stream) + b = stream.next_byte h = SyntaxHead(kind, flags) - push!(stream.tokens, SyntaxToken(h, (@__MODULE__).kind(h), false, b)) + # Zero-width token + node = RawGreenNode(h, 0, kind) + push!(stream.output, node) + # No need to update next_byte for zero-width token if !isnothing(error) emit_diagnostic(stream, b:b-1, error=error) end @@ -858,8 +949,14 @@ whitespace if necessary with bump_trivia. function bump_glue(stream::ParseStream, kind, flags) i = stream.lookahead_index h = SyntaxHead(kind, flags) - push!(stream.tokens, SyntaxToken(h, kind, false, - stream.lookahead[i+1].next_byte)) + # Calculate byte span for glued tokens + start_byte = stream.next_byte + end_byte = stream.lookahead[i+1].next_byte + byte_span = end_byte - start_byte + + node = RawGreenNode(h, byte_span, kind) + push!(stream.output, node) + stream.next_byte += byte_span stream.lookahead_index += 2 stream.peek_count = 0 return position(stream) @@ -887,24 +984,23 @@ simpler one which only splits preceding dots? function bump_split(stream::ParseStream, split_spec::Vararg{Any, N}) where {N} tok = stream.lookahead[stream.lookahead_index] stream.lookahead_index += 1 - b = _next_byte(stream) - toklen = tok.next_byte - b + start_b = _next_byte(stream) + toklen = tok.next_byte - start_b + prev_b = start_b for (i, (nbyte, k, f)) in enumerate(split_spec) h = SyntaxHead(k, f) - b += nbyte < 0 ? (toklen + nbyte) : nbyte + actual_nbyte = nbyte < 0 ? (toklen + nbyte) : nbyte orig_k = k == K"." ? K"." : kind(tok) - push!(stream.tokens, SyntaxToken(h, orig_k, false, b)) + node = RawGreenNode(h, actual_nbyte, orig_k) + push!(stream.output, node) + prev_b += actual_nbyte + stream.next_byte += actual_nbyte end - @assert tok.next_byte == b + @assert tok.next_byte == prev_b stream.peek_count = 0 return position(stream) end -function _reset_node_head(x, k, f) - h = SyntaxHead(isnothing(k) ? kind(x) : k, - isnothing(f) ? flags(x) : f) -end - """ Reset kind or flags of an existing node in the output stream @@ -915,17 +1011,8 @@ in those cases. """ function reset_node!(stream::ParseStream, pos::ParseStreamPosition; kind=nothing, flags=nothing) - if token_is_last(stream, pos) - t = stream.tokens[pos.token_index] - stream.tokens[pos.token_index] = - SyntaxToken(_reset_node_head(t, kind, flags), - t.orig_kind, t.preceding_whitespace, t.next_byte) - else - r = stream.ranges[pos.range_index] - stream.ranges[pos.range_index] = - TaggedRange(_reset_node_head(r, kind, flags), - r.first_token, r.last_token) - end + node = stream.output[pos.node_index] + stream.output[pos.node_index] = reset_node(node, kind, flags) end """ @@ -937,45 +1024,57 @@ Hack alert! This is used only for managing the complicated rules related to dedenting triple quoted strings. """ function steal_token_bytes!(stream::ParseStream, pos::ParseStreamPosition, numbytes) - i = pos.token_index - t1 = stream.tokens[i] - t2 = stream.tokens[i+1] + i = pos.node_index + t1 = stream.output[i] + t2 = stream.output[i+1] + @assert is_terminal(t1) && is_terminal(t2) - t1_next_byte = t1.next_byte + numbytes - stream.tokens[i] = SyntaxToken(t1.head, t1.orig_kind, - t1.preceding_whitespace, t1_next_byte) + stream.output[i] = RawGreenNode(t1.head, t1.byte_span + numbytes, + t1.orig_kind) - t2_is_empty = t1_next_byte == t2.next_byte + t2_is_empty = t2.byte_span == numbytes head2 = t2_is_empty ? SyntaxHead(K"TOMBSTONE", EMPTY_FLAGS) : t2.head - stream.tokens[i+1] = SyntaxToken(head2, t2.orig_kind, - t2.preceding_whitespace, t2.next_byte) + stream.output[i+1] = RawGreenNode(head2, t2.byte_span - numbytes, + t2.orig_kind) return t2_is_empty end # Get position of last item emitted into the output stream function Base.position(stream::ParseStream) - ParseStreamPosition(lastindex(stream.tokens), lastindex(stream.ranges)) + byte_idx = stream.next_byte + node_idx = length(stream.output) + + ParseStreamPosition(byte_idx, node_idx) end """ emit(stream, mark, kind, flags = EMPTY_FLAGS; error=nothing) -Emit a new text span into the output which covers source bytes from `mark` to +Emit a new non-terminal node into the output which covers source bytes from `mark` to the end of the most recent token which was `bump()`'ed. The starting `mark` -should be a previous return value of `position()`. +should be a previous return value of `position()`. The emitted node will have +its `node_span` set to the number of nodes emitted since `mark`. """ function emit(stream::ParseStream, mark::ParseStreamPosition, kind::Kind, flags::RawFlags = EMPTY_FLAGS; error=nothing) - first_token = mark.token_index + 1 - range = TaggedRange(SyntaxHead(kind, flags), first_token, length(stream.tokens)) + # Calculate byte span from mark position to current + mark_byte = mark.byte_index + current_byte = stream.next_byte + byte_span = current_byte - mark_byte + + # Calculate node span (number of children, exclusive of the node itself) + node_span = length(stream.output) - mark.node_index + + # Create non-terminal RawGreenNode + node = RawGreenNode(SyntaxHead(kind, flags), byte_span, node_span) + if !isnothing(error) - # The first child must be a leaf, otherwise ranges would be improperly - # nested. - fbyte = token_first_byte(stream, first_token) - lbyte = token_last_byte(stream, lastindex(stream.tokens)) - emit_diagnostic(stream, fbyte:lbyte, error=error) + emit_diagnostic(stream, mark_byte:current_byte-1, error=error) end - push!(stream.ranges, range) + + push!(stream.output, node) + # Note: emit() for non-terminals doesn't advance next_byte + # because it's a range over already-emitted tokens return position(stream) end @@ -1008,25 +1107,21 @@ function emit_diagnostic(stream::ParseStream; whitespace=false, kws...) end function emit_diagnostic(stream::ParseStream, mark::ParseStreamPosition; trim_whitespace=true, kws...) - i = mark.token_index - j = lastindex(stream.tokens) + # Find the byte range from mark to current position + start_byte = mark.byte_index + end_byte = stream.next_byte - 1 + if trim_whitespace - while i < j && is_whitespace(stream.tokens[j]) - j -= 1 - end - while i+1 < j && is_whitespace(stream.tokens[i+1]) - i += 1 - end + # TODO: Implement whitespace trimming for unified output + # This would require scanning the output array end - byterange = stream.tokens[i].next_byte:stream.tokens[j].next_byte-1 - emit_diagnostic(stream, byterange; kws...) + + emit_diagnostic(stream, start_byte:end_byte; kws...) end function emit_diagnostic(stream::ParseStream, mark::ParseStreamPosition, end_mark::ParseStreamPosition; kws...) - fbyte = stream.tokens[mark.token_index].next_byte - lbyte = stream.tokens[end_mark.token_index].next_byte-1 - emit_diagnostic(stream, fbyte:lbyte; kws...) + emit_diagnostic(stream, mark.byte_index:end_mark.byte_index-1; kws...) end function emit_diagnostic(diagnostics::AbstractVector{Diagnostic}, @@ -1039,15 +1134,21 @@ end function validate_tokens(stream::ParseStream) txtbuf = unsafe_textbuf(stream) - toks = stream.tokens charbuf = IOBuffer() - for i = 2:length(toks) - t = toks[i] - k = kind(t) - fbyte = toks[i-1].next_byte - nbyte = t.next_byte + + # Process terminal nodes in the output + fbyte = stream.output[1].byte_span+1 # Start after sentinel + for i = 2:length(stream.output) + node = stream.output[i] + if !is_terminal(node) || kind(node) == K"TOMBSTONE" + continue + end + + k = kind(node) + nbyte = fbyte + node.byte_span tokrange = fbyte:nbyte-1 error_kind = K"None" + if k in KSet"Integer BinInt OctInt HexInt" # The following shouldn't be able to error... # parse_int_literal @@ -1090,7 +1191,7 @@ function validate_tokens(stream::ParseStream) error="character literal contains multiple characters") end end - elseif k == K"String" && !has_flags(t, RAW_STRING_FLAG) + elseif k == K"String" && !has_flags(node, RAW_STRING_FLAG) had_error = unescape_julia_string(devnull, txtbuf, fbyte, nbyte, stream.diagnostics) if had_error @@ -1108,11 +1209,14 @@ function validate_tokens(stream::ParseStream) end emit_diagnostic(stream, tokrange, error=msg) end + if error_kind != K"None" - toks[i] = SyntaxToken(SyntaxHead(error_kind, EMPTY_FLAGS), - t.orig_kind, t.preceding_whitespace, - t.next_byte) + # Update the node with new error kind + stream.output[i] = RawGreenNode(SyntaxHead(error_kind, EMPTY_FLAGS), + node.byte_span, node.orig_kind) end + + fbyte = nbyte end sort!(stream.diagnostics, by=first_byte) end @@ -1121,89 +1225,6 @@ end # API for extracting results from ParseStream -""" - build_tree(make_node::Function, ::Type{StackEntry}, stream::ParseStream; kws...) - -Construct a tree from a ParseStream using depth-first traversal. `make_node` -must have the signature - - make_node(head::SyntaxHead, span::Integer, children) - -where `children` is either `nothing` for leaf nodes or an iterable of the -children of type `StackEntry` for internal nodes. `StackEntry` may be a node -type, but also may include other information required during building the tree. - -If the ParseStream has multiple nodes at the top level, `K"wrapper"` is used to -wrap them in a single node. - -The tree here is constructed depth-first in postorder. -""" -function build_tree(make_node::Function, ::Type{NodeType}, stream::ParseStream; - kws...) where NodeType - stack = Vector{NamedTuple{(:first_token,:node),Tuple{Int,NodeType}}}() - - tokens = stream.tokens - ranges = stream.ranges - i = firstindex(tokens) - j = firstindex(ranges) - while true - last_token = j <= lastindex(ranges) ? - ranges[j].last_token : lastindex(tokens) - # Process tokens to leaf nodes for all tokens used by the next internal node - while i <= last_token - t = tokens[i] - if kind(t) == K"TOMBSTONE" - i += 1 - continue # Ignore removed tokens - end - srcrange = (stream.tokens[i-1].next_byte: - stream.tokens[i].next_byte - 1) - h = head(t) - node = make_node(h, srcrange, nothing) - if !isnothing(node) - push!(stack, (first_token=i, node=node)) - end - i += 1 - end - if j > lastindex(ranges) - break - end - # Process internal nodes which end at the current position - while j <= lastindex(ranges) - r = ranges[j] - if r.last_token != last_token - break - end - if kind(r) == K"TOMBSTONE" - j += 1 - continue - end - # Collect children from the stack for this internal node - k = length(stack) + 1 - while k > 1 && r.first_token <= stack[k-1].first_token - k -= 1 - end - srcrange = (stream.tokens[r.first_token-1].next_byte: - stream.tokens[r.last_token].next_byte - 1) - children = (stack[n].node for n = k:length(stack)) - node = make_node(head(r), srcrange, children) - resize!(stack, k-1) - if !isnothing(node) - push!(stack, (first_token=r.first_token, node=node)) - end - j += 1 - end - end - if length(stack) == 1 - return only(stack).node - else - srcrange = (stream.tokens[1].next_byte: - stream.tokens[end].next_byte - 1) - children = (x.node for x in stack) - return make_node(SyntaxHead(K"wrapper", EMPTY_FLAGS), srcrange, children) - end -end - function sourcetext(stream::ParseStream; steal_textbuf=false) Base.depwarn("Use of `sourcetext(::ParseStream)` is deprecated. Use `SourceFile(stream)` instead", :sourcetext) root = stream.text_root @@ -1253,27 +1274,34 @@ Return the `Vector{UInt8}` text buffer being parsed by this `ParseStream`. """ unsafe_textbuf(stream) = stream.textbuf -first_byte(stream::ParseStream) = first(stream.tokens).next_byte # Use sentinel token -last_byte(stream::ParseStream) = _next_byte(stream)-1 +first_byte(stream::ParseStream) = first(stream.output).byte_span + 1 # After sentinel +last_byte(stream::ParseStream) = stream.next_byte - 1 any_error(stream::ParseStream) = any_error(stream.diagnostics) # Return last non-whitespace byte which was parsed function last_non_whitespace_byte(stream::ParseStream) - for i = length(stream.tokens):-1:1 - tok = stream.tokens[i] - if !(kind(tok) in KSet"Comment Whitespace NewlineWs ErrorEofMultiComment") - return tok.next_byte - 1 + byte_pos = stream.next_byte + for i = length(stream.output):-1:1 + node = stream.output[i] + if is_terminal(node) + if !(kind(node) in KSet"Comment Whitespace NewlineWs ErrorEofMultiComment") + return byte_pos - 1 + end + byte_pos -= node.byte_span end end return first_byte(stream) - 1 end function Base.empty!(stream::ParseStream) - t = last(stream.tokens) - empty!(stream.tokens) - # Restore sentinel token - push!(stream.tokens, SyntaxToken(SyntaxHead(K"TOMBSTONE",EMPTY_FLAGS), - K"TOMBSTONE", t.preceding_whitespace, - t.next_byte)) - empty!(stream.ranges) + # Keep only the sentinel + if !isempty(stream.output) && kind(stream.output[1]) == K"TOMBSTONE" + resize!(stream.output, 1) + else + empty!(stream.output) + # Restore sentinel node + push!(stream.output, RawGreenNode(SyntaxHead(K"TOMBSTONE", EMPTY_FLAGS), 0, K"TOMBSTONE")) + end + # Reset next_byte to initial position + stream.next_byte = 1 end diff --git a/src/parser.jl b/src/parser.jl index d1a91478..d593fe0b 100644 --- a/src/parser.jl +++ b/src/parser.jl @@ -340,6 +340,8 @@ function bump_dotsplit(ps, flags=EMPTY_FLAGS; bump_trivia(ps) mark = position(ps) k = remap_kind != K"None" ? remap_kind : kind(t) + # Split the dotted operator into . and the operator + # First split emits the . token (1 byte) at position mark.node_index+1 pos = bump_split(ps, (1, K".", TRIVIA_FLAG), (-1, k, flags)) if emit_dot_node pos = emit(ps, mark, K".") diff --git a/src/parser_api.jl b/src/parser_api.jl index 7931ef31..a3e2162b 100644 --- a/src/parser_api.jl +++ b/src/parser_api.jl @@ -190,13 +190,15 @@ emitted as `K"Identifier"` (the default) or as `K"+"`. function tokenize(text; operators_as_identifiers=true) ps = ParseStream(text) parse!(ps, rule=:all) - ts = ps.tokens + ts = ps.output output_tokens = Token[] + byte_start::UInt32 = ps.output[1].byte_span + 1 for i = 2:length(ts) - if kind(ts[i]) == K"TOMBSTONE" + if kind(ts[i]) == K"TOMBSTONE" || is_non_terminal(ts[i]) continue end - r = ts[i-1].next_byte:ts[i].next_byte-1 + r = byte_start:(byte_start+ts[i].byte_span - 1) + byte_start = last(r) + 1 k = kind(ts[i]) if k == K"Identifier" && !operators_as_identifiers orig_k = ts[i].orig_kind diff --git a/src/syntax_tree.jl b/src/syntax_tree.jl index edc864e0..71b1be82 100644 --- a/src/syntax_tree.jl +++ b/src/syntax_tree.jl @@ -57,18 +57,28 @@ const AbstractSyntaxNode = TreeNode{<:AbstractSyntaxData} struct SyntaxData <: AbstractSyntaxData source::SourceFile raw::GreenNode{SyntaxHead} - position::Int + byte_end::UInt32 val::Any end +function Base.getproperty(data::SyntaxData, name::Symbol) + if name === :position + # Previous versions of JuliaSyntax had `position::Int`. + # Allow access for compatibility. It was renamed (with changed) semantics + # to `byte_end::UInt32` to match the rest of the code base, which identified + # nodes, by their last byte. + return Int(getfield(data, :byte_end) - getfield(data, :raw).span + UInt32(1)) + end + return getfield(data, name) +end Base.hash(data::SyntaxData, h::UInt) = - hash(data.source, hash(data.raw, hash(data.position, + hash(data.source, hash(data.raw, hash(data.byte_end, # Avoid dynamic dispatch: # This does not support custom `hash` implementation that may be defined for `typeof(data.val)`, # However, such custom user types should not generally appear in the AST. Core.invoke(hash, Tuple{Any,UInt}, data.val, h)))) function Base.:(==)(a::SyntaxData, b::SyntaxData) - a.source == b.source && a.raw == b.raw && a.position == b.position && a.val === b.val + a.source == b.source && a.raw == b.raw && a.byte_end == b.byte_end && a.val === b.val end """ @@ -80,41 +90,56 @@ text by calling one of the parser API functions such as [`parseall`](@ref) """ const SyntaxNode = TreeNode{SyntaxData} -function SyntaxNode(source::SourceFile, raw::GreenNode{SyntaxHead}; - keep_parens=false, position::Integer=1) +function SyntaxNode(source::SourceFile, cursor::RedTreeCursor; + keep_parens=false) + # Build the full GreenNode tree once upfront (including trivia) + green = GreenNode(cursor.green) + GC.@preserve source begin raw_offset, txtbuf = _unsafe_wrap_substring(source.code) offset = raw_offset - source.byte_offset - _to_SyntaxNode(source, txtbuf, offset, raw, convert(Int, position), keep_parens) + _to_SyntaxNode(source, txtbuf, offset, cursor, green, keep_parens) end end +function SyntaxNode(source::SourceFile, cursor::RedTreeCursor, green::GreenNode{SyntaxHead}; + keep_parens=false) + GC.@preserve source begin + raw_offset, txtbuf = _unsafe_wrap_substring(source.code) + offset = raw_offset - source.byte_offset + _to_SyntaxNode(source, txtbuf, offset, cursor, green, keep_parens) + end +end + +should_include_node(child) = !is_trivia(child) || is_error(child) + function _to_SyntaxNode(source::SourceFile, txtbuf::Vector{UInt8}, offset::Int, - raw::GreenNode{SyntaxHead}, - position::Int, keep_parens::Bool) - if is_leaf(raw) + cursor::RedTreeCursor, green::GreenNode{SyntaxHead}, keep_parens::Bool) + if is_leaf(cursor) # Here we parse the values eagerly rather than representing them as # strings. Maybe this is good. Maybe not. - valrange = position:position + span(raw) - 1 - val = parse_julia_literal(txtbuf, head(raw), valrange .+ offset) - return SyntaxNode(nothing, nothing, SyntaxData(source, raw, position, val)) + valrange = byte_range(cursor) + val = parse_julia_literal(txtbuf, head(cursor), valrange .+ offset) + return SyntaxNode(nothing, nothing, SyntaxData(source, green, cursor.byte_end, val)) else cs = SyntaxNode[] - pos = position - for (i,rawchild) in enumerate(children(raw)) - # FIXME: Allowing trivia is_error nodes here corrupts the tree layout. - if !is_trivia(rawchild) || is_error(rawchild) - push!(cs, _to_SyntaxNode(source, txtbuf, offset, rawchild, pos, keep_parens)) + green_children = children(green) + + # We need to match up the filtered SyntaxNode children with the unfiltered GreenNode children + # Both cursor and green children need to be traversed in the same order + # Since cursor iterates in reverse, we need to match from the end of green_children + green_idx = green_children === nothing ? 0 : length(green_children) + + for (i, child_cursor) in enumerate(reverse(cursor)) + if should_include_node(child_cursor) + pushfirst!(cs, _to_SyntaxNode(source, txtbuf, offset, child_cursor, green[end-i+1], keep_parens)) end - pos += Int(rawchild.span) end - if !keep_parens && kind(raw) == K"parens" && length(cs) == 1 - return cs[1] - end - if kind(raw) == K"wrapper" && length(cs) == 1 + + if !keep_parens && kind(cursor) == K"parens" && length(cs) == 1 return cs[1] end - node = SyntaxNode(nothing, cs, SyntaxData(source, raw, position, nothing)) + node = SyntaxNode(nothing, cs, SyntaxData(source, green, cursor.byte_end, nothing)) for c in cs c.parent = node end @@ -162,9 +187,12 @@ structure. """ head(node::AbstractSyntaxNode) = head(node.raw) -span(node::AbstractSyntaxNode) = span(node.raw) +span(node::AbstractSyntaxNode) = node.raw.span -byte_range(node::AbstractSyntaxNode) = node.position:(node.position + span(node) - 1) +byte_range(node::AbstractSyntaxNode) = (node.byte_end - span(node) + 1):node.byte_end + +first_byte(node::AbstractSyntaxNode) = first(byte_range(node)) +last_byte(node::AbstractSyntaxNode) = last(byte_range(node)) sourcefile(node::AbstractSyntaxNode) = node.source @@ -271,13 +299,45 @@ function Base.copy(node::TreeNode) end # shallow-copy the data -Base.copy(data::SyntaxData) = SyntaxData(data.source, data.raw, data.position, data.val) +Base.copy(data::SyntaxData) = SyntaxData(data.source, data.raw, data.byte_end, data.val) function build_tree(::Type{SyntaxNode}, stream::ParseStream; filename=nothing, first_line=1, keep_parens=false, kws...) - green_tree = build_tree(GreenNode, stream; kws...) source = SourceFile(stream, filename=filename, first_line=first_line) - SyntaxNode(source, green_tree, position=first_byte(stream), keep_parens=keep_parens) + cursor = RedTreeCursor(stream) + if has_toplevel_siblings(cursor) + # There are multiple toplevel nodes, e.g. because we're using this + # to test a partial parse. Wrap everything in K"wrapper" + + # First build the full green tree for all children (including trivia) + green_children = GreenNode{SyntaxHead}[] + for child in reverse_toplevel_siblings(cursor) + pushfirst!(green_children, GreenNode(child.green)) + end + + # Create a wrapper GreenNode with children + green = GreenNode(SyntaxHead(K"wrapper", NON_TERMINAL_FLAG), + stream.next_byte-1, green_children) + + # Now build SyntaxNodes, iterating through cursors and green nodes together + cs = SyntaxNode[] + for (i, child) in enumerate(reverse_toplevel_siblings(cursor)) + if should_include_node(child) + pushfirst!(cs, SyntaxNode(source, child, green[end-i+1], keep_parens=keep_parens)) + end + end + + length(cs) == 1 && return only(cs) + + node = SyntaxNode(nothing, cs, SyntaxData(source, green, + stream.next_byte-1, nothing)) + for c in cs + c.parent = node + end + return node + else + return SyntaxNode(source, cursor, keep_parens=keep_parens) + end end @deprecate haschildren(x) !is_leaf(x) false diff --git a/src/tokenize.jl b/src/tokenize.jl index 0ea9be19..761455dd 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -2,7 +2,7 @@ module Tokenize export tokenize, untokenize -using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str +using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str, @callsite_inline import ..JuliaSyntax: kind, is_literal, is_contextual_keyword, is_word_operator @@ -1303,14 +1303,14 @@ function lex_identifier(l::Lexer, c) @inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1] break end - elseif Unicode.isgraphemebreak!(graphemestate, c, pc) + elseif @callsite_inline Unicode.isgraphemebreak!(graphemestate, c, pc) if (pc == '!' && ppc == '=') || !is_identifier_char(pc) break end elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters # ZWJ/ZWNJ only within grapheme sequences, not at end graphemestate_peek[] = graphemestate[] - if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc) + if @callsite_inline Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc) break end end diff --git a/src/tree_cursors.jl b/src/tree_cursors.jl new file mode 100644 index 00000000..3f65b6ce --- /dev/null +++ b/src/tree_cursors.jl @@ -0,0 +1,166 @@ +using Base.Iterators: Reverse + +""" + prev_sibling_assumed(cursor::GreenTreeCursor)::Union{Nothing, GreenTreeCursor} + prev_sibling_assumed(cursor::RedTreeCursor)::Union{Nothing, RedTreeCursor} + +Gives the previous sibling of the current node, but makes the assumption that +there is one or that we are at the top level. +Without knowing the parent, we cannot otherwise know which the last sibling is, +unless we are at the top level in which case `nothing` is returned. +""" +function prev_sibling_assumed end + +""" + GreenTreeCursor + +Represents a cursors into a ParseStream output buffer that makes it easy to +work with the green tree representation. +""" +struct GreenTreeCursor + parser_output::Vector{RawGreenNode} + position::UInt32 +end +GreenTreeCursor(stream::ParseStream) = GreenTreeCursor(stream.output, length(stream.output)) +this(node::GreenTreeCursor) = node.parser_output[node.position] + +const SENTINEL_INDEX = UInt32(1) +function prev_sibling_assumed(cursor::GreenTreeCursor) + next_idx = cursor.position - this(cursor).node_span - UInt32(1) + next_idx == SENTINEL_INDEX && return nothing + GreenTreeCursor(cursor.parser_output, next_idx) +end + +# Debug printing +function Base.show(io::IO, node::GreenTreeCursor) + print(io, Base.summary(this(node)), " @", node.position) +end + +# Reverse iterator interface +Base.reverse(node::GreenTreeCursor) = Base.Iterators.Reverse(node) +Base.IteratorSize(::Type{Reverse{GreenTreeCursor}}) = Base.SizeUnknown() +@inline function Base.iterate(node::Reverse{GreenTreeCursor}, + (next_idx, final)::NTuple{2, UInt32} = + (node.itr.position-UInt32(1), node.itr.position - this(node.itr).node_span - UInt32(1)))::Union{Nothing, Tuple{GreenTreeCursor, NTuple{2, UInt32}}} + node = node.itr + while true + next_idx == final && return nothing + next_node = GreenTreeCursor(node.parser_output, next_idx) + nrgn = this(next_node) + if getfield(nrgn, :head).kind == K"TOMBSTONE" + # TOMBSTONED nodes are counted as part of the size of the tree, but + # do not contribute either byte ranges or children. + next_idx -= UInt32(1) + continue + end + # Inlined prev_sibling_assumed + new_next_idx = next_idx - nrgn.node_span - UInt32(1) + return (next_node, (new_next_idx, final)) + end +end + +# Accessors / predicates +is_leaf(node::GreenTreeCursor) = !is_non_terminal(this(node)) +head(node::GreenTreeCursor) = this(node).head +treesize(node::GreenTreeCursor) = this(node).node_span +is_non_terminal(node::GreenTreeCursor) = is_non_terminal(this(node)) + +""" + span(node) + +Get the number of bytes this node covers in the source text. +""" +span(node::GreenTreeCursor) = this(node).byte_span + +""" + RedTreeCursor + +Wraps a `GreenTreeCursor` to keep track of the absolute position of the node +in the original source text. +""" +struct RedTreeCursor + green::GreenTreeCursor + # The last byte that is still part of the node + byte_end::UInt32 +end +RedTreeCursor(stream::ParseStream) = RedTreeCursor( + GreenTreeCursor(stream), stream.next_byte - UInt32(1)) + +function prev_sibling_assumed(cursor::RedTreeCursor) + prevgreen = prev_sibling_assumed(cursor.green) + if prevgreen === nothing + return nothing + end + return RedTreeCursor(prevgreen, cursor.byte_end - span(cursor)) +end + + +Base.reverse(node::RedTreeCursor) = Base.Iterators.Reverse(node) +Base.IteratorSize(::Type{Reverse{RedTreeCursor}}) = Base.SizeUnknown() +@inline function Base.iterate(node::Reverse{RedTreeCursor})::Union{Nothing, Tuple{RedTreeCursor, NTuple{3, UInt32}}} + r = iterate(Reverse(node.itr.green)) + return _iterate_red_cursor(r, node.itr.byte_end) +end + +@inline function Base.iterate(node::Reverse{RedTreeCursor}, state::NTuple{3, UInt32})::Union{Nothing, Tuple{RedTreeCursor, NTuple{3, UInt32}}} + r = iterate(Reverse(node.itr.green), Base.tail(state)) + return _iterate_red_cursor(r, first(state)) +end + +@inline function _iterate_red_cursor(r, byte_end) + r === nothing && return nothing + next_node, next_idx = r + return RedTreeCursor(next_node, byte_end), + (byte_end - span(next_node), next_idx...) +end + +is_leaf(node::RedTreeCursor) = is_leaf(node.green) +head(node::RedTreeCursor) = head(node.green) +span(node::RedTreeCursor) = span(node.green) +byte_range(node::RedTreeCursor) = (node.byte_end - span(node.green) + UInt32(1)):node.byte_end +treesize(node::RedTreeCursor) = treesize(node.green) +is_non_terminal(node::RedTreeCursor) = is_non_terminal(node.green) + +function Base.show(io::IO, node::RedTreeCursor) + print(io, node.green, " [", byte_range(node), "]") +end + +has_toplevel_siblings(cursor::GreenTreeCursor) = + treesize(cursor)+1 != length(cursor.parser_output)-1 +has_toplevel_siblings(cursor::RedTreeCursor) = + has_toplevel_siblings(cursor.green) +struct TopLevelSiblingIterator{C} + cursor::C +end + +function reverse_toplevel_siblings(cursor::RedTreeCursor) + @assert cursor.green.position == length(cursor.green.parser_output) + TopLevelSiblingIterator(cursor) +end + +function reverse_toplevel_siblings(cursor::GreenTreeCursor) + @assert cursor.position == length(cursor.parser_output) + TopLevelSiblingIterator(cursor) +end + +function Base.iterate(tsi::TopLevelSiblingIterator) + return (tsi.cursor, tsi.cursor) +end +function Base.iterate(cursor::TopLevelSiblingIterator{C}, last::C) where {C} + this = prev_sibling_assumed(last) + this === nothing && return nothing + return (this, this) +end + +# HACK: Force inlining of `filter` for our cursors to avoid significant perf +# degradation. +@inline function Base.iterate(f::Iterators.Filter{<:Any, Iterators.Reverse{T}}, state...) where {T<:Union{RedTreeCursor, GreenTreeCursor}} + y = iterate(f.itr, state...) + while y !== nothing + if f.flt(y[1]) + return y + end + y = iterate(f.itr, y[2]) + end + nothing +end diff --git a/src/utils.jl b/src/utils.jl index 3f95c485..c21c251e 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -14,6 +14,15 @@ if VERSION < v"1.5" import Base.peek end +@static if VERSION < v"1.8" + macro callsite_inline(call) + esc(call) + end +else + const var"@callsite_inline" = var"@inline" +end + + _unsafe_wrap_substring(s) = (s.offset, unsafe_wrap(Vector{UInt8}, s.string)) #-------------------------------------------------- diff --git a/test/expr.jl b/test/expr.jl index 200e8764..7651347c 100644 --- a/test/expr.jl +++ b/test/expr.jl @@ -379,7 +379,7 @@ Expr(:call, :f, Expr(:parameters, Expr(:kw, :b, 2))) @test parsestmt("f(a=1; b=2)") == Expr(:call, :f, Expr(:parameters, Expr(:kw, :b, 2)), Expr(:kw, :a, 1)) - @test parsestmt("f(a; b; c)") == + @test parsestmt("f(a; b; c)") == Expr(:call, :f, Expr(:parameters, Expr(:parameters, :c), :b), :a) @test parsestmt("+(a=1,)") == Expr(:call, :+, Expr(:kw, :a, 1)) @@ -389,11 +389,11 @@ # Operator calls: = is not :kw @test parsestmt("(x=1) != 2") == Expr(:call, :!=, Expr(:(=), :x, 1), 2) - @test parsestmt("+(a=1)") == + @test parsestmt("+(a=1)") == Expr(:call, :+, Expr(:(=), :a, 1)) - @test parsestmt("(a=1)'") == + @test parsestmt("(a=1)'") == Expr(Symbol("'"), Expr(:(=), :a, 1)) - @test parsestmt("(a=1)'ᵀ") == + @test parsestmt("(a=1)'ᵀ") == Expr(:call, Symbol("'ᵀ"), Expr(:(=), :a, 1)) # Dotcall @@ -611,8 +611,8 @@ Expr(:generator, :x, Expr(:filter, :z, Expr(:(=), :a, :as), Expr(:(=), :b, :bs))) @test parsestmt("(x for a in as, b in bs for c in cs, d in ds)") == - Expr(:flatten, - Expr(:generator, + Expr(:flatten, + Expr(:generator, Expr(:generator, :x, Expr(:(=), :c, :cs), Expr(:(=), :d, :ds)), Expr(:(=), :a, :as), Expr(:(=), :b, :bs))) @test parsestmt("(x for a in as for b in bs if z)") == @@ -782,7 +782,7 @@ @test parsestmt("global x ~ 1") == Expr(:global, Expr(:call, :~, :x, 1)) @test parsestmt("global x += 1") == Expr(:global, Expr(:+=, :x, 1)) - # Parsing of global/local with + # Parsing of global/local with @test parsestmt("global (x,y)") == Expr(:global, :x, :y) @test parsestmt("local (x,y)") == Expr(:local, :x, :y) end diff --git a/test/green_node.jl b/test/green_node.jl index 42d20f52..727c7178 100644 --- a/test/green_node.jl +++ b/test/green_node.jl @@ -3,7 +3,7 @@ @test span(t) == 6 @test !is_leaf(t) - @test head(t) == SyntaxHead(K"call", 0x0008) + @test head(t) == SyntaxHead(K"call", 0x0088) @test span.(children(t)) == [2,1,1,1,1] @test head.(children(t)) == [ SyntaxHead(K"Identifier", 0x0000) diff --git a/test/parse_stream.jl b/test/parse_stream.jl index f5148f27..0eca59b7 100644 --- a/test/parse_stream.jl +++ b/test/parse_stream.jl @@ -20,7 +20,6 @@ using JuliaSyntax: ParseStream, yy end """ - st = ParseStream(code) p1 = position(st) @@ -73,8 +72,6 @@ using JuliaSyntax: ParseStream, @test peek(st) == K"NewlineWs" bump(st, TRIVIA_FLAG) emit(st, p1, K"toplevel") - - @test build_tree(GreenNode, st) isa JuliaSyntax.GreenNode end @testset "ParseStream constructors" begin @@ -106,47 +103,48 @@ end end @testset "ParseStream tree traversal" begin - # NB: ParseStreamPosition.token_index includes an initial sentinel token so - # indices here are one more than "might be expected". + # NB: ParseStreamPosition.node_index includes an initial sentinel token so + # indices here are one more than "might be expected". Additionally, note that + # the byte index points to the first byte after the token. st = parse_sexpr("((a b) c)") child1_pos = first_child_position(st, position(st)) - @test child1_pos == ParseStreamPosition(7, 1) - @test first_child_position(st, child1_pos) == ParseStreamPosition(4, 0) - @test last_child_position(st, position(st)) == ParseStreamPosition(9, 0) - @test last_child_position(st, child1_pos) == ParseStreamPosition(6, 0) + @test child1_pos == ParseStreamPosition(7, 8) + @test first_child_position(st, child1_pos) == ParseStreamPosition(4, 4) + @test last_child_position(st, position(st)) == ParseStreamPosition(9, 10) + @test last_child_position(st, child1_pos) == ParseStreamPosition(6, 6) st = parse_sexpr("( (a b) c)") child1_pos = first_child_position(st, position(st)) - @test child1_pos == ParseStreamPosition(8, 1) - @test first_child_position(st, child1_pos) == ParseStreamPosition(5, 0) - @test last_child_position(st, position(st)) == ParseStreamPosition(10, 0) - @test last_child_position(st, child1_pos) == ParseStreamPosition(7, 0) + @test child1_pos == ParseStreamPosition(8, 9) + @test first_child_position(st, child1_pos) == ParseStreamPosition(5, 5) + @test last_child_position(st, position(st)) == ParseStreamPosition(10, 11) + @test last_child_position(st, child1_pos) == ParseStreamPosition(7, 7) st = parse_sexpr("(a (b c))") - @test first_child_position(st, position(st)) == ParseStreamPosition(3, 0) + @test first_child_position(st, position(st)) == ParseStreamPosition(3, 3) child2_pos = last_child_position(st, position(st)) - @test child2_pos == ParseStreamPosition(9, 1) - @test first_child_position(st, child2_pos) == ParseStreamPosition(6, 0) - @test last_child_position(st, child2_pos) == ParseStreamPosition(8, 0) + @test child2_pos == ParseStreamPosition(9, 10) + @test first_child_position(st, child2_pos) == ParseStreamPosition(6, 6) + @test last_child_position(st, child2_pos) == ParseStreamPosition(8, 8) st = parse_sexpr("( a (b c))") - @test first_child_position(st, position(st)) == ParseStreamPosition(4, 0) + @test first_child_position(st, position(st)) == ParseStreamPosition(4, 4) child2_pos = last_child_position(st, position(st)) - @test child2_pos == ParseStreamPosition(10, 1) - @test first_child_position(st, child2_pos) == ParseStreamPosition(7, 0) - @test last_child_position(st, child2_pos) == ParseStreamPosition(9, 0) + @test child2_pos == ParseStreamPosition(10, 11) + @test first_child_position(st, child2_pos) == ParseStreamPosition(7, 7) + @test last_child_position(st, child2_pos) == ParseStreamPosition(9, 9) st = parse_sexpr("a (b c)") - @test first_child_position(st, position(st)) == ParseStreamPosition(5, 0) - @test last_child_position(st, position(st)) == ParseStreamPosition(7, 0) + @test first_child_position(st, position(st)) == ParseStreamPosition(5, 5) + @test last_child_position(st, position(st)) == ParseStreamPosition(7, 7) st = parse_sexpr("(a) (b c)") - @test first_child_position(st, position(st)) == ParseStreamPosition(7, 0) - @test last_child_position(st, position(st)) == ParseStreamPosition(9, 0) + @test first_child_position(st, position(st)) == ParseStreamPosition(7, 8) + @test last_child_position(st, position(st)) == ParseStreamPosition(9, 10) st = parse_sexpr("(() ())") - @test first_child_position(st, position(st)) == ParseStreamPosition(4, 1) - @test last_child_position(st, position(st)) == ParseStreamPosition(7, 2) + @test first_child_position(st, position(st)) == ParseStreamPosition(4, 5) + @test last_child_position(st, position(st)) == ParseStreamPosition(7, 9) end @testset "SubString{GenericString} (issue #505)" begin diff --git a/test/parser.jl b/test/parser.jl index f208e24c..f0ff0f51 100644 --- a/test/parser.jl +++ b/test/parser.jl @@ -5,9 +5,7 @@ function parse_to_sexpr_str(production, code::AbstractString; v=v"1.6", show_kws stream = ParseStream(code, version=v) production(ParseState(stream)) JuliaSyntax.validate_tokens(stream) - t = build_tree(GreenNode, stream) - source = SourceFile(code) - s = SyntaxNode(source, t, keep_parens=true) + s = build_tree(SyntaxNode, stream, keep_parens=true) return sprint(io->show(io, MIME("text/x.sexpression"), s; show_kws...)) end diff --git a/test/syntax_tree.jl b/test/syntax_tree.jl index 2fac0d6b..3e2361ca 100644 --- a/test/syntax_tree.jl +++ b/test/syntax_tree.jl @@ -28,7 +28,6 @@ "(call-i (call-i a::Identifier *::Identifier b::Identifier) +::Identifier c::Identifier)" @test sprint(highlight, t[1][3]) == "a*b + c\n# ╙" - @test sprint(highlight, t.source, t.raw, 1, 3) == "a*b + c\n# ╙" # Pass-through field access node = t[1][1] @@ -40,7 +39,6 @@ # Newline-terminated source t = parsestmt(SyntaxNode, "a*b + c\n") @test sprint(highlight, t[1][3]) == "a*b + c\n# ╙" - @test sprint(highlight, t.source, t.raw, 1, 3) == "a*b + c\n# ╙" # copy t = parsestmt(SyntaxNode, "a*b + c") @@ -58,8 +56,8 @@ # SyntaxNode with offsets t,_ = parsestmt(SyntaxNode, "begin a end\nbegin b end", 13) - @test t.position == 13 - @test t[1].position == 19 + @test first(byte_range(t)) == 13 + @test first(byte_range(t[1])) == 19 @test t[1].val == :b # Unicode character ranges diff --git a/test/test_utils.jl b/test/test_utils.jl index 7553bf1c..dae16cc0 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -18,6 +18,7 @@ using .JuliaSyntax: @K_str, # Nodes GreenNode, + RedTreeCursor, SyntaxNode, ErrorVal, # Node inspection @@ -131,7 +132,7 @@ function exprs_roughly_equal(fl_ex, ex) args = ex.head in (:block, :quote, :toplevel) ? filter(x->!(x isa LineNumberNode), ex.args) : ex.args - if (fl_ex.head == :block && ex.head == :tuple && + if (fl_ex.head == :block && ex.head == :tuple && length(fl_args) == 2 && length(args) == 2 && Meta.isexpr(args[1], :parameters, 1) && exprs_roughly_equal(fl_args[2], args[1].args[1]) &&