Skip to content

Commit 840fcca

Browse files
committed
RFC/WIP: Reorganize into largely self-container pieces
A I mentioned in #560, and as contemplated in #536, I'd like to try re-using JuliaParser infrastructure to replace parsers I've written for some other languages. This takes the first step to do so by moving various files into directories depending on whether they are language-dependent or not. Right now there is still some coupling and of course, there are no actual abstractions between these pieces. The idea would be to intrduce those over time. For now, if we put in this refactoring, the way to use this would be to copy the appropriate pieces (at least `core/`) into your downstream parser and then rewrite it to those APIs. I'm planning to do that with a parser or two to see if I hit any big API issues and see what it would take to actually make the re-use happen.
1 parent 55c316a commit 840fcca

File tree

4 files changed

+316
-315
lines changed

4 files changed

+316
-315
lines changed

src/JuliaSyntax.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,13 @@ include("core/diagnostics.jl")
9090

9191
# Parsing
9292
include("core/parse_stream.jl")
93+
include("core/tree_cursors.jl")
94+
include("julia/julia_parse_stream.jl")
9395
include("julia/parser.jl")
9496
include("julia/parser_api.jl")
9597
include("julia/literal_parsing.jl")
9698

9799
# Tree data structures
98-
include("core/tree_cursors.jl")
99100
include("porcelain/green_node.jl")
100101
include("porcelain/syntax_tree.jl")
101102
include("integration/expr.jl")

src/core/parse_stream.jl

Lines changed: 0 additions & 313 deletions
Original file line numberDiff line numberDiff line change
@@ -9,93 +9,11 @@ const EMPTY_FLAGS = RawFlags(0)
99
# Set for tokens or ranges which are syntax trivia after parsing
1010
const TRIVIA_FLAG = RawFlags(1<<0)
1111

12-
# Token flags - may be set for operator kinded tokens
13-
# Operator is dotted
14-
const DOTOP_FLAG = RawFlags(1<<1)
15-
# Operator has a suffix
16-
const SUFFIXED_FLAG = RawFlags(1<<2)
17-
18-
# Set for K"call", K"dotcall" or any syntactic operator heads
19-
# Distinguish various syntaxes which are mapped to K"call"
20-
const PREFIX_CALL_FLAG = RawFlags(0<<3)
21-
const INFIX_FLAG = RawFlags(1<<3)
22-
const PREFIX_OP_FLAG = RawFlags(2<<3)
23-
const POSTFIX_OP_FLAG = RawFlags(3<<3)
24-
25-
# The following flags are quite head-specific and may overlap
26-
27-
"""
28-
Set when K"string" or K"cmdstring" was triple-delimited as with \"\"\" or ```
29-
"""
30-
const TRIPLE_STRING_FLAG = RawFlags(1<<5)
31-
32-
"""
33-
Set when a K"string", K"cmdstring" or K"Identifier" needs raw string unescaping
34-
"""
35-
const RAW_STRING_FLAG = RawFlags(1<<6)
36-
37-
"""
38-
Set for K"tuple", K"block" or K"macrocall" which are delimited by parentheses
39-
"""
40-
const PARENS_FLAG = RawFlags(1<<5)
41-
42-
"""
43-
Set for various delimited constructs when they contains a trailing comma. For
44-
example, to distinguish `(a,b,)` vs `(a,b)`, and `f(a)` vs `f(a,)`. Kinds where
45-
this applies are: `tuple call dotcall macrocall vect curly braces <: >:`.
46-
"""
47-
const TRAILING_COMMA_FLAG = RawFlags(1<<6)
48-
49-
"""
50-
Set for K"quote" for the short form `:x` as opposed to long form `quote x end`
51-
"""
52-
const COLON_QUOTE = RawFlags(1<<5)
53-
54-
"""
55-
Set for K"toplevel" which is delimited by parentheses
56-
"""
57-
const TOPLEVEL_SEMICOLONS_FLAG = RawFlags(1<<5)
58-
59-
"""
60-
Set for K"function" in short form definitions such as `f() = 1`
61-
"""
62-
const SHORT_FORM_FUNCTION_FLAG = RawFlags(1<<5)
63-
64-
"""
65-
Set for K"struct" when mutable
66-
"""
67-
const MUTABLE_FLAG = RawFlags(1<<5)
68-
69-
"""
70-
Set for K"module" when it's not bare (`module`, not `baremodule`)
71-
"""
72-
const BARE_MODULE_FLAG = RawFlags(1<<5)
73-
7412
"""
7513
Set for nodes that are non-terminals
7614
"""
7715
const NON_TERMINAL_FLAG = RawFlags(1<<7)
7816

79-
# Flags holding the dimension of an nrow or other UInt8 not held in the source
80-
# TODO: Given this is only used for nrow/ncat, we could actually use all the flags?
81-
const NUMERIC_FLAGS = RawFlags(RawFlags(0xff)<<8)
82-
83-
function set_numeric_flags(n::Integer)
84-
f = RawFlags((n << 8) & NUMERIC_FLAGS)
85-
if numeric_flags(f) != n
86-
error("Numeric flags unable to hold large integer $n")
87-
end
88-
f
89-
end
90-
91-
function call_type_flags(f::RawFlags)
92-
f & 0b11000
93-
end
94-
95-
function numeric_flags(f::RawFlags)
96-
Int((f >> 8) % UInt8)
97-
end
98-
9917
function remove_flags(n::RawFlags, fs...)
10018
RawFlags(n & ~(RawFlags((|)(fs...))))
10119
end
@@ -138,47 +56,6 @@ function Base.summary(head::SyntaxHead)
13856
untokenize(head, unique=false, include_flag_suff=false)
13957
end
14058

141-
function untokenize(head::SyntaxHead; unique=true, include_flag_suff=true)
142-
str = (is_error(kind(head)) ? untokenize(kind(head); unique=false) :
143-
untokenize(kind(head); unique=unique))::String
144-
if is_dotted(head)
145-
str = "."*str
146-
end
147-
if include_flag_suff
148-
# Ignore DOTOP_FLAG - it's represented above with . prefix
149-
is_trivia(head) && (str = str*"-t")
150-
is_infix_op_call(head) && (str = str*"-i")
151-
is_prefix_op_call(head) && (str = str*"-pre")
152-
is_postfix_op_call(head) && (str = str*"-post")
153-
154-
k = kind(head)
155-
if k in KSet"string cmdstring Identifier"
156-
has_flags(head, TRIPLE_STRING_FLAG) && (str = str*"-s")
157-
has_flags(head, RAW_STRING_FLAG) && (str = str*"-r")
158-
elseif k in KSet"tuple block macrocall"
159-
has_flags(head, PARENS_FLAG) && (str = str*"-p")
160-
elseif k == K"quote"
161-
has_flags(head, COLON_QUOTE) && (str = str*"-:")
162-
elseif k == K"toplevel"
163-
has_flags(head, TOPLEVEL_SEMICOLONS_FLAG) && (str = str*"-;")
164-
elseif k == K"function"
165-
has_flags(head, SHORT_FORM_FUNCTION_FLAG) && (str = str*"-=")
166-
elseif k == K"struct"
167-
has_flags(head, MUTABLE_FLAG) && (str = str*"-mut")
168-
elseif k == K"module"
169-
has_flags(head, BARE_MODULE_FLAG) && (str = str*"-bare")
170-
end
171-
if k in KSet"tuple call dotcall macrocall vect curly braces <: >:" &&
172-
has_flags(head, TRAILING_COMMA_FLAG)
173-
str *= "-,"
174-
end
175-
is_suffixed(head) && (str = str*"-suf")
176-
n = numeric_flags(head)
177-
n != 0 && (str = str*"-"*string(n))
178-
end
179-
str
180-
end
181-
18259
#-------------------------------------------------------------------------------
18360
# Generic interface for types `T` which have kind and flags. Either:
18461
# 1. Define kind(::T) and flags(::T), or
@@ -200,65 +77,6 @@ invisible to the parser (eg, whitespace) or implied by the structure of the AST
20077
"""
20178
is_trivia(x) = has_flags(x, TRIVIA_FLAG)
20279

203-
"""
204-
is_prefix_call(x)
205-
206-
Return true for normal prefix function call syntax such as the `f` call node
207-
parsed from `f(x)`.
208-
"""
209-
is_prefix_call(x) = call_type_flags(x) == PREFIX_CALL_FLAG
210-
211-
"""
212-
is_infix_op_call(x)
213-
214-
Return true for infix operator calls such as the `+` call node parsed from
215-
`x + y`.
216-
"""
217-
is_infix_op_call(x) = call_type_flags(x) == INFIX_FLAG
218-
219-
"""
220-
is_prefix_op_call(x)
221-
222-
Return true for prefix operator calls such as the `+` call node parsed from `+x`.
223-
"""
224-
is_prefix_op_call(x) = call_type_flags(x) == PREFIX_OP_FLAG
225-
226-
"""
227-
is_postfix_op_call(x)
228-
229-
Return true for postfix operator calls such as the `'ᵀ` call node parsed from `x'ᵀ`.
230-
"""
231-
is_postfix_op_call(x) = call_type_flags(x) == POSTFIX_OP_FLAG
232-
233-
"""
234-
is_dotted(x)
235-
236-
Return true for dotted syntax tokens
237-
"""
238-
is_dotted(x) = has_flags(x, DOTOP_FLAG)
239-
240-
"""
241-
is_suffixed(x)
242-
243-
Return true for operators which have suffixes, such as `+₁`
244-
"""
245-
is_suffixed(x) = has_flags(x, SUFFIXED_FLAG)
246-
247-
"""
248-
is_decorated(x)
249-
250-
Return true for operators which are decorated with a dot or suffix.
251-
"""
252-
is_decorated(x) = is_dotted(x) || is_suffixed(x)
253-
254-
"""
255-
numeric_flags(x)
256-
257-
Return the number attached to a `SyntaxHead`. This is only for kinds `K"nrow"`
258-
and `K"ncat"`, for now.
259-
"""
260-
numeric_flags(x) = numeric_flags(flags(x))
261-
26280
#-------------------------------------------------------------------------------
26381
"""
26482
`SyntaxToken` is a token covering a contiguous byte range in the input text.
@@ -962,45 +780,6 @@ function bump_glue(stream::ParseStream, kind, flags)
962780
return position(stream)
963781
end
964782

965-
"""
966-
bump_split(stream, token_spec1, [token_spec2 ...])
967-
968-
Bump the next token, splitting it into several pieces
969-
970-
Tokens are defined by a number of `token_spec` of shape `(nbyte, kind, flags)`.
971-
If all `nbyte` are positive, the sum must equal the token length. If one
972-
`nbyte` is negative, that token is given `tok_len + nbyte` bytes and the sum of
973-
all `nbyte` must equal zero.
974-
975-
This is a hack which helps resolves the occasional lexing ambiguity. For
976-
example
977-
* Whether .+ should be a single token or the composite (. +) which is used for
978-
standalone operators.
979-
* Whether ... is splatting (most of the time) or three . tokens in import paths
980-
981-
TODO: Are these the only cases? Can we replace this general utility with a
982-
simpler one which only splits preceding dots?
983-
"""
984-
function bump_split(stream::ParseStream, split_spec::Vararg{Any, N}) where {N}
985-
tok = stream.lookahead[stream.lookahead_index]
986-
stream.lookahead_index += 1
987-
start_b = _next_byte(stream)
988-
toklen = tok.next_byte - start_b
989-
prev_b = start_b
990-
for (i, (nbyte, k, f)) in enumerate(split_spec)
991-
h = SyntaxHead(k, f)
992-
actual_nbyte = nbyte < 0 ? (toklen + nbyte) : nbyte
993-
orig_k = k == K"." ? K"." : kind(tok)
994-
node = RawGreenNode(h, actual_nbyte, orig_k)
995-
push!(stream.output, node)
996-
prev_b += actual_nbyte
997-
stream.next_byte += actual_nbyte
998-
end
999-
@assert tok.next_byte == prev_b
1000-
stream.peek_count = 0
1001-
return position(stream)
1002-
end
1003-
1004783
"""
1005784
Reset kind or flags of an existing node in the output stream
1006785
@@ -1129,98 +908,6 @@ function emit_diagnostic(diagnostics::AbstractVector{Diagnostic},
1129908
push!(diagnostics, Diagnostic(first(byterange), last(byterange); kws...))
1130909
end
1131910

1132-
#-------------------------------------------------------------------------------
1133-
# ParseStream Post-processing
1134-
1135-
function validate_tokens(stream::ParseStream)
1136-
txtbuf = unsafe_textbuf(stream)
1137-
charbuf = IOBuffer()
1138-
1139-
# Process terminal nodes in the output
1140-
fbyte = stream.output[1].byte_span+1 # Start after sentinel
1141-
for i = 2:length(stream.output)
1142-
node = stream.output[i]
1143-
if !is_terminal(node) || kind(node) == K"TOMBSTONE"
1144-
continue
1145-
end
1146-
1147-
k = kind(node)
1148-
nbyte = fbyte + node.byte_span
1149-
tokrange = fbyte:nbyte-1
1150-
error_kind = K"None"
1151-
1152-
if k in KSet"Integer BinInt OctInt HexInt"
1153-
# The following shouldn't be able to error...
1154-
# parse_int_literal
1155-
# parse_uint_literal
1156-
elseif k == K"Float" || k == K"Float32"
1157-
underflow0 = false
1158-
if k == K"Float"
1159-
x, code = parse_float_literal(Float64, txtbuf, fbyte, nbyte)
1160-
# jl_strtod_c can return "underflow" even for valid cases such
1161-
# as `5e-324` where the source is an exact representation of
1162-
# `x`. So only warn when underflowing to zero.
1163-
underflow0 = code === :underflow && x == 0
1164-
else
1165-
x, code = parse_float_literal(Float32, txtbuf, fbyte, nbyte)
1166-
underflow0 = code === :underflow && x == 0
1167-
end
1168-
if code === :ok
1169-
# pass
1170-
elseif code === :overflow
1171-
emit_diagnostic(stream, tokrange,
1172-
error="overflow in floating point literal")
1173-
error_kind = K"ErrorNumericOverflow"
1174-
elseif underflow0
1175-
emit_diagnostic(stream, tokrange,
1176-
warning="underflow to zero in floating point literal")
1177-
end
1178-
elseif k == K"Char"
1179-
@assert fbyte < nbyte # Already handled in the parser
1180-
truncate(charbuf, 0)
1181-
had_error = unescape_julia_string(charbuf, txtbuf, fbyte,
1182-
nbyte, stream.diagnostics)
1183-
if had_error
1184-
error_kind = K"ErrorInvalidEscapeSequence"
1185-
else
1186-
seek(charbuf,0)
1187-
read(charbuf, Char)
1188-
if !eof(charbuf)
1189-
error_kind = K"ErrorOverLongCharacter"
1190-
emit_diagnostic(stream, tokrange,
1191-
error="character literal contains multiple characters")
1192-
end
1193-
end
1194-
elseif k == K"String" && !has_flags(node, RAW_STRING_FLAG)
1195-
had_error = unescape_julia_string(devnull, txtbuf, fbyte,
1196-
nbyte, stream.diagnostics)
1197-
if had_error
1198-
error_kind = K"ErrorInvalidEscapeSequence"
1199-
end
1200-
elseif is_error(k) && k != K"error"
1201-
# Emit messages for non-generic token errors
1202-
tokstr = String(txtbuf[tokrange])
1203-
msg = if k in KSet"ErrorInvisibleChar ErrorUnknownCharacter ErrorIdentifierStart"
1204-
"$(_token_error_descriptions[k]) $(repr(tokstr[1]))"
1205-
elseif k in KSet"ErrorInvalidUTF8 ErrorBidiFormatting"
1206-
"$(_token_error_descriptions[k]) $(repr(tokstr))"
1207-
else
1208-
_token_error_descriptions[k]
1209-
end
1210-
emit_diagnostic(stream, tokrange, error=msg)
1211-
end
1212-
1213-
if error_kind != K"None"
1214-
# Update the node with new error kind
1215-
stream.output[i] = RawGreenNode(SyntaxHead(error_kind, EMPTY_FLAGS),
1216-
node.byte_span, node.orig_kind)
1217-
end
1218-
1219-
fbyte = nbyte
1220-
end
1221-
sort!(stream.diagnostics, by=first_byte)
1222-
end
1223-
1224911
# Tree construction from the list of text ranges held by ParseStream
1225912

1226913
# API for extracting results from ParseStream

0 commit comments

Comments
 (0)