Skip to content

Commit 35dc3bc

Browse files
authored
Parse non-syntactic operator tokens as K"Identifier" kind (#523)
Most operators are semantically just normal identifiers after parsing so should get the Kind `K"Identifier"`. For example, after this change `a + b` parses with `K"Identifier"` kind for the `+` token. As an exception, standalone syntactic ops keep their kind - they can't really be used in a sane way as identifiers or interpolated into expressions in the normal way because they have their own syntactic forms. This also helps us in `Expr` conversion where they also have their own rules for coalescing with dots, when dotted. Also introduce a new keyword `operators_as_identifiers` to the `tokenize()` API to accommodate some simple uses of this API to colour token strings by operator type, even when the operator is semantically in identifier-position.
1 parent 4a7e846 commit 35dc3bc

File tree

9 files changed

+119
-41
lines changed

9 files changed

+119
-41
lines changed

src/expr.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads,
297297
if !@isexpr(a2, :quote) && !(a2 isa QuoteNode)
298298
args[2] = QuoteNode(a2)
299299
end
300-
elseif length(args) == 1 && is_operator(childheads[1])
300+
elseif length(args) == 1
301301
# Hack: Here we preserve the head of the operator to determine whether
302302
# we need to coalesce it with the dot into a single symbol later on.
303303
args[1] = (childheads[1], args[1])

src/kinds.jl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1230,3 +1230,12 @@ function is_whitespace(x)
12301230
k = kind(x)
12311231
return k == K"Whitespace" || k == K"NewlineWs" || k == K"Comment"
12321232
end
1233+
1234+
function is_syntactic_operator(x)
1235+
k = kind(x)
1236+
# TODO: Do we need to disallow dotted and suffixed forms when this is used
1237+
# in the parser? The lexer itself usually disallows such tokens, so it's
1238+
# not clear whether we need to handle them. (Though note `.->` is a
1239+
# token...)
1240+
return k in KSet"&& || . ... ->" || is_syntactic_assignment(k)
1241+
end

src/parse_stream.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -890,7 +890,8 @@ function bump_split(stream::ParseStream, split_spec::Vararg{Any, N}) where {N}
890890
for (i, (nbyte, k, f)) in enumerate(split_spec)
891891
h = SyntaxHead(k, f)
892892
b = (i == length(split_spec)) ? tok.next_byte : b + nbyte
893-
push!(stream.tokens, SyntaxToken(h, kind(tok), false, b))
893+
orig_k = k == K"." ? K"." : kind(tok)
894+
push!(stream.tokens, SyntaxToken(h, orig_k, false, b))
894895
end
895896
stream.peek_count = 0
896897
return position(stream)

src/parser.jl

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -382,22 +382,22 @@ function parse_LtoR(ps::ParseState, down, is_op)
382382
down(ps)
383383
while is_op(peek(ps))
384384
t = peek_token(ps)
385-
bump_dotsplit(ps)
385+
bump_dotsplit(ps, remap_kind=K"Identifier")
386386
down(ps)
387387
emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
388388
end
389389
end
390390

391391
# parse right-to-left binary operator
392-
# produces structures like (= a (= b (= c d)))
392+
# produces structures like (=> a (=> b (=> c d)))
393393
#
394394
# flisp: parse-RtoL
395395
function parse_RtoL(ps::ParseState, down, is_op, self)
396396
mark = position(ps)
397397
down(ps)
398398
t = peek_token(ps)
399399
if is_op(kind(t))
400-
bump_dotsplit(ps)
400+
bump_dotsplit(ps, remap_kind=K"Identifier")
401401
self(ps)
402402
emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
403403
end
@@ -624,7 +624,7 @@ function parse_assignment_with_initial_ex(ps::ParseState, mark, down::T) where {
624624
# a .~ b ==> (dotcall-i a ~ b)
625625
# [a ~ b c] ==> (hcat (call-i a ~ b) c)
626626
# [a~b] ==> (vect (call-i a ~ b))
627-
bump_dotsplit(ps)
627+
bump_dotsplit(ps, remap_kind=K"Identifier")
628628
bump_trivia(ps)
629629
parse_assignment(ps, down)
630630
emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
@@ -759,7 +759,7 @@ function parse_arrow(ps::ParseState)
759759
# x <--> y ==> (call-i x <--> y)
760760
# x .--> y ==> (dotcall-i x --> y)
761761
# x -->₁ y ==> (call-i x -->₁ y)
762-
bump_dotsplit(ps)
762+
bump_dotsplit(ps, remap_kind=K"Identifier")
763763
parse_arrow(ps)
764764
emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
765765
end
@@ -821,7 +821,7 @@ function parse_comparison(ps::ParseState, subtype_comparison=false)
821821
while (t = peek_token(ps); is_prec_comparison(t))
822822
n_comparisons += 1
823823
op_dotted = is_dotted(t)
824-
op_pos = bump_dotsplit(ps, emit_dot_node=true)
824+
op_pos = bump_dotsplit(ps, emit_dot_node=true, remap_kind=K"Identifier")
825825
parse_pipe_lt(ps)
826826
end
827827
if n_comparisons == 1
@@ -881,7 +881,7 @@ function parse_range(ps::ParseState)
881881
# a..b ==> (call-i a .. b)
882882
# a … b ==> (call-i a … b)
883883
# a .… b ==> (dotcall-i a … b)
884-
bump_dotsplit(ps)
884+
bump_dotsplit(ps, remap_kind=K"Identifier")
885885
parse_invalid_ops(ps)
886886
emit(ps, mark, is_dotted(initial_tok) ? K"dotcall" : K"call", INFIX_FLAG)
887887
elseif initial_kind == K":" && ps.range_colon_enabled
@@ -904,17 +904,17 @@ function parse_range(ps::ParseState)
904904
# a :> b ==> (call-i a (error : >) b)
905905
bump_trivia(ps, skip_newlines=false)
906906
emark = position(ps)
907-
bump(ps) # K":"
907+
bump(ps, remap_kind=K"Identifier") # K":"
908908
ks = untokenize(peek(ps))
909-
bump(ps) # K"<" or K">"
909+
bump(ps, remap_kind=K"Identifier") # K"<" or K">"
910910
emit(ps, emark, K"error",
911911
error="Invalid `:$ks` found, maybe replace with `$ks:`")
912912
parse_invalid_ops(ps)
913913
emit(ps, mark, K"call", INFIX_FLAG)
914914
break
915915
end
916916
n_colons += 1
917-
bump(ps, n_colons == 1 ? EMPTY_FLAGS : TRIVIA_FLAG)
917+
bump(ps, n_colons == 1 ? EMPTY_FLAGS : TRIVIA_FLAG; remap_kind=K"Identifier")
918918
had_newline = peek(ps) == K"NewlineWs"
919919
t = peek_token(ps)
920920
if is_closing_token(ps, kind(t))
@@ -1008,7 +1008,7 @@ function parse_with_chains(ps::ParseState, down, is_op, chain_ops)
10081008
# [x+y + z] ==> (vect (call-i x + y z))
10091009
break
10101010
end
1011-
bump_dotsplit(ps)
1011+
bump_dotsplit(ps, remap_kind=K"Identifier")
10121012
down(ps)
10131013
if kind(t) in chain_ops && !is_decorated(t)
10141014
# a + b + c ==> (call-i a + b c)
@@ -1217,7 +1217,7 @@ function parse_unary(ps::ParseState)
12171217
# unary negation
12181218
# -2^x ==> (call-pre - (call-i 2 ^ x))
12191219
# -2[1, 3] ==> (call-pre - (ref 2 1 3))
1220-
bump(ps)
1220+
bump(ps, remap_kind=K"Identifier")
12211221
parse_factor(ps)
12221222
emit(ps, mark, K"call", PREFIX_OP_FLAG)
12231223
else
@@ -1256,7 +1256,7 @@ function parse_unary(ps::ParseState)
12561256
#
12571257
# (The flisp parser only considers commas before `;` and thus gets this
12581258
# last case wrong)
1259-
op_pos = bump_dotsplit(ps, emit_dot_node=true)
1259+
op_pos = bump_dotsplit(ps, emit_dot_node=true, remap_kind=K"Identifier")
12601260

12611261
space_before_paren = preceding_whitespace(t2)
12621262
if space_before_paren
@@ -1303,7 +1303,7 @@ function parse_unary(ps::ParseState)
13031303
if is_type_operator(op_t)
13041304
# <:(a,) ==> (<: a)
13051305
emit(ps, mark, op_k, opts.delim_flags)
1306-
reset_node!(ps, op_pos, flags=TRIVIA_FLAG)
1306+
reset_node!(ps, op_pos, flags=TRIVIA_FLAG, kind=op_k)
13071307
else
13081308
emit(ps, mark, K"call", opts.delim_flags)
13091309
end
@@ -1329,7 +1329,7 @@ function parse_unary(ps::ParseState)
13291329
if is_type_operator(op_t)
13301330
# <:(a) ==> (<:-pre (parens a))
13311331
emit(ps, mark, op_k, PREFIX_OP_FLAG)
1332-
reset_node!(ps, op_pos, flags=TRIVIA_FLAG)
1332+
reset_node!(ps, op_pos, flags=TRIVIA_FLAG, kind=op_k)
13331333
else
13341334
if is_dotted(op_t)
13351335
emit(ps, mark, K"dotcall", PREFIX_OP_FLAG)
@@ -1349,12 +1349,12 @@ function parse_unary(ps::ParseState)
13491349
# -0x1 ==> (call-pre - 0x01)
13501350
# - 2 ==> (call-pre - 2)
13511351
# .-2 ==> (dotcall-pre - 2)
1352-
op_pos = bump_dotsplit(ps, EMPTY_FLAGS)
1352+
op_pos = bump_dotsplit(ps, EMPTY_FLAGS, remap_kind=K"Identifier")
13531353
else
13541354
# /x ==> (call-pre (error /) x)
13551355
# +₁ x ==> (call-pre (error +₁) x)
13561356
# .<: x ==> (dotcall-pre (error (. <:)) x)
1357-
bump_dotsplit(ps, EMPTY_FLAGS, emit_dot_node=true)
1357+
bump_dotsplit(ps, EMPTY_FLAGS, emit_dot_node=true, remap_kind=K"Identifier")
13581358
op_pos = emit(ps, mark, K"error", error="not a unary operator")
13591359
end
13601360
parse_unary(ps)
@@ -1385,7 +1385,7 @@ end
13851385
function parse_factor_with_initial_ex(ps::ParseState, mark)
13861386
parse_decl_with_initial_ex(ps, mark)
13871387
if (t = peek_token(ps); is_prec_power(kind(t)))
1388-
bump_dotsplit(ps)
1388+
bump_dotsplit(ps, remap_kind=K"Identifier")
13891389
parse_factor_after(ps)
13901390
emit(ps, mark, is_dotted(t) ? K"dotcall" : K"call", INFIX_FLAG)
13911391
end
@@ -1687,11 +1687,12 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
16871687
macro_atname_range = (m, position(ps))
16881688
emit(ps, mark, K".")
16891689
elseif k == K"'"
1690+
# f.' => f (error-t (. '))
1691+
bump_dotsplit(ps, remap_kind=K"Identifier")
16901692
# TODO: Reclaim dotted postfix operators :-)
1691-
# f.' => f (error-t ')
1692-
bump(ps)
1693-
emit(ps, emark, K"error", TRIVIA_FLAG,
1693+
emit(ps, emark, K"error",
16941694
error="the .' operator for transpose is discontinued")
1695+
emit(ps, mark, K"dotcall", POSTFIX_OP_FLAG)
16951696
else
16961697
# Field/property syntax
16971698
# f.x.y ==> (. (. f x) y)
@@ -1703,7 +1704,7 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
17031704
elseif k == K"'" && !preceding_whitespace(t)
17041705
# f' ==> (call-post f ')
17051706
# f'ᵀ ==> (call-post f 'ᵀ)
1706-
bump(ps)
1707+
bump(ps, remap_kind=K"Identifier")
17071708
emit(ps, mark, K"call", POSTFIX_OP_FLAG)
17081709
elseif k == K"{"
17091710
# Type parameter curlies and macro calls
@@ -3554,11 +3555,8 @@ function parse_atom(ps::ParseState, check_identifiers=true)
35543555
# + ==> +
35553556
# .+ ==> (. +)
35563557
# .= ==> (. =)
3557-
if is_dotted(peek_token(ps))
3558-
bump_dotsplit(ps, emit_dot_node=true)
3559-
else
3560-
bump(ps, remap_kind=K"Identifier")
3561-
end
3558+
bump_dotsplit(ps, emit_dot_node=true, remap_kind=
3559+
is_syntactic_operator(leading_kind) ? leading_kind : K"Identifier")
35623560
if check_identifiers && !is_valid_identifier(leading_kind)
35633561
# += ==> (error +=)
35643562
# ? ==> (error ?)

src/parser_api.jl

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,15 +174,20 @@ Token() = Token(SyntaxHead(K"None", EMPTY_FLAGS), 0:0)
174174
head(t::Token) = t.head
175175

176176
"""
177-
tokenize(text)
177+
tokenize(text; operators_as_identifiers=true)
178178
179179
Returns the tokenized UTF-8 encoded `text` as a vector of `Token`s. The
180180
text for the token can be retrieved by using `untokenize()`. The full text can be
181181
reconstructed with, for example, `join(untokenize.(tokenize(text), text))`.
182182
183183
This interface works on UTF-8 encoded string or buffer data only.
184+
185+
The keyword `operators_as_identifiers` specifies whether operators in
186+
identifier-position should have `K"Identifier"` as their kind, or be emitted as
187+
more specific operator kinds. For example, whether the `+` in `a + b` should be
188+
emitted as `K"Identifier"` (the default) or as `K"+"`.
184189
"""
185-
function tokenize(text)
190+
function tokenize(text; operators_as_identifiers=true)
186191
ps = ParseStream(text)
187192
parse!(ps, rule=:all)
188193
ts = ps.tokens
@@ -192,7 +197,15 @@ function tokenize(text)
192197
continue
193198
end
194199
r = ts[i-1].next_byte:ts[i].next_byte-1
195-
push!(output_tokens, Token(head(ts[i]), r))
200+
k = kind(ts[i])
201+
if k == K"Identifier" && !operators_as_identifiers
202+
orig_k = ts[i].orig_kind
203+
if is_operator(orig_k) && !is_word_operator(orig_k)
204+
k = orig_k
205+
end
206+
end
207+
f = flags(ts[i])
208+
push!(output_tokens, Token(SyntaxHead(k,f), r))
196209
end
197210
output_tokens
198211
end

test/green_node.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
@test head.(children(t)) == [
99
SyntaxHead(K"Identifier", 0x0000)
1010
SyntaxHead(K"Whitespace", 0x0001)
11-
SyntaxHead(K"+", 0x0000)
11+
SyntaxHead(K"Identifier", 0x0000)
1212
SyntaxHead(K"Whitespace", 0x0001)
1313
SyntaxHead(K"Identifier", 0x0000)
1414
]

test/parser.jl

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
"""
22
Parse string to SyntaxNode tree and show as an sexpression
33
"""
4-
function parse_to_sexpr_str(production, code::AbstractString; v=v"1.6")
4+
function parse_to_sexpr_str(production, code::AbstractString; v=v"1.6", show_kws...)
55
stream = ParseStream(code, version=v)
66
production(ParseState(stream))
77
JuliaSyntax.validate_tokens(stream)
88
t = build_tree(GreenNode, stream)
99
source = SourceFile(code)
1010
s = SyntaxNode(source, t, keep_parens=true)
11-
return sprint(show, MIME("text/x.sexpression"), s)
11+
return sprint(io->show(io, MIME("text/x.sexpression"), s; show_kws...))
1212
end
1313

1414
function test_parse(production, input, output)
@@ -29,7 +29,7 @@ function test_parse(inout::Pair)
2929
test_parse(JuliaSyntax.parse_toplevel, inout...)
3030
end
3131

32-
const PARSE_ERROR = r"\(error-t "
32+
PARSE_ERROR = r"\(error-t "
3333

3434
with_version(v::VersionNumber, (i,o)::Pair) = ((;v=v), i) => o
3535

@@ -436,7 +436,7 @@ tests = [
436436
"A.@x a" => "(macrocall (. A @x) a)"
437437
"@A.B.@x a" => "(macrocall (. (. A B) (error-t) @x) a)"
438438
# .' discontinued
439-
"f.'" => "(wrapper f (error-t '))"
439+
"f.'" => "(dotcall-post f (error '))"
440440
# Field/property syntax
441441
"f.x.y" => "(. (. f x) y)"
442442
"x .y" => "(. x (error-t) y)"
@@ -1112,6 +1112,44 @@ parsestmt_test_specs = [
11121112
end
11131113
end
11141114

1115+
parsestmt_with_kind_tests = [
1116+
# Most operators are semantically just normal identifiers after parsing so
1117+
# get the Kind K"Identifier"
1118+
"+" => "+::Identifier"
1119+
"a + b" => "(call-i a::Identifier +::Identifier b::Identifier)"
1120+
"a .+ b" => "(dotcall-i a::Identifier +::Identifier b::Identifier)"
1121+
"a |> b" => "(call-i a::Identifier |>::Identifier b::Identifier)"
1122+
"a => b" => "(call-i a::Identifier =>::Identifier b::Identifier)"
1123+
"a → b" => "(call-i a::Identifier →::Identifier b::Identifier)"
1124+
"a < b < c" => "(comparison a::Identifier <::Identifier b::Identifier <::Identifier c::Identifier)"
1125+
"a .<: b"=> "(dotcall-i a::Identifier <:::Identifier b::Identifier)"
1126+
"a .. b" => "(call-i a::Identifier ..::Identifier b::Identifier)"
1127+
"a : b" => "(call-i a::Identifier :::Identifier b::Identifier)"
1128+
"-2^x" => "(call-pre -::Identifier (call-i 2::Integer ^::Identifier x::Identifier))"
1129+
"-(2)" => "(call-pre -::Identifier (parens 2::Integer))"
1130+
"<:(a,)" => "(<:-, a::Identifier)"
1131+
"- 2" => "(call-pre -::Identifier 2::Integer)"
1132+
"/x" => "(call-pre (error /::Identifier) x::Identifier)"
1133+
"a^b" => "(call-i a::Identifier ^::Identifier b::Identifier)"
1134+
"f.'" => "(dotcall-post f::Identifier (error '::Identifier))"
1135+
"f'" => "(call-post f::Identifier '::Identifier)"
1136+
# Standalone syntactic ops which keep their kind - they can't really be
1137+
# used in a sane way as identifiers or interpolated into expressions
1138+
# because they have their own syntactic forms.
1139+
":(::)" => "(quote-: (parens ::::::))"
1140+
":(\$)" => "(quote-: (parens \$::\$))"
1141+
":(<:)" => "(quote-: (parens <:::<:))"
1142+
":(&&)" => "(quote-: (parens &&::&&))"
1143+
":(=)" => "(quote-: (parens =::=))"
1144+
]
1145+
1146+
@testset "parser `Kind` remapping" begin
1147+
@testset "$(repr(input))" for (input, output) in parsestmt_with_kind_tests
1148+
input = ((show_kind=true,), input)
1149+
test_parse(JuliaSyntax.parse_stmts, input, output)
1150+
end
1151+
end
1152+
11151153
@testset "Trivia attachment" begin
11161154
# TODO: Need to expand this greatly to cover as many forms as possible!
11171155
@test show_green_tree("f(a;b)") == """

test/parser_api.jl

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,14 +170,25 @@ end
170170
end
171171
end
172172

173-
tokensplit(str) = [kind(tok) => untokenize(tok, str) for tok in tokenize(str)]
173+
tokensplit(str; kws...) = [kind(tok) => untokenize(tok, str) for tok in tokenize(str; kws...)]
174174

175175
@testset "tokenize() API" begin
176176
# tokenize() is eager
177177
@test tokenize("aba") isa Vector{JuliaSyntax.Token}
178178

179179
# . is a separate token from + in `.+`
180180
@test tokensplit("a .+ β") == [
181+
K"Identifier" => "a",
182+
K"Whitespace" => " ",
183+
K"." => ".",
184+
K"Identifier" => "+",
185+
K"Whitespace" => " ",
186+
K"Identifier" => "β",
187+
]
188+
189+
# + is kind K"+" when operators in identifier position are emitted as
190+
# operator kinds.
191+
@test tokensplit("a .+ β"; operators_as_identifiers=false) == [
181192
K"Identifier" => "a",
182193
K"Whitespace" => " ",
183194
K"." => ".",
@@ -194,6 +205,14 @@ tokensplit(str) = [kind(tok) => untokenize(tok, str) for tok in tokenize(str)]
194205
K"Whitespace" => " ",
195206
K"Integer" => "1",
196207
]
208+
# Including word operators
209+
@test tokensplit("where = 1"; operators_as_identifiers=false) == [
210+
K"Identifier" => "where",
211+
K"Whitespace" => " ",
212+
K"=" => "=",
213+
K"Whitespace" => " ",
214+
K"Integer" => "1",
215+
]
197216

198217
# A predicate based on flags()
199218
@test JuliaSyntax.is_suffixed(tokenize("+₁")[1])

0 commit comments

Comments
 (0)