Skip to content

Commit 4132d75

Browse files
authored
Split char delimiters early and emit K"char" node (#121)
Here we split off char delimiters in the tokenizer rather than re-parsing them later during value conversion. Also add a K"char" internal node to cover the delimiters and the literal char content in the green tree. This allows us to remove another special case token error kind (ErrorEofChar) and makes char representation in the tree similar to string representation.
1 parent 7b23532 commit 4132d75

File tree

10 files changed

+105
-61
lines changed

10 files changed

+105
-61
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,9 @@ parsing `key=val` pairs inside parentheses.
741741
:([(x, y) for $(Expr(:filter, :(y < x), :(x = 1:10), :(y = 1:10)))])
742742
```
743743

744+
* The character `'` may be written without escaping as `'''` rather than
745+
requiring the form `'\''`.
746+
744747
# Comparisons to other packages
745748

746749
### Official Julia compiler

src/expr.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,9 @@ function _to_expr(node::SyntaxNode; iteration_spec=false, need_linenodes=true,
239239
elseif headsym == :do
240240
@check length(args) == 3
241241
return Expr(:do, args[1], Expr(:->, args[2], args[3]))
242+
elseif headsym == :char
243+
@check length(args) == 1
244+
return args[1]
242245
end
243246
return Expr(headsym, args...)
244247
end

src/hooks.jl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,6 @@ function _incomplete_tag(n::SyntaxNode)
4242
k1 = kind(cs[1])
4343
if k1 == K"ErrorEofMultiComment"
4444
return :comment
45-
elseif k1 == K"ErrorEofChar"
46-
# TODO: Make this case into an internal node
47-
return :char
4845
end
4946
for cc in cs
5047
if kind(cc) == K"error"
@@ -57,6 +54,8 @@ function _incomplete_tag(n::SyntaxNode)
5754
return :string
5855
elseif kp == K"cmdstring"
5956
return :cmd
57+
elseif kp == K"char"
58+
return :char
6059
elseif kp in KSet"block quote let try"
6160
return :block
6261
elseif kp in KSet"for while function if"

src/kinds.jl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ const _kind_names =
1515
"BEGIN_ERRORS"
1616
# Tokenization errors
1717
"ErrorEofMultiComment"
18-
"ErrorEofChar"
1918
"ErrorInvalidNumericConstant"
2019
"ErrorInvalidOperator"
2120
"ErrorInvalidInterpolationTerminator"
@@ -874,6 +873,7 @@ const _kind_names =
874873
"inert" # QuoteNode; not quasiquote
875874
"string" # A string interior node (possibly containing interpolations)
876875
"cmdstring" # A cmd string node (containing delimiters plus string)
876+
"char" # A char string node (containing delims + char data)
877877
"macrocall"
878878
"parameters" # the list after ; in f(; a=1)
879879
"toplevel"
@@ -1004,7 +1004,6 @@ const _nonunique_kind_names = Set([
10041004
K"Identifier"
10051005

10061006
K"ErrorEofMultiComment"
1007-
K"ErrorEofChar"
10081007
K"ErrorInvalidNumericConstant"
10091008
K"ErrorInvalidOperator"
10101009
K"ErrorInvalidInterpolationTerminator"

src/parser.jl

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1561,7 +1561,7 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
15611561
emit(ps, mark, K".")
15621562
this_iter_valid_macroname = true
15631563
end
1564-
elseif k == K"'"
1564+
elseif k == K"'" && !preceding_whitespace(t)
15651565
if !is_suffixed(t)
15661566
# f' ==> (' f)
15671567
bump(ps, TRIVIA_FLAG)
@@ -3148,7 +3148,7 @@ function parse_string(ps::ParseState, raw::Bool)
31483148
else
31493149
# Missing delimiter recovery
31503150
# "str ==> (string "str" (error-t))
3151-
bump_invisible(ps, K"error", TRIVIA_FLAG, error="Unterminated string literal")
3151+
bump_invisible(ps, K"error", TRIVIA_FLAG, error="unterminated string literal")
31523152
end
31533153
# String interpolations
31543154
# "$x$y$z" ==> (string x y z)
@@ -3197,7 +3197,32 @@ function parse_atom(ps::ParseState, check_identifiers=true)
31973197
mark = position(ps)
31983198
leading_kind = peek(ps)
31993199
# todo: Reorder to put most likely tokens first?
3200-
if leading_kind == K":"
3200+
if leading_kind == K"'"
3201+
# char literal
3202+
bump(ps, TRIVIA_FLAG)
3203+
k = peek(ps)
3204+
if k == K"Char"
3205+
bump(ps)
3206+
if peek(ps) == K"'"
3207+
# 'a' ==> (char 'a')
3208+
# 'α' ==> (char 'α')
3209+
# '\xce\xb1' ==> (char 'α')
3210+
bump(ps, TRIVIA_FLAG)
3211+
else
3212+
# 'a ==> (char 'a' (error-t))
3213+
bump_invisible(ps, K"error", TRIVIA_FLAG,
3214+
error="unterminated character literal")
3215+
end
3216+
elseif k == K"'"
3217+
# '' ==> (char (error))
3218+
bump_invisible(ps, K"error", error="empty character literal")
3219+
else
3220+
# ' ==> (char (error))
3221+
@check k == K"EndMarker"
3222+
bump_invisible(ps, K"error", error="unterminated character literal")
3223+
end
3224+
emit(ps, mark, K"char")
3225+
elseif leading_kind == K":"
32013226
# symbol/expression quote
32023227
# :foo ==> (quote foo)
32033228
t = peek_token(ps, 2)
@@ -3275,7 +3300,7 @@ function parse_atom(ps::ParseState, check_identifiers=true)
32753300
bump(ps, TRIVIA_FLAG)
32763301
else
32773302
bump_invisible(ps, K"error", TRIVIA_FLAG,
3278-
error="Unterminated string literal")
3303+
error="unterminated string literal")
32793304
end
32803305
t = peek_token(ps)
32813306
k = kind(t)

src/syntax_tree.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ function SyntaxNode(source::SourceFile, raw::GreenNode{SyntaxHead}, position::In
3737
false
3838
elseif k == K"Char"
3939
v, err, _ = unescape_julia_string(val_str, false, false)
40-
if err
40+
if err || length(v) != 1
4141
ErrorVal()
4242
else
43-
v[2]
43+
only(v)
4444
end
4545
elseif k == K"Identifier"
4646
if has_flags(head(raw), RAW_STRING_FLAG)

src/tokenize.jl

Lines changed: 26 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ include("tokenize_utils.jl")
1515
# Error kind => description
1616
TOKEN_ERROR_DESCRIPTION = Dict{Kind, String}(
1717
K"ErrorEofMultiComment" => "unterminated multi-line comment #= ... =#",
18-
K"ErrorEofChar" => "unterminated character literal",
1918
K"ErrorInvalidNumericConstant" => "invalid numeric constant",
2019
K"ErrorInvalidOperator" => "invalid operator",
2120
K"ErrorInvalidInterpolationTerminator" => "interpolated variable ends with invalid character; use `\$(...)` instead",
@@ -263,9 +262,9 @@ end
263262
264263
Returns a `Token` of kind `kind` with contents `str` and starts a new `Token`.
265264
"""
266-
function emit(l::Lexer, kind::Kind)
265+
function emit(l::Lexer, kind::Kind, maybe_op=true)
267266
suffix = false
268-
if optakessuffix(kind)
267+
if optakessuffix(kind) && maybe_op
269268
while isopsuffix(peekchar(l))
270269
readchar(l)
271270
suffix = true
@@ -448,6 +447,11 @@ function lex_string_chunk(l)
448447
end
449448
return emit(l, K"Whitespace")
450449
elseif pc == state.delim && string_terminates(l, state.delim, state.triplestr)
450+
if state.delim == '\'' && l.last_token == K"'" && dpeekchar(l)[2] == '\''
451+
# Handle '''
452+
readchar(l)
453+
return emit(l, K"Char")
454+
end
451455
# Terminate string
452456
pop!(l.string_states)
453457
readchar(l)
@@ -456,7 +460,8 @@ function lex_string_chunk(l)
456460
return emit(l, state.delim == '"' ?
457461
K"\"\"\"" : K"```")
458462
else
459-
return emit(l, state.delim == '"' ? K"\"" : K"`")
463+
return emit(l, state.delim == '"' ? K"\"" :
464+
state.delim == '`' ? K"`" : K"'", false)
460465
end
461466
end
462467
# Read a chunk of string characters
@@ -516,7 +521,8 @@ function lex_string_chunk(l)
516521
end
517522
end
518523
end
519-
return emit(l, state.delim == '"' ? K"String" : K"CmdString")
524+
return emit(l, state.delim == '"' ? K"String" :
525+
state.delim == '`' ? K"CmdString" : K"Char")
520526
end
521527

522528
# Lex whitespace, a whitespace char `c` has been consumed
@@ -859,41 +865,23 @@ function lex_digit(l::Lexer, kind)
859865
return emit(l, kind)
860866
end
861867

862-
function lex_prime(l, doemit = true)
863-
if l.last_token == K"Identifier" ||
864-
is_contextual_keyword(l.last_token) ||
865-
is_word_operator(l.last_token) ||
866-
l.last_token == K"." ||
867-
l.last_token == K")" ||
868-
l.last_token == K"]" ||
869-
l.last_token == K"}" ||
870-
l.last_token == K"'" ||
871-
l.last_token == K"end" || is_literal(l.last_token)
868+
function lex_prime(l)
869+
if l.last_token == K"Identifier" ||
870+
is_contextual_keyword(l.last_token) ||
871+
is_word_operator(l.last_token) ||
872+
l.last_token == K"." ||
873+
l.last_token == K")" ||
874+
l.last_token == K"]" ||
875+
l.last_token == K"}" ||
876+
l.last_token == K"'" ||
877+
l.last_token == K"end" ||
878+
is_literal(l.last_token)
879+
# FIXME ^ This doesn't cover all cases - probably needs involvement
880+
# from the parser state.
872881
return emit(l, K"'")
873882
else
874-
if accept(l, '\'')
875-
if accept(l, '\'')
876-
return doemit ? emit(l, K"Char") : EMPTY_TOKEN
877-
else
878-
# Empty char literal
879-
# Arguably this should be an error here, but we generally
880-
# look at the contents of the char literal in the parser,
881-
# so we defer erroring until there.
882-
return doemit ? emit(l, K"Char") : EMPTY_TOKEN
883-
end
884-
end
885-
while true
886-
c = readchar(l)
887-
if c == EOF_CHAR
888-
return doemit ? emit_error(l, K"ErrorEofChar") : EMPTY_TOKEN
889-
elseif c == '\\'
890-
if readchar(l) == EOF_CHAR
891-
return doemit ? emit_error(l, K"ErrorEofChar") : EMPTY_TOKEN
892-
end
893-
elseif c == '\''
894-
return doemit ? emit(l, K"Char") : EMPTY_TOKEN
895-
end
896-
end
883+
push!(l.string_states, StringState(false, true, '\'', 0))
884+
return emit(l, K"'", false)
897885
end
898886
end
899887

test/expr.jl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,14 @@
157157
Expr(:string, "a\n", :x, "\nb\nc")
158158
end
159159

160+
@testset "Char conversions" begin
161+
@test parse(Expr, "'a'") == 'a'
162+
@test parse(Expr, "'α'") == 'α'
163+
@test parse(Expr, "'\\xce\\xb1'") == 'α'
164+
# FIXME
165+
# @test_throws ParseError parse(Expr, "'abcde'")
166+
end
167+
160168
@testset "do block conversion" begin
161169
@test parse(Expr, "f(x) do y\n body end") ==
162170
Expr(:do, Expr(:call, :f, :x),

test/parser.jl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,14 @@ tests = [
567567
"(1:2)" => "(call-i 1 : 2)"
568568
],
569569
JuliaSyntax.parse_atom => [
570+
# char literal
571+
"'a'" => "(char 'a')"
572+
"'α'" => "(char 'α')"
573+
"'\\xce\\xb1'" => "(char 'α')"
574+
"'a" => "(char 'a' (error-t))"
575+
"''" => "(char (error))"
576+
"'" => "(char (error))"
577+
# symbol/expression quote
570578
":foo" => "(quote foo)"
571579
# Literal colons
572580
":)" => ":"

test/tokenize.jl

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ end # testset
109109
K"NewlineWs",K"[",K"Integer",K"*",K"Integer",K",",K"Integer",
110110
K";",K"Integer",K",",K"Integer",K"]",
111111

112-
K"NewlineWs",K"\"",K"String",K"\"",K";",K"Whitespace",K"Char",
112+
K"NewlineWs",K"\"",K"String",K"\"",K";",K"Whitespace",K"'",K"Char",K"'",
113113

114114
K"NewlineWs",K"(",K"Identifier",K"&&",K"Identifier",K")",K"||",
115115
K"(",K"Identifier",K"||",K"Identifier",K")",
@@ -130,7 +130,7 @@ end # testset
130130

131131
K"NewlineWs",K"{",K"}",
132132

133-
K"NewlineWs",K"ErrorEofChar",K"EndMarker"]
133+
K"NewlineWs",K"'",K"Char",K"EndMarker"]
134134

135135
for (i, n) in enumerate(tokenize(str))
136136
@test kind(n) == kinds[i]
@@ -190,6 +190,8 @@ function test_roundtrip(str, kind, val)
190190
@test untokenize(t, str) == val
191191
end
192192

193+
roundtrip(str) = join(untokenize.(collect(tokenize(str)), str))
194+
193195
@testset "tokenizing juxtaposed numbers and dotted operators/identifiers" begin
194196
test_roundtrip("1234 .+1", K"Integer", "1234")
195197
test_roundtrip("1234.0+1", K"Float", "1234.0")
@@ -228,22 +230,32 @@ end
228230
D = ImageMagick.load(fn)
229231
"""
230232
tokens = collect(tokenize(str))
231-
@test string(untokenize(tokens[16], str))==string(untokenize(tokens[17], str))=="'"
233+
@test string(untokenize(tokens[16], str)) == string(untokenize(tokens[17], str))=="'"
234+
235+
@test roundtrip("'a'") == "'a'"
236+
@test kind.(collect(tokenize("'a'"))) == [K"'", K"Char", K"'", K"EndMarker"]
237+
238+
# ' is not an operator here, so doesn't consume the suffix ᵀ
239+
@test roundtrip("'ᵀ'") == "'ᵀ'"
240+
@test kind.(collect(tokenize("'₁'"))) == [K"'", K"Char", K"'", K"EndMarker"]
241+
242+
@test roundtrip("''") == "''"
243+
@test kind.(collect(tokenize("''"))) == [K"'", K"'", K"EndMarker"]
244+
245+
@test roundtrip("'''") == "'''"
246+
@test kind.(collect(tokenize("'''"))) == [K"'", K"Char", K"'", K"EndMarker"]
232247

233-
test_roundtrip("'a'", K"Char", "'a'")
234-
test_roundtrip("''", K"Char", "''")
235-
test_roundtrip("'''", K"Char", "'''")
236-
test_roundtrip("''''", K"Char", "'''")
248+
@test roundtrip("''''") == "''''"
249+
@test kind.(collect(tokenize("''''"))) == [K"'", K"Char", K"'", K"'", K"EndMarker"]
237250

238-
@test tok("''''", 1).kind == K"Char"
239-
@test tok("''''", 2).kind == K"'"
240251
@test tok("()'", 3).kind == K"'"
241252
@test tok("{}'", 3).kind == K"'"
242253
@test tok("[]'", 3).kind == K"'"
243254
@test tok("outer'", 2).kind == K"'"
244255
@test tok("mutable'", 2).kind == K"'"
245256
@test tok("as'", 2).kind == K"'"
246257
@test tok("isa'", 2).kind == K"'"
258+
@test untokenize.(collect(tokenize("a'ᵀ")), "a'ᵀ") == ["a", "'ᵀ", ""]
247259
end
248260

249261
@testset "keywords" begin
@@ -293,9 +305,8 @@ end
293305
end
294306

295307
@testset "errors" begin
296-
@test tok("#= #= =#", 1).kind == K"ErrorEofMultiComment"
297-
@test tok("'dsadsa", 1).kind == K"ErrorEofChar"
298-
@test tok("aa **", 3).kind == K"ErrorInvalidOperator"
308+
@test tok("#= #= =#", 1).kind == K"ErrorEofMultiComment"
309+
@test tok("aa **", 3).kind == K"ErrorInvalidOperator"
299310
end
300311

301312
@testset "xor_eq" begin

0 commit comments

Comments
 (0)