Split char delimiters early and emit K"char" node (#121)

c42f · web-flow · commit 4132d7583190 · 2022-10-12T10:33:41.000+10:00
Here we split off char delimiters in the tokenizer rather than
re-parsing them later during value conversion. Also add a K"char"
internal node to cover the delimiters and the literal char content in
the green tree.

This allows us to remove another special case token error kind
(ErrorEofChar) and makes char representation in the tree similar to
string representation.
diff --git a/README.md b/README.md
@@ -741,6 +741,9 @@ parsing `key=val` pairs inside parentheses.
   :([(x, y) for $(Expr(:filter, :(y < x), :(x = 1:10), :(y = 1:10)))])
   ```
 
+* The character `'` may be written without escaping as `'''` rather than
+  requiring the form `'\''`.
+
 # Comparisons to other packages
 
 ### Official Julia compiler
diff --git a/src/expr.jl b/src/expr.jl
@@ -239,6 +239,9 @@ function _to_expr(node::SyntaxNode; iteration_spec=false, need_linenodes=true,
     elseif headsym == :do
         @check length(args) == 3
         return Expr(:do, args[1], Expr(:->, args[2], args[3]))
+    elseif headsym == :char
+        @check length(args) == 1
+        return args[1]
     end
     return Expr(headsym, args...)
 end
diff --git a/src/hooks.jl b/src/hooks.jl
@@ -42,9 +42,6 @@ function _incomplete_tag(n::SyntaxNode)
         k1 = kind(cs[1])
         if k1 == K"ErrorEofMultiComment"
             return :comment
-        elseif k1 == K"ErrorEofChar"
-            # TODO: Make this case into an internal node
-            return :char
         end
         for cc in cs
             if kind(cc) == K"error"
@@ -57,6 +54,8 @@ function _incomplete_tag(n::SyntaxNode)
         return :string
     elseif kp == K"cmdstring"
         return :cmd
+    elseif kp == K"char"
+        return :char
     elseif kp in KSet"block quote let try"
         return :block
     elseif kp in KSet"for while function if"
diff --git a/src/kinds.jl b/src/kinds.jl
@@ -15,7 +15,6 @@ const _kind_names =
     "BEGIN_ERRORS"
         # Tokenization errors
         "ErrorEofMultiComment"
-        "ErrorEofChar"
         "ErrorInvalidNumericConstant"
         "ErrorInvalidOperator"
         "ErrorInvalidInterpolationTerminator"
@@ -874,6 +873,7 @@ const _kind_names =
         "inert"          # QuoteNode; not quasiquote
         "string"         # A string interior node (possibly containing interpolations)
         "cmdstring"      # A cmd string node (containing delimiters plus string)
+        "char"           # A char string node (containing delims + char data)
         "macrocall"
         "parameters"     # the list after ; in f(; a=1)
         "toplevel"
@@ -1004,7 +1004,6 @@ const _nonunique_kind_names = Set([
     K"Identifier"
 
     K"ErrorEofMultiComment"
-    K"ErrorEofChar"
     K"ErrorInvalidNumericConstant"
     K"ErrorInvalidOperator"
     K"ErrorInvalidInterpolationTerminator"
diff --git a/src/parser.jl b/src/parser.jl
@@ -1561,7 +1561,7 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false)
                 emit(ps, mark, K".")
                 this_iter_valid_macroname = true
             end
-        elseif k == K"'"
+        elseif k == K"'" && !preceding_whitespace(t)
             if !is_suffixed(t)
                 # f'  ==> (' f)
                 bump(ps, TRIVIA_FLAG)
@@ -3148,7 +3148,7 @@ function parse_string(ps::ParseState, raw::Bool)
     else
         # Missing delimiter recovery
         # "str   ==> (string "str" (error-t))
-        bump_invisible(ps, K"error", TRIVIA_FLAG, error="Unterminated string literal")
+        bump_invisible(ps, K"error", TRIVIA_FLAG, error="unterminated string literal")
     end
     # String interpolations
     # "$x$y$z"  ==> (string x y z)
@@ -3197,7 +3197,32 @@ function parse_atom(ps::ParseState, check_identifiers=true)
     mark = position(ps)
     leading_kind = peek(ps)
     # todo: Reorder to put most likely tokens first?
-    if leading_kind == K":"
+    if leading_kind == K"'"
+        # char literal
+        bump(ps, TRIVIA_FLAG)
+        k = peek(ps)
+        if k == K"Char"
+            bump(ps)
+            if peek(ps) == K"'"
+                # 'a'         ==>  (char 'a')
+                # 'α'         ==>  (char 'α')
+                # '\xce\xb1'  ==>  (char 'α')
+                bump(ps, TRIVIA_FLAG)
+            else
+                # 'a  ==>  (char 'a' (error-t))
+                bump_invisible(ps, K"error", TRIVIA_FLAG,
+                               error="unterminated character literal")
+            end
+        elseif k == K"'"
+            # ''  ==>  (char (error))
+            bump_invisible(ps, K"error", error="empty character literal")
+        else
+            # '   ==>  (char (error))
+            @check k == K"EndMarker"
+            bump_invisible(ps, K"error", error="unterminated character literal")
+        end
+        emit(ps, mark, K"char")
+    elseif leading_kind == K":"
         # symbol/expression quote
         # :foo  ==>  (quote foo)
         t = peek_token(ps, 2)
@@ -3275,7 +3300,7 @@ function parse_atom(ps::ParseState, check_identifiers=true)
                 bump(ps, TRIVIA_FLAG)
             else
                 bump_invisible(ps, K"error", TRIVIA_FLAG,
-                               error="Unterminated string literal")
+                               error="unterminated string literal")
             end
             t = peek_token(ps)
             k = kind(t)
diff --git a/src/syntax_tree.jl b/src/syntax_tree.jl
@@ -37,10 +37,10 @@ function SyntaxNode(source::SourceFile, raw::GreenNode{SyntaxHead}, position::In
             false
         elseif k == K"Char"
             v, err, _ = unescape_julia_string(val_str, false, false)
-            if err
+            if err || length(v) != 1
                 ErrorVal()
             else
-                v[2]
+                only(v)
             end
         elseif k == K"Identifier"
             if has_flags(head(raw), RAW_STRING_FLAG)
diff --git a/src/tokenize.jl b/src/tokenize.jl
@@ -15,7 +15,6 @@ include("tokenize_utils.jl")
 # Error kind => description
 TOKEN_ERROR_DESCRIPTION = Dict{Kind, String}(
     K"ErrorEofMultiComment" => "unterminated multi-line comment #= ... =#",
-    K"ErrorEofChar" => "unterminated character literal",
     K"ErrorInvalidNumericConstant" => "invalid numeric constant",
     K"ErrorInvalidOperator" => "invalid operator",
     K"ErrorInvalidInterpolationTerminator" => "interpolated variable ends with invalid character; use `\$(...)` instead",
@@ -263,9 +262,9 @@ end
 
 Returns a `Token` of kind `kind` with contents `str` and starts a new `Token`.
 """
-function emit(l::Lexer, kind::Kind)
+function emit(l::Lexer, kind::Kind, maybe_op=true)
     suffix = false
-    if optakessuffix(kind)
+    if optakessuffix(kind) && maybe_op
         while isopsuffix(peekchar(l))
             readchar(l)
             suffix = true
@@ -448,6 +447,11 @@ function lex_string_chunk(l)
         end
         return emit(l, K"Whitespace")
     elseif pc == state.delim && string_terminates(l, state.delim, state.triplestr)
+        if state.delim == '\'' && l.last_token == K"'" && dpeekchar(l)[2] == '\''
+            # Handle '''
+            readchar(l)
+            return emit(l, K"Char")
+        end
         # Terminate string
         pop!(l.string_states)
         readchar(l)
@@ -456,7 +460,8 @@ function lex_string_chunk(l)
             return emit(l, state.delim == '"' ?
                         K"\"\"\"" : K"```")
         else
-            return emit(l, state.delim == '"' ? K"\"" : K"`")
+            return emit(l, state.delim == '"' ? K"\"" :
+                           state.delim == '`' ? K"`"  : K"'", false)
         end
     end
     # Read a chunk of string characters
@@ -516,7 +521,8 @@ function lex_string_chunk(l)
             end
         end
     end
-    return emit(l, state.delim == '"' ?  K"String" : K"CmdString")
+    return emit(l, state.delim == '"' ? K"String"    :
+                   state.delim == '`' ? K"CmdString" : K"Char")
 end
 
 # Lex whitespace, a whitespace char `c` has been consumed
@@ -859,41 +865,23 @@ function lex_digit(l::Lexer, kind)
     return emit(l, kind)
 end
 
-function lex_prime(l, doemit = true)
-    if l.last_token == K"Identifier" ||
-        is_contextual_keyword(l.last_token) ||
-        is_word_operator(l.last_token) ||
-        l.last_token == K"." ||
-        l.last_token ==  K")" ||
-        l.last_token ==  K"]" ||
-        l.last_token ==  K"}" ||
-        l.last_token == K"'" ||
-        l.last_token == K"end" || is_literal(l.last_token)
+function lex_prime(l)
+    if l.last_token == K"Identifier"         ||
+         is_contextual_keyword(l.last_token) ||
+         is_word_operator(l.last_token)      ||
+         l.last_token == K"."                ||
+         l.last_token ==  K")"               ||
+         l.last_token ==  K"]"               ||
+         l.last_token ==  K"}"               ||
+         l.last_token == K"'"                ||
+         l.last_token == K"end"              ||
+         is_literal(l.last_token)
+        # FIXME ^ This doesn't cover all cases - probably needs involvement
+        # from the parser state.
         return emit(l, K"'")
     else
-        if accept(l, '\'')
-            if accept(l, '\'')
-                return doemit ? emit(l, K"Char") : EMPTY_TOKEN
-            else
-                # Empty char literal
-                # Arguably this should be an error here, but we generally
-                # look at the contents of the char literal in the parser,
-                # so we defer erroring until there.
-                return doemit ? emit(l, K"Char") : EMPTY_TOKEN
-            end
-        end
-        while true
-            c = readchar(l)
-            if c == EOF_CHAR
-                return doemit ? emit_error(l, K"ErrorEofChar") : EMPTY_TOKEN
-            elseif c == '\\'
-                if readchar(l) == EOF_CHAR
-                    return doemit ? emit_error(l, K"ErrorEofChar") : EMPTY_TOKEN
-                end
-            elseif c == '\''
-                return doemit ? emit(l, K"Char") : EMPTY_TOKEN
-            end
-        end
+        push!(l.string_states, StringState(false, true, '\'', 0))
+        return emit(l, K"'", false)
     end
 end
 
diff --git a/test/expr.jl b/test/expr.jl
@@ -157,6 +157,14 @@
             Expr(:string, "a\n", :x, "\nb\nc")
     end
 
+    @testset "Char conversions" begin
+        @test parse(Expr, "'a'") == 'a'
+        @test parse(Expr, "'α'") == 'α'
+        @test parse(Expr, "'\\xce\\xb1'") == 'α'
+        # FIXME
+        # @test_throws ParseError parse(Expr, "'abcde'")
+    end
+
     @testset "do block conversion" begin
         @test parse(Expr, "f(x) do y\n body end") ==
             Expr(:do, Expr(:call, :f, :x),
diff --git a/test/parser.jl b/test/parser.jl
@@ -567,6 +567,14 @@ tests = [
         "(1:2)" => "(call-i 1 : 2)"
     ],
     JuliaSyntax.parse_atom => [
+        # char literal
+        "'a'"           =>  "(char 'a')"
+        "'α'"           =>  "(char 'α')"
+        "'\\xce\\xb1'"  =>  "(char 'α')"
+        "'a"            =>  "(char 'a' (error-t))"
+        "''"            =>  "(char (error))"
+        "'"             =>  "(char (error))"
+        # symbol/expression quote
         ":foo"   => "(quote foo)"
         # Literal colons
         ":)"     => ":"
diff --git a/test/tokenize.jl b/test/tokenize.jl
@@ -109,7 +109,7 @@ end # testset
             K"NewlineWs",K"[",K"Integer",K"*",K"Integer",K",",K"Integer",
             K";",K"Integer",K",",K"Integer",K"]",
 
-            K"NewlineWs",K"\"",K"String",K"\"",K";",K"Whitespace",K"Char",
+            K"NewlineWs",K"\"",K"String",K"\"",K";",K"Whitespace",K"'",K"Char",K"'",
 
             K"NewlineWs",K"(",K"Identifier",K"&&",K"Identifier",K")",K"||",
             K"(",K"Identifier",K"||",K"Identifier",K")",
@@ -130,7 +130,7 @@ end # testset
 
             K"NewlineWs",K"{",K"}",
 
-            K"NewlineWs",K"ErrorEofChar",K"EndMarker"]
+            K"NewlineWs",K"'",K"Char",K"EndMarker"]
 
     for (i, n) in enumerate(tokenize(str))
         @test kind(n) == kinds[i]
@@ -190,6 +190,8 @@ function test_roundtrip(str, kind, val)
     @test untokenize(t, str) == val
 end
 
+roundtrip(str) = join(untokenize.(collect(tokenize(str)), str))
+
 @testset "tokenizing juxtaposed numbers and dotted operators/identifiers" begin
     test_roundtrip("1234 .+1",     K"Integer", "1234")
     test_roundtrip("1234.0+1",     K"Float",   "1234.0")
@@ -228,22 +230,32 @@ end
     D = ImageMagick.load(fn)
     """
     tokens = collect(tokenize(str))
-    @test string(untokenize(tokens[16], str))==string(untokenize(tokens[17], str))=="'"
+    @test string(untokenize(tokens[16], str)) == string(untokenize(tokens[17], str))=="'"
+
+    @test roundtrip("'a'") == "'a'"
+    @test kind.(collect(tokenize("'a'"))) == [K"'", K"Char", K"'", K"EndMarker"]
+
+    # ' is not an operator here, so doesn't consume the suffix ᵀ
+    @test roundtrip("'ᵀ'") == "'ᵀ'"
+    @test kind.(collect(tokenize("'₁'"))) == [K"'", K"Char", K"'", K"EndMarker"]
+
+    @test roundtrip("''") == "''"
+    @test kind.(collect(tokenize("''"))) == [K"'", K"'", K"EndMarker"]
+
+    @test roundtrip("'''") == "'''"
+    @test kind.(collect(tokenize("'''"))) == [K"'", K"Char", K"'", K"EndMarker"]
 
-    test_roundtrip("'a'",  K"Char", "'a'")
-    test_roundtrip("''",   K"Char", "''")
-    test_roundtrip("'''",  K"Char", "'''")
-    test_roundtrip("''''", K"Char", "'''")
+    @test roundtrip("''''") == "''''"
+    @test kind.(collect(tokenize("''''"))) == [K"'", K"Char", K"'", K"'", K"EndMarker"]
 
-    @test tok("''''", 1).kind == K"Char"
-    @test tok("''''", 2).kind == K"'"
     @test tok("()'", 3).kind == K"'"
     @test tok("{}'", 3).kind == K"'"
     @test tok("[]'", 3).kind == K"'"
     @test tok("outer'", 2).kind == K"'"
     @test tok("mutable'", 2).kind == K"'"
     @test tok("as'", 2).kind == K"'"
     @test tok("isa'", 2).kind == K"'"
+    @test untokenize.(collect(tokenize("a'ᵀ")), "a'ᵀ") == ["a", "'ᵀ", ""]
 end
 
 @testset "keywords" begin
@@ -293,9 +305,8 @@ end
 end
 
 @testset "errors" begin
-    @test tok("#=   #=   =#",           1).kind == K"ErrorEofMultiComment"
-    @test tok("'dsadsa",                1).kind == K"ErrorEofChar"
-    @test tok("aa **",                  3).kind == K"ErrorInvalidOperator"
+    @test tok("#=   #=   =#", 1).kind == K"ErrorEofMultiComment"
+    @test tok("aa **",        3).kind == K"ErrorInvalidOperator"
 end
 
 @testset "xor_eq" begin