fix normalization and string parsing (again)

pfitzseb · pfitzseb · commit c510357f2441 · 2022-05-13T21:01:48.000+02:00
diff --git a/src/components/strings.jl b/src/components/strings.jl
@@ -129,6 +129,7 @@ function parse_string_or_cmd(ps::ParseState, prefixed=false)
                 elseif !isempty(str)
                     push!(ret, ex)
                 end
+                !iscmd && _rm_escaped_newlines(ex)
                 istrip && adjust_lcp(ex)
                 startbytes = 0
                 op = EXPR(:OPERATOR, 1, 1, "\$")
@@ -227,6 +228,7 @@ function parse_string_or_cmd(ps::ParseState, prefixed=false)
                 # only mark non-interpolated triple u_strings
                 ex = EXPR(length(ret) == 0 ? :TRIPLESTRING : :STRING, lspan + ps.nt.startbyte - ps.t.endbyte - 1 + startbytes, lspan + startbytes, str)
                 # find lcp for escaped string
+                !iscmd && _rm_escaped_newlines(ex)
                 adjust_lcp(ex, true)
                 # we only want to drop the leading new line if it's a literal newline, not if it's `\n`
                 if startswith(str, "\\n")
@@ -288,6 +290,7 @@ function parse_string_or_cmd(ps::ParseState, prefixed=false)
         ret = unwrapped
     end
     if !iscmd && prefixed == false
+        _rm_escaped_newlines(ret)
         _unescape_string_expr(ret)
     end
     update_span!(ret)
@@ -297,14 +300,24 @@ end
 
 function _unescape_string_expr(expr)
     if headof(expr) === :STRING || headof(expr) === :TRIPLESTRING
-        expr.val = _unescape_string(replace(valof(expr), r"(?<!\\)((?:\\\\)*)\\\n[\s\n]*" => s"\1"))
+        expr.val = _unescape_string(valof(expr))
     else
         for a in expr
             _unescape_string_expr(a)
         end
     end
 end
 
+function _rm_escaped_newlines(expr)
+    if headof(expr) === :STRING || headof(expr) === :TRIPLESTRING
+        expr.val = replace(valof(expr), r"(?<!\\)((?:\\\\)*)\\\n[\s\n]*" => s"\1")
+    else
+        for a in expr
+            _rm_escaped_newlines(a)
+        end
+    end
+end
+
 function adjustspan(x::EXPR)
     x.fullspan = x.span
     return x
diff --git a/src/conversion.jl b/src/conversion.jl
@@ -2,6 +2,9 @@
 function julia_normalization_map(c::Int32, x::Ptr{Nothing})::Int32
     return c == 0x00B5 ? 0x03BC : # micro sign -> greek small letter mu
            c == 0x025B ? 0x03B5 : # latin small letter open e -> greek small letter
+           c == 0x00B7 ? 0x22C5 :
+           c == 0x0387 ? 0x22C5 :
+           c == 0x2212 ? 0x002D :
            c
 end
 
@@ -155,7 +158,7 @@ function to_codeobject(x::EXPR)
             return Symbol(lowercase(string(headof(x))))
         end
     elseif isoperator(x)
-        return Symbol(valof(x))
+        return Symbol(normalize_julia_identifier(valof(x)))
     elseif ispunctuation(x)
         if headof(x) === :DOT
             if x.args === nothing
@@ -207,13 +210,11 @@ function to_codeobject(x::EXPR)
         # Special conversion needed - the initial text section is treated as empty for the represented string following lowest-common-prefix adjustments, but exists in the source.
         Expr(:string, to_codeobject.(x.args[2:end])...)
     elseif x.args === nothing
-        # this is mostly useful for ncat
+        # for ncat/nrow etc
         int = tryparse(Int, String(x.head))
-        if int === nothing
-            Expr(Symbol(lowercase(String(x.head))))
-        else
-            int
-        end
+        int !== nothing && return int
+
+        Expr(Symbol(lowercase(String(x.head))))
     elseif x.head === :errortoken
         Expr(:error)
     else
diff --git a/src/precompile.jl b/src/precompile.jl
@@ -21,10 +21,9 @@ function _precompile()
     precompile(EXPR, (Symbol, Vector{EXPR}, Int, Int))
     precompile(EXPR, (Symbol, Vector{EXPR}))
 
-    
+
     precompile(INSTANCE, (ParseState,))
 
-    precompile(tostr, (IOBuffer,))
     precompile(str_value, (EXPR,))
 
     precompile(CSTParser.parse_expression, (ParseState,))
@@ -109,4 +108,4 @@ function _precompile()
     precompile(Tokenize.Lexers.lex_star, (Tokenize.Lexers.Lexer{IOBuffer,Tokens.RawToken},))
     precompile(Tokenize.Lexers.lex_whitespace, (Tokenize.Lexers.Lexer{IOBuffer,Tokens.RawToken},))
     precompile(Tokenize.Lexers.lex_xor, (Tokenize.Lexers.Lexer{IOBuffer,Tokens.RawToken},))
-end
+end
diff --git a/test/parser.jl b/test/parser.jl
@@ -738,9 +738,20 @@ end
             @test """throw(ArgumentError("invalid \$(m == 2 ? "hex (\\\\x)" :
             "unicode (\\\$u)") escape sequence"))""" |> test_expr
             @test "\"a\\\\\\\\\\\nb\"" |> test_expr
-            for c in 0:12
+            for c in 0:20
                 @test test_expr(string("\"a", '\\'^c, "\nb\""))
+                @test test_expr(string("\"\"\"a", '\\'^c, "\nb\"\"\""))
             end
+            for c in 0:20
+                @test test_expr(string("`a", '\\'^c, "\nb`"))
+                @test test_expr(string("```a", '\\'^c, "\nb```"))
+            end
+
+            @test "\"\"\"\n    a\\\n  b\"\"\"" |> test_expr
+            @test "\"\"\"\n        a\\\n  b\"\"\"" |> test_expr
+            @test "\"\"\"\na\\\n  b\"\"\"" |> test_expr
+            @test "\"\"\"\na\\\nb\"\"\"" |> test_expr
+            @test "\"\"\"\n   a\\\n       b\"\"\"" |> test_expr
         end
     end
 
@@ -1473,6 +1484,13 @@ end
         end
     end
 
+    if VERSION > v"1.7-"
+        @testset "normalized unicode ops" begin
+            @test "(·) == (·) == (⋅) == 5" |> test_expr
+            @test "(−) == (-) == 6" |> test_expr
+        end
+    end
+
     @testset "pair tuple" begin
         @test test_expr("a => b")
         @test test_expr("a => b, c, d")