handle ZWJ and emoji sequences, don't break identifiers within graphemes (#372)

stevengj · web-flow · commit 05c594bdd260 · 2023-11-01T08:26:34.000-04:00
* handle ZWJ and emoji sequences

* forbid ZWNJ at end

* fix tests on Julia &lt; 1.5

* ascii fast path

* fix for earlier Julia versions

* Update test/tokenize.jl
diff --git a/src/tokenize.jl b/src/tokenize.jl
@@ -1284,13 +1284,31 @@ function lex_backtick(l::Lexer)
 end
 
 const MAX_KW_LENGTH = 10
+const ascii_is_identifier_char = Bool[is_identifier_char(Char(b)) for b=0x00:0x7f]
 function lex_identifier(l::Lexer, c)
     h = simple_hash(c, UInt64(0))
     n = 1
+    ascii = isascii(c)
+    graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER
+    graphemestate_peek = Ref(zero(Int32))
     while true
         pc, ppc = dpeekchar(l)
-        if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
-            break
+        ascii = ascii && isascii(pc)
+        if ascii # fast path
+            pc_byte = pc % UInt8
+            @inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1]
+                break
+            end
+        elseif Unicode.isgraphemebreak!(graphemestate, c, pc)
+            if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
+                break
+            end
+        elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
+            # ZWJ/ZWNJ only within grapheme sequences, not at end
+            graphemestate_peek[] = graphemestate[]
+            if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc)
+                break
+            end
         end
         c = readchar(l)
         h = simple_hash(c, h)
diff --git a/test/diagnostics.jl b/test/diagnostics.jl
@@ -7,7 +7,7 @@ function diagnostic(str; only_first=false, allow_multiple=false, rule=:all, vers
         if !only_first
             @test length(stream.diagnostics) == 1
         end
-        return stream.diagnostics[1]
+        return isempty(stream.diagnostics) ? nothing : stream.diagnostics[1]
     end
 end
 
diff --git a/test/tokenize.jl b/test/tokenize.jl
@@ -44,12 +44,14 @@ end
 end # testset
 
 @testset "tokenize unicode" begin
-    str = "𝘋 =2β"
+    # FIXME: rm VERSION check once we implement our own is_identifier_char
+    emoji = VERSION < v"1.5" ? "😄" : "\U1F3F3\UFE0F\U200D\U1F308" # 🏳️‍🌈 requires newer Unicode
+    str = "𝘋 =2"*emoji
     for s in [str, IOBuffer(str)]
         l = tokenize(s)
         kinds = [K"Identifier", K"Whitespace", K"=",
                  K"Integer", K"Identifier", K"EndMarker"]
-        token_strs = ["𝘋", " ", "=", "2", "β", ""]
+        token_strs = ["𝘋", " ", "=", "2", emoji, ""]
         for (i, n) in enumerate(l)
             @test kind(n) == kinds[i]
             @test untokenize(n, str)  == token_strs[i]