Skip to content

Commit 05c594b

Browse files
authored
handle ZWJ and emoji sequences, don't break identifiers within graphemes (#372)
* handle ZWJ and emoji sequences * forbid ZWNJ at end * fix tests on Julia < 1.5 * ascii fast path * fix for earlier Julia versions * Update test/tokenize.jl
1 parent 48ddfc6 commit 05c594b

File tree

3 files changed

+25
-5
lines changed

3 files changed

+25
-5
lines changed

src/tokenize.jl

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1284,13 +1284,31 @@ function lex_backtick(l::Lexer)
12841284
end
12851285

12861286
const MAX_KW_LENGTH = 10
1287+
const ascii_is_identifier_char = Bool[is_identifier_char(Char(b)) for b=0x00:0x7f]
12871288
function lex_identifier(l::Lexer, c)
12881289
h = simple_hash(c, UInt64(0))
12891290
n = 1
1291+
ascii = isascii(c)
1292+
graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER
1293+
graphemestate_peek = Ref(zero(Int32))
12901294
while true
12911295
pc, ppc = dpeekchar(l)
1292-
if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
1293-
break
1296+
ascii = ascii && isascii(pc)
1297+
if ascii # fast path
1298+
pc_byte = pc % UInt8
1299+
@inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1]
1300+
break
1301+
end
1302+
elseif Unicode.isgraphemebreak!(graphemestate, c, pc)
1303+
if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
1304+
break
1305+
end
1306+
elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
1307+
# ZWJ/ZWNJ only within grapheme sequences, not at end
1308+
graphemestate_peek[] = graphemestate[]
1309+
if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc)
1310+
break
1311+
end
12941312
end
12951313
c = readchar(l)
12961314
h = simple_hash(c, h)

test/diagnostics.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ function diagnostic(str; only_first=false, allow_multiple=false, rule=:all, vers
77
if !only_first
88
@test length(stream.diagnostics) == 1
99
end
10-
return stream.diagnostics[1]
10+
return isempty(stream.diagnostics) ? nothing : stream.diagnostics[1]
1111
end
1212
end
1313

test/tokenize.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,14 @@ end
4444
end # testset
4545

4646
@testset "tokenize unicode" begin
47-
str = "𝘋 =2β"
47+
# FIXME: rm VERSION check once we implement our own is_identifier_char
48+
emoji = VERSION < v"1.5" ? "😄" : "\U1F3F3\UFE0F\U200D\U1F308" # 🏳️‍🌈 requires newer Unicode
49+
str = "𝘋 =2"*emoji
4850
for s in [str, IOBuffer(str)]
4951
l = tokenize(s)
5052
kinds = [K"Identifier", K"Whitespace", K"=",
5153
K"Integer", K"Identifier", K"EndMarker"]
52-
token_strs = ["𝘋", " ", "=", "2", "β", ""]
54+
token_strs = ["𝘋", " ", "=", "2", emoji, ""]
5355
for (i, n) in enumerate(l)
5456
@test kind(n) == kinds[i]
5557
@test untokenize(n, str) == token_strs[i]

0 commit comments

Comments
 (0)