@@ -7,7 +7,169 @@ using ..JuliaSyntax: JuliaSyntax, Kind, @K_str
77import ..JuliaSyntax: kind,
88 is_literal, is_error, is_contextual_keyword, is_word_operator
99
10- include("tokenize_utils.jl")
10+ #-------------------------------------------------------------------------------
11+ # Character-based predicates for tokenization
12+ import Base.Unicode
13+
14+ const EOF_CHAR = typemax(Char)
15+
16+ function is_identifier_char(c::Char)
17+ c == EOF_CHAR && return false
18+ Base.isvalid(c) || return false
19+ return Base.is_id_char(c)
20+ end
21+
22+ function is_identifier_start_char(c::Char)
23+ c == EOF_CHAR && return false
24+ Base.isvalid(c) || return false
25+ return Base.is_id_start_char(c)
26+ end
27+
28+ # Chars that we will never allow to be part of a valid non-operator identifier
29+ function is_never_id_char(ch::Char)
30+ Base.isvalid(ch) || return true
31+ cat = Unicode.category_code(ch)
32+ c = UInt32(ch)
33+ return (
34+ # spaces and control characters:
35+ (cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||
36+
37+ # ASCII and Latin1 non-connector punctuation
38+ (c < 0xff &&
39+ cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||
40+
41+ c == UInt32('`') ||
42+
43+ # mathematical brackets
44+ (c >= 0x27e6 && c <= 0x27ef) ||
45+ # angle, corner, and lenticular brackets
46+ (c >= 0x3008 && c <= 0x3011) ||
47+ # tortoise shell, square, and more lenticular brackets
48+ (c >= 0x3014 && c <= 0x301b) ||
49+ # fullwidth parens
50+ (c == 0xff08 || c == 0xff09) ||
51+ # fullwidth square brackets
52+ (c == 0xff3b || c == 0xff3d)
53+ )
54+ end
55+
56+ readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)
57+
58+ # Some unicode operators are normalized by the tokenizer into their equivalent
59+ # kinds. See also normalize_identifier()
60+ const _ops_with_unicode_aliases = [
61+ # \minus '−' is normalized into K"-",
62+ '−' => K"-"
63+ # Lookalikes which are normalized into K"⋅",
64+ # https://github.com/JuliaLang/julia/pull/25157,
65+ '\u00b7' => K"⋅" # '·' Middle Dot,,
66+ '\u0387' => K"⋅" # '·' Greek Ano Teleia,,
67+ ]
68+
69+ function _nondot_symbolic_operator_kinds()
70+ op_range = reinterpret(UInt16, K"BEGIN_OPS"):reinterpret(UInt16, K"END_OPS")
71+ setdiff(reinterpret.(Kind, op_range), [
72+ K"ErrorInvalidOperator"
73+ K"Error**"
74+ K"..."
75+ K"."
76+ K"where"
77+ K"isa"
78+ K"in"
79+ K".'"
80+ ])
81+ end
82+
83+ function _char_in_set_expr(varname, firstchars)
84+ codes = sort!(UInt32.(unique(firstchars)))
85+ terms = []
86+ i = 1
87+ while i <= length(codes)
88+ j = i
89+ while j < length(codes) && codes[j+1] == codes[j]+1
90+ j += 1
91+ end
92+ if i == j
93+ push!(terms, :($varname == $(codes[i])))
94+ else
95+ push!(terms, :($(codes[i]) <= $varname <= $(codes[j])))
96+ end
97+ i = j+1
98+ end
99+ foldr((t1,t2)->:($t1 || $t2), terms)
100+ end
101+
102+ @eval function is_operator_start_char(c)
103+ if c == EOF_CHAR || !Base.isvalid(c)
104+ return false
105+ end
106+ u = UInt32(c)
107+ return $(_char_in_set_expr(:u,
108+ append!(first.(string.(_nondot_symbolic_operator_kinds())),
109+ first.(_ops_with_unicode_aliases))))
110+ end
111+
112+ # Checks whether a Char is an operator which can be prefixed with a dot `.`
113+ function is_dottable_operator_start_char(c)
114+ return c != '?' && c != '$' && c != ':' && c != '\'' && is_operator_start_char(c)
115+ end
116+
117+ @eval function isopsuffix(c::Char)
118+ c == EOF_CHAR && return false
119+ Base.isvalid(c) || return false
120+ u = UInt32(c)
121+ if (u < 0xa1 || u > 0x10ffff)
122+ return false
123+ end
124+ cat = Base.Unicode.category_code(u)
125+ if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN ||
126+ cat == Base.Unicode.UTF8PROC_CATEGORY_MC ||
127+ cat == Base.Unicode.UTF8PROC_CATEGORY_ME)
128+ return true
129+ end
130+ # Additional allowed cases
131+ return $(_char_in_set_expr(:u,
132+ collect("²³¹ʰʲʳʷʸˡˢˣᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁᵂᵃᵇᵈᵉᵍᵏᵐᵒᵖᵗᵘᵛᵝᵞᵟᵠᵡᵢᵣᵤᵥᵦᵧᵨᵩᵪᶜᶠᶥᶦᶫᶰᶸᶻᶿ′″‴‵‶‷⁗⁰ⁱ⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₕₖₗₘₙₚₛₜⱼⱽꜛꜜꜝ")))
133+ end
134+
135+ function optakessuffix(k)
136+ (K"BEGIN_OPS" <= k <= K"END_OPS") &&
137+ !(
138+ k == K"..." ||
139+ K"BEGIN_ASSIGNMENTS" <= k <= K"END_ASSIGNMENTS" ||
140+ k == K"?" ||
141+ k == K"<:" ||
142+ k == K">:" ||
143+ k == K"&&" ||
144+ k == K"||" ||
145+ k == K"in" ||
146+ k == K"isa" ||
147+ k == K"≔" ||
148+ k == K"⩴" ||
149+ k == K":" ||
150+ k == K".." ||
151+ k == K"$" ||
152+ k == K"::" ||
153+ k == K"where" ||
154+ k == K"." ||
155+ k == K"!" ||
156+ k == K".'" ||
157+ k == K"->" ||
158+ K"¬" <= k <= K"∜"
159+ )
160+ end
161+
162+ const _unicode_ops = let
163+ ks = _nondot_symbolic_operator_kinds()
164+ ss = string.(ks)
165+
166+ ops = Dict{Char, Kind}([first(s)=>k for (k,s) in zip(ks,ss)
167+ if length(s) == 1 && !isascii(s[1])])
168+ for ck in _ops_with_unicode_aliases
169+ push!(ops, ck)
170+ end
171+ ops
172+ end
11173
12174#-------------------------------------------------------------------------------
13175# Tokens
0 commit comments