@@ -7,7 +7,169 @@ using ..JuliaSyntax: JuliaSyntax, Kind, @K_str
7
7
import ..JuliaSyntax: kind,
8
8
is_literal, is_error, is_contextual_keyword, is_word_operator
9
9
10
- include (" tokenize_utils.jl" )
10
+ #-------------------------------------------------------------------------------
11
+ # Character-based predicates for tokenization
12
+ import Base.Unicode
13
+
14
+ const EOF_CHAR = typemax(Char)
15
+
16
+ function is_identifier_char(c::Char)
17
+ c == EOF_CHAR && return false
18
+ Base.isvalid(c) || return false
19
+ return Base.is_id_char(c)
20
+ end
21
+
22
+ function is_identifier_start_char(c::Char)
23
+ c == EOF_CHAR && return false
24
+ Base.isvalid(c) || return false
25
+ return Base.is_id_start_char(c)
26
+ end
27
+
28
+ # Chars that we will never allow to be part of a valid non-operator identifier
29
+ function is_never_id_char(ch::Char)
30
+ Base.isvalid(ch) || return true
31
+ cat = Unicode.category_code(ch)
32
+ c = UInt32(ch)
33
+ return (
34
+ # spaces and control characters:
35
+ (cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||
36
+
37
+ # ASCII and Latin1 non-connector punctuation
38
+ (c < 0xff &&
39
+ cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||
40
+
41
+ c == UInt32('`') ||
42
+
43
+ # mathematical brackets
44
+ (c >= 0x27e6 && c <= 0x27ef) ||
45
+ # angle, corner, and lenticular brackets
46
+ (c >= 0x3008 && c <= 0x3011) ||
47
+ # tortoise shell, square, and more lenticular brackets
48
+ (c >= 0x3014 && c <= 0x301b) ||
49
+ # fullwidth parens
50
+ (c == 0xff08 || c == 0xff09) ||
51
+ # fullwidth square brackets
52
+ (c == 0xff3b || c == 0xff3d)
53
+ )
54
+ end
55
+
56
+ readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)
57
+
58
+ # Some unicode operators are normalized by the tokenizer into their equivalent
59
+ # kinds. See also normalize_identifier()
60
+ const _ops_with_unicode_aliases = [
61
+ # \minus '−' is normalized into K"-",
62
+ '−' => K"-"
63
+ # Lookalikes which are normalized into K"⋅",
64
+ # https://github.com/JuliaLang/julia/pull/25157,
65
+ '\u00b7' => K"⋅" # '·' Middle Dot,,
66
+ '\u0387' => K"⋅" # '·' Greek Ano Teleia,,
67
+ ]
68
+
69
+ function _nondot_symbolic_operator_kinds()
70
+ op_range = reinterpret(UInt16, K"BEGIN_OPS"):reinterpret(UInt16, K"END_OPS")
71
+ setdiff(reinterpret.(Kind, op_range), [
72
+ K"ErrorInvalidOperator"
73
+ K"Error**"
74
+ K"..."
75
+ K"."
76
+ K"where"
77
+ K"isa"
78
+ K"in"
79
+ K".'"
80
+ ])
81
+ end
82
+
83
+ function _char_in_set_expr(varname, firstchars)
84
+ codes = sort!(UInt32.(unique(firstchars)))
85
+ terms = []
86
+ i = 1
87
+ while i <= length(codes)
88
+ j = i
89
+ while j < length(codes) && codes[j+1] == codes[j]+1
90
+ j += 1
91
+ end
92
+ if i == j
93
+ push!(terms, :($varname == $(codes[i])))
94
+ else
95
+ push!(terms, :($(codes[i]) <= $varname <= $(codes[j])))
96
+ end
97
+ i = j+1
98
+ end
99
+ foldr((t1,t2)->:($t1 || $t2), terms)
100
+ end
101
+
102
+ @eval function is_operator_start_char(c)
103
+ if c == EOF_CHAR || !Base.isvalid(c)
104
+ return false
105
+ end
106
+ u = UInt32(c)
107
+ return $(_char_in_set_expr(:u,
108
+ append!(first.(string.(_nondot_symbolic_operator_kinds())),
109
+ first.(_ops_with_unicode_aliases))))
110
+ end
111
+
112
+ # Checks whether a Char is an operator which can be prefixed with a dot `.`
113
+ function is_dottable_operator_start_char(c)
114
+ return c != '?' && c != '$' && c != ':' && c != '\'' && is_operator_start_char(c)
115
+ end
116
+
117
+ @eval function isopsuffix(c::Char)
118
+ c == EOF_CHAR && return false
119
+ Base.isvalid(c) || return false
120
+ u = UInt32(c)
121
+ if (u < 0xa1 || u > 0x10ffff)
122
+ return false
123
+ end
124
+ cat = Base.Unicode.category_code(u)
125
+ if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN ||
126
+ cat == Base.Unicode.UTF8PROC_CATEGORY_MC ||
127
+ cat == Base.Unicode.UTF8PROC_CATEGORY_ME)
128
+ return true
129
+ end
130
+ # Additional allowed cases
131
+ return $(_char_in_set_expr(:u,
132
+ collect("²³¹ʰʲʳʷʸˡˢˣᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁᵂᵃᵇᵈᵉᵍᵏᵐᵒᵖᵗᵘᵛᵝᵞᵟᵠᵡᵢᵣᵤᵥᵦᵧᵨᵩᵪᶜᶠᶥᶦᶫᶰᶸᶻᶿ′″‴‵‶‷⁗⁰ⁱ⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₕₖₗₘₙₚₛₜⱼⱽꜛꜜꜝ")))
133
+ end
134
+
135
+ function optakessuffix(k)
136
+ (K"BEGIN_OPS" <= k <= K"END_OPS") &&
137
+ !(
138
+ k == K"..." ||
139
+ K"BEGIN_ASSIGNMENTS" <= k <= K"END_ASSIGNMENTS" ||
140
+ k == K"?" ||
141
+ k == K"<:" ||
142
+ k == K">:" ||
143
+ k == K"&&" ||
144
+ k == K"||" ||
145
+ k == K"in" ||
146
+ k == K"isa" ||
147
+ k == K"≔" ||
148
+ k == K"⩴" ||
149
+ k == K":" ||
150
+ k == K".." ||
151
+ k == K"$" ||
152
+ k == K"::" ||
153
+ k == K"where" ||
154
+ k == K"." ||
155
+ k == K"!" ||
156
+ k == K".'" ||
157
+ k == K"->" ||
158
+ K"¬" <= k <= K"∜"
159
+ )
160
+ end
161
+
162
+ const _unicode_ops = let
163
+ ks = _nondot_symbolic_operator_kinds()
164
+ ss = string.(ks)
165
+
166
+ ops = Dict{Char, Kind}([first(s)=>k for (k,s) in zip(ks,ss)
167
+ if length(s) == 1 && !isascii(s[1])])
168
+ for ck in _ops_with_unicode_aliases
169
+ push!(ops, ck)
170
+ end
171
+ ops
172
+ end
11
173
12
174
#-------------------------------------------------------------------------------
13
175
# Tokens
0 commit comments