Skip to content

Commit 3994395

Browse files
committed
Relocate code from tokenize_utils.jl to tokenize.jl
Now that this code is much cleaner and shorter it makes sense to have the whole lexer in one file.
1 parent b39008a commit 3994395

File tree

2 files changed

+163
-162
lines changed

2 files changed

+163
-162
lines changed

src/tokenize.jl

Lines changed: 163 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,169 @@ using ..JuliaSyntax: JuliaSyntax, Kind, @K_str
77
import ..JuliaSyntax: kind,
88
is_literal, is_error, is_contextual_keyword, is_word_operator
99

10-
include("tokenize_utils.jl")
10+
#-------------------------------------------------------------------------------
11+
# Character-based predicates for tokenization
12+
import Base.Unicode
13+
14+
const EOF_CHAR = typemax(Char)
15+
16+
function is_identifier_char(c::Char)
17+
c == EOF_CHAR && return false
18+
Base.isvalid(c) || return false
19+
return Base.is_id_char(c)
20+
end
21+
22+
function is_identifier_start_char(c::Char)
23+
c == EOF_CHAR && return false
24+
Base.isvalid(c) || return false
25+
return Base.is_id_start_char(c)
26+
end
27+
28+
# Chars that we will never allow to be part of a valid non-operator identifier
29+
function is_never_id_char(ch::Char)
30+
Base.isvalid(ch) || return true
31+
cat = Unicode.category_code(ch)
32+
c = UInt32(ch)
33+
return (
34+
# spaces and control characters:
35+
(cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||
36+
37+
# ASCII and Latin1 non-connector punctuation
38+
(c < 0xff &&
39+
cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||
40+
41+
c == UInt32('`') ||
42+
43+
# mathematical brackets
44+
(c >= 0x27e6 && c <= 0x27ef) ||
45+
# angle, corner, and lenticular brackets
46+
(c >= 0x3008 && c <= 0x3011) ||
47+
# tortoise shell, square, and more lenticular brackets
48+
(c >= 0x3014 && c <= 0x301b) ||
49+
# fullwidth parens
50+
(c == 0xff08 || c == 0xff09) ||
51+
# fullwidth square brackets
52+
(c == 0xff3b || c == 0xff3d)
53+
)
54+
end
55+
56+
readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)
57+
58+
# Some unicode operators are normalized by the tokenizer into their equivalent
59+
# kinds. See also normalize_identifier()
60+
const _ops_with_unicode_aliases = [
61+
# \minus '−' is normalized into K"-",
62+
'−' => K"-"
63+
# Lookalikes which are normalized into K"⋅",
64+
# https://github.com/JuliaLang/julia/pull/25157,
65+
'\u00b7' => K"⋅" # '·' Middle Dot,,
66+
'\u0387' => K"⋅" # '·' Greek Ano Teleia,,
67+
]
68+
69+
function _nondot_symbolic_operator_kinds()
70+
op_range = reinterpret(UInt16, K"BEGIN_OPS"):reinterpret(UInt16, K"END_OPS")
71+
setdiff(reinterpret.(Kind, op_range), [
72+
K"ErrorInvalidOperator"
73+
K"Error**"
74+
K"..."
75+
K"."
76+
K"where"
77+
K"isa"
78+
K"in"
79+
K".'"
80+
])
81+
end
82+
83+
function _char_in_set_expr(varname, firstchars)
84+
codes = sort!(UInt32.(unique(firstchars)))
85+
terms = []
86+
i = 1
87+
while i <= length(codes)
88+
j = i
89+
while j < length(codes) && codes[j+1] == codes[j]+1
90+
j += 1
91+
end
92+
if i == j
93+
push!(terms, :($varname == $(codes[i])))
94+
else
95+
push!(terms, :($(codes[i]) <= $varname <= $(codes[j])))
96+
end
97+
i = j+1
98+
end
99+
foldr((t1,t2)->:($t1 || $t2), terms)
100+
end
101+
102+
@eval function is_operator_start_char(c)
103+
if c == EOF_CHAR || !Base.isvalid(c)
104+
return false
105+
end
106+
u = UInt32(c)
107+
return $(_char_in_set_expr(:u,
108+
append!(first.(string.(_nondot_symbolic_operator_kinds())),
109+
first.(_ops_with_unicode_aliases))))
110+
end
111+
112+
# Checks whether a Char is an operator which can be prefixed with a dot `.`
113+
function is_dottable_operator_start_char(c)
114+
return c != '?' && c != '$' && c != ':' && c != '\'' && is_operator_start_char(c)
115+
end
116+
117+
@eval function isopsuffix(c::Char)
118+
c == EOF_CHAR && return false
119+
Base.isvalid(c) || return false
120+
u = UInt32(c)
121+
if (u < 0xa1 || u > 0x10ffff)
122+
return false
123+
end
124+
cat = Base.Unicode.category_code(u)
125+
if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN ||
126+
cat == Base.Unicode.UTF8PROC_CATEGORY_MC ||
127+
cat == Base.Unicode.UTF8PROC_CATEGORY_ME)
128+
return true
129+
end
130+
# Additional allowed cases
131+
return $(_char_in_set_expr(:u,
132+
collect("²³¹ʰʲʳʷʸˡˢˣᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁᵂᵃᵇᵈᵉᵍᵏᵐᵒᵖᵗᵘᵛᵝᵞᵟᵠᵡᵢᵣᵤᵥᵦᵧᵨᵩᵪᶜᶠᶥᶦᶫᶰᶸᶻᶿ′″‴‵‶‷⁗⁰ⁱ⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₕₖₗₘₙₚₛₜⱼⱽꜛꜜꜝ")))
133+
end
134+
135+
function optakessuffix(k)
136+
(K"BEGIN_OPS" <= k <= K"END_OPS") &&
137+
!(
138+
k == K"..." ||
139+
K"BEGIN_ASSIGNMENTS" <= k <= K"END_ASSIGNMENTS" ||
140+
k == K"?" ||
141+
k == K"<:" ||
142+
k == K">:" ||
143+
k == K"&&" ||
144+
k == K"||" ||
145+
k == K"in" ||
146+
k == K"isa" ||
147+
k == K"≔" ||
148+
k == K"⩴" ||
149+
k == K":" ||
150+
k == K".." ||
151+
k == K"$" ||
152+
k == K"::" ||
153+
k == K"where" ||
154+
k == K"." ||
155+
k == K"!" ||
156+
k == K".'" ||
157+
k == K"->" ||
158+
K"¬" <= k <= K"∜"
159+
)
160+
end
161+
162+
const _unicode_ops = let
163+
ks = _nondot_symbolic_operator_kinds()
164+
ss = string.(ks)
165+
166+
ops = Dict{Char, Kind}([first(s)=>k for (k,s) in zip(ks,ss)
167+
if length(s) == 1 && !isascii(s[1])])
168+
for ck in _ops_with_unicode_aliases
169+
push!(ops, ck)
170+
end
171+
ops
172+
end
11173

12174
#-------------------------------------------------------------------------------
13175
# Tokens

src/tokenize_utils.jl

Lines changed: 0 additions & 161 deletions
This file was deleted.

0 commit comments

Comments
 (0)