@@ -2,12 +2,174 @@ module Tokenize
22
33export tokenize, untokenize, Tokens
44
5- using ..JuliaSyntax: Kind, @K_str
5+ using ..JuliaSyntax: JuliaSyntax, Kind, @K_str
66
77import ..JuliaSyntax: kind,
88 is_literal, is_error, is_contextual_keyword, is_word_operator
99
10- include("tokenize_utils.jl")
10+ #-------------------------------------------------------------------------------
11+ # Character-based predicates for tokenization
12+ import Base.Unicode
13+
14+ const EOF_CHAR = typemax(Char)
15+
16+ function is_identifier_char(c::Char)
17+ c == EOF_CHAR && return false
18+ Base.isvalid(c) || return false
19+ return Base.is_id_char(c)
20+ end
21+
22+ function is_identifier_start_char(c::Char)
23+ c == EOF_CHAR && return false
24+ Base.isvalid(c) || return false
25+ return Base.is_id_start_char(c)
26+ end
27+
28+ # Chars that we will never allow to be part of a valid non-operator identifier
29+ function is_never_id_char(ch::Char)
30+ Base.isvalid(ch) || return true
31+ cat = Unicode.category_code(ch)
32+ c = UInt32(ch)
33+ return (
34+ # spaces and control characters:
35+ (cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||
36+
37+ # ASCII and Latin1 non-connector punctuation
38+ (c < 0xff &&
39+ cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||
40+
41+ c == UInt32('`') ||
42+
43+ # mathematical brackets
44+ (c >= 0x27e6 && c <= 0x27ef) ||
45+ # angle, corner, and lenticular brackets
46+ (c >= 0x3008 && c <= 0x3011) ||
47+ # tortoise shell, square, and more lenticular brackets
48+ (c >= 0x3014 && c <= 0x301b) ||
49+ # fullwidth parens
50+ (c == 0xff08 || c == 0xff09) ||
51+ # fullwidth square brackets
52+ (c == 0xff3b || c == 0xff3d)
53+ )
54+ end
55+
56+ readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)
57+
58+ # Some unicode operators are normalized by the tokenizer into their equivalent
59+ # kinds. See also normalize_identifier()
60+ const _ops_with_unicode_aliases = [
61+ # \minus '−' is normalized into K"-",
62+ '−' => K"-"
63+ # Lookalikes which are normalized into K"⋅",
64+ # https://github.com/JuliaLang/julia/pull/25157,
65+ '\u00b7' => K"⋅" # '·' Middle Dot,,
66+ '\u0387' => K"⋅" # '·' Greek Ano Teleia,,
67+ ]
68+
69+ function _nondot_symbolic_operator_kinds()
70+ op_range = reinterpret(UInt16, K"BEGIN_OPS"):reinterpret(UInt16, K"END_OPS")
71+ setdiff(reinterpret.(Kind, op_range), [
72+ K"ErrorInvalidOperator"
73+ K"Error**"
74+ K"..."
75+ K"."
76+ K"where"
77+ K"isa"
78+ K"in"
79+ K".'"
80+ ])
81+ end
82+
83+ function _char_in_set_expr(varname, firstchars)
84+ codes = sort!(UInt32.(unique(firstchars)))
85+ terms = []
86+ i = 1
87+ while i <= length(codes)
88+ j = i
89+ while j < length(codes) && codes[j+1] == codes[j]+1
90+ j += 1
91+ end
92+ if i == j
93+ push!(terms, :($varname == $(codes[i])))
94+ else
95+ push!(terms, :($(codes[i]) <= $varname <= $(codes[j])))
96+ end
97+ i = j+1
98+ end
99+ foldr((t1,t2)->:($t1 || $t2), terms)
100+ end
101+
102+ @eval function is_operator_start_char(c)
103+ if c == EOF_CHAR || !Base.isvalid(c)
104+ return false
105+ end
106+ u = UInt32(c)
107+ return $(_char_in_set_expr(:u,
108+ append!(first.(string.(_nondot_symbolic_operator_kinds())),
109+ first.(_ops_with_unicode_aliases))))
110+ end
111+
112+ # Checks whether a Char is an operator which can be prefixed with a dot `.`
113+ function is_dottable_operator_start_char(c)
114+ return c != '?' && c != '$' && c != ':' && c != '\'' && is_operator_start_char(c)
115+ end
116+
117+ @eval function isopsuffix(c::Char)
118+ c == EOF_CHAR && return false
119+ Base.isvalid(c) || return false
120+ u = UInt32(c)
121+ if (u < 0xa1 || u > 0x10ffff)
122+ return false
123+ end
124+ cat = Base.Unicode.category_code(u)
125+ if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN ||
126+ cat == Base.Unicode.UTF8PROC_CATEGORY_MC ||
127+ cat == Base.Unicode.UTF8PROC_CATEGORY_ME)
128+ return true
129+ end
130+ # Additional allowed cases
131+ return $(_char_in_set_expr(:u,
132+ collect("²³¹ʰʲʳʷʸˡˢˣᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁᵂᵃᵇᵈᵉᵍᵏᵐᵒᵖᵗᵘᵛᵝᵞᵟᵠᵡᵢᵣᵤᵥᵦᵧᵨᵩᵪᶜᶠᶥᶦᶫᶰᶸᶻᶿ′″‴‵‶‷⁗⁰ⁱ⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₕₖₗₘₙₚₛₜⱼⱽꜛꜜꜝ")))
133+ end
134+
135+ function optakessuffix(k)
136+ (K"BEGIN_OPS" <= k <= K"END_OPS") &&
137+ !(
138+ k == K"..." ||
139+ K"BEGIN_ASSIGNMENTS" <= k <= K"END_ASSIGNMENTS" ||
140+ k == K"?" ||
141+ k == K"<:" ||
142+ k == K">:" ||
143+ k == K"&&" ||
144+ k == K"||" ||
145+ k == K"in" ||
146+ k == K"isa" ||
147+ k == K"≔" ||
148+ k == K"⩴" ||
149+ k == K":" ||
150+ k == K".." ||
151+ k == K"$" ||
152+ k == K"::" ||
153+ k == K"where" ||
154+ k == K"." ||
155+ k == K"!" ||
156+ k == K".'" ||
157+ k == K"->" ||
158+ K"¬" <= k <= K"∜"
159+ )
160+ end
161+
162+ const _unicode_ops = let
163+ ks = _nondot_symbolic_operator_kinds()
164+ ss = string.(ks)
165+
166+ ops = Dict{Char, Kind}([first(s)=>k for (k,s) in zip(ks,ss)
167+ if length(s) == 1 && !isascii(s[1])])
168+ for ck in _ops_with_unicode_aliases
169+ push!(ops, ck)
170+ end
171+ ops
172+ end
11173
12174#-------------------------------------------------------------------------------
13175# Tokens
@@ -370,7 +532,7 @@ function _next_token(l::Lexer, c)
370532 return lex_identifier(l, c)
371533 elseif isdigit(c)
372534 return lex_digit(l, K"Integer")
373- elseif (k = get(UNICODE_OPS , c, K"error")) != K"error"
535+ elseif (k = get(_unicode_ops , c, K"error")) != K"error"
374536 return emit(l, k)
375537 else
376538 emit_error(l, K"ErrorUnknownCharacter")
@@ -416,6 +578,7 @@ function lex_string_chunk(l)
416578 !(pc == EOF_CHAR || is_operator_start_char(pc) || is_never_id_char(pc))
417579 # Only allow certain characters after interpolated vars
418580 # https://github.com/JuliaLang/julia/pull/25234
581+ readchar(l)
419582 return emit_error(l, K"ErrorInvalidInterpolationTerminator")
420583 end
421584 if pc == EOF_CHAR
@@ -771,7 +934,7 @@ function lex_digit(l::Lexer, kind)
771934 # If we enter the function with kind == K"Float" then a '.' has been parsed.
772935 readchar(l)
773936 return emit_error(l, K"ErrorInvalidNumericConstant")
774- elseif is_operator_start_char (ppc) && ppc !== ':'
937+ elseif is_dottable_operator_start_char (ppc)
775938 readchar(l)
776939 return emit_error(l, K"ErrorAmbiguousNumericConstant") # `1.+`
777940 end
@@ -787,14 +950,14 @@ function lex_digit(l::Lexer, kind)
787950 accept(l, "+-−")
788951 if accept_batch(l, isdigit)
789952 pc,ppc = dpeekchar(l)
790- if pc === '.' && !dotop2 (ppc)
953+ if pc === '.' && !is_dottable_operator_start_char (ppc)
791954 readchar(l)
792955 return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e1.`
793956 end
794957 else
795958 return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e`
796959 end
797- elseif pc == '.' && ppc != '.' && !is_operator_start_char (ppc)
960+ elseif pc == '.' && ppc != '.' && !is_dottable_operator_start_char (ppc)
798961 readchar(l)
799962 return emit_error(l, K"ErrorInvalidNumericConstant") # `1.1.`
800963 elseif !had_fraction_digs && (is_identifier_start_char(pc) ||
@@ -808,7 +971,7 @@ function lex_digit(l::Lexer, kind)
808971 accept(l, "+-−")
809972 if accept_batch(l, isdigit)
810973 pc,ppc = dpeekchar(l)
811- if pc === '.' && !dotop2 (ppc)
974+ if pc === '.' && !is_dottable_operator_start_char (ppc)
812975 accept(l, '.')
813976 return emit_error(l, K"ErrorInvalidNumericConstant") # `1e1.`
814977 end
@@ -948,7 +1111,7 @@ function lex_dot(l::Lexer)
9481111 if accept(l, '.')
9491112 return emit(l, K"...")
9501113 else
951- if dotop2 (peekchar(l))
1114+ if is_dottable_operator_start_char (peekchar(l))
9521115 readchar(l)
9531116 return emit_error(l, K"ErrorInvalidOperator")
9541117 else
@@ -959,10 +1122,7 @@ function lex_dot(l::Lexer)
9591122 return lex_digit(l, K"Float")
9601123 else
9611124 pc, dpc = dpeekchar(l)
962- if dotop1(pc)
963- l.dotop = true
964- return _next_token(l, readchar(l))
965- elseif pc =='+'
1125+ if pc == '+'
9661126 l.dotop = true
9671127 readchar(l)
9681128 return lex_plus(l)
@@ -1040,6 +1200,9 @@ function lex_dot(l::Lexer)
10401200 l.dotop = true
10411201 readchar(l)
10421202 return lex_equal(l)
1203+ elseif is_dottable_operator_start_char(pc)
1204+ l.dotop = true
1205+ return _next_token(l, readchar(l))
10431206 end
10441207 return emit(l, K".")
10451208 end
0 commit comments