@@ -2,12 +2,174 @@ module Tokenize
2
2
3
3
export tokenize, untokenize, Tokens
4
4
5
- using .. JuliaSyntax: Kind, @K_str
5
+ using ..JuliaSyntax: JuliaSyntax, Kind, @K_str
6
6
7
7
import ..JuliaSyntax: kind,
8
8
is_literal, is_error, is_contextual_keyword, is_word_operator
9
9
10
- include (" tokenize_utils.jl" )
10
+ #-------------------------------------------------------------------------------
11
+ # Character-based predicates for tokenization
12
+ import Base.Unicode
13
+
14
+ const EOF_CHAR = typemax(Char)
15
+
16
+ function is_identifier_char(c::Char)
17
+ c == EOF_CHAR && return false
18
+ Base.isvalid(c) || return false
19
+ return Base.is_id_char(c)
20
+ end
21
+
22
+ function is_identifier_start_char(c::Char)
23
+ c == EOF_CHAR && return false
24
+ Base.isvalid(c) || return false
25
+ return Base.is_id_start_char(c)
26
+ end
27
+
28
+ # Chars that we will never allow to be part of a valid non-operator identifier
29
+ function is_never_id_char(ch::Char)
30
+ Base.isvalid(ch) || return true
31
+ cat = Unicode.category_code(ch)
32
+ c = UInt32(ch)
33
+ return (
34
+ # spaces and control characters:
35
+ (cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||
36
+
37
+ # ASCII and Latin1 non-connector punctuation
38
+ (c < 0xff &&
39
+ cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||
40
+
41
+ c == UInt32('`') ||
42
+
43
+ # mathematical brackets
44
+ (c >= 0x27e6 && c <= 0x27ef) ||
45
+ # angle, corner, and lenticular brackets
46
+ (c >= 0x3008 && c <= 0x3011) ||
47
+ # tortoise shell, square, and more lenticular brackets
48
+ (c >= 0x3014 && c <= 0x301b) ||
49
+ # fullwidth parens
50
+ (c == 0xff08 || c == 0xff09) ||
51
+ # fullwidth square brackets
52
+ (c == 0xff3b || c == 0xff3d)
53
+ )
54
+ end
55
+
56
+ readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)
57
+
58
+ # Some unicode operators are normalized by the tokenizer into their equivalent
59
+ # kinds. See also normalize_identifier()
60
+ const _ops_with_unicode_aliases = [
61
+ # \minus '−' is normalized into K"-",
62
+ '−' => K"-"
63
+ # Lookalikes which are normalized into K"⋅",
64
+ # https://github.com/JuliaLang/julia/pull/25157,
65
+ '\u00b7' => K"⋅" # '·' Middle Dot,,
66
+ '\u0387' => K"⋅" # '·' Greek Ano Teleia,,
67
+ ]
68
+
69
+ function _nondot_symbolic_operator_kinds()
70
+ op_range = reinterpret(UInt16, K"BEGIN_OPS"):reinterpret(UInt16, K"END_OPS")
71
+ setdiff(reinterpret.(Kind, op_range), [
72
+ K"ErrorInvalidOperator"
73
+ K"Error**"
74
+ K"..."
75
+ K"."
76
+ K"where"
77
+ K"isa"
78
+ K"in"
79
+ K".'"
80
+ ])
81
+ end
82
+
83
+ function _char_in_set_expr(varname, firstchars)
84
+ codes = sort!(UInt32.(unique(firstchars)))
85
+ terms = []
86
+ i = 1
87
+ while i <= length(codes)
88
+ j = i
89
+ while j < length(codes) && codes[j+1] == codes[j]+1
90
+ j += 1
91
+ end
92
+ if i == j
93
+ push!(terms, :($varname == $(codes[i])))
94
+ else
95
+ push!(terms, :($(codes[i]) <= $varname <= $(codes[j])))
96
+ end
97
+ i = j+1
98
+ end
99
+ foldr((t1,t2)->:($t1 || $t2), terms)
100
+ end
101
+
102
+ @eval function is_operator_start_char(c)
103
+ if c == EOF_CHAR || !Base.isvalid(c)
104
+ return false
105
+ end
106
+ u = UInt32(c)
107
+ return $(_char_in_set_expr(:u,
108
+ append!(first.(string.(_nondot_symbolic_operator_kinds())),
109
+ first.(_ops_with_unicode_aliases))))
110
+ end
111
+
112
+ # Checks whether a Char is an operator which can be prefixed with a dot `.`
113
+ function is_dottable_operator_start_char(c)
114
+ return c != '?' && c != '$' && c != ':' && c != '\'' && is_operator_start_char(c)
115
+ end
116
+
117
+ @eval function isopsuffix(c::Char)
118
+ c == EOF_CHAR && return false
119
+ Base.isvalid(c) || return false
120
+ u = UInt32(c)
121
+ if (u < 0xa1 || u > 0x10ffff)
122
+ return false
123
+ end
124
+ cat = Base.Unicode.category_code(u)
125
+ if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN ||
126
+ cat == Base.Unicode.UTF8PROC_CATEGORY_MC ||
127
+ cat == Base.Unicode.UTF8PROC_CATEGORY_ME)
128
+ return true
129
+ end
130
+ # Additional allowed cases
131
+ return $(_char_in_set_expr(:u,
132
+ collect("²³¹ʰʲʳʷʸˡˢˣᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁᵂᵃᵇᵈᵉᵍᵏᵐᵒᵖᵗᵘᵛᵝᵞᵟᵠᵡᵢᵣᵤᵥᵦᵧᵨᵩᵪᶜᶠᶥᶦᶫᶰᶸᶻᶿ′″‴‵‶‷⁗⁰ⁱ⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₕₖₗₘₙₚₛₜⱼⱽꜛꜜꜝ")))
133
+ end
134
+
135
+ function optakessuffix(k)
136
+ (K"BEGIN_OPS" <= k <= K"END_OPS") &&
137
+ !(
138
+ k == K"..." ||
139
+ K"BEGIN_ASSIGNMENTS" <= k <= K"END_ASSIGNMENTS" ||
140
+ k == K"?" ||
141
+ k == K"<:" ||
142
+ k == K">:" ||
143
+ k == K"&&" ||
144
+ k == K"||" ||
145
+ k == K"in" ||
146
+ k == K"isa" ||
147
+ k == K"≔" ||
148
+ k == K"⩴" ||
149
+ k == K":" ||
150
+ k == K".." ||
151
+ k == K"$" ||
152
+ k == K"::" ||
153
+ k == K"where" ||
154
+ k == K"." ||
155
+ k == K"!" ||
156
+ k == K".'" ||
157
+ k == K"->" ||
158
+ K"¬" <= k <= K"∜"
159
+ )
160
+ end
161
+
162
+ const _unicode_ops = let
163
+ ks = _nondot_symbolic_operator_kinds()
164
+ ss = string.(ks)
165
+
166
+ ops = Dict{Char, Kind}([first(s)=>k for (k,s) in zip(ks,ss)
167
+ if length(s) == 1 && !isascii(s[1])])
168
+ for ck in _ops_with_unicode_aliases
169
+ push!(ops, ck)
170
+ end
171
+ ops
172
+ end
11
173
12
174
#-------------------------------------------------------------------------------
13
175
# Tokens
@@ -370,7 +532,7 @@ function _next_token(l::Lexer, c)
370
532
return lex_identifier(l, c)
371
533
elseif isdigit(c)
372
534
return lex_digit(l, K"Integer")
373
- elseif (k = get (UNICODE_OPS , c, K " error" )) != K " error"
535
+ elseif (k = get(_unicode_ops , c, K"error")) != K"error"
374
536
return emit(l, k)
375
537
else
376
538
emit_error(l, K"ErrorUnknownCharacter")
@@ -416,6 +578,7 @@ function lex_string_chunk(l)
416
578
!(pc == EOF_CHAR || is_operator_start_char(pc) || is_never_id_char(pc))
417
579
# Only allow certain characters after interpolated vars
418
580
# https://github.com/JuliaLang/julia/pull/25234
581
+ readchar(l)
419
582
return emit_error(l, K"ErrorInvalidInterpolationTerminator")
420
583
end
421
584
if pc == EOF_CHAR
@@ -771,7 +934,7 @@ function lex_digit(l::Lexer, kind)
771
934
# If we enter the function with kind == K"Float" then a '.' has been parsed.
772
935
readchar(l)
773
936
return emit_error(l, K"ErrorInvalidNumericConstant")
774
- elseif is_operator_start_char (ppc) && ppc != = ' : '
937
+ elseif is_dottable_operator_start_char (ppc)
775
938
readchar(l)
776
939
return emit_error(l, K"ErrorAmbiguousNumericConstant") # `1.+`
777
940
end
@@ -787,14 +950,14 @@ function lex_digit(l::Lexer, kind)
787
950
accept(l, "+-−")
788
951
if accept_batch(l, isdigit)
789
952
pc,ppc = dpeekchar(l)
790
- if pc === ' .' && ! dotop2 (ppc)
953
+ if pc === '.' && !is_dottable_operator_start_char (ppc)
791
954
readchar(l)
792
955
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e1.`
793
956
end
794
957
else
795
958
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e`
796
959
end
797
- elseif pc == ' .' && ppc != ' .' && ! is_operator_start_char (ppc)
960
+ elseif pc == '.' && ppc != '.' && !is_dottable_operator_start_char (ppc)
798
961
readchar(l)
799
962
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.1.`
800
963
elseif !had_fraction_digs && (is_identifier_start_char(pc) ||
@@ -808,7 +971,7 @@ function lex_digit(l::Lexer, kind)
808
971
accept(l, "+-−")
809
972
if accept_batch(l, isdigit)
810
973
pc,ppc = dpeekchar(l)
811
- if pc === ' .' && ! dotop2 (ppc)
974
+ if pc === '.' && !is_dottable_operator_start_char (ppc)
812
975
accept(l, '.')
813
976
return emit_error(l, K"ErrorInvalidNumericConstant") # `1e1.`
814
977
end
@@ -948,7 +1111,7 @@ function lex_dot(l::Lexer)
948
1111
if accept(l, '.')
949
1112
return emit(l, K"...")
950
1113
else
951
- if dotop2 (peekchar (l))
1114
+ if is_dottable_operator_start_char (peekchar(l))
952
1115
readchar(l)
953
1116
return emit_error(l, K"ErrorInvalidOperator")
954
1117
else
@@ -959,10 +1122,7 @@ function lex_dot(l::Lexer)
959
1122
return lex_digit(l, K"Float")
960
1123
else
961
1124
pc, dpc = dpeekchar(l)
962
- if dotop1 (pc)
963
- l. dotop = true
964
- return _next_token (l, readchar (l))
965
- elseif pc == ' +'
1125
+ if pc == '+'
966
1126
l.dotop = true
967
1127
readchar(l)
968
1128
return lex_plus(l)
@@ -1040,6 +1200,9 @@ function lex_dot(l::Lexer)
1040
1200
l.dotop = true
1041
1201
readchar(l)
1042
1202
return lex_equal(l)
1203
+ elseif is_dottable_operator_start_char(pc)
1204
+ l.dotop = true
1205
+ return _next_token(l, readchar(l))
1043
1206
end
1044
1207
return emit(l, K".")
1045
1208
end
0 commit comments