Skip to content

Commit d79b9e9

Browse files
authored
Merge pull request #272 from JuliaLang/c42f/operator-predicate-rewrite
Cleanup and fix operator predicates
2 parents 8f8ba0d + cddc197 commit d79b9e9

File tree

5 files changed

+279
-1006
lines changed

5 files changed

+279
-1006
lines changed

src/kinds.jl

Lines changed: 47 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -934,7 +934,25 @@ primitive type Kind 16 end
934934
# the K_str macro to self-name these kinds with their literal representation,
935935
# rather than needing to invent a new name for each.
936936

937-
let kind_int_type = :UInt16,
937+
let kind_int_type = :UInt16
938+
# Preprocess _kind_names to conflate category markers with the first/last
939+
# in the category.
940+
kindstr_to_int = Dict{String,UInt16}()
941+
i = 1
942+
while i <= length(_kind_names)
943+
kn = _kind_names[i]
944+
kind_int = i-1
945+
if startswith(kn, "BEGIN_")
946+
deleteat!(_kind_names, i)
947+
elseif startswith(kn, "END_")
948+
kind_int = i-2
949+
deleteat!(_kind_names, i)
950+
else
951+
i += 1
952+
end
953+
push!(kindstr_to_int, kn=>kind_int)
954+
end
955+
938956
max_kind_int = length(_kind_names)-1
939957

940958
@eval begin
@@ -945,9 +963,9 @@ let kind_int_type = :UInt16,
945963
return Base.bitcast(Kind, convert($kind_int_type, x))
946964
end
947965

948-
Base.convert(::Type{String}, k::Kind) = _kind_names[1 + Base.bitcast($kind_int_type, k)]
966+
Base.convert(::Type{String}, k::Kind) = _kind_names[1 + reinterpret($kind_int_type, k)]
949967

950-
let kindstr_to_int = Dict(s=>i-1 for (i,s) in enumerate(_kind_names))
968+
let kindstr_to_int=$kindstr_to_int
951969
function Base.convert(::Type{Kind}, s::AbstractString)
952970
i = get(kindstr_to_int, s) do
953971
error("unknown Kind name $(repr(s))")
@@ -1078,12 +1096,12 @@ const _token_error_descriptions = Dict{Kind, String}(
10781096

10791097
#-------------------------------------------------------------------------------
10801098
# Predicates
1081-
is_contextual_keyword(k::Kind) = K"BEGIN_CONTEXTUAL_KEYWORDS" < k < K"END_CONTEXTUAL_KEYWORDS"
1082-
is_error(k::Kind) = K"BEGIN_ERRORS" < k < K"END_ERRORS" || k == K"ErrorInvalidOperator" || k == K"Error**"
1083-
is_keyword(k::Kind) = K"BEGIN_KEYWORDS" < k < K"END_KEYWORDS"
1084-
is_block_continuation_keyword(k::Kind) = K"BEGIN_BLOCK_CONTINUATION_KEYWORDS" < k < K"END_BLOCK_CONTINUATION_KEYWORDS"
1085-
is_literal(k::Kind) = K"BEGIN_LITERAL" < k < K"END_LITERAL"
1086-
is_operator(k::Kind) = K"BEGIN_OPS" < k < K"END_OPS"
1099+
is_contextual_keyword(k::Kind) = K"BEGIN_CONTEXTUAL_KEYWORDS" <= k <= K"END_CONTEXTUAL_KEYWORDS"
1100+
is_error(k::Kind) = K"BEGIN_ERRORS" <= k <= K"END_ERRORS" || k == K"ErrorInvalidOperator" || k == K"Error**"
1101+
is_keyword(k::Kind) = K"BEGIN_KEYWORDS" <= k <= K"END_KEYWORDS"
1102+
is_block_continuation_keyword(k::Kind) = K"BEGIN_BLOCK_CONTINUATION_KEYWORDS" <= k <= K"END_BLOCK_CONTINUATION_KEYWORDS"
1103+
is_literal(k::Kind) = K"BEGIN_LITERAL" <= k <= K"END_LITERAL"
1104+
is_operator(k::Kind) = K"BEGIN_OPS" <= k <= K"END_OPS"
10871105
is_word_operator(k::Kind) = (k == K"in" || k == K"isa" || k == K"where")
10881106

10891107
is_contextual_keyword(k) = is_contextual_keyword(kind(k))
@@ -1097,28 +1115,28 @@ is_word_operator(k) = is_word_operator(kind(k))
10971115
# Predicates for operator precedence
10981116
# FIXME: Review how precedence depends on dottedness, eg
10991117
# https://github.com/JuliaLang/julia/pull/36725
1100-
is_prec_assignment(x) = K"BEGIN_ASSIGNMENTS" < kind(x) < K"END_ASSIGNMENTS"
1101-
is_prec_pair(x) = K"BEGIN_PAIRARROW" < kind(x) < K"END_PAIRARROW"
1102-
is_prec_conditional(x) = K"BEGIN_CONDITIONAL" < kind(x) < K"END_CONDITIONAL"
1103-
is_prec_arrow(x) = K"BEGIN_ARROW" < kind(x) < K"END_ARROW"
1104-
is_prec_lazy_or(x) = K"BEGIN_LAZYOR" < kind(x) < K"END_LAZYOR"
1105-
is_prec_lazy_and(x) = K"BEGIN_LAZYAND" < kind(x) < K"END_LAZYAND"
1106-
is_prec_comparison(x) = K"BEGIN_COMPARISON" < kind(x) < K"END_COMPARISON"
1107-
is_prec_pipe(x) = K"BEGIN_PIPE" < kind(x) < K"END_PIPE"
1108-
is_prec_colon(x) = K"BEGIN_COLON" < kind(x) < K"END_COLON"
1109-
is_prec_plus(x) = K"BEGIN_PLUS" < kind(x) < K"END_PLUS"
1110-
is_prec_bitshift(x) = K"BEGIN_BITSHIFTS" < kind(x) < K"END_BITSHIFTS"
1111-
is_prec_times(x) = K"BEGIN_TIMES" < kind(x) < K"END_TIMES"
1112-
is_prec_rational(x) = K"BEGIN_RATIONAL" < kind(x) < K"END_RATIONAL"
1113-
is_prec_power(x) = K"BEGIN_POWER" < kind(x) < K"END_POWER"
1114-
is_prec_decl(x) = K"BEGIN_DECL" < kind(x) < K"END_DECL"
1115-
is_prec_where(x) = K"BEGIN_WHERE" < kind(x) < K"END_WHERE"
1116-
is_prec_dot(x) = K"BEGIN_DOT" < kind(x) < K"END_DOT"
1117-
is_prec_unicode_ops(x) = K"BEGIN_UNICODE_OPS" < kind(x) < K"END_UNICODE_OPS"
1118+
is_prec_assignment(x) = K"BEGIN_ASSIGNMENTS" <= kind(x) <= K"END_ASSIGNMENTS"
1119+
is_prec_pair(x) = K"BEGIN_PAIRARROW" <= kind(x) <= K"END_PAIRARROW"
1120+
is_prec_conditional(x) = K"BEGIN_CONDITIONAL" <= kind(x) <= K"END_CONDITIONAL"
1121+
is_prec_arrow(x) = K"BEGIN_ARROW" <= kind(x) <= K"END_ARROW"
1122+
is_prec_lazy_or(x) = K"BEGIN_LAZYOR" <= kind(x) <= K"END_LAZYOR"
1123+
is_prec_lazy_and(x) = K"BEGIN_LAZYAND" <= kind(x) <= K"END_LAZYAND"
1124+
is_prec_comparison(x) = K"BEGIN_COMPARISON" <= kind(x) <= K"END_COMPARISON"
1125+
is_prec_pipe(x) = K"BEGIN_PIPE" <= kind(x) <= K"END_PIPE"
1126+
is_prec_colon(x) = K"BEGIN_COLON" <= kind(x) <= K"END_COLON"
1127+
is_prec_plus(x) = K"BEGIN_PLUS" <= kind(x) <= K"END_PLUS"
1128+
is_prec_bitshift(x) = K"BEGIN_BITSHIFTS" <= kind(x) <= K"END_BITSHIFTS"
1129+
is_prec_times(x) = K"BEGIN_TIMES" <= kind(x) <= K"END_TIMES"
1130+
is_prec_rational(x) = K"BEGIN_RATIONAL" <= kind(x) <= K"END_RATIONAL"
1131+
is_prec_power(x) = K"BEGIN_POWER" <= kind(x) <= K"END_POWER"
1132+
is_prec_decl(x) = K"BEGIN_DECL" <= kind(x) <= K"END_DECL"
1133+
is_prec_where(x) = K"BEGIN_WHERE" <= kind(x) <= K"END_WHERE"
1134+
is_prec_dot(x) = K"BEGIN_DOT" <= kind(x) <= K"END_DOT"
1135+
is_prec_unicode_ops(x) = K"BEGIN_UNICODE_OPS" <= kind(x) <= K"END_UNICODE_OPS"
11181136
is_prec_pipe_lt(x) = kind(x) == K"<|"
11191137
is_prec_pipe_gt(x) = kind(x) == K"|>"
1120-
is_syntax_kind(x) = K"BEGIN_SYNTAX_KINDS" < kind(x) < K"END_SYNTAX_KINDS"
1121-
is_macro_name(x) = K"BEGIN_MACRO_NAMES" < kind(x) < K"END_MACRO_NAMES"
1138+
is_syntax_kind(x) = K"BEGIN_SYNTAX_KINDS"<= kind(x) <= K"END_SYNTAX_KINDS"
1139+
is_macro_name(x) = K"BEGIN_MACRO_NAMES" <= kind(x) <= K"END_MACRO_NAMES"
11221140

11231141
function is_number(x)
11241142
kind(x) in (K"Integer", K"BinInt", K"HexInt", K"OctInt", K"Float", K"Float32")

src/literal_parsing.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -329,11 +329,11 @@ end
329329

330330
# static wrapper around user callback function
331331
function utf8proc_custom_func(codepoint::UInt32, ::Ptr{Cvoid})::UInt32
332-
(codepoint == 0x025B ? 0x03B5 :
333-
codepoint == 0x00B5 ? 0x03BC :
334-
codepoint == 0x00B7 ? 0x22C5 :
335-
codepoint == 0x0387 ? 0x22C5 :
336-
codepoint == 0x2212 ? 0x002D :
332+
(codepoint == 0x025B ? 0x03B5 : # 'ɛ' => 'ε'
333+
codepoint == 0x00B5 ? 0x03BC : # 'µ' => 'μ'
334+
codepoint == 0x00B7 ? 0x22C5 : # '·' => '⋅'
335+
codepoint == 0x0387 ? 0x22C5 : # '·' => '⋅'
336+
codepoint == 0x2212 ? 0x002D : # '−' => '-'
337337
codepoint)
338338
end
339339

src/tokenize.jl

Lines changed: 175 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,174 @@ module Tokenize
22

33
export tokenize, untokenize, Tokens
44

5-
using ..JuliaSyntax: Kind, @K_str
5+
using ..JuliaSyntax: JuliaSyntax, Kind, @K_str
66

77
import ..JuliaSyntax: kind,
88
is_literal, is_error, is_contextual_keyword, is_word_operator
99

10-
include("tokenize_utils.jl")
10+
#-------------------------------------------------------------------------------
11+
# Character-based predicates for tokenization
12+
import Base.Unicode
13+
14+
const EOF_CHAR = typemax(Char)
15+
16+
function is_identifier_char(c::Char)
17+
c == EOF_CHAR && return false
18+
Base.isvalid(c) || return false
19+
return Base.is_id_char(c)
20+
end
21+
22+
function is_identifier_start_char(c::Char)
23+
c == EOF_CHAR && return false
24+
Base.isvalid(c) || return false
25+
return Base.is_id_start_char(c)
26+
end
27+
28+
# Chars that we will never allow to be part of a valid non-operator identifier
29+
function is_never_id_char(ch::Char)
30+
Base.isvalid(ch) || return true
31+
cat = Unicode.category_code(ch)
32+
c = UInt32(ch)
33+
return (
34+
# spaces and control characters:
35+
(cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||
36+
37+
# ASCII and Latin1 non-connector punctuation
38+
(c < 0xff &&
39+
cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||
40+
41+
c == UInt32('`') ||
42+
43+
# mathematical brackets
44+
(c >= 0x27e6 && c <= 0x27ef) ||
45+
# angle, corner, and lenticular brackets
46+
(c >= 0x3008 && c <= 0x3011) ||
47+
# tortoise shell, square, and more lenticular brackets
48+
(c >= 0x3014 && c <= 0x301b) ||
49+
# fullwidth parens
50+
(c == 0xff08 || c == 0xff09) ||
51+
# fullwidth square brackets
52+
(c == 0xff3b || c == 0xff3d)
53+
)
54+
end
55+
56+
readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)
57+
58+
# Some unicode operators are normalized by the tokenizer into their equivalent
59+
# kinds. See also normalize_identifier()
60+
const _ops_with_unicode_aliases = [
61+
# \minus '−' is normalized into K"-",
62+
'−' => K"-"
63+
# Lookalikes which are normalized into K"⋅",
64+
# https://github.com/JuliaLang/julia/pull/25157,
65+
'\u00b7' => K"⋅" # '·' Middle Dot,,
66+
'\u0387' => K"⋅" # '·' Greek Ano Teleia,,
67+
]
68+
69+
function _nondot_symbolic_operator_kinds()
70+
op_range = reinterpret(UInt16, K"BEGIN_OPS"):reinterpret(UInt16, K"END_OPS")
71+
setdiff(reinterpret.(Kind, op_range), [
72+
K"ErrorInvalidOperator"
73+
K"Error**"
74+
K"..."
75+
K"."
76+
K"where"
77+
K"isa"
78+
K"in"
79+
K".'"
80+
])
81+
end
82+
83+
function _char_in_set_expr(varname, firstchars)
84+
codes = sort!(UInt32.(unique(firstchars)))
85+
terms = []
86+
i = 1
87+
while i <= length(codes)
88+
j = i
89+
while j < length(codes) && codes[j+1] == codes[j]+1
90+
j += 1
91+
end
92+
if i == j
93+
push!(terms, :($varname == $(codes[i])))
94+
else
95+
push!(terms, :($(codes[i]) <= $varname <= $(codes[j])))
96+
end
97+
i = j+1
98+
end
99+
foldr((t1,t2)->:($t1 || $t2), terms)
100+
end
101+
102+
@eval function is_operator_start_char(c)
103+
if c == EOF_CHAR || !Base.isvalid(c)
104+
return false
105+
end
106+
u = UInt32(c)
107+
return $(_char_in_set_expr(:u,
108+
append!(first.(string.(_nondot_symbolic_operator_kinds())),
109+
first.(_ops_with_unicode_aliases))))
110+
end
111+
112+
# Checks whether a Char is an operator which can be prefixed with a dot `.`
113+
function is_dottable_operator_start_char(c)
114+
return c != '?' && c != '$' && c != ':' && c != '\'' && is_operator_start_char(c)
115+
end
116+
117+
@eval function isopsuffix(c::Char)
118+
c == EOF_CHAR && return false
119+
Base.isvalid(c) || return false
120+
u = UInt32(c)
121+
if (u < 0xa1 || u > 0x10ffff)
122+
return false
123+
end
124+
cat = Base.Unicode.category_code(u)
125+
if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN ||
126+
cat == Base.Unicode.UTF8PROC_CATEGORY_MC ||
127+
cat == Base.Unicode.UTF8PROC_CATEGORY_ME)
128+
return true
129+
end
130+
# Additional allowed cases
131+
return $(_char_in_set_expr(:u,
132+
collect("²³¹ʰʲʳʷʸˡˢˣᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁᵂᵃᵇᵈᵉᵍᵏᵐᵒᵖᵗᵘᵛᵝᵞᵟᵠᵡᵢᵣᵤᵥᵦᵧᵨᵩᵪᶜᶠᶥᶦᶫᶰᶸᶻᶿ′″‴‵‶‷⁗⁰ⁱ⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₕₖₗₘₙₚₛₜⱼⱽꜛꜜꜝ")))
133+
end
134+
135+
function optakessuffix(k)
136+
(K"BEGIN_OPS" <= k <= K"END_OPS") &&
137+
!(
138+
k == K"..." ||
139+
K"BEGIN_ASSIGNMENTS" <= k <= K"END_ASSIGNMENTS" ||
140+
k == K"?" ||
141+
k == K"<:" ||
142+
k == K">:" ||
143+
k == K"&&" ||
144+
k == K"||" ||
145+
k == K"in" ||
146+
k == K"isa" ||
147+
k == K"≔" ||
148+
k == K"⩴" ||
149+
k == K":" ||
150+
k == K".." ||
151+
k == K"$" ||
152+
k == K"::" ||
153+
k == K"where" ||
154+
k == K"." ||
155+
k == K"!" ||
156+
k == K".'" ||
157+
k == K"->" ||
158+
K"¬" <= k <= K"∜"
159+
)
160+
end
161+
162+
const _unicode_ops = let
163+
ks = _nondot_symbolic_operator_kinds()
164+
ss = string.(ks)
165+
166+
ops = Dict{Char, Kind}([first(s)=>k for (k,s) in zip(ks,ss)
167+
if length(s) == 1 && !isascii(s[1])])
168+
for ck in _ops_with_unicode_aliases
169+
push!(ops, ck)
170+
end
171+
ops
172+
end
11173

12174
#-------------------------------------------------------------------------------
13175
# Tokens
@@ -370,7 +532,7 @@ function _next_token(l::Lexer, c)
370532
return lex_identifier(l, c)
371533
elseif isdigit(c)
372534
return lex_digit(l, K"Integer")
373-
elseif (k = get(UNICODE_OPS, c, K"error")) != K"error"
535+
elseif (k = get(_unicode_ops, c, K"error")) != K"error"
374536
return emit(l, k)
375537
else
376538
emit_error(l, K"ErrorUnknownCharacter")
@@ -416,6 +578,7 @@ function lex_string_chunk(l)
416578
!(pc == EOF_CHAR || is_operator_start_char(pc) || is_never_id_char(pc))
417579
# Only allow certain characters after interpolated vars
418580
# https://github.com/JuliaLang/julia/pull/25234
581+
readchar(l)
419582
return emit_error(l, K"ErrorInvalidInterpolationTerminator")
420583
end
421584
if pc == EOF_CHAR
@@ -771,7 +934,7 @@ function lex_digit(l::Lexer, kind)
771934
# If we enter the function with kind == K"Float" then a '.' has been parsed.
772935
readchar(l)
773936
return emit_error(l, K"ErrorInvalidNumericConstant")
774-
elseif is_operator_start_char(ppc) && ppc !== ':'
937+
elseif is_dottable_operator_start_char(ppc)
775938
readchar(l)
776939
return emit_error(l, K"ErrorAmbiguousNumericConstant") # `1.+`
777940
end
@@ -787,14 +950,14 @@ function lex_digit(l::Lexer, kind)
787950
accept(l, "+-−")
788951
if accept_batch(l, isdigit)
789952
pc,ppc = dpeekchar(l)
790-
if pc === '.' && !dotop2(ppc)
953+
if pc === '.' && !is_dottable_operator_start_char(ppc)
791954
readchar(l)
792955
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e1.`
793956
end
794957
else
795958
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e`
796959
end
797-
elseif pc == '.' && ppc != '.' && !is_operator_start_char(ppc)
960+
elseif pc == '.' && ppc != '.' && !is_dottable_operator_start_char(ppc)
798961
readchar(l)
799962
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.1.`
800963
elseif !had_fraction_digs && (is_identifier_start_char(pc) ||
@@ -808,7 +971,7 @@ function lex_digit(l::Lexer, kind)
808971
accept(l, "+-−")
809972
if accept_batch(l, isdigit)
810973
pc,ppc = dpeekchar(l)
811-
if pc === '.' && !dotop2(ppc)
974+
if pc === '.' && !is_dottable_operator_start_char(ppc)
812975
accept(l, '.')
813976
return emit_error(l, K"ErrorInvalidNumericConstant") # `1e1.`
814977
end
@@ -948,7 +1111,7 @@ function lex_dot(l::Lexer)
9481111
if accept(l, '.')
9491112
return emit(l, K"...")
9501113
else
951-
if dotop2(peekchar(l))
1114+
if is_dottable_operator_start_char(peekchar(l))
9521115
readchar(l)
9531116
return emit_error(l, K"ErrorInvalidOperator")
9541117
else
@@ -959,10 +1122,7 @@ function lex_dot(l::Lexer)
9591122
return lex_digit(l, K"Float")
9601123
else
9611124
pc, dpc = dpeekchar(l)
962-
if dotop1(pc)
963-
l.dotop = true
964-
return _next_token(l, readchar(l))
965-
elseif pc =='+'
1125+
if pc == '+'
9661126
l.dotop = true
9671127
readchar(l)
9681128
return lex_plus(l)
@@ -1040,6 +1200,9 @@ function lex_dot(l::Lexer)
10401200
l.dotop = true
10411201
readchar(l)
10421202
return lex_equal(l)
1203+
elseif is_dottable_operator_start_char(pc)
1204+
l.dotop = true
1205+
return _next_token(l, readchar(l))
10431206
end
10441207
return emit(l, K".")
10451208
end

0 commit comments

Comments
 (0)