Skip to content

Commit 4fce392

Browse files
committed
feat: XID identifier support
1 parent 3961b69 commit 4fce392

File tree

6 files changed

+1853
-3
lines changed

6 files changed

+1853
-3
lines changed

script/gen_unicode.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
#!/usr/bin/env python3
2+
#
3+
# Copyright (c) 2026 Lean FRO, LLC. All rights reserved.
4+
# Released under Apache 2.0 license as described in the file LICENSE.
5+
#
6+
# Ported from https://github.com/unicode-rs/unicode-xid/blob/master/scripts/unicode.py
7+
# Original copyright: 2011-2015 The Rust Project Developers (Apache 2.0 / MIT)
8+
#
9+
# This script uses the following Unicode tables:
10+
# - DerivedCoreProperties.txt
11+
# - ReadMe.txt
12+
#
13+
# Since this should not require frequent updates, we store the output
14+
# out-of-line and check the tables.lean file into git.
15+
16+
import fileinput, re, os, sys
17+
18+
preamble = """/-
19+
Copyright (c) 2026 Lean FRO, LLC. All rights reserved.
20+
Released under Apache 2.0 license as described in the file LICENSE.
21+
22+
NOTE: The following code was generated by "script/unicode.py", do not edit directly
23+
-/
24+
module
25+
26+
prelude
27+
public import Init.Data.Char.Basic
28+
import Init.Data.Array.Basic
29+
import Init.Notation
30+
31+
-- The XID tables contain ~800 entries each, requiring a deep let-chain.
32+
set_option maxRecDepth 2048
33+
"""
34+
35+
def fetch(f):
36+
if not os.path.exists(os.path.basename(f)):
37+
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" % f)
38+
39+
if not os.path.exists(os.path.basename(f)):
40+
sys.stderr.write("cannot load %s" % f)
41+
exit(1)
42+
43+
def group_cat(cat):
44+
cat_out = []
45+
letters = sorted(set(cat))
46+
cur_start = letters.pop(0)
47+
cur_end = cur_start
48+
for letter in letters:
49+
assert letter > cur_end, \
50+
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
51+
if letter == cur_end + 1:
52+
cur_end = letter
53+
else:
54+
cat_out.append((cur_start, cur_end))
55+
cur_start = cur_end = letter
56+
cat_out.append((cur_start, cur_end))
57+
return cat_out
58+
59+
def ungroup_cat(cat):
60+
cat_out = []
61+
for (lo, hi) in cat:
62+
while lo <= hi:
63+
cat_out.append(lo)
64+
lo += 1
65+
return cat_out
66+
67+
def load_properties(f, interestingprops):
68+
fetch(f)
69+
props = {}
70+
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
71+
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
72+
73+
for line in fileinput.input(os.path.basename(f)):
74+
prop = None
75+
d_lo = 0
76+
d_hi = 0
77+
m = re1.match(line)
78+
if m:
79+
d_lo = m.group(1)
80+
d_hi = m.group(1)
81+
prop = m.group(2)
82+
else:
83+
m = re2.match(line)
84+
if m:
85+
d_lo = m.group(1)
86+
d_hi = m.group(2)
87+
prop = m.group(3)
88+
else:
89+
continue
90+
if interestingprops and prop not in interestingprops:
91+
continue
92+
d_lo = int(d_lo, 16)
93+
d_hi = int(d_hi, 16)
94+
if prop not in props:
95+
props[prop] = []
96+
props[prop].append((d_lo, d_hi))
97+
98+
# optimize if possible
99+
for prop in props:
100+
props[prop] = group_cat(ungroup_cat(props[prop]))
101+
102+
return props
103+
104+
def format_lean_Nat(c):
105+
return "0x%x" % c
106+
107+
def emit_bsearch_range_table(f):
108+
f.write("""
109+
partial def bsearchRangeTable (c : Nat) (r : Array (Nat × Nat)) : Bool :=
110+
loop 0 r.size
111+
where
112+
loop (lo hi : Nat) : Bool :=
113+
if lo >= hi then
114+
false
115+
else
116+
let mid := lo + (hi - lo) / 2
117+
let (rlo, rhi) := r[mid]!
118+
if rlo > c then
119+
loop lo mid
120+
else if rhi < c then
121+
loop (mid + 1) hi
122+
else
123+
true
124+
""")
125+
126+
def emit_table(f, name, t_data,
127+
pfun=lambda x: "(0x%x, 0x%x)" % (x[0], x[1])):
128+
f.write("def %s : Array (Nat × Nat) :=\n" % name)
129+
f.write(" let table := Array.mkEmpty %d\n" % len(t_data))
130+
for dat in t_data:
131+
f.write(" let table := table.push %s\n" % pfun(dat))
132+
f.write(" table\n\n")
133+
134+
def emit_property_module(f, mod, tbl, emit):
135+
# Convert module name to Lean namespace style (PascalCase)
136+
f.write("namespace Char\n\n")
137+
for cat in sorted(emit):
138+
emit_table(f, "%sTable" % cat, tbl[cat])
139+
f.write("public def is%s (c : Char) : Bool :=\n" % cat)
140+
f.write(" bsearchRangeTable c.val.toNat %sTable\n\n" % cat)
141+
f.write("end Char\n\n")
142+
143+
if __name__ == "__main__":
144+
r = "tables.lean"
145+
if os.path.exists(r):
146+
os.remove(r)
147+
with open(r, "w") as rf:
148+
# write the file's preamble
149+
rf.write(preamble)
150+
151+
# download and parse all the data
152+
fetch("ReadMe.txt")
153+
with open("ReadMe.txt") as readme:
154+
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
155+
unicode_version = re.search(pattern, readme.read()).groups()
156+
rf.write("""
157+
/-- The version of [Unicode](http://www.unicode.org/)
158+
that this version of the XID tables is based on. -/
159+
def unicodeVersion : Nat × Nat × Nat := (%s, %s, %s)
160+
""" % unicode_version)
161+
162+
emit_bsearch_range_table(rf)
163+
164+
want_derived = ["XID_Start", "XID_Continue"]
165+
derived = load_properties("DerivedCoreProperties.txt", want_derived)
166+
emit_property_module(rf, "derived_property", derived, want_derived)

src/Init/Data/Char.lean

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ public import Init.Data.Char.Basic
1010
public import Init.Data.Char.Lemmas
1111
public import Init.Data.Char.Order
1212
public import Init.Data.Char.Ordinal
13+
public import Init.Data.Char.Xid

0 commit comments

Comments
 (0)