Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 166 additions & 0 deletions script/gen_unicode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#!/usr/bin/env python3
#
# Copyright (c) 2026 Lean FRO, LLC. All rights reserved.
# Released under Apache 2.0 license as described in the file LICENSE.
#
# Ported from https://github.com/unicode-rs/unicode-xid/blob/master/scripts/unicode.py
# Original copyright: 2011-2015 The Rust Project Developers (Apache 2.0 / MIT)
#
# This script uses the following Unicode tables:
# - DerivedCoreProperties.txt
# - ReadMe.txt
#
# Since this should not require frequent updates, we store the output
# out-of-line and check the tables.lean file into git.

import fileinput, re, os, sys

preamble = """/-
Copyright (c) 2026 Lean FRO, LLC. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.

NOTE: The following code was generated by "script/unicode.py", do not edit directly
-/
module

prelude
public import Init.Data.Char.Basic
import Init.Data.Array.Basic
import Init.Notation

-- The XID tables contain ~800 entries each, requiring a deep let-chain.
set_option maxRecDepth 2048
"""

def fetch(f):
if not os.path.exists(os.path.basename(f)):
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" % f)

if not os.path.exists(os.path.basename(f)):
sys.stderr.write("cannot load %s" % f)
exit(1)

def group_cat(cat):
cat_out = []
letters = sorted(set(cat))
cur_start = letters.pop(0)
cur_end = cur_start
for letter in letters:
assert letter > cur_end, \
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
if letter == cur_end + 1:
cur_end = letter
else:
cat_out.append((cur_start, cur_end))
cur_start = cur_end = letter
cat_out.append((cur_start, cur_end))
return cat_out

def ungroup_cat(cat):
cat_out = []
for (lo, hi) in cat:
while lo <= hi:
cat_out.append(lo)
lo += 1
return cat_out

def load_properties(f, interestingprops):
fetch(f)
props = {}
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")

for line in fileinput.input(os.path.basename(f)):
prop = None
d_lo = 0
d_hi = 0
m = re1.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(1)
prop = m.group(2)
else:
m = re2.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(2)
prop = m.group(3)
else:
continue
if interestingprops and prop not in interestingprops:
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
if prop not in props:
props[prop] = []
props[prop].append((d_lo, d_hi))

# optimize if possible
for prop in props:
props[prop] = group_cat(ungroup_cat(props[prop]))

return props

def format_lean_Nat(c):
return "0x%x" % c

def emit_bsearch_range_table(f):
f.write("""
partial def bsearchRangeTable (c : Nat) (r : Array (Nat × Nat)) : Bool :=
loop 0 r.size
where
loop (lo hi : Nat) : Bool :=
if lo >= hi then
false
else
let mid := lo + (hi - lo) / 2
let (rlo, rhi) := r[mid]!
if rlo > c then
loop lo mid
else if rhi < c then
loop (mid + 1) hi
else
true
""")

def emit_table(f, name, t_data,
pfun=lambda x: "(0x%x, 0x%x)" % (x[0], x[1])):
f.write("def %s : Array (Nat × Nat) :=\n" % name)
f.write(" let table := Array.mkEmpty %d\n" % len(t_data))
for dat in t_data:
f.write(" let table := table.push %s\n" % pfun(dat))
f.write(" table\n\n")

def emit_property_module(f, mod, tbl, emit):
# Convert module name to Lean namespace style (PascalCase)
f.write("namespace Char\n\n")
for cat in sorted(emit):
emit_table(f, "%sTable" % cat, tbl[cat])
f.write("public def is%s (c : Char) : Bool :=\n" % cat)
f.write(" bsearchRangeTable c.val.toNat %sTable\n\n" % cat)
f.write("end Char\n\n")

if __name__ == "__main__":
r = "tables.lean"
if os.path.exists(r):
os.remove(r)
with open(r, "w") as rf:
# write the file's preamble
rf.write(preamble)

# download and parse all the data
fetch("ReadMe.txt")
with open("ReadMe.txt") as readme:
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
unicode_version = re.search(pattern, readme.read()).groups()
rf.write("""
/-- The version of [Unicode](http://www.unicode.org/)
that this version of the XID tables is based on. -/
def unicodeVersion : Nat × Nat × Nat := (%s, %s, %s)
""" % unicode_version)

emit_bsearch_range_table(rf)

want_derived = ["XID_Start", "XID_Continue"]
derived = load_properties("DerivedCoreProperties.txt", want_derived)
emit_property_module(rf, "derived_property", derived, want_derived)
1 change: 1 addition & 0 deletions src/Init/Data/Char.lean
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ public import Init.Data.Char.Basic
public import Init.Data.Char.Lemmas
public import Init.Data.Char.Order
public import Init.Data.Char.Ordinal
public import Init.Data.Char.Xid
Loading
Loading