Skip to content

Commit 32a8d01

Browse files
sayrerclaude
andcommitted
Add tld_gen_phf.py to automate tlds.rs generation from tld_lib.yml.
The PHF hash set in tlds.rs was previously maintained by hand. This adds a generator script (tld_gen_phf.py) and a wrapper (update_tlds.sh) to produce it from the canonical tld_lib.yml source. Also adds 7 punycode TLDs that were missing from the hand-maintained file and fixes sorting of unicode TLDs containing non-ASCII Latin characters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f76666b commit 32a8d01

File tree

4 files changed

+265
-2
lines changed

4 files changed

+265
-2
lines changed

rust/conformance/BUILD.bazel

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,3 +181,23 @@ genrule(
181181
tools = ["tests/insert_tld_grammar.sh"],
182182
visibility = ["//visibility:public"],
183183
)
184+
185+
# Hermetic PHF TLD hash set generator
186+
py_binary(
187+
name = "tld_gen_phf",
188+
srcs = ["tests/tld_gen_phf.py"],
189+
main = "tests/tld_gen_phf.py",
190+
python_version = "PY3",
191+
deps = [
192+
requirement("PyYAML"),
193+
],
194+
)
195+
196+
genrule(
197+
name = "generate_tlds_rs",
198+
srcs = ["tests/tld_lib.yml"],
199+
outs = ["tlds.rs"],
200+
cmd = "$(location :tld_gen_phf) --input $(location tests/tld_lib.yml) --output $@",
201+
tools = [":tld_gen_phf"],
202+
visibility = ["//visibility:public"],
203+
)
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# Copyright 2025 Robert Sayre
2+
# Licensed under the Apache License, Version 2.0
3+
# http://www.apache.org/licenses/LICENSE-2.0
4+
5+
import argparse
6+
import sys
7+
8+
import yaml
9+
10+
11+
def main():
12+
parser = argparse.ArgumentParser(
13+
description="Generate TLD PHF hash set for Rust"
14+
)
15+
parser.add_argument(
16+
"--input", required=True, help="Input YAML file containing TLDs"
17+
)
18+
parser.add_argument(
19+
"--output", required=True, help="Output file for generated tlds.rs"
20+
)
21+
args = parser.parse_args()
22+
23+
with open(args.input, "r", encoding="utf-8") as stream:
24+
try:
25+
tld_structure = yaml.safe_load(stream)
26+
except yaml.YAMLError as exc:
27+
print(exc, file=sys.stderr)
28+
sys.exit(1)
29+
30+
tlds = set()
31+
for kind in tld_structure.values():
32+
for element in kind:
33+
idna = element.encode("idna").decode("ascii")
34+
if idna != element:
35+
tlds.add(idna)
36+
tlds.add(element.lower())
37+
38+
# Separate into ASCII, unicode, and punycode for organized output
39+
ascii_tlds = sorted(t for t in tlds if all(ord(c) < 128 for c in t) and not t.startswith("xn--"))
40+
unicode_tlds = sorted(t for t in tlds if any(ord(c) >= 128 for c in t))
41+
punycode_tlds = sorted(t for t in tlds if t.startswith("xn--"))
42+
43+
with open(args.output, "w", encoding="utf-8") as out:
44+
out.write(FILE_HEADER)
45+
46+
# Write phf_set entries
47+
for tld in ascii_tlds:
48+
out.write(f' "{tld}",\n')
49+
for tld in unicode_tlds:
50+
out.write(f' "{tld}",\n')
51+
out.write(" // Punycode TLDs (IDN TLDs in ASCII-compatible encoding)\n")
52+
for tld in punycode_tlds:
53+
out.write(f' "{tld}",\n')
54+
55+
out.write(FILE_FOOTER)
56+
57+
print(
58+
f"Generated {args.output} with {len(tlds)} TLDs "
59+
f"({len(ascii_tlds)} ASCII, {len(unicode_tlds)} Unicode, "
60+
f"{len(punycode_tlds)} Punycode)",
61+
file=sys.stderr,
62+
)
63+
64+
65+
FILE_HEADER = """\
66+
// Copyright 2025 Robert Sayre
67+
// Licensed under the Apache License, Version 2.0
68+
// http://www.apache.org/licenses/LICENSE-2.0
69+
70+
//! TLD lookup using a compile-time perfect hash map.
71+
//!
72+
//! This module provides O(1) TLD validation, replacing the O(n)
73+
//! Pest grammar alternation.
74+
//!
75+
//! This file is generated by tld_gen_phf.py from tld_lib.yml.
76+
//! Do not edit by hand.
77+
78+
use phf::phf_set;
79+
80+
/// A compile-time perfect hash set of all valid TLDs.
81+
/// Case-insensitive matching is handled by lowercasing input before lookup.
82+
pub static TLDS: phf::Set<&'static str> = phf_set! {
83+
"""
84+
85+
FILE_FOOTER = """\
86+
};
87+
88+
/// Check if a string is a valid TLD.
89+
/// The input should be lowercase.
90+
#[inline]
91+
pub fn is_valid_tld(tld: &str) -> bool {
92+
TLDS.contains(tld)
93+
}
94+
95+
/// Maximum TLD length in bytes (longest is "சிங்கப்பூர்" at 33 bytes)
96+
const MAX_TLD_LEN: usize = 64;
97+
98+
/// Check if a string is a valid TLD, case-insensitively.
99+
/// Uses a stack-allocated buffer for lowercase conversion to avoid heap allocations.
100+
#[inline]
101+
pub fn is_valid_tld_case_insensitive(tld: &str) -> bool {
102+
// Fast path: if already lowercase ASCII, check directly
103+
if tld.bytes().all(|b| !b.is_ascii_uppercase()) {
104+
return TLDS.contains(tld);
105+
}
106+
107+
// Need to lowercase - use stack buffer if small enough
108+
if tld.len() <= MAX_TLD_LEN {
109+
let mut buf = [0u8; MAX_TLD_LEN];
110+
let mut i = 0;
111+
for c in tld.chars() {
112+
for lc in c.to_lowercase() {
113+
let len = lc.len_utf8();
114+
if i + len > MAX_TLD_LEN {
115+
// Overflow - fall back to heap allocation
116+
return TLDS.contains(&tld.to_lowercase());
117+
}
118+
lc.encode_utf8(&mut buf[i..]);
119+
i += len;
120+
}
121+
}
122+
// Safety: we only wrote valid UTF-8 chars
123+
let lowered = unsafe { std::str::from_utf8_unchecked(&buf[..i]) };
124+
TLDS.contains(lowered)
125+
} else {
126+
// TLD too long - fall back to heap allocation (shouldn't happen with valid TLDs)
127+
TLDS.contains(&tld.to_lowercase())
128+
}
129+
}
130+
131+
#[cfg(test)]
132+
mod tests {
133+
use super::*;
134+
135+
#[test]
136+
fn test_common_tlds() {
137+
assert!(is_valid_tld("com"));
138+
assert!(is_valid_tld("org"));
139+
assert!(is_valid_tld("net"));
140+
assert!(is_valid_tld("uk"));
141+
assert!(is_valid_tld("io"));
142+
}
143+
144+
#[test]
145+
fn test_unicode_tlds() {
146+
assert!(is_valid_tld("中国"));
147+
assert!(is_valid_tld("рф"));
148+
assert!(is_valid_tld("한국"));
149+
}
150+
151+
#[test]
152+
fn test_punycode_tlds() {
153+
assert!(is_valid_tld("xn--p1ai")); // .рф (Russia)
154+
assert!(is_valid_tld("xn--wgbh1c")); // .مصر (Egypt)
155+
assert!(is_valid_tld("xn--mgbaam7a8h")); // .امارات (UAE)
156+
}
157+
158+
#[test]
159+
fn test_invalid_tlds() {
160+
assert!(!is_valid_tld("invalid"));
161+
assert!(!is_valid_tld("notarealtld"));
162+
assert!(!is_valid_tld(""));
163+
}
164+
}
165+
"""
166+
167+
168+
if __name__ == "__main__":
169+
main()

rust/twitter-text/src/tlds.rs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
//!
77
//! This module provides O(1) TLD validation, replacing the O(n)
88
//! Pest grammar alternation.
9+
//!
10+
//! This file is generated by tld_gen_phf.py from tld_lib.yml.
11+
//! Do not edit by hand.
912
1013
use phf::phf_set;
1114

@@ -1340,8 +1343,6 @@ pub static TLDS: phf::Set<&'static str> = phf_set! {
13401343
"vegas",
13411344
"ventures",
13421345
"verisign",
1343-
"vermögensberater",
1344-
"vermögensberatung",
13451346
"versicherung",
13461347
"vet",
13471348
"vg",
@@ -1437,6 +1438,8 @@ pub static TLDS: phf::Set<&'static str> = phf_set! {
14371438
"zone",
14381439
"zuerich",
14391440
"zw",
1441+
"vermögensberater",
1442+
"vermögensberatung",
14401443
"ελ",
14411444
"ευ",
14421445
"бг",
@@ -1603,6 +1606,7 @@ pub static TLDS: phf::Set<&'static str> = phf_set! {
16031606
"xn--3ds443g",
16041607
"xn--3e0b707e",
16051608
"xn--3hcrj9c",
1609+
"xn--3oq18vl8pn36a",
16061610
"xn--3pxu8k",
16071611
"xn--42c2d9a",
16081612
"xn--45br5cyl",
@@ -1645,6 +1649,7 @@ pub static TLDS: phf::Set<&'static str> = phf_set! {
16451649
"xn--e1a4c",
16461650
"xn--eckvdtc9d",
16471651
"xn--efvy88h",
1652+
"xn--estv75g",
16481653
"xn--fct429k",
16491654
"xn--fhbei",
16501655
"xn--fiq228c5hs",
@@ -1671,22 +1676,26 @@ pub static TLDS: phf::Set<&'static str> = phf_set! {
16711676
"xn--j1amh",
16721677
"xn--j6w193g",
16731678
"xn--jlq480n2rg",
1679+
"xn--jlq61u9w7b",
16741680
"xn--jvr189m",
16751681
"xn--kcrx77d1x4a",
16761682
"xn--kprw13d",
16771683
"xn--kpry57d",
1684+
"xn--kpu716f",
16781685
"xn--kput3i",
16791686
"xn--l1acc",
16801687
"xn--lgbbat1ad8j",
16811688
"xn--mgb9awbf",
16821689
"xn--mgba3a3ejt",
16831690
"xn--mgba3a4f16a",
16841691
"xn--mgba7c0bbn0a",
1692+
"xn--mgbaakc7dvf",
16851693
"xn--mgbaam7a8h",
16861694
"xn--mgbab2bd",
16871695
"xn--mgbah1a3hjkrd",
16881696
"xn--mgbai9azgqp6j",
16891697
"xn--mgbayh7gpa",
1698+
"xn--mgbb9fbpob",
16901699
"xn--mgbbh1a",
16911700
"xn--mgbbh1a71e",
16921701
"xn--mgbc0a9azcg",
@@ -1714,6 +1723,7 @@ pub static TLDS: phf::Set<&'static str> = phf_set! {
17141723
"xn--otu796d",
17151724
"xn--p1acf",
17161725
"xn--p1ai",
1726+
"xn--pbt977c",
17171727
"xn--pgbs0dh",
17181728
"xn--pssy2u",
17191729
"xn--q7ce6a",

update_tlds.sh

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/bin/bash
2+
# Copyright 2025 Robert Sayre
3+
# Licensed under the Apache License, Version 2.0
4+
# http://www.apache.org/licenses/LICENSE-2.0
5+
#
6+
# Regenerate TLD files from tld_lib.yml.
7+
# Requires: Python 3, PyYAML
8+
#
9+
# By default, only regenerates tlds.rs (the PHF hash set used at runtime).
10+
# The Pest grammar uses character-class-based TLD matching and does not
11+
# need regeneration when TLDs change.
12+
#
13+
# To also regenerate the Pest trie grammar (legacy, requires patricia-trie):
14+
# ./update_tlds.sh --pest
15+
#
16+
# Usage: ./update_tlds.sh [--pest]
17+
18+
set -euo pipefail
19+
20+
REPO_ROOT="$(cd "$(dirname "$0")" && pwd)"
21+
22+
TLD_YAML="$REPO_ROOT/rust/conformance/tests/tld_lib.yml"
23+
TLD_GEN_PHF="$REPO_ROOT/rust/conformance/tests/tld_gen_phf.py"
24+
TLDS_RS="$REPO_ROOT/rust/twitter-text/src/tlds.rs"
25+
26+
UPDATE_PEST=false
27+
for arg in "$@"; do
28+
case "$arg" in
29+
--pest) UPDATE_PEST=true ;;
30+
*) echo "Unknown option: $arg" >&2; exit 1 ;;
31+
esac
32+
done
33+
34+
if [ "$UPDATE_PEST" = true ]; then
35+
TLD_GEN="$REPO_ROOT/rust/conformance/tests/tld_gen.py"
36+
INSERT_SCRIPT="$REPO_ROOT/rust/conformance/tests/insert_tld_grammar.sh"
37+
PEST_FILE="$REPO_ROOT/rust/parser/src/twitter_text.pest"
38+
39+
TMPDIR="$(mktemp -d)"
40+
trap 'rm -rf "$TMPDIR"' EXIT
41+
42+
echo "==> Generating Pest TLD grammar..."
43+
python3 "$TLD_GEN" \
44+
--input "$TLD_YAML" \
45+
--output "$TMPDIR/generated_tld.pest"
46+
47+
echo "==> Inserting TLD grammar into twitter_text.pest..."
48+
bash "$INSERT_SCRIPT" \
49+
"$PEST_FILE" \
50+
"$TMPDIR/generated_tld.pest" \
51+
"$PEST_FILE"
52+
fi
53+
54+
echo "==> Generating tlds.rs PHF hash set..."
55+
python3 "$TLD_GEN_PHF" \
56+
--input "$TLD_YAML" \
57+
--output "$TLDS_RS"
58+
59+
echo ""
60+
echo "Done. Updated:"
61+
if [ "$UPDATE_PEST" = true ]; then
62+
echo " - $PEST_FILE"
63+
fi
64+
echo " - $TLDS_RS"

0 commit comments

Comments
 (0)