Skip to content

Commit 82f528a

Browse files
committed
Improve: Assume PyICU available
1 parent 3c1832b commit 82f528a

File tree

1 file changed

+36
-69
lines changed

1 file changed

+36
-69
lines changed

bench_unicode.py

Lines changed: 36 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# /// script
22
# dependencies = [
3-
# "stringzilla",
3+
# "stringzilla>=4.4.0",
44
# "regex",
55
# "PyICU",
66
# ]
@@ -31,28 +31,20 @@
3131
from importlib.metadata import version as pkg_version
3232
from typing import List, Tuple, Optional
3333

34+
import icu
3435
import regex
3536
import stringzilla as sz
3637

3738
from utils import load_dataset, tokenize_dataset, add_common_args, now_ns, should_run
3839

39-
# Try to import PyICU, gracefully degrade if not available
40-
try:
41-
import icu
42-
PYICU_AVAILABLE = True
43-
except ImportError:
44-
PYICU_AVAILABLE = False
4540

4641

4742
def log_system_info():
4843
"""Log Python version and library versions."""
4944
print(f"- Python: {sys.version.split()[0]}, {sys.platform}")
5045
print(f"- StringZilla: {sz.__version__} with {sz.__capabilities_str__}")
5146
print(f"- regex: {pkg_version('regex')}")
52-
if PYICU_AVAILABLE:
53-
print(f"- PyICU: {pkg_version('PyICU')} (ICU {icu.ICU_VERSION})")
54-
else:
55-
print("- PyICU: not available")
47+
print(f"- PyICU: {pkg_version('PyICU')} (ICU {icu.ICU_VERSION})")
5648
print()
5749

5850

@@ -120,17 +112,11 @@ def compare_regex_fullcase(s1: str, s2: str) -> bool:
120112
return pattern.fullmatch(s2) is not None
121113

122114

123-
def make_compare_icu() -> callable:
124-
"""Create ICU case-folded comparison function."""
125-
if not PYICU_AVAILABLE:
126-
return None
127-
128-
def compare_icu(s1: str, s2: str) -> bool:
129-
folded1 = icu.UnicodeString(s1).foldCase()
130-
folded2 = icu.UnicodeString(s2).foldCase()
131-
return folded1 == folded2
132-
133-
return compare_icu
115+
def compare_icu(s1: str, s2: str) -> bool:
116+
"""Compare using ICU case folding."""
117+
folded1 = icu.UnicodeString(s1).foldCase()
118+
folded2 = icu.UnicodeString(s2).foldCase()
119+
return folded1 == folded2
134120

135121

136122
def compare_stringzilla(s1: str, s2: str) -> bool:
@@ -207,27 +193,19 @@ def find_regex_fullcase(haystack: str, needle: str) -> int:
207193
return len(pattern.findall(haystack))
208194

209195

210-
def make_find_icu() -> callable:
211-
"""Create ICU case-insensitive search function."""
212-
if not PYICU_AVAILABLE:
213-
return None
214-
215-
def find_icu(haystack: str, needle: str) -> int:
216-
"""Count occurrences using ICU StringSearch."""
217-
if not needle:
218-
return 0
219-
# Use ICU's StringSearch for case-insensitive matching
220-
collator = icu.Collator.createInstance(icu.Locale.getRoot())
221-
collator.setStrength(icu.Collator.SECONDARY) # Case-insensitive
222-
searcher = icu.StringSearch(needle, haystack, collator)
223-
count = 0
196+
def find_icu(haystack: str, needle: str) -> int:
197+
"""Count occurrences using ICU StringSearch."""
198+
if not needle:
199+
return 0
200+
collator = icu.Collator.createInstance(icu.Locale.getRoot())
201+
collator.setStrength(icu.Collator.SECONDARY) # Case-insensitive
202+
searcher = icu.StringSearch(needle, haystack, collator)
203+
count = 0
204+
pos = searcher.nextMatch()
205+
while pos != -1:
206+
count += 1
224207
pos = searcher.nextMatch()
225-
while pos != -1:
226-
count += 1
227-
pos = searcher.nextMatch()
228-
return count
229-
230-
return find_icu
208+
return count
231209

232210

233211
def find_stringzilla(haystack: str, needle: str) -> int:
@@ -237,11 +215,11 @@ def find_stringzilla(haystack: str, needle: str) -> int:
237215
count = 0
238216
start = 0
239217
while True:
240-
pos = sz.utf8_case_insensitive_find(haystack, needle, start)
218+
pos = sz.utf8_case_insensitive_find(haystack[start:], needle)
241219
if pos == -1:
242220
break
243221
count += 1
244-
start = pos + 1
222+
start += pos + 1
245223
return count
246224

247225

@@ -295,15 +273,9 @@ def fold_stringzilla(s: str) -> bytes:
295273
return sz.utf8_case_fold(s)
296274

297275

298-
def make_fold_icu() -> callable:
299-
"""Create ICU case folding function."""
300-
if not PYICU_AVAILABLE:
301-
return None
302-
303-
def fold_icu(s: str) -> str:
304-
return str(icu.UnicodeString(s).foldCase())
305-
306-
return fold_icu
276+
def fold_icu(s: str) -> str:
277+
"""Fold using ICU case folding."""
278+
return str(icu.UnicodeString(s).foldCase())
307279

308280

309281
_main_epilog = """
@@ -363,40 +335,35 @@ def main():
363335
print(f"Pairs: {total_pairs:,}, Search needles: {len(search_needles)}")
364336
log_system_info()
365337

366-
# Prepare ICU functions if available
367-
compare_icu = make_compare_icu()
368-
find_icu = make_find_icu()
369-
fold_icu = make_fold_icu()
370-
371338
# === Case-Insensitive Comparison ===
372339
print("=== Case-Insensitive Comparison ===")
373-
if should_run("sz.utf8_case_insensitive_order", filter_pattern):
340+
if should_run("case-insensitive-compare/sz", filter_pattern):
374341
bench_case_compare("sz.utf8_case_insensitive_order", pairs, compare_stringzilla, args.time_limit)
375-
if should_run("str.casefold().eq()", filter_pattern):
342+
if should_run("case-insensitive-compare/str", filter_pattern):
376343
bench_case_compare("str.casefold().eq()", pairs, compare_casefold, args.time_limit)
377-
if should_run("regex.fullmatch(FULLCASE)", filter_pattern):
344+
if should_run("case-insensitive-compare/regex", filter_pattern):
378345
bench_case_compare("regex.fullmatch(FULLCASE)", pairs, compare_regex_fullcase, args.time_limit)
379-
if PYICU_AVAILABLE and compare_icu and should_run("icu.CaseMap.foldCase().eq()", filter_pattern):
346+
if should_run("case-insensitive-compare/icu", filter_pattern):
380347
bench_case_compare("icu.CaseMap.foldCase().eq()", pairs, compare_icu, args.time_limit)
381348

382349
# === Case-Insensitive Substring Search ===
383350
print("\n=== Case-Insensitive Substring Search ===")
384-
if should_run("sz.utf8_case_insensitive_find", filter_pattern):
351+
if should_run("case-insensitive-find/sz", filter_pattern):
385352
bench_case_find("sz.utf8_case_insensitive_find", pythonic_str, search_needles, find_stringzilla, args.time_limit)
386-
if should_run("str.casefold().find()", filter_pattern):
353+
if should_run("case-insensitive-find/str", filter_pattern):
387354
bench_case_find("str.casefold().find()", pythonic_str, search_needles, find_casefold, args.time_limit)
388-
if should_run("regex.search(FULLCASE)", filter_pattern):
355+
if should_run("case-insensitive-find/regex", filter_pattern):
389356
bench_case_find("regex.search(FULLCASE)", pythonic_str, search_needles, find_regex_fullcase, args.time_limit)
390-
if PYICU_AVAILABLE and find_icu and should_run("icu.StringSearch", filter_pattern):
357+
if should_run("case-insensitive-find/icu", filter_pattern):
391358
bench_case_find("icu.StringSearch", pythonic_str, search_needles, find_icu, args.time_limit)
392359

393360
# === Case Folding Transformation ===
394361
print("\n=== Case Folding Transformation ===")
395-
if should_run("sz.utf8_case_fold()", filter_pattern):
362+
if should_run("case-fold/sz", filter_pattern):
396363
bench_case_fold("sz.utf8_case_fold()", tokens, fold_stringzilla, args.time_limit)
397-
if should_run("str.casefold()", filter_pattern):
364+
if should_run("case-fold/str", filter_pattern):
398365
bench_case_fold("str.casefold()", tokens, fold_casefold, args.time_limit)
399-
if PYICU_AVAILABLE and fold_icu and should_run("icu.CaseMap.foldCase()", filter_pattern):
366+
if should_run("case-fold/icu", filter_pattern):
400367
bench_case_fold("icu.CaseMap.foldCase()", tokens, fold_icu, args.time_limit)
401368

402369
return 0

0 commit comments

Comments
 (0)