11# /// script
22# dependencies = [
3- # "stringzilla",
3+ # "stringzilla>=4.4.0 ",
44# "regex",
55# "PyICU",
66# ]
3131from importlib .metadata import version as pkg_version
3232from typing import List , Tuple , Optional
3333
34+ import icu
3435import regex
3536import stringzilla as sz
3637
3738from utils import load_dataset , tokenize_dataset , add_common_args , now_ns , should_run
3839
39- # Try to import PyICU, gracefully degrade if not available
40- try :
41- import icu
42- PYICU_AVAILABLE = True
43- except ImportError :
44- PYICU_AVAILABLE = False
4540
4641
4742def log_system_info ():
4843 """Log Python version and library versions."""
4944 print (f"- Python: { sys .version .split ()[0 ]} , { sys .platform } " )
5045 print (f"- StringZilla: { sz .__version__ } with { sz .__capabilities_str__ } " )
5146 print (f"- regex: { pkg_version ('regex' )} " )
52- if PYICU_AVAILABLE :
53- print (f"- PyICU: { pkg_version ('PyICU' )} (ICU { icu .ICU_VERSION } )" )
54- else :
55- print ("- PyICU: not available" )
47+ print (f"- PyICU: { pkg_version ('PyICU' )} (ICU { icu .ICU_VERSION } )" )
5648 print ()
5749
5850
@@ -120,17 +112,11 @@ def compare_regex_fullcase(s1: str, s2: str) -> bool:
120112 return pattern .fullmatch (s2 ) is not None
121113
122114
123- def make_compare_icu () -> callable :
124- """Create ICU case-folded comparison function."""
125- if not PYICU_AVAILABLE :
126- return None
127-
128- def compare_icu (s1 : str , s2 : str ) -> bool :
129- folded1 = icu .UnicodeString (s1 ).foldCase ()
130- folded2 = icu .UnicodeString (s2 ).foldCase ()
131- return folded1 == folded2
132-
133- return compare_icu
115+ def compare_icu (s1 : str , s2 : str ) -> bool :
116+ """Compare using ICU case folding."""
117+ folded1 = icu .UnicodeString (s1 ).foldCase ()
118+ folded2 = icu .UnicodeString (s2 ).foldCase ()
119+ return folded1 == folded2
134120
135121
136122def compare_stringzilla (s1 : str , s2 : str ) -> bool :
@@ -207,27 +193,19 @@ def find_regex_fullcase(haystack: str, needle: str) -> int:
207193 return len (pattern .findall (haystack ))
208194
209195
210- def make_find_icu () -> callable :
211- """Create ICU case-insensitive search function."""
212- if not PYICU_AVAILABLE :
213- return None
214-
215- def find_icu (haystack : str , needle : str ) -> int :
216- """Count occurrences using ICU StringSearch."""
217- if not needle :
218- return 0
219- # Use ICU's StringSearch for case-insensitive matching
220- collator = icu .Collator .createInstance (icu .Locale .getRoot ())
221- collator .setStrength (icu .Collator .SECONDARY ) # Case-insensitive
222- searcher = icu .StringSearch (needle , haystack , collator )
223- count = 0
196+ def find_icu (haystack : str , needle : str ) -> int :
197+ """Count occurrences using ICU StringSearch."""
198+ if not needle :
199+ return 0
200+ collator = icu .Collator .createInstance (icu .Locale .getRoot ())
201+ collator .setStrength (icu .Collator .SECONDARY ) # Case-insensitive
202+ searcher = icu .StringSearch (needle , haystack , collator )
203+ count = 0
204+ pos = searcher .nextMatch ()
205+ while pos != - 1 :
206+ count += 1
224207 pos = searcher .nextMatch ()
225- while pos != - 1 :
226- count += 1
227- pos = searcher .nextMatch ()
228- return count
229-
230- return find_icu
208+ return count
231209
232210
233211def find_stringzilla (haystack : str , needle : str ) -> int :
@@ -237,11 +215,11 @@ def find_stringzilla(haystack: str, needle: str) -> int:
237215 count = 0
238216 start = 0
239217 while True :
240- pos = sz .utf8_case_insensitive_find (haystack , needle , start )
218+ pos = sz .utf8_case_insensitive_find (haystack [ start :] , needle )
241219 if pos == - 1 :
242220 break
243221 count += 1
244- start = pos + 1
222+ start + = pos + 1
245223 return count
246224
247225
@@ -295,15 +273,9 @@ def fold_stringzilla(s: str) -> bytes:
295273 return sz .utf8_case_fold (s )
296274
297275
298- def make_fold_icu () -> callable :
299- """Create ICU case folding function."""
300- if not PYICU_AVAILABLE :
301- return None
302-
303- def fold_icu (s : str ) -> str :
304- return str (icu .UnicodeString (s ).foldCase ())
305-
306- return fold_icu
276+ def fold_icu (s : str ) -> str :
277+ """Fold using ICU case folding."""
278+ return str (icu .UnicodeString (s ).foldCase ())
307279
308280
309281_main_epilog = """
@@ -363,40 +335,35 @@ def main():
363335 print (f"Pairs: { total_pairs :,} , Search needles: { len (search_needles )} " )
364336 log_system_info ()
365337
366- # Prepare ICU functions if available
367- compare_icu = make_compare_icu ()
368- find_icu = make_find_icu ()
369- fold_icu = make_fold_icu ()
370-
371338 # === Case-Insensitive Comparison ===
372339 print ("=== Case-Insensitive Comparison ===" )
373- if should_run ("sz.utf8_case_insensitive_order " , filter_pattern ):
340+ if should_run ("case-insensitive-compare/sz " , filter_pattern ):
374341 bench_case_compare ("sz.utf8_case_insensitive_order" , pairs , compare_stringzilla , args .time_limit )
375- if should_run ("str.casefold().eq() " , filter_pattern ):
342+ if should_run ("case-insensitive-compare/ str" , filter_pattern ):
376343 bench_case_compare ("str.casefold().eq()" , pairs , compare_casefold , args .time_limit )
377- if should_run ("regex.fullmatch(FULLCASE) " , filter_pattern ):
344+ if should_run ("case-insensitive-compare/ regex" , filter_pattern ):
378345 bench_case_compare ("regex.fullmatch(FULLCASE)" , pairs , compare_regex_fullcase , args .time_limit )
379- if PYICU_AVAILABLE and compare_icu and should_run ("icu.CaseMap.foldCase().eq() " , filter_pattern ):
346+ if should_run ("case-insensitive-compare/ icu" , filter_pattern ):
380347 bench_case_compare ("icu.CaseMap.foldCase().eq()" , pairs , compare_icu , args .time_limit )
381348
382349 # === Case-Insensitive Substring Search ===
383350 print ("\n === Case-Insensitive Substring Search ===" )
384- if should_run ("sz.utf8_case_insensitive_find " , filter_pattern ):
351+ if should_run ("case-insensitive-find/sz " , filter_pattern ):
385352 bench_case_find ("sz.utf8_case_insensitive_find" , pythonic_str , search_needles , find_stringzilla , args .time_limit )
386- if should_run ("str.casefold(). find() " , filter_pattern ):
353+ if should_run ("case-insensitive- find/str " , filter_pattern ):
387354 bench_case_find ("str.casefold().find()" , pythonic_str , search_needles , find_casefold , args .time_limit )
388- if should_run ("regex.search(FULLCASE) " , filter_pattern ):
355+ if should_run ("case-insensitive-find/ regex" , filter_pattern ):
389356 bench_case_find ("regex.search(FULLCASE)" , pythonic_str , search_needles , find_regex_fullcase , args .time_limit )
390- if PYICU_AVAILABLE and find_icu and should_run ("icu.StringSearch " , filter_pattern ):
357+ if should_run ("case-insensitive-find/ icu" , filter_pattern ):
391358 bench_case_find ("icu.StringSearch" , pythonic_str , search_needles , find_icu , args .time_limit )
392359
393360 # === Case Folding Transformation ===
394361 print ("\n === Case Folding Transformation ===" )
395- if should_run ("sz.utf8_case_fold() " , filter_pattern ):
362+ if should_run ("case-fold/sz " , filter_pattern ):
396363 bench_case_fold ("sz.utf8_case_fold()" , tokens , fold_stringzilla , args .time_limit )
397- if should_run ("str.casefold() " , filter_pattern ):
364+ if should_run ("case-fold/ str" , filter_pattern ):
398365 bench_case_fold ("str.casefold()" , tokens , fold_casefold , args .time_limit )
399- if PYICU_AVAILABLE and fold_icu and should_run ("icu.CaseMap.foldCase() " , filter_pattern ):
366+ if should_run ("case-fold/ icu" , filter_pattern ):
400367 bench_case_fold ("icu.CaseMap.foldCase()" , tokens , fold_icu , args .time_limit )
401368
402369 return 0
0 commit comments