Skip to content

Commit 825603b

Browse files
committed
Improve: Filtering by name
1 parent 5ed019a commit 825603b

12 files changed

+162
-141
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1+
# Builds
12
/target
3+
/__pycache__
4+
5+
# Datasets
26
/acgt_*.txt
37
/xlsum.csv
48
/leipzig1M.txt

bench_find.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
import stringzilla as sz
3333
import ahocorasick as ahoc
3434

35-
from utils import load_dataset, tokenize_dataset, add_common_args, now_ns, name_matches
35+
from utils import load_dataset, tokenize_dataset, add_common_args, now_ns, should_run
3636

3737

3838
def log_system_info():
@@ -173,15 +173,15 @@ def main():
173173
log_system_info()
174174

175175
print("\n=== Substring Search Benchmarks ===")
176-
if name_matches("str.find", filter_pattern):
176+
if should_run("str.find", filter_pattern):
177177
bench_op("str.find", pythonic_str, tokens[::-1], count_find, args.time_limit)
178-
if name_matches("stringzilla.Str.find", filter_pattern):
178+
if should_run("stringzilla.Str.find", filter_pattern):
179179
bench_op("stringzilla.Str.find", stringzilla_str, tokens[::-1], count_find, args.time_limit)
180-
if name_matches("str.rfind", filter_pattern):
180+
if should_run("str.rfind", filter_pattern):
181181
bench_op("str.rfind", pythonic_str, tokens, count_rfind, args.time_limit)
182-
if name_matches("stringzilla.Str.rfind", filter_pattern):
182+
if should_run("stringzilla.Str.rfind", filter_pattern):
183183
bench_op("stringzilla.Str.rfind", stringzilla_str, tokens, count_rfind, args.time_limit)
184-
if name_matches("pyahocorasick.iter", filter_pattern):
184+
if should_run("pyahocorasick.iter", filter_pattern):
185185
bench_op("pyahocorasick.iter", pythonic_str, tokens[::-1], count_aho, args.time_limit)
186186

187187
print("\n=== Character Set Search ===")
@@ -191,9 +191,9 @@ def main():
191191
else:
192192
re_chars = re.compile(r"[\t\n\r ]") # whitespace: space, tab, LF, CR
193193
sz_chars = " \t\n\r"
194-
if name_matches("re.finditer", filter_pattern):
194+
if should_run("re.finditer", filter_pattern):
195195
bench_op("re.finditer", pythonic_str, [re_chars], count_regex, args.time_limit)
196-
if name_matches("stringzilla.Str.find_first_of", filter_pattern):
196+
if should_run("stringzilla.Str.find_first_of", filter_pattern):
197197
bench_op("stringzilla.Str.find_first_of", stringzilla_str, [sz_chars], count_byteset, args.time_limit)
198198

199199
return 0

bench_fingerprints.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
import stringzillas as szs
3838
import stringzilla as sz
3939

40-
from utils import load_dataset, tokenize_dataset, add_common_args, now_ns, name_matches
40+
from utils import load_dataset, tokenize_dataset, add_common_args, now_ns, should_run
4141

4242
# For RAPIDS cuDF GPU-accelerated MinHash
4343
try:

bench_fingerprints.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ use stringzilla::szs::{capabilities as szs_capabilities, version as szs_version}
4949
use stringzilla::szs::{AnyBytesTape, DeviceScope, Fingerprints, UnifiedAlloc, UnifiedVec};
5050

5151
mod utils;
52-
use utils::{set_fingerprints_bytes_per_hash, HashesWallTime, should_run_benchmark};
52+
use utils::{set_fingerprints_bytes_per_hash, should_run, HashesWallTime};
5353

5454
// Fixed n-gram widths for multi-scale fingerprinting
5555
const NGRAM_WIDTHS: [usize; 4] = [5, 9, 17, 33];

bench_hash.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
import mmh3
4949
import cityhash
5050

51-
from utils import load_dataset, tokenize_dataset, add_common_args, now_ns, name_matches
51+
from utils import load_dataset, tokenize_dataset, add_common_args, now_ns, should_run
5252

5353

5454
def log_system_info():
@@ -112,40 +112,40 @@ def run_stateless_benchmarks(
112112
print("\n=== Stateless Hash Benchmarks ===")
113113

114114
# Python built-in hash
115-
if name_matches("hash", filter_pattern):
115+
if should_run("hash", filter_pattern):
116116
bench_hash_function("hash", tokens, lambda x: hash(x), time_limit_seconds)
117117

118118
# xxHash
119-
if name_matches("xxhash.xxh3_64", filter_pattern):
119+
if should_run("xxhash.xxh3_64", filter_pattern):
120120
bench_hash_function("xxhash.xxh3_64", tokens, lambda x: xxhash.xxh3_64(x).intdigest(), time_limit_seconds)
121121

122122
# StringZilla hashes
123-
if name_matches("stringzilla.hash", filter_pattern):
123+
if should_run("stringzilla.hash", filter_pattern):
124124
bench_hash_function("stringzilla.hash", tokens, lambda x: sz.hash(x), time_limit_seconds)
125125

126126
# Google CRC32C (Castagnoli) one-shot
127-
if name_matches("google_crc32c.value", filter_pattern):
127+
if should_run("google_crc32c.value", filter_pattern):
128128
bench_hash_function("google_crc32c.value", tokens, lambda x: google_crc32c.value(x), time_limit_seconds)
129129

130130
# MurmurHash3 — stateless
131-
if name_matches("mmh3.hash32", filter_pattern):
131+
if should_run("mmh3.hash32", filter_pattern):
132132
bench_hash_function("mmh3.hash32", tokens, lambda x: mmh3.hash(x, signed=False), time_limit_seconds)
133-
if name_matches("mmh3.hash64", filter_pattern):
133+
if should_run("mmh3.hash64", filter_pattern):
134134
bench_hash_function("mmh3.hash64", tokens, lambda x: mmh3.hash64(x, signed=False)[0], time_limit_seconds)
135-
if name_matches("mmh3.hash128", filter_pattern):
135+
if should_run("mmh3.hash128", filter_pattern):
136136
bench_hash_function("mmh3.hash128", tokens, lambda x: mmh3.hash128(x, signed=False), time_limit_seconds)
137137

138138
# CityHash — stateless
139-
if name_matches("cityhash.CityHash64", filter_pattern):
139+
if should_run("cityhash.CityHash64", filter_pattern):
140140
bench_hash_function("cityhash.CityHash64", tokens, lambda x: cityhash.CityHash64(x), time_limit_seconds)
141-
if name_matches("cityhash.CityHash128", filter_pattern):
141+
if should_run("cityhash.CityHash128", filter_pattern):
142142
bench_hash_function("cityhash.CityHash128", tokens, lambda x: cityhash.CityHash128(x), time_limit_seconds)
143143

144144
# Reference bounds
145-
if name_matches("blake3.digest", filter_pattern):
145+
if should_run("blake3.digest", filter_pattern):
146146
bench_hash_function("blake3.digest", tokens, lambda x: blake3.blake3(x).digest(), time_limit_seconds)
147147

148-
if name_matches("stringzilla.bytesum", filter_pattern):
148+
if should_run("stringzilla.bytesum", filter_pattern):
149149
bench_hash_function("stringzilla.bytesum", tokens, lambda x: sz.bytesum(x), time_limit_seconds)
150150

151151

@@ -194,15 +194,15 @@ def run_stateful_benchmarks(
194194
print("\n=== Stateful Hash Benchmarks ===")
195195

196196
# xxHash stateful
197-
if name_matches("xxhash.xxh3_64", filter_pattern):
197+
if should_run("xxhash.xxh3_64", filter_pattern):
198198
bench_stateful_hash("xxhash.xxh3_64", tokens, lambda: xxhash.xxh3_64(), time_limit_seconds)
199199

200200
# StringZilla stateful hasher
201-
if name_matches("stringzilla.Hasher", filter_pattern):
201+
if should_run("stringzilla.Hasher", filter_pattern):
202202
bench_stateful_hash("stringzilla.Hasher", tokens, lambda: sz.Hasher(), time_limit_seconds)
203203

204204
# Google CRC32C (Castagnoli) stateful
205-
if name_matches("google_crc32c.Checksum", filter_pattern):
205+
if should_run("google_crc32c.Checksum", filter_pattern):
206206
bench_stateful_hash("google_crc32c.Checksum", tokens, lambda: google_crc32c.Checksum(), time_limit_seconds)
207207

208208

bench_memory.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
from Crypto.Cipher import AES as PyCryptoDomeAES
3232
import cv2
3333

34-
from utils import add_common_args, load_dataset, name_matches, now_ns, tokenize_dataset
34+
from utils import add_common_args, load_dataset, should_run, now_ns, tokenize_dataset
3535

3636

3737
def log_system_info():
@@ -258,55 +258,55 @@ def main() -> int:
258258
tokens_mv = [memoryview(bytearray(token)) for token in tokens_b]
259259

260260
# Python bytes.translate (always allocating)
261-
if name_matches("bytes.translate(new)", pattern):
261+
if should_run("bytes.translate(new)", pattern):
262262
bench_translate("bytes.translate(new)", tokens_b, reverse, bytes_translate, args.time_limit)
263263

264264
# OpenCV allocating
265-
if name_matches("opencv.LUT(new)", pattern):
265+
if should_run("opencv.LUT(new)", pattern):
266266
bench_translate("opencv.LUT(new)", tokens_np, reverse_np, opencv_lut_allocating, args.time_limit)
267267

268268
# OpenCV in-place
269-
if name_matches("opencv.LUT(inplace)", pattern):
269+
if should_run("opencv.LUT(inplace)", pattern):
270270
bench_translate("opencv.LUT(inplace)", tokens_np, reverse_np, opencv_lut_inplace, args.time_limit)
271271

272272
# NumPy indexing allocating
273-
if name_matches("numpy.indexing(new)", pattern):
273+
if should_run("numpy.indexing(new)", pattern):
274274
bench_translate("numpy.indexing(new)", tokens_np, reverse_np, numpy_lut_indexing_allocating, args.time_limit)
275275

276276
# NumPy indexing in-place
277-
if name_matches("numpy.indexing(inplace)", pattern):
277+
if should_run("numpy.indexing(inplace)", pattern):
278278
bench_translate("numpy.indexing(inplace)", tokens_np, reverse_np, numpy_lut_indexing_inplace, args.time_limit)
279279

280280
# NumPy take allocating
281-
if name_matches("numpy.take(new)", pattern):
281+
if should_run("numpy.take(new)", pattern):
282282
bench_translate("numpy.take(new)", tokens_np, reverse_np, numpy_lut_take_allocating, args.time_limit)
283283

284284
# NumPy take in-place
285-
if name_matches("numpy.take(inplace)", pattern):
285+
if should_run("numpy.take(inplace)", pattern):
286286
bench_translate("numpy.take(inplace)", tokens_np, reverse_np, numpy_lut_take_inplace, args.time_limit)
287287

288288
# StringZilla allocating
289-
if name_matches("stringzilla.translate(new)", pattern):
289+
if should_run("stringzilla.translate(new)", pattern):
290290
bench_translate("stringzilla.translate(new)", tokens_b, reverse, sz_translate_allocating, args.time_limit)
291291

292292
# StringZilla in-place (need memoryviews for each token)
293-
if name_matches("stringzilla.translate(inplace)", pattern):
293+
if should_run("stringzilla.translate(inplace)", pattern):
294294
bench_translate("stringzilla.translate(inplace)", tokens_mv, reverse, sz_translate_inplace, args.time_limit)
295295

296296
# ---------------- Random byte generation ----------------
297297
print()
298298
print("--- Random Byte Generation ---")
299299
sizes = sizes_from_tokens(tokens_b)
300300

301-
if name_matches("pycryptodome.AES-CTR", pattern):
301+
if should_run("pycryptodome.AES-CTR", pattern):
302302
bench_generator("pycryptodome.AES-CTR", sizes, make_pycryptodome_aes_ctr(), args.time_limit)
303-
if name_matches("stringzilla.fill_random", pattern):
303+
if should_run("stringzilla.fill_random", pattern):
304304
bench_generator("stringzilla.fill_random", sizes, make_stringzilla_fill_random(), args.time_limit)
305-
if name_matches("stringzilla.random", pattern):
305+
if should_run("stringzilla.random", pattern):
306306
bench_generator("stringzilla.random", sizes, sz.random, args.time_limit)
307-
if name_matches("numpy.PCG64", pattern):
307+
if should_run("numpy.PCG64", pattern):
308308
bench_generator("numpy.PCG64", sizes, make_numpy_pcg64(), args.time_limit)
309-
if name_matches("numpy.Philox", pattern):
309+
if should_run("numpy.Philox", pattern):
310310
bench_generator("numpy.Philox", sizes, make_numpy_philox(), args.time_limit)
311311

312312
return 0

bench_sequence.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
# cuDF sorts run on GPU; nothing to set for CPU threads here
5050
pass
5151

52-
from utils import load_dataset, tokenize_dataset, add_common_args, now_ns, name_matches
52+
from utils import load_dataset, tokenize_dataset, add_common_args, now_ns, should_run
5353

5454

5555
def log_system_info():
@@ -134,22 +134,22 @@ def main():
134134
print("\n=== Sort Benchmarks ===")
135135

136136
# Python list.sort
137-
if name_matches("list.sort", filter_pattern):
137+
if should_run("list.sort", filter_pattern):
138138
py_list = list(tokens)
139139
bench_sort_operation("list.sort", lambda: py_list.sort(), len(tokens))
140140

141141
# StringZilla
142-
if name_matches("stringzilla.Strs.sorted", filter_pattern):
142+
if should_run("stringzilla.Strs.sorted", filter_pattern):
143143
sz_strs = sz.Strs(tokens)
144144
bench_sort_operation("stringzilla.Strs.sorted", lambda: sz_strs.sorted(), len(tokens))
145145

146146
# Pandas
147-
if name_matches("pandas.Series.sort_values", filter_pattern):
147+
if should_run("pandas.Series.sort_values", filter_pattern):
148148
s = pd.Series(tokens)
149149
bench_sort_operation("pandas.Series.sort_values", lambda: s.sort_values(ignore_index=True), len(tokens))
150150

151151
# PyArrow
152-
if name_matches("pyarrow.compute.sort_indices", filter_pattern):
152+
if should_run("pyarrow.compute.sort_indices", filter_pattern):
153153
# Choose Arrow string type without timing the conversion
154154
INT32_MAX = 2_147_483_647
155155
total_bytes = 0
@@ -167,12 +167,12 @@ def _pa_sort_call():
167167
bench_sort_operation("pyarrow.compute.sort_indices", _pa_sort_call, len(tokens))
168168

169169
# Polars
170-
if name_matches("polars.Series.sort", filter_pattern):
170+
if should_run("polars.Series.sort", filter_pattern):
171171
ps = pl.Series(tokens)
172172
bench_sort_operation("polars.Series.sort", lambda: ps.sort(), len(tokens))
173173

174174
# cuDF GPU (if available)
175-
if CUDF_AVAILABLE and name_matches("cudf.Series.sort_values", filter_pattern):
175+
if CUDF_AVAILABLE and should_run("cudf.Series.sort_values", filter_pattern):
176176
cs = cudf.Series(tokens)
177177
bench_sort_operation("cudf.Series.sort_values", lambda: cs.sort_values(ignore_index=True), len(tokens))
178178

0 commit comments

Comments
 (0)