Skip to content

Commit aa9a7f1

Browse files
committed
Improve: Benchmark section names
1 parent 808cdbc commit aa9a7f1

File tree

11 files changed

+344
-250
lines changed

11 files changed

+344
-250
lines changed

bench_find.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -173,16 +173,16 @@ def main():
173173
log_system_info()
174174

175175
print("\n=== Substring Search Benchmarks ===")
176-
if should_run("str.find", filter_pattern):
177-
bench_op("str.find", pythonic_str, tokens[::-1], count_find, args.time_limit)
178-
if should_run("stringzilla.Str.find", filter_pattern):
179-
bench_op("stringzilla.Str.find", stringzilla_str, tokens[::-1], count_find, args.time_limit)
180-
if should_run("str.rfind", filter_pattern):
181-
bench_op("str.rfind", pythonic_str, tokens, count_rfind, args.time_limit)
182-
if should_run("stringzilla.Str.rfind", filter_pattern):
183-
bench_op("stringzilla.Str.rfind", stringzilla_str, tokens, count_rfind, args.time_limit)
184-
if should_run("pyahocorasick.iter", filter_pattern):
185-
bench_op("pyahocorasick.iter", pythonic_str, tokens[::-1], count_aho, args.time_limit)
176+
if should_run("substring-forward/std.str.find()", filter_pattern):
177+
bench_op("std.str.find()", pythonic_str, tokens[::-1], count_find, args.time_limit)
178+
if should_run("substring-forward/stringzilla.Str.find()", filter_pattern):
179+
bench_op("stringzilla.Str.find()", stringzilla_str, tokens[::-1], count_find, args.time_limit)
180+
if should_run("substring-backward/std.str.rfind()", filter_pattern):
181+
bench_op("std.str.rfind()", pythonic_str, tokens, count_rfind, args.time_limit)
182+
if should_run("substring-backward/stringzilla.Str.rfind()", filter_pattern):
183+
bench_op("stringzilla.Str.rfind()", stringzilla_str, tokens, count_rfind, args.time_limit)
184+
if should_run("substring-forward/pyahocorasick.iter()", filter_pattern):
185+
bench_op("pyahocorasick.iter()", pythonic_str, tokens[::-1], count_aho, args.time_limit)
186186

187187
print("\n=== Character Set Search ===")
188188
if args.tokens == "lines":
@@ -191,10 +191,10 @@ def main():
191191
else:
192192
re_chars = re.compile(r"[\t\n\r ]") # whitespace: space, tab, LF, CR
193193
sz_chars = " \t\n\r"
194-
if should_run("re.finditer", filter_pattern):
195-
bench_op("re.finditer", pythonic_str, [re_chars], count_regex, args.time_limit)
196-
if should_run("stringzilla.Str.find_first_of", filter_pattern):
197-
bench_op("stringzilla.Str.find_first_of", stringzilla_str, [sz_chars], count_byteset, args.time_limit)
194+
if should_run("byteset-forward/std.re.finditer()", filter_pattern):
195+
bench_op("std.re.finditer()", pythonic_str, [re_chars], count_regex, args.time_limit)
196+
if should_run("byteset-forward/stringzilla.Str.find_first_of()", filter_pattern):
197+
bench_op("stringzilla.Str.find_first_of()", stringzilla_str, [sz_chars], count_byteset, args.time_limit)
198198

199199
return 0
200200

bench_fingerprints.rs

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ RUSTFLAGS="-C target-cpu=native" \
4646

4747
use core::convert::TryInto;
4848
use std::collections::hash_map::DefaultHasher;
49-
use std::env;
5049
use std::hash::{Hash, Hasher};
5150

5251
use criterion::{Criterion, Throughput};
@@ -60,8 +59,8 @@ use stringzilla::szs::{AnyBytesTape, DeviceScope, Fingerprints, UnifiedAlloc, Un
6059

6160
mod utils;
6261
use utils::{
63-
install_panic_hook, load_dataset, set_fingerprints_bytes_per_hash, should_run, HashesWallTime,
64-
ResultExt,
62+
get_env_parsed, install_panic_hook, load_dataset, set_fingerprints_bytes_per_hash, should_run,
63+
HashesWallTime, ResultExt,
6564
};
6665

6766
// Fixed n-gram widths for multi-scale fingerprinting
@@ -181,20 +180,8 @@ fn bench_fingerprints(c: &mut Criterion<HashesWallTime>) {
181180
.as_chars()
182181
.expect("Dataset must be valid UTF-8 for fingerprinting");
183182

184-
let batch_size = env::var("STRINGWARS_BATCH")
185-
.unwrap_or_else(|_| "1024".to_string())
186-
.parse::<usize>()
187-
.unwrap_or_else(|e| {
188-
panic!(
189-
"STRINGWARS_BATCH must be a valid number for fingerprinting benchmarks: {}",
190-
e
191-
)
192-
});
193-
194-
let ndim = env::var("STRINGWARS_NDIM")
195-
.ok()
196-
.and_then(|v| v.parse::<usize>().ok())
197-
.unwrap_or(256);
183+
let batch_size: usize = get_env_parsed("STRINGWARS_BATCH", 1024);
184+
let ndim: usize = get_env_parsed("STRINGWARS_NDIM", 256);
198185

199186
if batch_size == 0 {
200187
panic!("STRINGWARS_BATCH must be greater than zero for fingerprinting benchmarks.");
@@ -314,9 +301,9 @@ fn bench_fingerprints(c: &mut Criterion<HashesWallTime>) {
314301
min_counts.resize(batch_size * ndim, 0);
315302

316303
// StringZilla: 1x CPU
317-
if should_run("fingerprinting/stringzillas::Fingerprints(1xCPU)") {
304+
if should_run("fingerprinting/stringzillas/Fingerprints(1xCPU)") {
318305
g.throughput(Throughput::Elements(per_batch_hash_ops));
319-
g.bench_function("stringzillas::Fingerprints(1xCPU)", |b| {
306+
g.bench_function("stringzillas/Fingerprints(1xCPU)", |b| {
320307
start_idx = 0;
321308
b.iter(|| {
322309
let (batch_bytes_view, _batch_chars_view, actual) = tokens_tape_slice(
@@ -350,12 +337,12 @@ fn bench_fingerprints(c: &mut Criterion<HashesWallTime>) {
350337

351338
// StringZilla: Nx CPU
352339
if should_run(&format!(
353-
"fingerprinting/stringzillas::Fingerprints({}xCPU)",
340+
"fingerprinting/stringzillas/Fingerprints({}xCPU)",
354341
num_cores
355342
)) {
356343
g.throughput(Throughput::Elements(per_batch_hash_ops));
357344
g.bench_function(
358-
&format!("stringzillas::Fingerprints({}xCPU)", num_cores),
345+
&format!("stringzillas/Fingerprints({}xCPU)", num_cores),
359346
|b| {
360347
start_idx = 0;
361348
b.iter(|| {
@@ -407,9 +394,9 @@ fn bench_fingerprints(c: &mut Criterion<HashesWallTime>) {
407394

408395
// StringZilla: 1x GPU (if available)
409396
if let (Ok(gpu), Some(engine)) = (maybe_gpu.as_ref(), maybe_sz_gpu.as_ref()) {
410-
if should_run("fingerprinting/stringzillas::Fingerprints(1xGPU)") {
397+
if should_run("fingerprinting/stringzillas/Fingerprints(1xGPU)") {
411398
g.throughput(Throughput::Elements(per_batch_hash_ops));
412-
g.bench_function("stringzillas::Fingerprints(1xGPU)", |b| {
399+
g.bench_function("stringzillas/Fingerprints(1xGPU)", |b| {
413400
start_idx = 0;
414401
b.iter(|| {
415402
let (batch_bytes_view, _batch_chars_view, actual) = tokens_tape_slice(
@@ -451,9 +438,9 @@ fn bench_fingerprints(c: &mut Criterion<HashesWallTime>) {
451438
let mut out = Vec::with_capacity(batch_size);
452439
let mut combined_signature = Vec::with_capacity(ndim);
453440

454-
if should_run("fingerprinting/pc::MinHash<ByteGrams>") {
441+
if should_run("fingerprinting/pc/MinHash<ByteGrams>()") {
455442
g.throughput(Throughput::Elements(per_batch_hash_ops));
456-
g.bench_function("pc::MinHash<ByteGrams>", |b| {
443+
g.bench_function("pc/MinHash<ByteGrams>()", |b| {
457444
start_idx = 0;
458445
b.iter(|| {
459446
let (batch_bytes_view, _batch_chars_view, actual) = tokens_tape_slice(
@@ -500,9 +487,9 @@ fn bench_fingerprints(c: &mut Criterion<HashesWallTime>) {
500487

501488
// Serial MinHash baseline implementing correct independent hash functions
502489
// This addresses the flaw in probabilistic_collections where hash function index is ignored
503-
if should_run("fingerprinting/serial::MinHash<ByteGrams>") {
490+
if should_run("fingerprinting/serial/MinHash<ByteGrams>()") {
504491
g.throughput(Throughput::Elements(per_batch_hash_ops));
505-
g.bench_function("serial::MinHash<ByteGrams>", |b| {
492+
g.bench_function("serial/MinHash<ByteGrams>()", |b| {
506493
// Pre-construct hash parameters for independent universal hash functions
507494
// Each hash function uses: hash_i(x) = (a_i * hash(x) + b_i) mod mersenne_prime
508495

bench_hash.py

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -114,34 +114,34 @@ def run_stateless_benchmarks(
114114
print("\n=== Stateless Hash Benchmarks ===")
115115

116116
# Python built-in hash
117-
if should_run("hash", filter_pattern):
118-
bench_hash_function("hash", tokens, lambda x: hash(x), time_limit_seconds)
117+
if should_run("stateless/std.hash()", filter_pattern):
118+
bench_hash_function("std.hash()", tokens, lambda x: hash(x), time_limit_seconds)
119119

120120
# xxHash
121-
if should_run("xxhash.xxh3_64", filter_pattern):
122-
bench_hash_function("xxhash.xxh3_64", tokens, lambda x: xxhash.xxh3_64(x).intdigest(), time_limit_seconds)
121+
if should_run("stateless/xxhash.xxh3_64()", filter_pattern):
122+
bench_hash_function("xxhash.xxh3_64()", tokens, lambda x: xxhash.xxh3_64(x).intdigest(), time_limit_seconds)
123123

124124
# StringZilla hashes
125-
if should_run("stringzilla.hash", filter_pattern):
126-
bench_hash_function("stringzilla.hash", tokens, lambda x: sz.hash(x), time_limit_seconds)
125+
if should_run("stateless/stringzilla.hash()", filter_pattern):
126+
bench_hash_function("stringzilla.hash()", tokens, lambda x: sz.hash(x), time_limit_seconds)
127127

128128
# Google CRC32C (Castagnoli) one-shot
129-
if should_run("google_crc32c.value", filter_pattern):
130-
bench_hash_function("google_crc32c.value", tokens, lambda x: google_crc32c.value(x), time_limit_seconds)
129+
if should_run("stateless/google_crc32c.value()", filter_pattern):
130+
bench_hash_function("google_crc32c.value()", tokens, lambda x: google_crc32c.value(x), time_limit_seconds)
131131

132132
# MurmurHash3 — stateless
133-
if should_run("mmh3.hash32", filter_pattern):
134-
bench_hash_function("mmh3.hash32", tokens, lambda x: mmh3.hash(x, signed=False), time_limit_seconds)
135-
if should_run("mmh3.hash64", filter_pattern):
136-
bench_hash_function("mmh3.hash64", tokens, lambda x: mmh3.hash64(x, signed=False)[0], time_limit_seconds)
137-
if should_run("mmh3.hash128", filter_pattern):
138-
bench_hash_function("mmh3.hash128", tokens, lambda x: mmh3.hash128(x, signed=False), time_limit_seconds)
133+
if should_run("stateless/mmh3.hash32()", filter_pattern):
134+
bench_hash_function("mmh3.hash32()", tokens, lambda x: mmh3.hash(x, signed=False), time_limit_seconds)
135+
if should_run("stateless/mmh3.hash64()", filter_pattern):
136+
bench_hash_function("mmh3.hash64()", tokens, lambda x: mmh3.hash64(x, signed=False)[0], time_limit_seconds)
137+
if should_run("stateless/mmh3.hash128()", filter_pattern):
138+
bench_hash_function("mmh3.hash128()", tokens, lambda x: mmh3.hash128(x, signed=False), time_limit_seconds)
139139

140140
# CityHash — stateless
141-
if should_run("cityhash.CityHash64", filter_pattern):
142-
bench_hash_function("cityhash.CityHash64", tokens, lambda x: cityhash.CityHash64(x), time_limit_seconds)
143-
if should_run("cityhash.CityHash128", filter_pattern):
144-
bench_hash_function("cityhash.CityHash128", tokens, lambda x: cityhash.CityHash128(x), time_limit_seconds)
141+
if should_run("stateless/cityhash.CityHash64()", filter_pattern):
142+
bench_hash_function("cityhash.CityHash64()", tokens, lambda x: cityhash.CityHash64(x), time_limit_seconds)
143+
if should_run("stateless/cityhash.CityHash128()", filter_pattern):
144+
bench_hash_function("cityhash.CityHash128()", tokens, lambda x: cityhash.CityHash128(x), time_limit_seconds)
145145

146146

147147
def bench_stateful_hash(
@@ -189,16 +189,16 @@ def run_stateful_benchmarks(
189189
print("\n=== Stateful Hash Benchmarks ===")
190190

191191
# xxHash stateful
192-
if should_run("xxhash.xxh3_64", filter_pattern):
193-
bench_stateful_hash("xxhash.xxh3_64", tokens, lambda: xxhash.xxh3_64(), time_limit_seconds)
192+
if should_run("stateful/xxhash.xxh3_64()", filter_pattern):
193+
bench_stateful_hash("xxhash.xxh3_64()", tokens, lambda: xxhash.xxh3_64(), time_limit_seconds)
194194

195195
# StringZilla stateful hasher
196-
if should_run("stringzilla.Hasher", filter_pattern):
197-
bench_stateful_hash("stringzilla.Hasher", tokens, lambda: sz.Hasher(), time_limit_seconds)
196+
if should_run("stateful/stringzilla.Hasher()", filter_pattern):
197+
bench_stateful_hash("stringzilla.Hasher()", tokens, lambda: sz.Hasher(), time_limit_seconds)
198198

199199
# Google CRC32C (Castagnoli) stateful
200-
if should_run("google_crc32c.Checksum", filter_pattern):
201-
bench_stateful_hash("google_crc32c.Checksum", tokens, lambda: google_crc32c.Checksum(), time_limit_seconds)
200+
if should_run("stateful/google_crc32c.Checksum()", filter_pattern):
201+
bench_stateful_hash("google_crc32c.Checksum()", tokens, lambda: google_crc32c.Checksum(), time_limit_seconds)
202202

203203

204204
def run_checksum_benchmarks(
@@ -210,20 +210,20 @@ def run_checksum_benchmarks(
210210
print("\n=== Checksum Hash Benchmarks ===")
211211

212212
# StringZilla bytesum - reference lower bound
213-
if should_run("stringzilla.bytesum", filter_pattern):
214-
bench_hash_function("stringzilla.bytesum", tokens, lambda x: sz.bytesum(x), time_limit_seconds)
213+
if should_run("checksum/stringzilla.bytesum()", filter_pattern):
214+
bench_hash_function("stringzilla.bytesum()", tokens, lambda x: sz.bytesum(x), time_limit_seconds)
215215

216216
# Blake3 - cryptographic hash
217-
if should_run("blake3.digest", filter_pattern):
218-
bench_hash_function("blake3.digest", tokens, lambda x: blake3.blake3(x).digest(), time_limit_seconds)
217+
if should_run("checksum/blake3.blake3()", filter_pattern):
218+
bench_hash_function("blake3.blake3()", tokens, lambda x: blake3.blake3(x).digest(), time_limit_seconds)
219219

220220
# SHA256 via hashlib (Python standard library)
221-
if should_run("hashlib.sha256", filter_pattern):
222-
bench_hash_function("hashlib.sha256", tokens, lambda x: hashlib.sha256(x).digest(), time_limit_seconds)
221+
if should_run("checksum/hashlib.sha256()", filter_pattern):
222+
bench_hash_function("hashlib.sha256()", tokens, lambda x: hashlib.sha256(x).digest(), time_limit_seconds)
223223

224224
# SHA256 via StringZilla
225-
if should_run("stringzilla.Sha256", filter_pattern):
226-
bench_hash_function("stringzilla.Sha256", tokens, lambda x: sz.Sha256().update(x).digest(), time_limit_seconds)
225+
if should_run("checksum/stringzilla.Sha256()", filter_pattern):
226+
bench_hash_function("stringzilla.Sha256()", tokens, lambda x: sz.Sha256().update(x).digest(), time_limit_seconds)
227227

228228

229229
_main_epilog = """

0 commit comments

Comments
 (0)