Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 23 additions & 8 deletions tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
[package]
authors = ["Anthony MOI <m.anthony.moi@gmail.com>", "Nicolas Patry <patry.nicolas@protonmail.com>"]
authors = [
"Anthony MOI <m.anthony.moi@gmail.com>",
"Nicolas Patry <patry.nicolas@protonmail.com>",
]
edition = "2018"
name = "tokenizers"
version = "0.22.3-dev.0"
Expand All @@ -13,7 +16,14 @@ description = """
Provides an implementation of today's most used tokenizers,
with a focus on performances and versatility.
"""
exclude = [ "rust-toolchain", "target/*", "Cargo.lock", "benches/*.txt", "benches/*.json", "data/*" ]
exclude = [
"rust-toolchain",
"target/*",
"Cargo.lock",
"benches/*.txt",
"benches/*.json",
"data/*",
]

[package.metadata.docs.rs]
all-features = true
Expand Down Expand Up @@ -48,31 +58,37 @@ name = "added_vocab_deserialize"
required-features = ["http"]
harness = false

[[bench]]
name = "parallel_pretok_benchmark"
harness = false

[dependencies]
rand = "0.9"
onig = { version = "6.5.1", default-features = false, optional = true }
regex = "1.10"
regex-syntax = "0.8"
rayon = "1.10"
rayon-cond = "0.4"
serde = { version = "1.0", features = [ "derive" ] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
unicode-normalization-alignments = "0.1"
unicode_categories = "0.1"
unicode-segmentation = "1.11"
indicatif = {version = "0.18", optional = true}
indicatif = { version = "0.18", optional = true }
itertools = "0.14"
log = "0.4"
derive_builder = "0.20"
spm_precompiled = "0.1.3"
hf-hub = { version = "0.4.1", features = ["ureq"], default-features = false, optional = true }
hf-hub = { version = "0.4.1", features = [
"ureq",
], default-features = false, optional = true }
aho-corasick = "1.1"
paste = "1.0.14"
macro_rules_attribute = "0.2.0"
thiserror = "2"
fancy-regex = { version = "0.17", optional = true}
fancy-regex = { version = "0.17", optional = true }
getrandom = { version = "0.3" }
esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
esaxx-rs = { version = "0.1.10", default-features = false, features = [] }
monostate = "0.1.12"
ahash = { version = "0.8.11", features = ["serde"] }
dary_heap = { version = "0.3.6", features = ["serde"] }
Expand All @@ -99,4 +115,3 @@ lto = "fat"
[[example]]
name = "encode_batch"
required-features = ["http"]

102 changes: 102 additions & 0 deletions tokenizers/benches/parallel_pretok_benchmark.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#[macro_use]
extern crate criterion;

use criterion::{BenchmarkId, Criterion, Throughput};
use std::hint::black_box;
use tokenizers::pattern::Pattern;
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
use tokenizers::utils::SysRegex;
use tokenizers::{PreTokenizedString, PreTokenizer};

/// GPT-2 byte-level regex pattern — the most common pre-tokenization regex
fn gpt2_regex() -> SysRegex {
SysRegex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
.unwrap()
}

fn bench_parallel_pretok(c: &mut Criterion) {
let data = std::fs::read_to_string("data/big.txt").unwrap();

// --- Raw find_matches: sequential vs parallel ---
{
let mut group = c.benchmark_group("find_matches");
group.throughput(Throughput::Bytes(data.len() as u64));

let re = gpt2_regex();

group.bench_function("sequential", |b| {
tokenizers::parallelism::set_parallelism(false);
b.iter(|| (&re).find_matches(black_box(&data)).unwrap())
});

group.bench_function("parallel", |b| {
tokenizers::parallelism::set_parallelism(true);
b.iter(|| (&re).find_matches(black_box(&data)).unwrap())
});

// Restore default
tokenizers::parallelism::set_parallelism(true);
group.finish();
}

// --- Full pre-tokenizer pipeline: sequential vs parallel ---
{
let mut group = c.benchmark_group("byte-level-pretok");
group.throughput(Throughput::Bytes(data.len() as u64));

let pretok = ByteLevel::default();

group.bench_function("sequential", |b| {
tokenizers::parallelism::set_parallelism(false);
b.iter(|| {
let mut pre = PreTokenizedString::from(black_box(data.as_str()));
pretok.pre_tokenize(&mut pre).unwrap();
pre
})
});

group.bench_function("parallel", |b| {
tokenizers::parallelism::set_parallelism(true);
b.iter(|| {
let mut pre = PreTokenizedString::from(black_box(data.as_str()));
pretok.pre_tokenize(&mut pre).unwrap();
pre
})
});

tokenizers::parallelism::set_parallelism(true);
group.finish();
}

// --- Scaling by input size ---
{
let mut group = c.benchmark_group("parallel-pretok-scaling");
let re = gpt2_regex();

for size in [1_000, 10_000, 100_000, 500_000] {
let input: String = data.chars().take(size).collect();
group.throughput(Throughput::Bytes(input.len() as u64));

group.bench_with_input(BenchmarkId::new("sequential", size), &input, |b, input| {
tokenizers::parallelism::set_parallelism(false);
b.iter(|| (&re).find_matches(black_box(input.as_str())).unwrap())
});

group.bench_with_input(BenchmarkId::new("parallel", size), &input, |b, input| {
tokenizers::parallelism::set_parallelism(true);
b.iter(|| (&re).find_matches(black_box(input.as_str())).unwrap())
});
}

tokenizers::parallelism::set_parallelism(true);
group.finish();
}
}

criterion_group! {
name = parallel_pretok;
config = Criterion::default().sample_size(20);
targets = bench_parallel_pretok
}

criterion_main!(parallel_pretok);
Loading
Loading