Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ monostate = "0.1.12"
ahash = { version = "0.8.11", features = ["serde"] }
dary_heap = { version = "0.3.6", features = ["serde"] }
compact_str = { version = "0.9", features = ["serde"] }
pcre2 = { version = "0.2.11", optional = true }

[features]
default = ["progressbar", "onig", "esaxx_fast"]
Expand Down
73 changes: 73 additions & 0 deletions tokenizers/benches/regex_split.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#[macro_use]
extern crate criterion;

use criterion::{Criterion, Throughput};
use std::fs::File;
use std::io::{BufRead, BufReader};
use tokenizers::{
pre_tokenizers::{PreTokenizer, PreTokenizerWrapper},
PreTokenizedString, Tokenizer,
};

#[cfg(feature = "pcre2")]
const BACKEND: &str = "pcre2";
#[cfg(all(feature = "onig", not(feature = "pcre2")))]
const BACKEND: &str = "onig";
#[cfg(all(
feature = "fancy-regex",
not(any(feature = "onig", feature = "pcre2"))
))]
const BACKEND: &str = "fancy-regex";

fn load_inputs(path: &str, limit: usize) -> Vec<String> {
BufReader::new(File::open(path).expect("read sample file"))
.lines()
.take(limit)
.map(|l| l.expect("line"))
.collect()
}

fn bench_split(
group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
name: &str,
pretok: PreTokenizerWrapper,
inputs: &[String],
) {
let total_bytes: u64 = inputs.iter().map(|s| s.len() as u64).sum();
group.throughput(Throughput::Bytes(total_bytes));

group.bench_function(name.to_string(), |b| {
let pretok = pretok.clone();
b.iter(|| {
for text in inputs {
let mut pts = PreTokenizedString::from(text.as_str());
pretok.pre_tokenize(&mut pts).expect("pre-tokenize");
}
});
});
}

fn regex_split(c: &mut Criterion) {
// Keep the run quick but representative.
let inputs = load_inputs("data/big.txt", 2_000);
let mut group = c.benchmark_group(format!("regex-split-{BACKEND}"));

let gpt2 = Tokenizer::from_file("data/tokenizer.json").expect("load gpt2 tokenizer");
let gpt2_pretok = gpt2
.get_pre_tokenizer()
.expect("gpt2 pretokenizer")
.clone();
bench_split(&mut group, "bytelevel-gpt2", gpt2_pretok, &inputs);

let llama = Tokenizer::from_file("data/llama-3-tokenizer.json").expect("load llama3 tokenizer");
let llama_pretok = llama
.get_pre_tokenizer()
.expect("llama3 pretokenizer")
.clone();
bench_split(&mut group, "llama3-split", llama_pretok, &inputs);

group.finish();
}

criterion_group!(regex_split_group, regex_split);
criterion_main!(regex_split_group);
128,457 changes: 128,457 additions & 0 deletions tokenizers/data/big.txt

Large diffs are not rendered by default.

Loading
Loading