diff --git a/docs/components/frontend/configuration.md b/docs/components/frontend/configuration.md index 65615ae2e5a..cdf9641fa8d 100644 --- a/docs/components/frontend/configuration.md +++ b/docs/components/frontend/configuration.md @@ -91,6 +91,12 @@ See the [Frontend Guide](frontend-guide.md) for KServe message formats and integ | `--metrics-prefix` | `DYN_METRICS_PREFIX` | `dynamo_frontend` | Prefix for frontend Prometheus metrics | | `--dump-config-to` | `DYN_DUMP_CONFIG_TO` | — | Dump resolved config to file path | +## Tokenizer + +| CLI Argument | Env Var | Default | Description | +|-------------|---------|---------|-------------| +| `--tokenizer` | `DYN_TOKENIZER` | `default` | Tokenizer backend: `default` (HuggingFace) or `fastokens` (fastokens crate for high-performance BPE encoding). See [Tokenizer Backends](tokenizer-backends.md) | + ## Experimental | CLI Argument | Env Var | Default | Description | diff --git a/docs/components/frontend/tokenizer-backends.md b/docs/components/frontend/tokenizer-backends.md new file mode 100644 index 00000000000..6becbb070f9 --- /dev/null +++ b/docs/components/frontend/tokenizer-backends.md @@ -0,0 +1,55 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: Tokenizer Backends +--- + +The Dynamo Frontend supports multiple tokenizer backends for BPE-based models. The backend controls how input text is tokenized before being sent to the inference engine. + +## Tokenizer Backends + +#### `default` HuggingFace Tokenizers + +The default backend uses the [HuggingFace `tokenizers`](https://github.com/huggingface/tokenizers) library (Rust). +It supports features in `tokenizer.json` files (normalizers, pre-tokenizers, post-processors, decoders, added tokens with special-token flags, and byte-fallback). + +#### `fastokens` High-Performance BPE Encoding + +The `fastokens` backend uses the [`fastokens`](https://github.com/Atero-ai/fastokens) crate, a purpose-built BPE encoder optimized for throughput. +It is a _hybrid_ backend: encoding uses `fastokens` while decoding falls back to HuggingFace so that incremental detokenization, byte-fallback, and special-token handling work correctly. + +Use this backend when tokenization is a measurable bottleneck, for example on high-concurrency prefill-heavy workloads. + +#### Compatibility notes: + +- Works with standard BPE `tokenizer.json` files (Qwen, LLaMA, GPT-family, Mistral, DeepSeek, etc.). +- If `fastokens` cannot load a particular tokenizer file, the frontend logs a warning and transparently falls back to HuggingFace; requests are never dropped. +- Has no effect on TikToken-format tokenizers (`.model` / `.tiktoken` files), which always use the TikToken backend. + +## Configuration + +Set the backend with a CLI flag or environment variable. The CLI flag takes precedence. + +| CLI Argument | Env Var | Valid values | Default | +|---|---|---|---| +| `--tokenizer` | `DYN_TOKENIZER` | `default`, `fastokens` | `default` | + +**Examples:** + +```bash +# CLI flag +python -m dynamo.frontend --tokenizer fastokens + +# Environment variable +export DYN_TOKENIZER=fastokens +python -m dynamo.frontend +``` + +## Dynamo Frontend Behavior + +When `DYN_TOKENIZER=fastokens` is set: + +1. The frontend passes the environment variable to the Rust runtime. +2. When building the tokenizer for a model, `ModelDeploymentCard::tokenizer()` attempts to load `fastokens::Tokenizer` from the same `tokenizer.json` file. +3. If loading succeeds, a hybrid `FastTokenizer` is created that encodes with `fastokens` and decodes with HuggingFace. +4. If loading fails (unsupported tokenizer features, missing file, etc.), the frontend logs a warning and falls back to the standard HuggingFace backend; no operator intervention is needed. diff --git a/docs/index.yml b/docs/index.yml index 4e1df0cff0a..817e2d2b952 100644 --- a/docs/index.yml +++ b/docs/index.yml @@ -200,6 +200,8 @@ navigation: contents: - page: Frontend Guide path: components/frontend/frontend-guide.md + - page: Tokenizer Backends + path: components/frontend/tokenizer-backends.md - section: Router path: components/router/README.md contents: diff --git a/lib/llm/Cargo.toml b/lib/llm/Cargo.toml index b14226b160a..17b1c655915 100644 --- a/lib/llm/Cargo.toml +++ b/lib/llm/Cargo.toml @@ -30,7 +30,11 @@ bench = ["dynamo-kv-router/bench"] kv-router-stress = ["dep:clap", "dep:indicatif", "bench"] [[bench]] -name = "tokenizer" +name = "tokenizer_simple" +harness = false + +[[bench]] +name = "tokenizer_dataset" harness = false [[bench]] diff --git a/lib/llm/benches/tokenizer_dataset.rs b/lib/llm/benches/tokenizer_dataset.rs new file mode 100644 index 00000000000..5baa70dc06d --- /dev/null +++ b/lib/llm/benches/tokenizer_dataset.rs @@ -0,0 +1,319 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Dataset-driven benchmarks +//! +//! Downloads a real dataset (LongBench-v2) from HuggingFace Hub and benchmarks +//! per-sample encode throughput with correctness verification. +//! +//! Run: +//! cargo bench --bench tokenizer_dataset +//! +//! Override tokenizer (default: Qwen/Qwen3-0.6B): +//! TOKENIZER_PATH=deepseek-ai/DeepSeek-V3 cargo bench --bench tokenizer_dataset +//! +//! Override dataset and sample count: +//! DATASET=RyokoAI/ShareGPT52K MAX_SAMPLES=50 cargo bench --bench tokenizer_dataset +//! +//! Batch benchmark (default: sequential): +//! BATCH_SIZE=64 cargo bench --bench tokenizer_dataset + +use std::path::Path; +use std::time::{Duration, Instant}; + +use dynamo_llm::tokenizers::{FastTokenizer, HuggingFaceTokenizer, traits::Encoder}; + +/// Default HuggingFace model for the tokenizer. +const DEFAULT_HF_MODEL: &str = "Qwen/Qwen3-0.6B"; + +/// Default dataset on HuggingFace Hub. +const DEFAULT_DATASET: &str = "zai-org/LongBench-v2"; + +/// Resolve tokenizer path: local file, HF model name, or default. +fn resolve_tokenizer_path() -> String { + let input = std::env::var("TOKENIZER_PATH").ok(); + + if let Some(ref p) = input + && Path::new(p).is_file() + { + eprintln!("[setup] Using local tokenizer: {p}"); + return p.clone(); + } + + let model_name = input.as_deref().unwrap_or(DEFAULT_HF_MODEL); + eprintln!("[setup] Downloading tokenizer for {model_name}..."); + + let cache = hf_hub::Cache::default(); + let api = hf_hub::api::sync::ApiBuilder::from_cache(cache) + .with_progress(true) + .build() + .expect("Failed to create HuggingFace API client"); + + let repo = api.model(model_name.to_string()); + let path = repo + .get("tokenizer.json") + .expect("Failed to download tokenizer.json"); + + let path_str = path.display().to_string(); + eprintln!("[setup] Tokenizer: {path_str}"); + path_str +} + +/// Return the JSON filename for a known HuggingFace Hub dataset. +fn dataset_json_file(dataset: &str) -> &'static str { + match dataset { + "RyokoAI/ShareGPT52K" => "sg_90k_part1.json", + "zai-org/LongBench-v2" => "data.json", + _ => panic!( + "Unknown dataset: {dataset}. Supported: zai-org/LongBench-v2, RyokoAI/ShareGPT52K" + ), + } +} + +/// Extract a text sample from a single JSON item. +fn extract_text(dataset: &str, item: &serde_json::Value) -> Option { + match dataset { + "RyokoAI/ShareGPT52K" => { + let messages = item.get("conversations")?.as_array()?; + let parts: Vec = messages + .iter() + .filter_map(|msg| { + let role = msg.get("from")?.as_str()?; + let value = msg.get("value")?.as_str()?; + if value.is_empty() { + return None; + } + Some(format!("[{role}]: {value}")) + }) + .collect(); + if parts.is_empty() { + return None; + } + Some(parts.join("\n\n")) + } + "zai-org/LongBench-v2" => { + let context = item.get("context")?.as_str()?; + if context.is_empty() { + return None; + } + Some(context.to_string()) + } + _ => None, + } +} + +/// Load text samples from a HuggingFace Hub dataset. +fn load_dataset(dataset: &str, max_items: usize) -> Vec { + let json_file = dataset_json_file(dataset); + + eprintln!("[setup] Downloading dataset {dataset}..."); + let api = hf_hub::api::sync::Api::new().expect("Failed to create HuggingFace API client"); + let repo = api.dataset(dataset.to_string()); + let json_path = repo.get(json_file).expect("Failed to download dataset"); + + let text = std::fs::read_to_string(&json_path).expect("Failed to read dataset JSON"); + let data: Vec = + serde_json::from_str(&text).expect("Failed to parse dataset JSON"); + + let samples: Vec = data + .iter() + .take(max_items) + .filter_map(|item| extract_text(dataset, item)) + .collect(); + + eprintln!("[setup] Loaded {} samples", samples.len()); + samples +} + +fn print_summary( + label: &str, + n: usize, + total_chars: u64, + total_tokens: u64, + total_hf: Duration, + total_ft: Duration, +) { + let hf_ms = total_hf.as_secs_f64() * 1000.0; + let ft_ms = total_ft.as_secs_f64() * 1000.0; + let speedup = hf_ms / ft_ms; + let nf = n as f64; + + println!(); + println!("=== {label} ({n} samples) ==="); + println!(" Total chars: {total_chars}"); + println!(" Total tokens: {total_tokens}"); + println!(" ---"); + println!(" HF total: {hf_ms:>10.2} ms"); + println!(" fastokens total: {ft_ms:>10.2} ms"); + println!(" Speedup: {speedup:>10.2}x"); + println!(" ---"); + println!(" HF avg/sample: {:>10.3} ms", hf_ms / nf); + println!(" ft avg/sample: {:>10.3} ms", ft_ms / nf); + println!( + " HF throughput: {:>10.2} MB/s", + total_chars as f64 / total_hf.as_secs_f64() / 1_000_000.0 + ); + println!( + " ft throughput: {:>10.2} MB/s", + total_chars as f64 / total_ft.as_secs_f64() / 1_000_000.0 + ); +} + +fn bench_sequential(samples: &[String], hf: &HuggingFaceTokenizer, fast: &FastTokenizer) { + let mut total_hf = Duration::ZERO; + let mut total_ft = Duration::ZERO; + let mut total_tokens: u64 = 0; + let mut total_chars: u64 = 0; + let mut mismatches = 0u64; + + for (i, text) in samples.iter().enumerate() { + let t0 = Instant::now(); + let hf_enc = hf.encode(text).expect("HF encode failed"); + let t1 = Instant::now(); + let ft_enc = fast.encode(text).expect("fastokens encode failed"); + let t2 = Instant::now(); + + let dt_hf = t1 - t0; + let dt_ft = t2 - t1; + + if hf_enc.token_ids() != ft_enc.token_ids() { + mismatches += 1; + if mismatches <= 3 { + eprintln!( + "[MISMATCH] sample {i}: hf={} tokens, ft={} tokens", + hf_enc.token_ids().len(), + ft_enc.token_ids().len() + ); + } + } + + total_hf += dt_hf; + total_ft += dt_ft; + total_tokens += ft_enc.token_ids().len() as u64; + total_chars += text.len() as u64; + + if (i + 1) % 20 == 0 { + eprintln!("[progress] {}/{}", i + 1, samples.len()); + } + } + + if mismatches > 0 { + eprintln!("[WARNING] {mismatches} samples had mismatched token IDs"); + } else { + eprintln!("[OK] All samples produced identical token IDs"); + } + + print_summary( + "Sequential Benchmark", + samples.len(), + total_chars, + total_tokens, + total_hf, + total_ft, + ); +} + +fn bench_batched( + samples: &[String], + hf: &HuggingFaceTokenizer, + fast: &FastTokenizer, + batch_size: usize, +) { + let mut total_hf = Duration::ZERO; + let mut total_ft = Duration::ZERO; + let mut total_tokens: u64 = 0; + let mut total_chars: u64 = 0; + let mut mismatches = 0u64; + + let num_batches = samples.len().div_ceil(batch_size); + + for (batch_idx, batch) in samples.chunks(batch_size).enumerate() { + let batch_refs: Vec<&str> = batch.iter().map(|s| s.as_str()).collect(); + let batch_chars: u64 = batch.iter().map(|s| s.len() as u64).sum(); + + let t0 = Instant::now(); + let hf_results = hf + .encode_batch(&batch_refs) + .expect("HF encode_batch failed"); + let t1 = Instant::now(); + let ft_results = fast + .encode_batch(&batch_refs) + .expect("fastokens encode_batch failed"); + let t2 = Instant::now(); + + // Verify correctness per sample within the batch + for (j, (hf_enc, ft_enc)) in hf_results.iter().zip(ft_results.iter()).enumerate() { + if hf_enc.token_ids() != ft_enc.token_ids() { + mismatches += 1; + if mismatches <= 3 { + let global_idx = batch_idx * batch_size + j; + eprintln!( + "[MISMATCH] sample {global_idx}: hf={} tokens, ft={} tokens", + hf_enc.token_ids().len(), + ft_enc.token_ids().len() + ); + } + } + } + + let batch_tokens: u64 = ft_results + .iter() + .map(|enc| enc.token_ids().len() as u64) + .sum(); + + total_hf += t1 - t0; + total_ft += t2 - t1; + total_tokens += batch_tokens; + total_chars += batch_chars; + + if (batch_idx + 1) % 5 == 0 { + eprintln!("[progress] batch {}/{num_batches}", batch_idx + 1); + } + } + + if mismatches > 0 { + eprintln!("[WARNING] {mismatches} samples had mismatched token IDs"); + } else { + eprintln!("[OK] All samples produced identical token IDs"); + } + + print_summary( + &format!("Batched Benchmark (batch_size={batch_size})"), + samples.len(), + total_chars, + total_tokens, + total_hf, + total_ft, + ); +} + +fn main() { + let tokenizer_path = resolve_tokenizer_path(); + let dataset = std::env::var("DATASET").unwrap_or_else(|_| DEFAULT_DATASET.to_string()); + let max_samples: usize = std::env::var("MAX_SAMPLES") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(503); + let batch_size: Option = std::env::var("BATCH_SIZE") + .ok() + .and_then(|v| v.parse().ok()); + + let samples = load_dataset(&dataset, max_samples); + + let hf = HuggingFaceTokenizer::from_file(&tokenizer_path) + .expect("Failed to load HuggingFace tokenizer"); + let fast = + FastTokenizer::from_file(&tokenizer_path).expect("Failed to load fastokens tokenizer"); + + // Warmup + if let Some(s) = samples.first() { + let _ = hf.encode(s.as_str()); + let _ = fast.encode(s.as_str()); + } + + if let Some(bs) = batch_size { + bench_batched(&samples, &hf, &fast, bs); + } else { + bench_sequential(&samples, &hf, &fast); + } +} diff --git a/lib/llm/benches/tokenizer.rs b/lib/llm/benches/tokenizer_simple.rs similarity index 60% rename from lib/llm/benches/tokenizer.rs rename to lib/llm/benches/tokenizer_simple.rs index 5a2403e693a..d25c8dc308a 100644 --- a/lib/llm/benches/tokenizer.rs +++ b/lib/llm/benches/tokenizer_simple.rs @@ -9,10 +9,12 @@ use criterion::{Criterion, Throughput, criterion_group, criterion_main}; use dynamo_llm::backend::Decoder; use dynamo_llm::protocols::common::StopConditions; use dynamo_llm::tokenizers::DecodeStream; +use dynamo_llm::tokenizers::FastTokenizer; use dynamo_llm::tokenizers::hf::HuggingFaceTokenizer; use dynamo_llm::tokenizers::tiktoken::TikTokenTokenizer; use dynamo_llm::tokenizers::traits::{Encoder, Tokenizer}; use dynamo_llm::types::TokenIdType; +use std::path::Path; const TEST_TOKENIZER: &str = concat!( env!("CARGO_MANIFEST_DIR"), @@ -137,12 +139,112 @@ pub fn tiktoken_decode(c: &mut Criterion) { group.finish(); } +// --------------------------------------------------------------------------- +// Tokenizer backend benchmarks +// +// By default these use the in-tree TinyLlama tokenizer. Override with a +// production-size tokenizer for more realistic numbers: +// TOKENIZER_PATH=/path/to/tokenizer.json cargo bench -- fastokens +// TOKENIZER_PATH=Qwen/Qwen3-0.6B cargo bench -- fastokens +// --------------------------------------------------------------------------- + +/// Default HuggingFace model to download when TOKENIZER_PATH is not set. +const DEFAULT_HF_MODEL: &str = "Qwen/Qwen3-0.6B"; + +/// Resolve a tokenizer.json path from TOKENIZER_PATH env var or download from HF Hub. +fn resolve_tokenizer_path() -> String { + let input = std::env::var("TOKENIZER_PATH").ok(); + + if let Some(ref p) = input + && Path::new(p).is_file() + { + return p.clone(); + } + + let model_name = input.as_deref().unwrap_or(DEFAULT_HF_MODEL); + let cache = hf_hub::Cache::default(); + let api = hf_hub::api::sync::ApiBuilder::from_cache(cache) + .with_progress(true) + .build() + .expect("Failed to create HuggingFace API client"); + + let repo = api.model(model_name.to_string()); + repo.get("tokenizer.json") + .expect("Failed to download tokenizer.json from HuggingFace Hub") + .display() + .to_string() +} + +const FASTOKENS_BATCH_SIZE: usize = 64; + +pub fn fastokens_encode(c: &mut Criterion) { + let tokenizer_path = resolve_tokenizer_path(); + let test_str: &str = &INPUT_STR.repeat(TARGET_ISL / INPUT_STR.len()); + + let hf_encoder = HuggingFaceTokenizer::from_file(&tokenizer_path).unwrap(); + let fast_encoder = FastTokenizer::from_file(&tokenizer_path).unwrap(); + + // Verify parity before benchmarking + let hf_ids = hf_encoder.encode(INPUT_STR).unwrap(); + let fast_ids = fast_encoder.encode(INPUT_STR).unwrap(); + assert_eq!( + hf_ids.token_ids(), + fast_ids.token_ids(), + "fastokens and HuggingFace must produce identical token IDs" + ); + + let mut group = c.benchmark_group("fastokens-encode"); + group.throughput(Throughput::Bytes(test_str.len() as u64)); + + group.bench_function("hf_encode", |b| { + b.iter(|| { + let _ = hf_encoder.encode(black_box(test_str)).unwrap(); + }) + }); + + group.bench_function("fastokens_encode", |b| { + b.iter(|| { + let _ = fast_encoder.encode(black_box(test_str)).unwrap(); + }) + }); + + group.finish(); +} + +pub fn fastokens_batch_encode(c: &mut Criterion) { + let tokenizer_path = resolve_tokenizer_path(); + let batch: Vec<&str> = (0..FASTOKENS_BATCH_SIZE).map(|_| INPUT_STR).collect(); + let total_bytes: u64 = batch.iter().map(|s| s.len() as u64).sum(); + + let hf_encoder = HuggingFaceTokenizer::from_file(&tokenizer_path).unwrap(); + let fast_encoder = FastTokenizer::from_file(&tokenizer_path).unwrap(); + + let mut group = c.benchmark_group("fastokens-batch-encode"); + group.throughput(Throughput::Bytes(total_bytes)); + + group.bench_function("hf_batch_encode", |b| { + b.iter(|| { + let _ = hf_encoder.encode_batch(black_box(&batch)).unwrap(); + }) + }); + + group.bench_function("fastokens_batch_encode", |b| { + b.iter(|| { + let _ = fast_encoder.encode_batch(black_box(&batch)).unwrap(); + }) + }); + + group.finish(); +} + criterion_group!( benches, encode, decode, decode_big, tiktoken_encode, - tiktoken_decode + tiktoken_decode, + fastokens_encode, + fastokens_batch_encode ); criterion_main!(benches);