Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
6d54a71
feat: reduce crate size with training/spm feature gates, remove deriv…
ArthurZucker Apr 9, 2026
c81f4ff
feat: further reduce crate size - slim regex, optional rayon, drop ah…
ArthurZucker Apr 9, 2026
c63ce5d
feat: feature-gate unicode-normalization, compact_str, unicode-segmen…
ArthurZucker Apr 9, 2026
d8e99a6
docs: add bundle size documentation, feature flags table, and measure…
ArthurZucker Apr 9, 2026
d716d77
fix: update bindings for ahash->foldhash migration and TemplateProces…
ArthurZucker Apr 9, 2026
36b9d10
ci: add bundle size reporting to release workflows and fix formatting
ArthurZucker Apr 10, 2026
e3099f7
fix: clippy lints and macOS abi3 cross-compilation RUSTFLAGS in CI
ArthurZucker Apr 10, 2026
7ba643d
fix: cargo fmt for node bindings
ArthurZucker Apr 10, 2026
2da83c5
fix: sync lib.rs doc comments with README for cargo readme check
ArthurZucker Apr 10, 2026
a2377e3
Merge branch 'main' into reduce-crate-size
ArthurZucker Apr 10, 2026
994022f
Merge branch 'main' into reduce-crate-size
ArthurZucker Apr 10, 2026
30bbf32
feat: make regex optional, strip+LTO python bindings, add release-sma…
ArthurZucker Apr 11, 2026
1280a28
feat: add `inference` feature set for smallest full-inference build
ArthurZucker Apr 11, 2026
8976c12
feat: replace from_value with from_str to reduce serde monomorphization
ArthurZucker Apr 11, 2026
73dd2ba
Revert "feat: replace from_value with from_str to reduce serde monomo…
ArthurZucker Apr 11, 2026
1187563
ci: measure linked cdylib size, not rlib
ArthurZucker Apr 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions .github/workflows/python-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,31 @@ jobs:
- run: twine check --strict dist/*
working-directory: ./bindings/python

- name: Report wheel sizes
working-directory: ./bindings/python
run: |
echo "## 🐍 Python Wheel Size — ${{ matrix.os }} ${{ matrix.target }}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Wheel (.whl) = compressed archive downloaded from PyPI." >> $GITHUB_STEP_SUMMARY
echo "Installed .so/.pyd = actual shared library loaded at runtime." >> $GITHUB_STEP_SUMMARY
echo "The installed size is what matters for on-device deployment." >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Wheel | Wheel size | Installed .so/.pyd |" >> $GITHUB_STEP_SUMMARY
echo "|---|---|---|" >> $GITHUB_STEP_SUMMARY
EXTRACT_DIR=$(mktemp -d)
for f in dist/*.whl; do
WHL_SIZE=$(du -h "$f" | cut -f1)
NAME=$(basename "$f")
rm -rf "$EXTRACT_DIR"/*
(cd "$EXTRACT_DIR" && unzip -q "$(realpath -- "$OLDPWD/$f" 2>/dev/null || echo "$OLDPWD/$f")" 2>/dev/null) \
|| unzip -q -o "$f" -d "$EXTRACT_DIR" 2>/dev/null || true
SO_SIZE=$(find "$EXTRACT_DIR" \( -name '*.so' -o -name '*.pyd' -o -name '*.dylib' \) -exec du -h {} \; | head -1 | cut -f1)
[ -z "$SO_SIZE" ] && SO_SIZE="n/a"
echo "| \`${NAME}\` | ${WHL_SIZE} | ${SO_SIZE} |" >> $GITHUB_STEP_SUMMARY
done
rm -rf "$EXTRACT_DIR"
echo "" >> $GITHUB_STEP_SUMMARY

- uses: actions/upload-artifact@v4
with:
name: pypi_files-${{ matrix.os }}-${{ matrix.target }}-${{ matrix.manylinux }}
Expand Down Expand Up @@ -180,6 +205,42 @@ jobs:
with:
path: ./bindings/python/dist
merge-multiple: true

- name: Wheel size summary
working-directory: ./bindings/python
run: |
echo "## 📦 All Python Wheel Sizes" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Wheel | Wheel size | Installed .so/.pyd |" >> $GITHUB_STEP_SUMMARY
echo "|---|---|---|" >> $GITHUB_STEP_SUMMARY
TOTAL_WHL=0
TOTAL_SO=0
EXTRACT_DIR=$(mktemp -d)
for f in dist/*.whl; do
WHL_BYTES=$(stat --format=%s "$f" 2>/dev/null || stat -f%z "$f")
WHL_SIZE=$(du -h "$f" | cut -f1)
NAME=$(basename "$f")
rm -rf "$EXTRACT_DIR"/*
unzip -q -o "$f" -d "$EXTRACT_DIR" 2>/dev/null || true
SO_FILE=$(find "$EXTRACT_DIR" \( -name '*.so' -o -name '*.pyd' -o -name '*.dylib' \) | head -1)
if [ -n "$SO_FILE" ]; then
SO_BYTES=$(stat --format=%s "$SO_FILE" 2>/dev/null || stat -f%z "$SO_FILE")
SO_SIZE=$(du -h "$SO_FILE" | cut -f1)
TOTAL_SO=$((TOTAL_SO + SO_BYTES))
else
SO_SIZE="n/a"
fi
echo "| \`${NAME}\` | ${WHL_SIZE} | ${SO_SIZE} |" >> $GITHUB_STEP_SUMMARY
TOTAL_WHL=$((TOTAL_WHL + WHL_BYTES))
done
rm -rf "$EXTRACT_DIR"
echo "" >> $GITHUB_STEP_SUMMARY
TOTAL_WHL_MB=$(echo "scale=2; $TOTAL_WHL / 1048576" | bc)
TOTAL_SO_MB=$(echo "scale=2; $TOTAL_SO / 1048576" | bc)
WHL_COUNT=$(ls dist/*.whl 2>/dev/null | wc -l | tr -d ' ')
echo "**Total**: ${WHL_COUNT} wheels | wheel: ${TOTAL_WHL_MB} MB | installed: ${TOTAL_SO_MB} MB" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY

# Temporary deactivation while testing abi3 CI
# - name: Upload to PyPi
# working-directory: ./bindings/python
Expand Down
79 changes: 79 additions & 0 deletions .github/workflows/rust-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,85 @@ jobs:
path: ~/.cargo/registry
key: ubuntu-latest-cargo-registry-${{ hashFiles('**/Cargo.toml') }}

- name: Measure crate size
working-directory: ./tokenizers
run: |
echo "## 📦 Crate Size Report" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY

# Packed crate size (what gets uploaded to crates.io)
cargo package --list --allow-dirty > /tmp/crate_files.txt
CRATE_FILE_COUNT=$(wc -l < /tmp/crate_files.txt | tr -d ' ')
PACKED_SIZE=$(cargo package --allow-dirty 2>&1 | grep -oP 'Packaged \d+ files?, \K[\d.]+ \w+' || echo "unknown")
echo "### Packed crate (crates.io)" >> $GITHUB_STEP_SUMMARY
echo "- **Size**: ${PACKED_SIZE}" >> $GITHUB_STEP_SUMMARY
echo "- **Files**: ${CRATE_FILE_COUNT}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY

# Linked shared library size for various feature combinations.
# This is the actual on-device size — what ships to users — NOT the
# .rlib, which contains unused code the final linker strips.
# We build a minimal cdylib that uses the Tokenizer API and measure it.
TEST_DIR=$(mktemp -d)
TOK_PATH="$(pwd)"
mkdir -p "$TEST_DIR/src"
cat > "$TEST_DIR/src/lib.rs" << 'RS'
use tokenizers::Tokenizer;
#[no_mangle]
pub extern "C" fn tokenize(path: *const u8, len: usize, input: *const u8, input_len: usize) -> usize {
let path = unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(path, len)) };
let input = unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(input, input_len)) };
let tok = Tokenizer::from_file(path).unwrap();
tok.encode(input, false).unwrap().get_ids().len()
}
RS

measure() {
local LABEL="$1"
local FEATURES="$2"
cat > "$TEST_DIR/Cargo.toml" << TOML
[package]
name = "size-test"
version = "0.1.0"
edition = "2021"
[lib]
crate-type = ["cdylib"]
[dependencies]
tokenizers = { path = "$TOK_PATH", ${FEATURES} }
[profile.release]
lto = "fat"
opt-level = "s"
strip = true
codegen-units = 1
panic = "abort"
TOML
(cd "$TEST_DIR" && cargo build --release >/dev/null 2>&1)
local LIB=$(find "$TEST_DIR/target/release" -maxdepth 1 \( -name '*.so' -o -name '*.dylib' -o -name '*.dll' \) | head -1)
if [ -n "$LIB" ]; then
local BYTES=$(stat --format=%s "$LIB" 2>/dev/null || stat -f%z "$LIB")
local KB=$((BYTES / 1024))
echo "| ${LABEL} | ${KB} KB |" >> $GITHUB_STEP_SUMMARY
else
echo "| ${LABEL} | build failed |" >> $GITHUB_STEP_SUMMARY
fi
}

echo "### Linked shared library size (stripped cdylib, LTO fat, opt-level=s, panic=abort)" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "This is the actual on-device size — what ships to end users." >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Feature set | Size |" >> $GITHUB_STEP_SUMMARY
echo "|---|---|" >> $GITHUB_STEP_SUMMARY

measure "default (all features)" ''
measure "inference (onig + unicode-norm + spm)" 'default-features = false, features = ["inference"]'
measure "minimal onig-only" 'default-features = false, features = ["onig"]'
measure "no-default + training" 'default-features = false, features = ["onig", "training"]'
measure "no-default + parallel" 'default-features = false, features = ["onig", "parallel"]'

rm -rf "$TEST_DIR"
echo "" >> $GITHUB_STEP_SUMMARY

- name: Publish package rust
working-directory: ./tokenizers
if: ${{ !contains(github.ref, 'rc') }}
Expand Down
1 change: 0 additions & 1 deletion bindings/node/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ napi = "2"
napi-derive = "2"
serde = { version = "1.0.163", features = ["derive"] }
tokenizers = { path = "../../tokenizers/" }
ahash = { version = "0.8.11", features = ["serde"] }

[build-dependencies]
napi-build = "2"
Expand Down
2 changes: 1 addition & 1 deletion bindings/node/src/models.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use crate::arc_rwlock_serde;
use crate::tasks::models::{BPEFromFilesTask, WordLevelFromFilesTask, WordPieceFromFilesTask};
use crate::trainers::Trainer;
use ahash::AHashMap;
use napi::bindgen_prelude::*;
use napi_derive::napi;
use serde::{Deserialize, Serialize};
Expand All @@ -12,6 +11,7 @@ use tokenizers as tk;
use tokenizers::models::bpe::{BpeBuilder, Merges};
use tokenizers::models::wordlevel::WordLevelBuilder;
use tokenizers::models::wordpiece::WordPieceBuilder;
use tokenizers::utils::AHashMap;

#[napi]
#[derive(Clone, Serialize, Deserialize)]
Expand Down
5 changes: 4 additions & 1 deletion bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ once_cell = "1.19.0"
numpy = "0.28"
ndarray = "0.16"
itertools = "0.14"
ahash = { version = "0.8.11", features = ["serde"] }
pyo3-ffi = { version = "0.28" }

[dependencies.tokenizers]
Expand All @@ -34,3 +33,7 @@ pyo3 = { version = "0.28", features = ["auto-initialize", "experimental-inspect"
[features]
default = ["ext-module"]
ext-module = ["pyo3/extension-module"]

[profile.release]
strip = true
lto = "fat"
2 changes: 1 addition & 1 deletion bindings/python/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ use std::sync::{Arc, RwLock};

use crate::token::PyToken;
use crate::trainers::PyTrainer;
use ahash::AHashMap;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
Expand All @@ -14,6 +13,7 @@ use tk::models::unigram::Unigram;
use tk::models::wordlevel::WordLevel;
use tk::models::wordpiece::{WordPiece, WordPieceBuilder};
use tk::models::ModelWrapper;
use tk::utils::AHashMap;
use tk::{Model, Token};
use tokenizers as tk;

Expand Down
50 changes: 33 additions & 17 deletions tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,40 +67,41 @@ name = "ci_benchmark"
harness = false

[dependencies]
rand = "0.9"
rand = { version = "0.9", optional = true }
onig = { version = "6.5.1", default-features = false, optional = true }
regex = "1.10"
regex-syntax = "0.8"
rayon = "1.10"
rayon-cond = "0.4"
regex = { version = "1.10", default-features = false, features = ["std", "perf", "unicode-perl"], optional = true }
rayon = { version = "1.10", optional = true }
rayon-cond = { version = "0.4", optional = true }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
unicode-normalization-alignments = "0.1"
unicode-normalization-alignments = { version = "0.1", optional = true }
unicode_categories = "0.1"
unicode-segmentation = "1.11"
unicode-segmentation = { version = "1.11", optional = true }
indicatif = { version = "0.18", optional = true }
itertools = "0.14"
log = "0.4"
derive_builder = "0.20"
spm_precompiled = "0.1.3"
spm_precompiled = { version = "0.1.3", optional = true }
hf-hub = { version = "0.4.1", features = ["ureq"], default-features = false, optional = true }
daachorse = "1.0.1"
paste = "1.0.14"
macro_rules_attribute = "0.2.0"
thiserror = "2"
fancy-regex = { version = "0.17", optional = true }
getrandom = { version = "0.3" }
esaxx-rs = { version = "0.1.10", default-features = false, features = [] }
monostate = "0.1.12"
ahash = { version = "0.8.11", features = ["serde"] }
dary_heap = { version = "0.3.6", features = ["serde"] }
compact_str = { version = "0.9", features = ["serde"] }
esaxx-rs = { version = "0.1.10", default-features = false, features = [], optional = true }
foldhash = "0.2"
dary_heap = "0.3.6"
compact_str = { version = "0.9", features = ["serde"], optional = true }

[features]
default = ["progressbar", "onig", "esaxx_fast"]
esaxx_fast = ["esaxx-rs/cpp"]
default = ["progressbar", "onig", "esaxx_fast", "spm", "training", "parallel", "unicode-normalization", "regex"]
unicode-normalization = ["dep:unicode-normalization-alignments"]
parallel = ["dep:rayon", "dep:rayon-cond"]
training = ["dep:rand", "dep:esaxx-rs", "dep:compact_str"]
spm = ["dep:spm_precompiled", "dep:unicode-segmentation"]
esaxx_fast = ["dep:esaxx-rs", "esaxx-rs/cpp"]
progressbar = ["indicatif"]
http = ["hf-hub"]
inference = ["onig", "unicode-normalization", "spm"]
unstable_wasm = ["fancy-regex", "getrandom/wasm_js"]
rustls-tls = ["hf-hub?/rustls-tls"]

Expand All @@ -114,6 +115,21 @@ tracing-subscriber = "0.3.18"
[profile.release]
lto = "fat"

# Use this profile for minimal binary size (e.g. on-device deployment).
# Pair with the `inference` feature for all inference capabilities without training/parallel:
# cargo build --profile release-small --no-default-features --features inference
# For even smaller builds (nightly only):
# RUSTFLAGS="-Zlocation-detail=none -Zfmt-debug=none" cargo +nightly build \
# -Z build-std=std,panic_abort -Z build-std-features="optimize_for_size" \
# --target <your-target-triple> --profile release-small \
# --no-default-features --features inference
[profile.release-small]
inherits = "release"
opt-level = "s"
strip = true
panic = "abort"
codegen-units = 1

[profile.profiling]
inherits = "release"
debug = true
Expand Down
Loading
Loading