Skip to content
4 changes: 1 addition & 3 deletions bindings/python/src/normalizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,7 @@ impl PyNormalizer {
/// :obj:`str`: A string after normalization
#[pyo3(text_signature = "(self, sequence)")]
fn normalize_str(&self, sequence: &str) -> PyResult<String> {
let mut normalized = NormalizedString::from(sequence);
ToPyResult(self.normalizer.normalize(&mut normalized)).into_py()?;
Ok(normalized.get().to_owned())
ToPyResult(self.normalizer.normalize_str(sequence)).into()
}

fn __repr__(&self) -> PyResult<String> {
Expand Down
60 changes: 60 additions & 0 deletions tokenizers/src/normalizers/byte_level.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,42 @@ pub struct ByteLevel;

static BYTES_CHAR: LazyLock<AHashMap<u8, char>> = LazyLock::new(bytes_char);

/// Pre-encoded UTF-8 lookup table for [`ByteLevel::normalize_str`].
///
/// The byte-level normalizer maps every input byte (0x00–0xFF) to a specific
/// Unicode character. For example:
///
/// | Input byte | Char | Code point | UTF-8 bytes | Length |
/// |------------|---------|------------|-------------|--------|
/// | `0x6B` (k) | 'k' | U+006B | `[6B]` | 1 |
/// | `0x20` ( ) | 'Ġ' | U+0120 | `[C4, A0]` | 2 |
/// | `0x0D` (CR)| 'č' | U+010D | `[C4, 8D]` | 2 |
///
/// Without this table, each byte requires a `HashMap<u8, char>` lookup
/// followed by `char::encode_utf8` to write it into the output `String`.
///
/// This table pre-computes the UTF-8 encoding once at startup, so the hot
/// path is just `out.extend_from_slice(&entry.bytes[..entry.len])` — a
/// direct memcpy with no hashing and no per-char encoding.
struct Utf8Entry {
/// The UTF-8 encoding of the byte-level char (at most 2 bytes for this
/// particular mapping, since all chars are ≤ U+0122).
bytes: [u8; 2],
/// Number of valid bytes in `bytes` (1 or 2).
len: u8,
}

static BYTES_CHAR_UTF8: LazyLock<[Utf8Entry; 256]> = LazyLock::new(|| {
let map = bytes_char();
std::array::from_fn(|i| {
let c = map[&(i as u8)];
let mut buf = [0u8; 2];
let s = c.encode_utf8(&mut buf);
let len = s.len() as u8;
Utf8Entry { bytes: buf, len }
})
});

impl Default for ByteLevel {
fn default() -> Self {
Self::new()
Expand Down Expand Up @@ -45,6 +81,19 @@ impl Normalizer for ByteLevel {
}
Ok(())
}

/// Fast path: map each byte to its byte-level char without alignment tracking.
/// Uses a pre-encoded UTF-8 lookup table — no HashMap, no per-char encoding.
fn normalize_str(&self, s: &str) -> Result<String> {
let table = &*BYTES_CHAR_UTF8;
let mut out = Vec::with_capacity(s.len() * 2);
for &b in s.as_bytes() {
let entry = &table[b as usize];
out.extend_from_slice(&entry.bytes[..entry.len as usize]);
}
// SAFETY: every entry in the table is valid UTF-8 (encoded from a char).
Ok(unsafe { String::from_utf8_unchecked(out) })
}
}

#[cfg(test)]
Expand Down Expand Up @@ -171,4 +220,15 @@ mod tests {
]
);
}

#[test]
fn normalize_str_matches_normalize() {
let bl = ByteLevel::new();
for input in &["Hello", "Hello 我今天能为你做什么", "", "abc\x00\x01\x7f"] {
let mut ns = NormalizedString::from(*input);
bl.normalize(&mut ns).unwrap();
let fast = bl.normalize_str(input).unwrap();
assert_eq!(ns.get(), fast, "mismatch for input: {input:?}");
}
}
}
19 changes: 19 additions & 0 deletions tokenizers/src/normalizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,25 @@ impl Normalizer for NormalizerWrapper {
Self::ByteLevel(lc) => lc.normalize(normalized),
}
}

fn normalize_str(&self, s: &str) -> crate::Result<String> {
match self {
Self::BertNormalizer(bn) => bn.normalize_str(s),
Self::StripNormalizer(sn) => sn.normalize_str(s),
Self::StripAccents(sn) => sn.normalize_str(s),
Self::NFC(nfc) => nfc.normalize_str(s),
Self::NFD(nfd) => nfd.normalize_str(s),
Self::NFKC(nfkc) => nfkc.normalize_str(s),
Self::NFKD(nfkd) => nfkd.normalize_str(s),
Self::Sequence(sequence) => sequence.normalize_str(s),
Self::Lowercase(lc) => lc.normalize_str(s),
Self::Nmt(lc) => lc.normalize_str(s),
Self::Precompiled(lc) => lc.normalize_str(s),
Self::Replace(lc) => lc.normalize_str(s),
Self::Prepend(lc) => lc.normalize_str(s),
Self::ByteLevel(lc) => lc.normalize_str(s),
}
}
}

impl_enum_from!(BertNormalizer, NormalizerWrapper, BertNormalizer);
Expand Down
7 changes: 7 additions & 0 deletions tokenizers/src/normalizers/prepend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ impl Normalizer for Prepend {
}
Ok(())
}
fn normalize_str(&self, s: &str) -> Result<String> {
if s.is_empty() {
Ok(String::new())
} else {
Ok(format!("{}{s}", self.prepend))
}
}
}

#[cfg(test)]
Expand Down
8 changes: 8 additions & 0 deletions tokenizers/src/normalizers/strip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ impl Normalizer for Strip {

Ok(())
}
fn normalize_str(&self, s: &str) -> Result<String> {
let s = if self.strip_left { s.trim_start() } else { s };
let s = if self.strip_right { s.trim_end() } else { s };
Ok(s.to_owned())
}
}

// This normalizer removes combining marks from a normalized string
Expand All @@ -53,6 +58,9 @@ impl Normalizer for StripAccents {
normalized.filter(|c| !is_combining_mark(c));
Ok(())
}
fn normalize_str(&self, s: &str) -> Result<String> {
Ok(s.chars().filter(|c| !is_combining_mark(*c)).collect())
}
}

#[cfg(test)]
Expand Down
28 changes: 28 additions & 0 deletions tokenizers/src/normalizers/unicode.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::tokenizer::{NormalizedString, Normalizer, Result};
use crate::utils::macro_rules_attribute;
use unicode_normalization_alignments::UnicodeNormalization;

#[derive(Default, Copy, Clone, Debug)]
#[macro_rules_attribute(impl_serde_type!)]
Expand All @@ -9,6 +10,9 @@ impl Normalizer for NFD {
normalized.nfd();
Ok(())
}
fn normalize_str(&self, s: &str) -> Result<String> {
Ok(s.nfd().map(|(c, _)| c).collect())
}
}

#[derive(Default, Copy, Clone, Debug)]
Expand All @@ -19,6 +23,9 @@ impl Normalizer for NFKD {
normalized.nfkd();
Ok(())
}
fn normalize_str(&self, s: &str) -> Result<String> {
Ok(s.nfkd().map(|(c, _)| c).collect())
}
}

#[derive(Default, Copy, Clone, Debug)]
Expand All @@ -29,6 +36,9 @@ impl Normalizer for NFC {
normalized.nfc();
Ok(())
}
fn normalize_str(&self, s: &str) -> Result<String> {
Ok(s.nfc().map(|(c, _)| c).collect())
}
}

#[derive(Default, Copy, Clone, Debug)]
Expand All @@ -39,6 +49,9 @@ impl Normalizer for NFKC {
normalized.nfkc();
Ok(())
}
fn normalize_str(&self, s: &str) -> Result<String> {
Ok(s.nfkc().map(|(c, _)| c).collect())
}
}

fn do_nmt(normalized: &mut NormalizedString) {
Expand Down Expand Up @@ -80,6 +93,21 @@ impl Normalizer for Nmt {
do_nmt(normalized);
Ok(())
}
fn normalize_str(&self, s: &str) -> Result<String> {
Ok(s.chars()
.filter(|c| {
!matches!(
*c as u32,
0x0001..=0x0008 | 0x000B | 0x000E..=0x001F | 0x007F | 0x008F | 0x009F
)
})
.map(|c| match c as u32 {
0x0009 | 0x000A | 0x000C | 0x000D | 0x1680 | 0x200B..=0x200F | 0x2028
| 0x2029 | 0x2581 | 0xFEFF | 0xFFFD => ' ',
_ => c,
})
.collect())
}
}

#[cfg(test)]
Expand Down
12 changes: 12 additions & 0 deletions tokenizers/src/normalizers/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ impl Normalizer for Sequence {
}
Ok(())
}

fn normalize_str(&self, s: &str) -> Result<String> {
let mut result = s.to_owned();
for normalizer in &self.normalizers {
result = normalizer.normalize_str(&result)?;
}
Ok(result)
}
}

/// Lowercases the input
Expand All @@ -57,4 +65,8 @@ impl Normalizer for Lowercase {
normalized.lowercase();
Ok(())
}

fn normalize_str(&self, s: &str) -> Result<String> {
Ok(s.to_lowercase())
}
}
41 changes: 35 additions & 6 deletions tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,9 +315,7 @@ impl AddedVocabulary {

if token.normalized {
if let Some(n) = normalizer {
let mut s = NormalizedString::from(token.content.as_ref());
n.normalize(&mut s)?;
let normed = s.get().to_string();
let normed = n.normalize_str(&token.content)?;
if normed != token.content {
self.normalized_cache.insert(new_id, normed);
}
Expand Down Expand Up @@ -360,9 +358,7 @@ impl AddedVocabulary {
for (id, token) in &self.added_tokens_map_r {
if token.normalized {
if let Some(n) = normalizer {
let mut s = NormalizedString::from(token.content.as_ref());
n.normalize(&mut s)?;
let normed = s.get().to_string();
let normed = n.normalize_str(&token.content)?;
if normed != token.content {
self.normalized_cache.insert(*id, normed);
}
Expand Down Expand Up @@ -562,6 +558,39 @@ impl AddedVocabulary {

pretokenized
}

/// Like [`extract_and_normalize`] but uses [`Normalizer::normalize_str`]
/// instead of [`Normalizer::normalize`], skipping alignment tracking.
///
/// This is used by `encode_fast` where offsets are not needed. The
/// normalization step avoids building per-byte alignment vectors, which
/// saves O(n) allocations per split.
pub fn extract_and_normalize_fast<N: Normalizer>(
&self,
normalizer: Option<&N>,
sequence: &str,
) -> PreTokenizedString {
let mut pretokenized: PreTokenizedString = sequence.into();

// 1. Extract non-normalized tokens from the raw string
pretokenized
.split(|_, sequence| Ok(self.split_with_indices(sequence, &self.split_trie)))
.expect("AddedVocabulary bad split");

// 2. Normalize remaining pieces via normalize_str (no alignment tracking)
// and extract normalized tokens
pretokenized
.split(|_, mut sequence| {
if let Some(n) = normalizer {
let normed = n.normalize_str(sequence.get())?;
sequence.set_normalized(normed);
}
Ok(self.split_with_indices(sequence, &self.split_normalized_trie))
})
.expect("AddedVocabulary bad split");

pretokenized
}
}

impl Default for AddedVocabulary {
Expand Down
22 changes: 19 additions & 3 deletions tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,17 @@ pub type Offsets = (usize, usize);
/// Takes care of pre-processing strings.
pub trait Normalizer: Sync {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;

/// Normalize a plain string, returning the result without tracking offsets.
///
/// The default implementation allocates a full [`NormalizedString`] (with
/// alignment vectors). Normalizers that can produce their output more
/// cheaply should override this to avoid the overhead.
fn normalize_str(&self, s: &str) -> Result<String> {
let mut n = NormalizedString::from(s);
self.normalize(&mut n)?;
Ok(n.get().to_owned())
}
}

/// The `PreTokenizer` is in charge of doing the pre-segmentation step. It splits the given string
Expand Down Expand Up @@ -731,10 +742,15 @@ where
type_id: u32,
offsets_type: OffsetType,
) -> Result<Encoding> {
let fast = matches!(offsets_type, OffsetType::None);
let encode = |is_pre_tokenized, subseq_idx, subseq| -> Result<Encoding> {
let normalized = self
.added_vocabulary
.extract_and_normalize(self.normalizer.as_ref(), subseq);
let normalized = if fast {
self.added_vocabulary
.extract_and_normalize_fast(self.normalizer.as_ref(), subseq)
} else {
self.added_vocabulary
.extract_and_normalize(self.normalizer.as_ref(), subseq)
};
let pre_tokenized = self.do_pre_tokenize(normalized)?;
let subseq_encoding = self.do_tokenize(
pre_tokenized,
Expand Down
12 changes: 12 additions & 0 deletions tokenizers/src/tokenizer/normalizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,18 @@ impl NormalizedString {
&self.normalized
}

/// Replace the normalized content without tracking alignments.
///
/// This is significantly cheaper than going through `transform()` since it
/// skips the per-byte alignment bookkeeping. Use this when offset tracking
/// is not needed (e.g. `encode_fast`).
pub fn set_normalized(&mut self, new: String) {
// Build trivial 1:1 alignments so that slice() still works for
// splitting, but no real offset mapping is preserved.
self.alignments = new.as_bytes().iter().enumerate().map(|(i, _)| (i, i + 1)).collect();
self.normalized = new;
}

/// Return the original string
pub fn get_original(&self) -> &str {
&self.original
Expand Down
Loading