turn tokenizer into package

stephantul · stephantul · commit e2789bab18db · 2025-05-13T12:31:44.000+02:00
diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py
@@ -9,10 +9,10 @@
 from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizerFast
 
 from model2vec.distill.inference import PCADimType, create_embeddings, post_process_embeddings
-from model2vec.distill.tokenizer import clean_and_create_vocabulary, replace_vocabulary, turn_tokens_into_ids
 from model2vec.distill.utils import select_optimal_device
 from model2vec.model import StaticModel
 from model2vec.quantization import DType, quantize_embeddings
+from model2vec.tokenizer import clean_and_create_vocabulary, replace_vocabulary, turn_tokens_into_ids
 
 try:
     # For huggingface_hub>=0.25.0
@@ -91,8 +91,11 @@ def distill_from_model(
         raise ValueError("The vocabulary is empty after preprocessing. Please check your token_remove_pattern.")
 
     # Create the embeddings.
-    unk_token = tokenizer.special_tokens_map.get("unk_token")
-    pad_token = tokenizer.special_tokens_map.get("pad_token")
+    unk_token: str | None = tokenizer.special_tokens_map.get("unk_token")
+    pad_token: str | None = tokenizer.special_tokens_map.get("pad_token")
+
+    # Add the cleaned vocabulary to the tokenizer.
+    backend_tokenizer = replace_vocabulary(backend_tokenizer, all_tokens, unk_token=unk_token, pad_token=pad_token)
 
     # Convert tokens to IDs
     token_ids = turn_tokens_into_ids(all_tokens, tokenizer, unk_token)
@@ -101,8 +104,6 @@ def distill_from_model(
         tokenized=token_ids, model=model, device=device, pad_token_id=tokenizer.get_vocab()[pad_token]
     )
 
-    # Add the cleaned vocabulary to the tokenizer.
-    backend_tokenizer = replace_vocabulary(backend_tokenizer, all_tokens, unk_token=unk_token, pad_token=pad_token)
     # Post process the embeddings by applying PCA and Zipf weighting.
     embeddings = post_process_embeddings(np.asarray(embeddings), pca_dims, sif_coefficient=sif_coefficient)
     # Quantize the embeddings.
diff --git a/model2vec/distill/inference.py b/model2vec/distill/inference.py
@@ -79,6 +79,8 @@ def create_embeddings(
     intermediate_weights = [intermediate_weights[i] for i in np.argsort(sort_order)]
     out_weights = np.stack(intermediate_weights)
 
+    out_weights = np.nan_to_num(out_weights)
+
     return out_weights
 
 
diff --git a/model2vec/distill/utils.py b/model2vec/distill/utils.py
@@ -1,27 +1,12 @@
 from __future__ import annotations
 
-import re
-from dataclasses import dataclass
 from logging import getLogger
 
 import torch
 
 logger = getLogger(__name__)
 
 
-@dataclass
-class Token:
-    """A class to represent a token."""
-
-    form: str
-    # The normalized and pretokenized form of the token
-    normalized_form: str
-    # Whether the word is a continuing subword.
-    is_subword: bool
-    # Whether the token is internal to the model.
-    is_internal: bool
-
-
 def select_optimal_device(device: str | None) -> str:
     """
     Guess what your optimal device should be based on backend availability.
diff --git a/model2vec/tokenizer/__init__.py b/model2vec/tokenizer/__init__.py
@@ -0,0 +1,7 @@
+from model2vec.utils import importable
+
+importable("transformers", "tokenizer")
+
+from model2vec.tokenizer.tokenizer import clean_and_create_vocabulary, replace_vocabulary, turn_tokens_into_ids
+
+__all__ = ["clean_and_create_vocabulary", "turn_tokens_into_ids", "replace_vocabulary"]
diff --git a/model2vec/tokenizer/datamodels.py b/model2vec/tokenizer/datamodels.py
@@ -0,0 +1,14 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class Token:
+    """A class to represent a token."""
+
+    form: str
+    # The normalized and pretokenized form of the token
+    normalized_form: str
+    # Whether the word is a continuing subword.
+    is_subword: bool
+    # Whether the token is internal to the model.
+    is_internal: bool
diff --git a/model2vec/tokenizer/model.py b/model2vec/tokenizer/model.py
@@ -0,0 +1,36 @@
+from typing import Any
+
+import numpy as np
+
+
+def process_tokenizer(
+    tokenizer_json: dict[str, Any], pre_tokenized_tokens: list[str], unk_token: str | None
+) -> dict[str, Any]:
+    """Process the WordPiece tokenizer JSON."""
+    tokenizer_json["model"]["type"] = "Unigram"
+    tokenizer_json["model"]["unk_id"] = pre_tokenized_tokens.index(unk_token) if unk_token else None
+
+    token_weights = np.asarray([_calculate_token_weight_for_unigram(token) for token in pre_tokenized_tokens])
+    proba = (token_weights / np.sum(token_weights)).tolist()
+    tokenizer_json["model"]["vocab"] = [(token, np.log(p)) for token, p in zip(pre_tokenized_tokens, proba)]
+
+    return tokenizer_json
+
+
+def process_unigram(tokenizer_json: dict[str, Any], pre_tokenized_tokens: list[str], unk_token: str) -> dict[str, Any]:
+    """Process the Unigram tokenizer JSON."""
+    current_probas = dict(tokenizer_json["model"]["vocab"])
+    avg_proba = sum(current_probas.values()) / len(current_probas)
+    new_probas = [[word, current_probas.get(word, avg_proba)] for word in pre_tokenized_tokens]
+    tokenizer_json["model"]["vocab"] = new_probas
+
+    tokens, _ = zip(*tokenizer_json["model"]["vocab"])
+    tokenizer_json["model"]["unk_id"] = list(tokens).index(unk_token)
+
+    return tokenizer_json
+
+
+def _calculate_token_weight_for_unigram(token: str) -> float:
+    """Calculate the token weight for Unigram."""
+    # Always prefer longer tokens.
+    return len(token) + int(token.startswith("▁"))
diff --git a/model2vec/tokenizer/normalizer.py b/model2vec/tokenizer/normalizer.py
@@ -0,0 +1,28 @@
+from string import punctuation
+
+from tokenizers import Regex
+from tokenizers.normalizers import Normalizer, Replace, Sequence, Strip
+
+
+def prepare_normalizer(
+    normalizer: Normalizer,
+) -> Normalizer:
+    """
+    Prepare the normalizer for the tokenizer.
+
+    This function sets the normalizer for the tokenizer based on the provided normalizer type.
+    If no normalizer is provided, it uses the default one.
+
+    :param normalizer: The tokenizer to prepare.
+    :return: The prepared tokenizer.
+    """
+    new_normalizers = []
+    for char in punctuation:
+        new_normalizers.append(Replace(char, f" {char} "))
+
+    new_normalizers.append(Replace(Regex(r"\s+"), " "))
+    new_normalizers.append(Strip(right=True))
+    if normalizer is None:
+        return Sequence(new_normalizers)
+
+    return Sequence([normalizer] + new_normalizers)
diff --git a/model2vec/tokenizer/pretokenizer.py b/model2vec/tokenizer/pretokenizer.py
@@ -0,0 +1,51 @@
+from typing import Any
+
+_FORBIDDEN_PRETOKENIZERS = (
+    "WhiteSpace",
+    "WhitespaceSplit",
+    "BertPreTokenizer",
+    "CharDelimiterSplit",
+    "Punctuation",
+    "Split",
+    "UnicodeScripts",
+)
+_BASIC_METASPACE = {"type": "Metaspace", "replacement": "▁", "prepend_scheme": "always", "split": False}
+
+
+def _fix_single_pretokenizer(pre_tokenizer: dict[str, Any]) -> dict[str, Any] | None:
+    """Fixes a single pretokenizer to allow multiword units."""
+    if pre_tokenizer["type"] in _FORBIDDEN_PRETOKENIZERS:
+        return None
+    if pre_tokenizer["type"] == "ByteLevel":
+        pre_tokenizer["add_prefix_space"] = True
+        pre_tokenizer["use_regex"] = False
+    if pre_tokenizer["type"] == "Metaspace":
+        pre_tokenizer["split"] = False
+        pre_tokenizer["prepend_scheme"] = "always"
+
+    return pre_tokenizer
+
+
+def fix_pretokenizer(pretokenizer: dict[str, Any] | None) -> dict[str, Any]:
+    """Fixes a single pretokenizer to allow multiword units."""
+    if pretokenizer is None:
+        return _BASIC_METASPACE
+
+    if pretokenizer["type"] == "Sequence":
+        new_pretokenizers = []
+        for single_pretokenizer in pretokenizer["pretokenizers"]:
+            new_pretokenizer = _fix_single_pretokenizer(single_pretokenizer)
+            if new_pretokenizer is not None:
+                new_pretokenizers.append(new_pretokenizer)
+        pretokenizer["pretokenizers"] = new_pretokenizers
+
+        if not pretokenizer:
+            return _BASIC_METASPACE
+
+        return pretokenizer
+
+    single_pretokenizer = _fix_single_pretokenizer(pretokenizer)
+    if single_pretokenizer is None:
+        return _BASIC_METASPACE
+
+    return single_pretokenizer
diff --git a/model2vec/tokenizer/tokenizer.py b/model2vec/tokenizer/tokenizer.py