MinishLab
diff --git a/‎model2vec/distill/distillation.py‎
Lines changed: 9 additions & 13 deletions b/‎model2vec/distill/distillation.py‎
Lines changed: 9 additions & 13 deletions
diff --git a/‎model2vec/hf_utils.py‎
Lines changed: 1 addition & 5 deletions b/‎model2vec/hf_utils.py‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎model2vec/model.py‎
Lines changed: 53 additions & 39 deletions b/‎model2vec/model.py‎
Lines changed: 53 additions & 39 deletions
diff --git a/‎model2vec/quantization.py‎
Lines changed: 4 additions & 3 deletions b/‎model2vec/quantization.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎model2vec/train/base.py‎
Lines changed: 4 additions & 2 deletions b/‎model2vec/train/base.py‎
Lines changed: 4 additions & 2 deletions
@@ -7,15 +7,14 @@
 
 import numpy as np
 from huggingface_hub.hf_api import model_info
-from sklearn.cluster import KMeans
 from transformers import AutoModel, AutoTokenizer
 from transformers.modeling_utils import PreTrainedModel
 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 
 from model2vec.distill.inference import PCADimType, create_embeddings, post_process_embeddings
 from model2vec.distill.utils import select_optimal_device
 from model2vec.model import StaticModel
-from model2vec.quantization import DType, quantize_embeddings
+from model2vec.quantization import DType, quantize_embeddings, quantize_vocabulary
 from model2vec.tokenizer import clean_and_create_vocabulary, replace_vocabulary, turn_tokens_into_ids
 
 logger = logging.getLogger(__name__)
@@ -58,6 +57,7 @@ def distill_from_model(
         If the pattern is so general that it removes all tokens, we throw an error. If the pattern can't be compiled into a valid regex, we also throw an error.
     :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
+    :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
     :return: A StaticModel
     :raises: ValueError if the vocabulary is empty after preprocessing.
 
@@ -118,19 +118,14 @@ def distill_from_model(
 
     if vocabulary_quantization is not None:
         _, weights = post_process_embeddings(np.asarray(embeddings), None, sif_coefficient=sif_coefficient)
-        km = KMeans(vocabulary_quantization, random_state=42)
-        km.fit(embeddings)
-        clustered_embeddings = km.predict(embeddings)
-        mapping = {idx: int(x) for idx, x in enumerate(clustered_embeddings)}
-
-        embeddings = km.cluster_centers_
+        embeddings, token_mapping, weights = quantize_vocabulary(
+            n_clusters=vocabulary_quantization, weights=weights, embeddings=np.asarray(embeddings)
+        )
         embeddings, _ = post_process_embeddings(embeddings, pca_dims, sif_coefficient=sif_coefficient)
     else:
         # Post-process the embeddings.
-        embeddings, weights = post_process_embeddings(
-            np.asarray(embeddings), pca_dims, sif_coefficient=sif_coefficient
-        )
-        mapping = {idx: idx for idx in range(len(all_tokens))}
+        embeddings, weights = post_process_embeddings(np.asarray(embeddings), pca_dims, sif_coefficient=sif_coefficient)
+        token_mapping = None
     # Quantize the embeddings.
     embeddings = quantize_embeddings(embeddings, quantize_to)
 
@@ -165,7 +160,7 @@ def distill_from_model(
     return StaticModel(
         vectors=embeddings,
         weights=weights,
-        token_mapping=mapping,
+        token_mapping=token_mapping,
         tokenizer=backend_tokenizer,
         config=config,
         base_model_name=model_name,
@@ -254,6 +249,7 @@ def distill(
     :param trust_remote_code: Whether to trust the remote code. If this is False, we will only load components coming from `transformers`. If this is True, we will load all components.
     :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
+    :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
     :return: A StaticModel
 
     """
 
@@ -36,6 +36,7 @@ def save_pretrained(
     :param config: A metadata config.
     :param create_model_card: Whether to create a model card.
     :param subfolder: The subfolder to save the model in.
+    :param weights: The weights of the model. If None, no weights are saved.
     :param **kwargs: Any additional arguments.
     """
     folder_path = folder_path / subfolder if subfolder else folder_path
@@ -195,11 +196,6 @@ def load_pretrained(
     tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
     config = json.load(open(config_path))
 
-    if len(tokenizer.get_vocab()) != len(embeddings):
-        logger.warning(
-            f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`"
-        )
-
     return embeddings, tokenizer, config, metadata, weights
 
 
 
@@ -5,14 +5,14 @@
 from logging import getLogger
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Any, Iterator, Sequence, Union, overload
+from typing import Any, Iterator, Sequence, Union, cast, overload
 
 import numpy as np
 from joblib import delayed
 from tokenizers import Encoding, Tokenizer
 from tqdm import tqdm
 
-from model2vec.quantization import DType, quantize_and_reduce_dim, vocabulary_quantization
+from model2vec.quantization import DType, quantize_and_reduce_dim, quantize_vocabulary
 from model2vec.utils import ProgressParallel, load_local_model
 
 PathLike = Union[Path, str]
@@ -25,12 +25,12 @@ def __init__(
         self,
         vectors: np.ndarray,
         tokenizer: Tokenizer,
-        weights: np.ndarray | None = None,
-        token_mapping: dict[int, int] | None = None,
         config: dict[str, Any] | None = None,
         normalize: bool | None = None,
         base_model_name: str | None = None,
         language: list[str] | None = None,
+        weights: np.ndarray | None = None,
+        token_mapping: list[int] | None = None,
     ) -> None:
         """
         Initialize the StaticModel.
@@ -41,6 +41,12 @@ def __init__(
         :param normalize: Whether to normalize the embeddings.
         :param base_model_name: The used base model name. Used for creating a model card.
         :param language: The language of the model. Used for creating a model card.
+        :param weights: The weights to use for the embeddings. If None, no weights are used.
+            We always assume the norm of the embeddings is an implicit weight anyway.
+            This is only used for models that have undergone vocabulary quantization.
+        :param token_mapping: A mapping from token ids to indices in the vectors.
+            If None, we don't remap the tokens during inference.
+            This is only used for models that have undergone vocabulary quantization.
         :raises: ValueError if the number of tokens does not match the number of vectors.
         """
         super().__init__()
@@ -55,7 +61,9 @@ def __init__(
 
         self.embedding = vectors
         self.weights = weights
-        self.token_mapping = token_mapping
+        # Convert to an array for fast lookups
+        # We can't use or short circuit here because np.ndarray as booleans are ambiguous.
+        self.token_mapping = None if token_mapping is None else np.asarray(token_mapping)
 
         self.tokenizer = tokenizer
         self.unk_token_id: int | None
@@ -114,7 +122,7 @@ def save_pretrained(self, path: PathLike, model_name: str | None = None, subfold
         from model2vec.hf_utils import save_pretrained
 
         if self.token_mapping is not None:
-            self.config["token_mapping"] = list(self.token_mapping.items())
+            self.config["token_mapping"] = self.token_mapping.tolist()
 
         save_pretrained(
             folder_path=Path(path),
@@ -167,7 +175,7 @@ def from_pretrained(
         subfolder: str | None = None,
         quantize_to: str | DType | None = None,
         dimensionality: int | None = None,
-        quantize_vocabulary: int | None = None,
+        vocabulary_quantization: int | None = None,
     ) -> StaticModel:
         """
         Load a StaticModel from a local path or huggingface hub path.
@@ -183,6 +191,7 @@ def from_pretrained(
         :param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
             This is useful if you want to load a model with a lower dimensionality.
             Note that this only applies if you have trained your model using mrl or PCA.
+        :param vocabulary_quantization: The number of clusters to use for vocabulary quantization.
         :return: A StaticModel.
         """
         from model2vec.hf_utils import load_pretrained
@@ -194,31 +203,27 @@ def from_pretrained(
             subfolder=subfolder,
         )
 
+        # Quantize the vocabulary at full precision and dimensionality
+        if vocabulary_quantization is not None:
+            embeddings, token_mapping, weights = quantize_vocabulary(
+                n_clusters=vocabulary_quantization, weights=weights, embeddings=embeddings
+            )
+        else:
+            token_mapping = config.pop("token_mapping", None)
+
+        # Reduce dimensionality and quantize if requested
         embeddings = quantize_and_reduce_dim(
             embeddings=embeddings,
             quantize_to=quantize_to,
             dimensionality=dimensionality,
         )
 
-        if quantize_vocabulary is not None:
-            embeddings, token_mapping, weights = vocabulary_quantization(
-                n_clusters=quantize_vocabulary, weights=weights, embeddings=embeddings
-            )
-        else:
-            token_mapping = config.pop("token_mapping", None)
-            if isinstance(token_mapping, list):
-                # If the token mapping is a list, convert it to a dict
-                token_mapping = {int(k): int(v) for k, v in token_mapping}
-            elif token_mapping is None:
-                # If no token mapping is provided, use the default mapping
-                token_mapping = {i: i for i in range(len(embeddings))}
-
         return cls(
-            embeddings,
-            tokenizer,
-            weights,
-            token_mapping,
-            config,
+            vectors=embeddings,
+            tokenizer=tokenizer,
+            weights=weights,
+            token_mapping=token_mapping,
+            config=config,
             normalize=normalize,
             base_model_name=metadata.get("base_model"),
             language=metadata.get("language"),
@@ -232,6 +237,7 @@ def from_sentence_transformers(
         normalize: bool | None = None,
         quantize_to: str | DType | None = None,
         dimensionality: int | None = None,
+        vocabulary_quantization: int | None = None,
     ) -> StaticModel:
         """
         Load a StaticModel trained with sentence transformers from a local path or huggingface hub path.
@@ -246,6 +252,7 @@ def from_sentence_transformers(
         :param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
             This is useful if you want to load a model with a lower dimensionality.
             Note that this only applies if you have trained your model using mrl or PCA.
+        :param vocabulary_quantization: The number of clusters to use for vocabulary quantization.
         :return: A StaticModel.
         """
         from model2vec.hf_utils import load_pretrained
@@ -254,26 +261,29 @@ def from_sentence_transformers(
             folder_or_repo_path=path,
             token=token,
             from_sentence_transformers=True,
-            subfolder=None,
         )
 
+        # Quantize the vocabulary at full precision and dimensionality
+        if vocabulary_quantization is not None:
+            embeddings, token_mapping, weights = quantize_vocabulary(
+                n_clusters=vocabulary_quantization, weights=weights, embeddings=embeddings
+            )
+        else:
+            token_mapping = config.pop("token_mapping", None)
+
+        # Reduce dimensionality and quantize if requested
         embeddings = quantize_and_reduce_dim(
             embeddings=embeddings,
             quantize_to=quantize_to,
             dimensionality=dimensionality,
         )
 
-        token_mapping = config.pop("token_mapping", None)
-        if token_mapping is None:
-            # If no token mapping is provided, use the default mapping
-            token_mapping = {i: i for i in range(len(embeddings))}
-
         return cls(
-            embeddings,
-            tokenizer,
-            weights,
-            token_mapping,
-            config,
+            vectors=embeddings,
+            tokenizer=tokenizer,
+            weights=weights,
+            token_mapping=token_mapping,
+            config=config,
             normalize=normalize,
             base_model_name=metadata.get("base_model"),
             language=metadata.get("language"),
@@ -446,10 +456,11 @@ def _encode_batch(self, sentences: Sequence[str], max_length: int | None) -> np.
         out: list[np.ndarray] = []
         for id_list in ids:
             if id_list:
+                id_list_remapped: list[int] | np.ndarray
                 if self.token_mapping is None:
                     id_list_remapped = id_list
                 else:
-                    id_list_remapped = [self.token_mapping.get(token_id, token_id) for token_id in id_list]
+                    id_list_remapped = self.token_mapping[id_list]
                 emb = self.embedding[id_list_remapped]
                 if self.weights is not None:
                     emb = emb * self.weights[id_list][:, None]
@@ -512,6 +523,9 @@ def load_local(cls: type[StaticModel], path: PathLike) -> StaticModel:
         if not path.is_dir():
             raise ValueError(f"Path {path} is not a directory.")
 
-        embeddings, tokenizer, config = load_local_model(path)
+        embeddings, tokenizer, config, weights = load_local_model(path)
+        token_mapping = cast(list[int], config.pop("token_mapping", None))
 
-        return StaticModel(embeddings, tokenizer, config=config)
+        return StaticModel(
+            vectors=embeddings, tokenizer=tokenizer, config=config, weights=weights, token_mapping=token_mapping
+        )
@@ -64,9 +64,9 @@ def quantize_and_reduce_dim(
     return embeddings
 
 
-def vocabulary_quantization(
+def quantize_vocabulary(
     n_clusters: int, weights: np.ndarray | None, embeddings: np.ndarray
-) -> tuple[np.ndarray, dict[int, int], np.ndarray]:
+) -> tuple[np.ndarray, list[int], np.ndarray]:
     """Quantize the vocabulary of embeddings using KMeans clustering."""
     # If the model does not have weights, we assume the norm to be informative.
     if weights is None:
@@ -80,7 +80,8 @@ def vocabulary_quantization(
     kmeans = KMeans(n_clusters=n_clusters, random_state=42)
     kmeans.fit(embeddings)
     # Create a mapping from the original token index to the cluster index
-    token_mapping = {idx: x for idx, x in enumerate(kmeans.predict(embeddings))}
+    # Make sure to convert to list, otherwise we get np.int32 which is not jsonable.
+    token_mapping = cast(list[int], kmeans.predict(embeddings).tolist())
     # The cluster centers are the new embeddings.
     embeddings = kmeans.cluster_centers_
 
 
@@ -33,6 +33,8 @@ def __init__(
         :param tokenizer: The tokenizer.
         :param out_dim: The output dimension of the head.
         :param pad_id: The padding id. This is set to 0 in almost all model2vec models
+        :param token_mapping: The token mapping. If None, the token mapping is set to the range of the number of vectors.
+        :param weights: The weights of the model. If None, the weights are initialized to zeros.
         """
         super().__init__()
         self.pad_id = pad_id
@@ -82,7 +84,7 @@ def from_static_model(cls: type[ModelType], *, model: StaticModel, out_dim: int
         weights = torch.from_numpy(model.weights) if model.weights is not None else None
         embeddings_converted = torch.from_numpy(model.embedding)
         if model.token_mapping is not None:
-            token_mapping = [i for _, i in sorted(model.token_mapping.items(), key=lambda x: x[0])]
+            token_mapping = model.token_mapping.tolist()
         else:
             token_mapping = None
         return cls(
@@ -148,7 +150,7 @@ def to_static_model(self) -> StaticModel:
         """Convert the model to a static model."""
         emb = self.embeddings.weight.detach().cpu().numpy()
         w = torch.sigmoid(self.w).detach().cpu().numpy()
-        token_mapping = {i: int(token_id) for i, token_id in enumerate(self.token_mapping.tolist())}
+        token_mapping = self.token_mapping.tolist()
 
         return StaticModel(
             vectors=emb, weights=w, tokenizer=self.tokenizer, normalize=True, token_mapping=token_mapping