MinishLab
diff --git a/‎model2vec/distill/distillation.py‎
Lines changed: 2 additions & 1 deletion b/‎model2vec/distill/distillation.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎model2vec/model.py‎
Lines changed: 136 additions & 71 deletions b/‎model2vec/model.py‎
Lines changed: 136 additions & 71 deletions
diff --git a/‎model2vec/quantization.py‎
Lines changed: 3 additions & 25 deletions b/‎model2vec/quantization.py‎
Lines changed: 3 additions & 25 deletions
diff --git a/‎model2vec/utils.py‎
Lines changed: 0 additions & 4 deletions b/‎model2vec/utils.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎model2vec/vocabulary_quantization.py‎
Lines changed: 45 additions & 0 deletions b/‎model2vec/vocabulary_quantization.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
@@ -14,8 +14,9 @@
 from model2vec.distill.inference import PCADimType, create_embeddings, post_process_embeddings
 from model2vec.distill.utils import select_optimal_device
 from model2vec.model import StaticModel
-from model2vec.quantization import DType, quantize_embeddings, quantize_vocabulary
+from model2vec.quantization import DType, quantize_embeddings
 from model2vec.tokenizer import clean_and_create_vocabulary, replace_vocabulary, turn_tokens_into_ids
+from model2vec.vocabulary_quantization import quantize_vocabulary
 
 logger = logging.getLogger(__name__)
 
 
@@ -12,7 +12,7 @@
 from tokenizers import Encoding, Tokenizer
 from tqdm import tqdm
 
-from model2vec.quantization import DType, quantize_and_reduce_dim, quantize_vocabulary
+from model2vec.quantization import DType
 from model2vec.utils import ProgressParallel, load_local_model
 
 PathLike = Union[Path, str]
@@ -63,7 +63,7 @@ def __init__(
         self.weights = weights
         # Convert to an array for fast lookups
         # We can't use or short circuit here because np.ndarray as booleans are ambiguous.
-        self.token_mapping = None if token_mapping is None else np.asarray(token_mapping)
+        self.token_mapping: np.ndarray | None = None if token_mapping is None else np.asarray(token_mapping)
 
         self.tokenizer = tokenizer
         self.unk_token_id: int | None
@@ -194,39 +194,16 @@ def from_pretrained(
         :param vocabulary_quantization: The number of clusters to use for vocabulary quantization.
         :return: A StaticModel.
         """
-        from model2vec.hf_utils import load_pretrained
-
-        embeddings, tokenizer, config, metadata, weights = load_pretrained(
-            folder_or_repo_path=path,
+        return _loading_helper(
+            cls=cls,
+            path=path,
             token=token,
-            from_sentence_transformers=False,
-            subfolder=subfolder,
-        )
-
-        # Quantize the vocabulary at full precision and dimensionality
-        if vocabulary_quantization is not None:
-            embeddings, token_mapping, weights = quantize_vocabulary(
-                n_clusters=vocabulary_quantization, weights=weights, embeddings=embeddings
-            )
-        else:
-            token_mapping = config.pop("token_mapping", None)
-
-        # Reduce dimensionality and quantize if requested
-        embeddings = quantize_and_reduce_dim(
-            embeddings=embeddings,
+            vocabulary_quantization=vocabulary_quantization,
             quantize_to=quantize_to,
             dimensionality=dimensionality,
-        )
-
-        return cls(
-            vectors=embeddings,
-            tokenizer=tokenizer,
-            weights=weights,
-            token_mapping=token_mapping,
-            config=config,
+            from_sentence_transformers=False,
             normalize=normalize,
-            base_model_name=metadata.get("base_model"),
-            language=metadata.get("language"),
+            subfolder=subfolder,
         )
 
     @classmethod
@@ -255,38 +232,16 @@ def from_sentence_transformers(
         :param vocabulary_quantization: The number of clusters to use for vocabulary quantization.
         :return: A StaticModel.
         """
-        from model2vec.hf_utils import load_pretrained
-
-        embeddings, tokenizer, config, metadata, weights = load_pretrained(
-            folder_or_repo_path=path,
+        return _loading_helper(
+            cls=cls,
+            path=path,
             token=token,
-            from_sentence_transformers=True,
-        )
-
-        # Quantize the vocabulary at full precision and dimensionality
-        if vocabulary_quantization is not None:
-            embeddings, token_mapping, weights = quantize_vocabulary(
-                n_clusters=vocabulary_quantization, weights=weights, embeddings=embeddings
-            )
-        else:
-            token_mapping = config.pop("token_mapping", None)
-
-        # Reduce dimensionality and quantize if requested
-        embeddings = quantize_and_reduce_dim(
-            embeddings=embeddings,
+            vocabulary_quantization=vocabulary_quantization,
             quantize_to=quantize_to,
             dimensionality=dimensionality,
-        )
-
-        return cls(
-            vectors=embeddings,
-            tokenizer=tokenizer,
-            weights=weights,
-            token_mapping=token_mapping,
-            config=config,
+            from_sentence_transformers=True,
             normalize=normalize,
-            base_model_name=metadata.get("base_model"),
-            language=metadata.get("language"),
+            subfolder=None,
         )
 
     @overload
@@ -381,7 +336,7 @@ def _encode_batch_as_sequence(self, sentences: Sequence[str], max_length: int |
         out: list[np.ndarray] = []
         for id_list in ids:
             if id_list:
-                out.append(self.embedding[id_list])
+                out.append(self._encode_helper(id_list))
             else:
                 out.append(np.zeros((0, self.dim)))
 
@@ -450,23 +405,35 @@ def encode(
             return out_array[0]
         return out_array
 
+    def _encode_helper(self, id_list: list[int]) -> np.ndarray:
+        """
+        Helper function to encode a list of ids.
+
+        This function is used to deduplicate the logic in `encode` and `encode_as_sequence`.
+        It retrieves the embeddings for the given list of ids, applying weights if available.
+
+        :param id_list: A list of token ids.
+        :return: The embeddings for the given ids, as a sequence of vectors.
+        """
+        id_list_remapped: list[int] | np.ndarray
+        if self.token_mapping is None:
+            id_list_remapped = id_list
+        else:
+            id_list_remapped = self.token_mapping[id_list]
+        emb = self.embedding[id_list_remapped]
+        if self.weights is not None:
+            emb = emb * self.weights[id_list][:, None]
+
+        return emb
+
     def _encode_batch(self, sentences: Sequence[str], max_length: int | None) -> np.ndarray:
         """Encode a batch of sentences."""
         ids = self.tokenize(sentences=sentences, max_length=max_length)
         out: list[np.ndarray] = []
         for id_list in ids:
             if id_list:
-                id_list_remapped: list[int] | np.ndarray
-                if self.token_mapping is None:
-                    id_list_remapped = id_list
-                else:
-                    id_list_remapped = self.token_mapping[id_list]
-                emb = self.embedding[id_list_remapped]
-                if self.weights is not None:
-                    emb = emb * self.weights[id_list][:, None]
-                emb = emb.mean(axis=0)
-
-                out.append(emb)
+                emb = self._encode_helper(id_list)
+                out.append(emb.mean(axis=0))
             else:
                 out.append(np.zeros(self.dim))
 
@@ -529,3 +496,101 @@ def load_local(cls: type[StaticModel], path: PathLike) -> StaticModel:
         return StaticModel(
             vectors=embeddings, tokenizer=tokenizer, config=config, weights=weights, token_mapping=token_mapping
         )
+
+
+def quantize_model(
+    model: StaticModel,
+    vocabulary_quantization: int | None = None,
+    quantize_to: str | DType | None = None,
+    dimensionality: int | None = None,
+) -> StaticModel:
+    """
+    Quantize the model to a lower precision and possibly lower dimensionality.
+
+    :param model: The model to quantize.
+    :param vocabulary_quantization: The number of clusters to use for quantization.
+    :param quantize_to: The dtype to quantize the model to.
+    :param dimensionality: The desired dimensionality of the model.
+        This needs to be < than the current model dimensionality.
+    :return: A new StaticModel with the quantized embeddings.
+    :raises: ValueError if the model is already quantized.
+    """
+    from model2vec.quantization import quantize_and_reduce_dim
+
+    token_mapping: list[int] | None
+    weights: np.ndarray | None
+    if vocabulary_quantization is not None:
+        from model2vec.vocabulary_quantization import quantize_vocabulary
+
+        if len(model.tokens) != len(model.embedding):
+            raise ValueError("Model already has been vocabulary quantized, cannot quantize again.")
+
+        embeddings, token_mapping, weights = quantize_vocabulary(
+            n_clusters=vocabulary_quantization, weights=model.weights, embeddings=model.embedding
+        )
+    else:
+        embeddings = model.embedding
+        token_mapping = cast(list[int], model.token_mapping.tolist()) if model.token_mapping is not None else None
+        weights = model.weights
+    if quantize_to is not None or dimensionality is not None:
+        embeddings = quantize_and_reduce_dim(
+            embeddings=embeddings,
+            quantize_to=quantize_to,
+            dimensionality=dimensionality,
+        )
+
+    return StaticModel(
+        vectors=embeddings,
+        tokenizer=model.tokenizer,
+        config=model.config,
+        weights=weights,
+        token_mapping=token_mapping,
+        normalize=model.normalize,
+        base_model_name=model.base_model_name,
+        language=model.language,
+    )
+
+
+def _loading_helper(
+    cls: type[StaticModel],
+    path: PathLike,
+    token: str | None,
+    vocabulary_quantization: int | None = None,
+    quantize_to: str | DType | None = None,
+    dimensionality: int | None = None,
+    from_sentence_transformers: bool = False,
+    normalize: bool | None = None,
+    subfolder: str | None = None,
+) -> StaticModel:
+    """Helper function to load a model from a directory."""
+    from model2vec.hf_utils import load_pretrained
+
+    if from_sentence_transformers and subfolder is not None:
+        raise ValueError("Subfolder is not supported for sentence transformers models.")
+
+    embeddings, tokenizer, config, metadata, weights = load_pretrained(
+        folder_or_repo_path=path,
+        token=token,
+        from_sentence_transformers=from_sentence_transformers,
+        subfolder=subfolder,
+    )
+
+    token_mapping = config.pop("token_mapping", None)
+
+    model = cls(
+        vectors=embeddings,
+        tokenizer=tokenizer,
+        weights=weights,
+        token_mapping=token_mapping,
+        config=config,
+        normalize=normalize,
+        base_model_name=metadata.get("base_model"),
+        language=metadata.get("language"),
+    )
+
+    return quantize_model(
+        model=model,
+        vocabulary_quantization=vocabulary_quantization,
+        quantize_to=quantize_to,
+        dimensionality=dimensionality,
+    )
@@ -1,10 +1,12 @@
 from __future__ import annotations
 
+import logging
 from enum import Enum
-from typing import cast
 
 import numpy as np
 
+logger = logging.getLogger(__name__)
+
 
 class DType(str, Enum):
     Float16 = "float16"
@@ -62,27 +64,3 @@ def quantize_and_reduce_dim(
         embeddings = embeddings[:, :dimensionality]
 
     return embeddings
-
-
-def quantize_vocabulary(
-    n_clusters: int, weights: np.ndarray | None, embeddings: np.ndarray
-) -> tuple[np.ndarray, list[int], np.ndarray]:
-    """Quantize the vocabulary of embeddings using KMeans clustering."""
-    # If the model does not have weights, we assume the norm to be informative.
-    if weights is None:
-        weights = cast(np.ndarray, np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-32)
-        # Divide by the norm to normalize the embeddings, so we don't bias the clustering.
-        embeddings = embeddings / weights
-
-    # Quantize the vocabulary
-    from sklearn.cluster import KMeans
-
-    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
-    kmeans.fit(embeddings)
-    # Create a mapping from the original token index to the cluster index
-    # Make sure to convert to list, otherwise we get np.int32 which is not jsonable.
-    token_mapping = cast(list[int], kmeans.predict(embeddings).tolist())
-    # The cluster centers are the new embeddings.
-    embeddings = kmeans.cluster_centers_
-
-    return embeddings, token_mapping, weights
@@ -125,8 +125,4 @@ def load_local_model(folder: Path) -> tuple[np.ndarray, Tokenizer, dict[str, str
 
     tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
 
-    if len(tokenizer.get_vocab()) != len(embeddings):
-        logger.warning(
-            f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`"
-        )
     return embeddings, tokenizer, config, weights
@@ -0,0 +1,45 @@
+import logging
+from typing import cast
+
+import numpy as np
+
+# Lazy import
+try:
+    from sklearn.cluster import KMeans
+except ImportError:
+    raise ImportError(
+        "scikit-learn is required for quantizing the vocabulary. "
+        "Please install model2vec with the quantization extra."
+    )
+
+
+logger = logging.getLogger(__name__)
+
+
+def quantize_vocabulary(
+    n_clusters: int, weights: np.ndarray | None, embeddings: np.ndarray
+) -> tuple[np.ndarray, list[int], np.ndarray]:
+    """Quantize the vocabulary of embeddings using KMeans clustering."""
+    logger.info(f"Quantizing vocabulary to {n_clusters} clusters.")
+    # If the model does not have weights, we assume the norm to be informative.
+    if weights is None:
+        weights = cast(np.ndarray, np.linalg.norm(embeddings, axis=1) + 1e-32)
+        # Divide by the norm to normalize the embeddings, so we don't bias the clustering.
+        embeddings = embeddings / weights[:, None]
+
+    # Ensure the embeddings are in float32 for KMeans
+    # Store the original dtype to restore it later
+    orig_dtype = embeddings.dtype
+
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42, init="k-means++")
+    cast_embeddings = embeddings.astype(np.float32)
+    # Fit KMeans to the embeddings
+    kmeans.fit(cast_embeddings)
+    # Create a mapping from the original token index to the cluster index
+    # Make sure to convert to list, otherwise we get np.int32 which is not jsonable.
+    token_mapping = cast(list[int], kmeans.predict(cast_embeddings).tolist())
+    # The cluster centers are the new embeddings.
+    # Convert them back to the original dtype
+    embeddings = kmeans.cluster_centers_.astype(orig_dtype)
+
+    return embeddings, token_mapping, weights
@@ -65,6 +65,7 @@ onnx = ["onnx", "torch"]
 train = ["torch", "lightning", "scikit-learn", "skops"]
 inference = ["scikit-learn", "skops"]
 tokenizer = ["transformers"]
+quantization = ["scikit-learn"]
 
 [project.urls]
 "Homepage" = "https://github.com/MinishLab"