MinishLab
diff --git a/‎model2vec/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎model2vec/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎model2vec/distill/distillation.py‎
Lines changed: 22 additions & 5 deletions b/‎model2vec/distill/distillation.py‎
Lines changed: 22 additions & 5 deletions
diff --git a/‎model2vec/distill/inference.py‎
Lines changed: 8 additions & 6 deletions b/‎model2vec/distill/inference.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎model2vec/hf_utils.py‎
Lines changed: 26 additions & 11 deletions b/‎model2vec/hf_utils.py‎
Lines changed: 26 additions & 11 deletions
diff --git a/‎model2vec/inference/model.py‎
Lines changed: 15 additions & 9 deletions b/‎model2vec/inference/model.py‎
Lines changed: 15 additions & 9 deletions
@@ -1,4 +1,4 @@
-from model2vec.model import StaticModel
+from model2vec.model import StaticModel, quantize_model
 from model2vec.version import __version__
 
-__all__ = ["StaticModel", "__version__"]
+__all__ = ["StaticModel", "quantize_model", "__version__"]
@@ -6,14 +6,17 @@
 from typing import Optional, cast
 
 import numpy as np
-from huggingface_hub import model_info
-from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizerFast
+from huggingface_hub.hf_api import model_info
+from transformers import AutoModel, AutoTokenizer
+from transformers.modeling_utils import PreTrainedModel
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 
 from model2vec.distill.inference import PCADimType, create_embeddings, post_process_embeddings
 from model2vec.distill.utils import select_optimal_device
 from model2vec.model import StaticModel
 from model2vec.quantization import DType, quantize_embeddings
 from model2vec.tokenizer import clean_and_create_vocabulary, replace_vocabulary, turn_tokens_into_ids
+from model2vec.vocabulary_quantization import quantize_vocabulary
 
 logger = logging.getLogger(__name__)
 
@@ -29,6 +32,7 @@ def distill_from_model(
     token_remove_pattern: str | None = r"\[unused\d+\]",
     quantize_to: DType | str = DType.Float16,
     use_subword: bool | None = None,
+    vocabulary_quantization: int | None = None,
 ) -> StaticModel:
     """
     Distill a staticmodel from a sentence transformer.
@@ -54,6 +58,7 @@ def distill_from_model(
         If the pattern is so general that it removes all tokens, we throw an error. If the pattern can't be compiled into a valid regex, we also throw an error.
     :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
+    :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
     :return: A StaticModel
     :raises: ValueError if the vocabulary is empty after preprocessing.
 
@@ -103,7 +108,6 @@ def distill_from_model(
 
     # Replace the vocabulary in the tokenizer with the new vocabulary.
     backend_tokenizer = replace_vocabulary(backend_tokenizer, all_tokens, unk_token=unk_token, pad_token=pad_token)
-
     logger.info(f"Creating embeddings for {len(all_tokens)} tokens")
     # Convert tokens to IDs
     token_ids = turn_tokens_into_ids(all_tokens, tokenizer, unk_token)
@@ -113,8 +117,16 @@ def distill_from_model(
         tokenized=token_ids, model=model, device=device, pad_token_id=tokenizer.get_vocab()[pad_token]
     )
 
-    # Post process the embeddings by applying PCA and Zipf weighting.
-    embeddings = post_process_embeddings(np.asarray(embeddings), pca_dims, sif_coefficient=sif_coefficient)
+    if vocabulary_quantization is not None:
+        _, weights = post_process_embeddings(np.asarray(embeddings), None, sif_coefficient=sif_coefficient)
+        embeddings, token_mapping, weights = quantize_vocabulary(
+            n_clusters=vocabulary_quantization, weights=weights, embeddings=np.asarray(embeddings)
+        )
+        embeddings, _ = post_process_embeddings(embeddings, pca_dims, sif_coefficient=sif_coefficient)
+    else:
+        # Post-process the embeddings.
+        embeddings, weights = post_process_embeddings(np.asarray(embeddings), pca_dims, sif_coefficient=sif_coefficient)
+        token_mapping = None
     # Quantize the embeddings.
     embeddings = quantize_embeddings(embeddings, quantize_to)
 
@@ -148,6 +160,8 @@ def distill_from_model(
 
     return StaticModel(
         vectors=embeddings,
+        weights=weights,
+        token_mapping=token_mapping,
         tokenizer=backend_tokenizer,
         config=config,
         base_model_name=model_name,
@@ -211,6 +225,7 @@ def distill(
     trust_remote_code: bool = False,
     quantize_to: DType | str = DType.Float16,
     use_subword: bool | None = None,
+    vocabulary_quantization: int | None = None,
 ) -> StaticModel:
     """
     Distill a staticmodel from a sentence transformer.
@@ -235,6 +250,7 @@ def distill(
     :param trust_remote_code: Whether to trust the remote code. If this is False, we will only load components coming from `transformers`. If this is True, we will load all components.
     :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
+    :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
     :return: A StaticModel
 
     """
@@ -255,4 +271,5 @@ def distill(
         sif_coefficient=sif_coefficient,
         quantize_to=quantize_to,
         use_subword=use_subword,
+        vocabulary_quantization=vocabulary_quantization,
     )
@@ -11,8 +11,8 @@
 from sklearn.decomposition import PCA
 from torch.nn.utils.rnn import pad_sequence
 from tqdm import tqdm
-from transformers import PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
+from transformers.modeling_utils import PreTrainedModel
 
 logger = logging.getLogger(__name__)
 
@@ -46,7 +46,7 @@ def create_embeddings(
     :param pad_token_id: The pad token id. Used to pad sequences.
     :return: The output embeddings.
     """
-    model = model.to(device)  # type: ignore
+    model = model.to(device)  # type: ignore  # Transformers error
 
     out_weights: np.ndarray
     intermediate_weights: list[np.ndarray] = []
@@ -98,7 +98,7 @@ def _encode_mean_using_model(model: PreTrainedModel, encodings: dict[str, torch.
     """
     encodings = {k: v.to(model.device) for k, v in encodings.items()}
     encoded: BaseModelOutputWithPoolingAndCrossAttentions = model(**encodings)
-    out: torch.Tensor = encoded.last_hidden_state.cpu()  # type: ignore  # typing is wrong.
+    out: torch.Tensor = encoded.last_hidden_state.cpu()  # type: ignore  # False positive
     # NOTE: If the dtype is bfloat 16, we convert to float32,
     # because numpy does not suport bfloat16
     # See here: https://github.com/numpy/numpy/issues/19808
@@ -116,7 +116,7 @@ def _encode_mean_using_model(model: PreTrainedModel, encodings: dict[str, torch.
 
 def post_process_embeddings(
     embeddings: np.ndarray, pca_dims: PCADimType, sif_coefficient: float | None = 1e-4
-) -> np.ndarray:
+) -> tuple[np.ndarray, np.ndarray]:
     """Post process embeddings by applying PCA and SIF weighting by estimating the frequencies through Zipf's law."""
     if pca_dims is not None:
         if pca_dims == "auto":
@@ -153,6 +153,8 @@ def post_process_embeddings(
         logger.info("Estimating word frequencies using Zipf's law, and then applying SIF.")
         inv_rank = 1 / (np.arange(2, embeddings.shape[0] + 2))
         proba = inv_rank / np.sum(inv_rank)
-        embeddings *= (sif_coefficient / (sif_coefficient + proba))[:, None]
+        weight = sif_coefficient / (sif_coefficient + proba)
+    else:
+        weight = np.ones(embeddings.shape[0])
 
-    return embeddings
+    return embeddings, weight
@@ -25,6 +25,8 @@ def save_pretrained(
     config: dict[str, Any],
     create_model_card: bool = True,
     subfolder: str | None = None,
+    weights: np.ndarray | None = None,
+    mapping: np.ndarray | None = None,
     **kwargs: Any,
 ) -> None:
     """
@@ -36,11 +38,20 @@ def save_pretrained(
     :param config: A metadata config.
     :param create_model_card: Whether to create a model card.
     :param subfolder: The subfolder to save the model in.
+    :param weights: The weights of the model. If None, no weights are saved.
+    :param mapping: The token mapping of the model. If None, there is no token mapping.
     :param **kwargs: Any additional arguments.
     """
     folder_path = folder_path / subfolder if subfolder else folder_path
     folder_path.mkdir(exist_ok=True, parents=True)
-    save_file({"embeddings": embeddings}, folder_path / "model.safetensors")
+
+    model_weights = {"embeddings": embeddings}
+    if weights is not None:
+        model_weights["weights"] = weights
+    if mapping is not None:
+        model_weights["mapping"] = mapping
+
+    save_file(model_weights, folder_path / "model.safetensors")
     tokenizer.save(str(folder_path / "tokenizer.json"), pretty=False)
     json.dump(config, open(folder_path / "config.json", "w"), indent=4)
 
@@ -101,7 +112,7 @@ def load_pretrained(
     token: str | None,
     from_sentence_transformers: bool,
     force_download: bool,
-) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
+) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any], np.ndarray | None, np.ndarray | None]:
     """
     Loads a pretrained model from a folder.
 
@@ -114,7 +125,7 @@ def load_pretrained(
     :param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
         already present in the cache.
     :raises: FileNotFoundError if the folder exists, but the file does not exist locally.
-    :return: The embeddings, tokenizer, config, and metadata.
+    :return: The embeddings, tokenizer, config, metadata, weights and mapping.
 
     """
     if from_sentence_transformers:
@@ -176,8 +187,17 @@ def load_pretrained(
         )
 
     opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
-    embedding_key = "embedding.weight" if from_sentence_transformers else "embeddings"
-    embeddings = opened_tensor_file.get_tensor(embedding_key)
+    embedding_name = "embedding.weight" if from_sentence_transformers else "embeddings"
+    embeddings = opened_tensor_file.get_tensor(embedding_name)
+    try:
+        weights = opened_tensor_file.get_tensor("weights")
+    except Exception:
+        # Bare except because safetensors does not export its own errors.
+        weights = None
+    try:
+        mapping = opened_tensor_file.get_tensor("mapping")
+    except Exception:
+        mapping = None
 
     if readme_path.exists():
         metadata = _get_metadata_from_readme(readme_path)
@@ -187,12 +207,7 @@ def load_pretrained(
     tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
     config = json.load(open(config_path))
 
-    if len(tokenizer.get_vocab()) != len(embeddings):
-        logger.warning(
-            f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`"
-        )
-
-    return embeddings, tokenizer, config, metadata
+    return embeddings, tokenizer, config, metadata, weights, mapping
 
 
 def _get_metadata_from_readme(readme_path: Path) -> dict[str, Any]:
 
@@ -3,7 +3,7 @@
 import re
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Sequence, TypeVar
+from typing import Sequence, TypeVar, Union, cast
 
 import huggingface_hub
 import numpy as np
@@ -273,14 +273,14 @@ def save_pipeline(pipeline: StaticModelPipeline, folder_path: str | Path) -> Non
     )
 
 
-def _is_multi_label_shaped(y: LabelType) -> bool:
+def _is_multi_label_shaped(y: list[int] | list[str] | list[list[int]] | list[list[str]]) -> bool:
     """Check if the labels are in a multi-label shape."""
     return isinstance(y, (list, tuple)) and len(y) > 0 and isinstance(y[0], (list, tuple, set))
 
 
 def evaluate_single_or_multi_label(
     predictions: np.ndarray,
-    y: LabelType,
+    y: list[int] | list[str] | list[list[int]] | list[list[str]],
     output_dict: bool = False,
 ) -> str | dict[str, dict[str, float]]:
     """
@@ -292,16 +292,22 @@ def evaluate_single_or_multi_label(
     :return: A classification report.
     """
     if _is_multi_label_shaped(y):
+        # Cast because the type checker doesn't understand that y is a list of lists.
+        y = cast(Union[list[list[str]], list[list[int]]], y)
         classes = sorted(set([label for labels in y for label in labels]))
         mlb = MultiLabelBinarizer(classes=classes)
-        y = mlb.fit_transform(y)
-        predictions = mlb.transform(predictions)
-    elif isinstance(y[0], (str, int)):
-        classes = sorted(set(y))
+        y_transformed = mlb.fit_transform(y)
+        predictions_transformed = mlb.transform(predictions)
+    else:
+        if all(isinstance(label, (str, int)) for label in y):
+            y = cast(Union[list[str], list[int]], y)
+            classes = sorted(set(y))
+        y_transformed = np.array(y)
+        predictions_transformed = np.array(predictions)
 
     report = classification_report(
-        y,
-        predictions,
+        y_transformed,
+        predictions_transformed,
         output_dict=output_dict,
         zero_division=0,
     )