MinishLab
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 1 addition & 22 deletions b/‎.github/workflows/ci.yaml‎
Lines changed: 1 addition & 22 deletions
diff --git a/‎model2vec/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎model2vec/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎model2vec/distill/distillation.py‎
Lines changed: 18 additions & 2 deletions b/‎model2vec/distill/distillation.py‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎model2vec/distill/inference.py‎
Lines changed: 7 additions & 5 deletions b/‎model2vec/distill/inference.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎model2vec/hf_utils.py‎
Lines changed: 73 additions & 29 deletions b/‎model2vec/hf_utils.py‎
Lines changed: 73 additions & 29 deletions
diff --git a/‎model2vec/inference/model.py‎
Lines changed: 15 additions & 9 deletions b/‎model2vec/inference/model.py‎
Lines changed: 15 additions & 9 deletions
@@ -9,17 +9,8 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: ["ubuntu-latest", "windows-latest"]
+        os: ["ubuntu-latest"]
         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-        exclude:
-          - os: windows-latest
-            python-version: "3.9"
-          - os: windows-latest
-            python-version: "3.11"
-          - os: windows-latest
-            python-version: "3.12"
-          - os: windows-latest
-            python-version: "3.13"
       fail-fast: false
 
     steps:
@@ -31,19 +22,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
           allow-prereleases: true
 
-      # Step for Windows: Create and activate a virtual environment
-      - name: Create and activate a virtual environment (Windows)
-        if: ${{ runner.os == 'Windows' }}
-        run: |
-          irm https://astral.sh/uv/install.ps1 | iex
-          $env:Path = "C:\Users\runneradmin\.local\bin;$env:Path"
-          uv venv .venv
-          "VIRTUAL_ENV=.venv" | Out-File -FilePath $env:GITHUB_ENV -Append
-          "$PWD/.venv/Scripts" | Out-File -FilePath $env:GITHUB_PATH -Append
-
-      # Step for Unix: Create and activate a virtual environment
       - name: Create and activate a virtual environment (Unix)
-        if: ${{ runner.os != 'Windows' }}
         run: |
           curl -LsSf https://astral.sh/uv/install.sh | sh
           uv venv .venv
 
@@ -1,4 +1,4 @@
-from model2vec.model import StaticModel
+from model2vec.model import StaticModel, quantize_model
 from model2vec.version import __version__
 
-__all__ = ["StaticModel", "__version__"]
+__all__ = ["StaticModel", "quantize_model", "__version__"]
@@ -18,6 +18,7 @@
 from model2vec.quantization import DType, quantize_embeddings
 from model2vec.tokenizer import clean_and_create_vocabulary, replace_vocabulary, turn_tokens_into_ids
 from model2vec.tokenizer.tokenizer import _patch_tokenizer
+from model2vec.vocabulary_quantization import quantize_vocabulary
 
 logger = logging.getLogger(__name__)
 
@@ -32,6 +33,7 @@ def distill_from_model(
     token_remove_pattern: str | None = r"\[unused\d+\]",
     quantize_to: DType | str = DType.Float16,
     lower_case: bool = True,
+    vocabulary_quantization: int | None = None,
 ) -> StaticModel:
     """
     Distill a staticmodel from a sentence transformer.
@@ -56,6 +58,7 @@ def distill_from_model(
     :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :param lower_case: If this is set, all tokens in the model vocabulary will be converted to lowercase, and
         a lowercase normalizer will be inserted. This almost always improves performance.
+    :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
     :return: A StaticModel
     :raises: ValueError if the vocabulary is empty after preprocessing.
 
@@ -106,8 +109,16 @@ def distill_from_model(
         pad_token_id = vocab[pad_token]
     embeddings = create_embeddings(tokenized=token_ids, model=model, device=device, pad_token_id=pad_token_id)
 
-    # Post process the embeddings by applying PCA and Zipf weighting.
-    embeddings = post_process_embeddings(np.asarray(embeddings), pca_dims, sif_coefficient=sif_coefficient)
+    if vocabulary_quantization is not None:
+        _, weights = post_process_embeddings(np.asarray(embeddings), None, sif_coefficient=sif_coefficient)
+        embeddings, token_mapping, weights = quantize_vocabulary(
+            n_clusters=vocabulary_quantization, weights=weights, embeddings=np.asarray(embeddings)
+        )
+        embeddings, _ = post_process_embeddings(embeddings, pca_dims, sif_coefficient=sif_coefficient)
+    else:
+        # Post-process the embeddings.
+        embeddings, weights = post_process_embeddings(np.asarray(embeddings), pca_dims, sif_coefficient=sif_coefficient)
+        token_mapping = None
     # Quantize the embeddings.
     embeddings = quantize_embeddings(embeddings, quantize_to)
 
@@ -140,6 +151,8 @@ def distill_from_model(
 
     return StaticModel(
         vectors=embeddings,
+        weights=weights,
+        token_mapping=token_mapping,
         tokenizer=backend_tokenizer,
         config=config,
         base_model_name=model_name,
@@ -186,6 +199,7 @@ def distill(
     trust_remote_code: bool = False,
     quantize_to: DType | str = DType.Float16,
     lower_case: bool = True,
+    vocabulary_quantization: int | None = None,
 ) -> StaticModel:
     """
     Distill a staticmodel from a sentence transformer.
@@ -209,6 +223,7 @@ def distill(
     :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :param lower_case: If this is set, all tokens in the model vocabulary will be converted to lowercase, and
         a lowercase normalizer will be inserted. This almost always improves performance.
+    :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
     :return: A StaticModel
 
     """
@@ -228,4 +243,5 @@ def distill(
         sif_coefficient=sif_coefficient,
         quantize_to=quantize_to,
         lower_case=lower_case,
+        vocabulary_quantization=vocabulary_quantization,
     )
@@ -11,8 +11,8 @@
 from sklearn.decomposition import PCA
 from torch.nn.utils.rnn import pad_sequence
 from tqdm import tqdm
-from transformers import PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
+from transformers.modeling_utils import PreTrainedModel
 
 logger = logging.getLogger(__name__)
 
@@ -46,7 +46,7 @@ def create_embeddings(
     :param pad_token_id: The pad token id. Used to pad sequences.
     :return: The output embeddings.
     """
-    model = model.to(device)  # type: ignore
+    model = model.to(device)  # type: ignore  # Transformers error
 
     out_weights: np.ndarray
     intermediate_weights: list[np.ndarray] = []
@@ -117,7 +117,7 @@ def _encode_mean_using_model(model: PreTrainedModel, encodings: dict[str, torch.
 
 def post_process_embeddings(
     embeddings: np.ndarray, pca_dims: PCADimType, sif_coefficient: float | None = 1e-4
-) -> np.ndarray:
+) -> tuple[np.ndarray, np.ndarray]:
     """Post process embeddings by applying PCA and SIF weighting by estimating the frequencies through Zipf's law."""
     if pca_dims is not None:
         if pca_dims == "auto":
@@ -154,6 +154,8 @@ def post_process_embeddings(
         logger.info("Estimating word frequencies using Zipf's law, and then applying SIF.")
         inv_rank = 1 / (np.arange(2, embeddings.shape[0] + 2))
         proba = inv_rank / np.sum(inv_rank)
-        embeddings *= (sif_coefficient / (sif_coefficient + proba))[:, None]
+        weight = sif_coefficient / (sif_coefficient + proba)
+    else:
+        weight = np.ones(embeddings.shape[0])
 
-    return embeddings
+    return embeddings, weight
@@ -9,6 +9,7 @@
 import numpy as np
 import safetensors
 from huggingface_hub import ModelCard, ModelCardData
+from huggingface_hub.constants import HF_HUB_CACHE
 from safetensors.numpy import save_file
 from tokenizers import Tokenizer
 
@@ -24,6 +25,8 @@ def save_pretrained(
     config: dict[str, Any],
     create_model_card: bool = True,
     subfolder: str | None = None,
+    weights: np.ndarray | None = None,
+    mapping: np.ndarray | None = None,
     **kwargs: Any,
 ) -> None:
     """
@@ -35,11 +38,20 @@ def save_pretrained(
     :param config: A metadata config.
     :param create_model_card: Whether to create a model card.
     :param subfolder: The subfolder to save the model in.
+    :param weights: The weights of the model. If None, no weights are saved.
+    :param mapping: The token mapping of the model. If None, there is no token mapping.
     :param **kwargs: Any additional arguments.
     """
     folder_path = folder_path / subfolder if subfolder else folder_path
     folder_path.mkdir(exist_ok=True, parents=True)
-    save_file({"embeddings": embeddings}, folder_path / "model.safetensors")
+
+    model_weights = {"embeddings": embeddings}
+    if weights is not None:
+        model_weights["weights"] = weights
+    if mapping is not None:
+        model_weights["mapping"] = mapping
+
+    save_file(model_weights, folder_path / "model.safetensors")
     tokenizer.save(str(folder_path / "tokenizer.json"), pretty=False)
     json.dump(config, open(folder_path / "config.json", "w"), indent=4)
 
@@ -96,10 +108,11 @@ def _create_model_card(
 
 def load_pretrained(
     folder_or_repo_path: str | Path,
-    subfolder: str | None = None,
-    token: str | None = None,
-    from_sentence_transformers: bool = False,
-) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
+    subfolder: str | None,
+    token: str | None,
+    from_sentence_transformers: bool,
+    force_download: bool,
+) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any], np.ndarray | None, np.ndarray | None]:
     """
     Loads a pretrained model from a folder.
 
@@ -109,8 +122,10 @@ def load_pretrained(
     :param subfolder: The subfolder to load from.
     :param token: The huggingface token to use.
     :param from_sentence_transformers: Whether to load the model from a sentence transformers model.
+    :param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
+        already present in the cache.
     :raises: FileNotFoundError if the folder exists, but the file does not exist locally.
-    :return: The embeddings, tokenizer, config, and metadata.
+    :return: The embeddings, tokenizer, config, metadata, weights and mapping.
 
     """
     if from_sentence_transformers:
@@ -122,7 +137,13 @@ def load_pretrained(
         tokenizer_file = "tokenizer.json"
         config_name = "config.json"
 
-    folder_or_repo_path = Path(folder_or_repo_path)
+    cached_folder = _get_latest_model_path(str(folder_or_repo_path))
+    if cached_folder and not force_download:
+        logger.info(f"Found cached model at {cached_folder}, loading from cache.")
+        folder_or_repo_path = cached_folder
+    else:
+        logger.info(f"No cached model found for {folder_or_repo_path}, loading from local or hub.")
+        folder_or_repo_path = Path(folder_or_repo_path)
 
     local_folder = folder_or_repo_path / subfolder if subfolder else folder_or_repo_path
 
@@ -139,9 +160,7 @@ def load_pretrained(
         if not tokenizer_path.exists():
             raise FileNotFoundError(f"Tokenizer file does not exist in {local_folder}")
 
-        # README is optional, so this is a bit finicky.
         readme_path = local_folder / "README.md"
-        metadata = _get_metadata_from_readme(readme_path)
 
     else:
         logger.info("Folder does not exist locally, attempting to use huggingface hub.")
@@ -150,18 +169,11 @@ def load_pretrained(
                 folder_or_repo_path.as_posix(), model_file, token=token, subfolder=subfolder
             )
         )
-
-        try:
-            readme_path = Path(
-                huggingface_hub.hf_hub_download(
-                    folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
-                )
+        readme_path = Path(
+            huggingface_hub.hf_hub_download(
+                folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
             )
-            metadata = _get_metadata_from_readme(Path(readme_path))
-        except Exception as e:
-            # NOTE: we don't want to raise an error here, since the README is optional.
-            logger.info(f"No README found in the model folder: {e} No model card loaded.")
-            metadata = {}
+        )
 
         config_path = Path(
             huggingface_hub.hf_hub_download(
@@ -175,20 +187,27 @@ def load_pretrained(
         )
 
     opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
-    if from_sentence_transformers:
-        embeddings = opened_tensor_file.get_tensor("embedding.weight")
+    embedding_name = "embedding.weight" if from_sentence_transformers else "embeddings"
+    embeddings = opened_tensor_file.get_tensor(embedding_name)
+    try:
+        weights = opened_tensor_file.get_tensor("weights")
+    except Exception:
+        # Bare except because safetensors does not export its own errors.
+        weights = None
+    try:
+        mapping = opened_tensor_file.get_tensor("mapping")
+    except Exception:
+        mapping = None
+
+    if readme_path.exists():
+        metadata = _get_metadata_from_readme(readme_path)
     else:
-        embeddings = opened_tensor_file.get_tensor("embeddings")
+        metadata = {}
 
     tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
     config = json.load(open(config_path))
 
-    if len(tokenizer.get_vocab()) != len(embeddings):
-        logger.warning(
-            f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`"
-        )
-
-    return embeddings, tokenizer, config, metadata
+    return embeddings, tokenizer, config, metadata, weights, mapping
 
 
 def _get_metadata_from_readme(readme_path: Path) -> dict[str, Any]:
@@ -223,3 +242,28 @@ def push_folder_to_hub(
     huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token, path_in_repo=subfolder)
 
     logger.info(f"Pushed model to {repo_id}")
+
+
+def _get_latest_model_path(model_id: str) -> Path | None:
+    """
+    Gets the latest model path for a given identifier from the hugging face hub cache.
+
+    Returns None if there is no cached model. In this case, the model will be downloaded.
+    """
+    # Make path object
+    cache_dir = Path(HF_HUB_CACHE)
+    # This is specific to how HF stores the files.
+    normalized = model_id.replace("/", "--")
+    repo_dir = cache_dir / f"models--{normalized}" / "snapshots"
+
+    if not repo_dir.exists():
+        return None
+
+    # Find all directories.
+    snapshots = [p for p in repo_dir.iterdir() if p.is_dir()]
+    if not snapshots:
+        return None
+
+    # Get the latest directory by modification time.
+    latest_snapshot = max(snapshots, key=lambda p: p.stat().st_mtime)
+    return latest_snapshot
@@ -3,7 +3,7 @@
 import re
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Sequence, TypeVar
+from typing import Sequence, TypeVar, Union, cast
 
 import huggingface_hub
 import numpy as np
@@ -273,14 +273,14 @@ def save_pipeline(pipeline: StaticModelPipeline, folder_path: str | Path) -> Non
     )
 
 
-def _is_multi_label_shaped(y: LabelType) -> bool:
+def _is_multi_label_shaped(y: list[int] | list[str] | list[list[int]] | list[list[str]]) -> bool:
     """Check if the labels are in a multi-label shape."""
     return isinstance(y, (list, tuple)) and len(y) > 0 and isinstance(y[0], (list, tuple, set))
 
 
 def evaluate_single_or_multi_label(
     predictions: np.ndarray,
-    y: LabelType,
+    y: list[int] | list[str] | list[list[int]] | list[list[str]],
     output_dict: bool = False,
 ) -> str | dict[str, dict[str, float]]:
     """
@@ -292,16 +292,22 @@ def evaluate_single_or_multi_label(
     :return: A classification report.
     """
     if _is_multi_label_shaped(y):
+        # Cast because the type checker doesn't understand that y is a list of lists.
+        y = cast(Union[list[list[str]], list[list[int]]], y)
         classes = sorted(set([label for labels in y for label in labels]))
         mlb = MultiLabelBinarizer(classes=classes)
-        y = mlb.fit_transform(y)
-        predictions = mlb.transform(predictions)
-    elif isinstance(y[0], (str, int)):
-        classes = sorted(set(y))
+        y_transformed = mlb.fit_transform(y)
+        predictions_transformed = mlb.transform(predictions)
+    else:
+        if all(isinstance(label, (str, int)) for label in y):
+            y = cast(Union[list[str], list[int]], y)
+            classes = sorted(set(y))
+        y_transformed = np.array(y)
+        predictions_transformed = np.array(predictions)
 
     report = classification_report(
-        y,
-        predictions,
+        y_transformed,
+        predictions_transformed,
         output_dict=output_dict,
         zero_division=0,
     )