Added configurable pooling for distillation

Pringled · Pringled · commit 1e3b58eea4fa · 2025-10-01T17:38:12.000+02:00
diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py
@@ -11,7 +11,7 @@
 from transformers.modeling_utils import PreTrainedModel
 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 
-from model2vec.distill.inference import PCADimType, create_embeddings, post_process_embeddings
+from model2vec.distill.inference import PCADimType, PoolingType, create_embeddings, post_process_embeddings
 from model2vec.distill.utils import select_optimal_device
 from model2vec.model import StaticModel
 from model2vec.quantization import DType, quantize_embeddings
@@ -33,6 +33,7 @@ def distill_from_model(
     quantize_to: DType | str = DType.Float16,
     use_subword: bool | None = None,
     vocabulary_quantization: int | None = None,
+    pooling: PoolingType = PoolingType.MEAN,
 ) -> StaticModel:
     """
     Distill a staticmodel from a sentence transformer.
@@ -59,6 +60,7 @@ def distill_from_model(
     :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
     :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
+    :param pooling: The pooling strategy to use for creating embeddings. Can be one of "mean", "last", or "cls".
     :return: A StaticModel
     :raises: ValueError if the vocabulary is empty after preprocessing.
 
@@ -114,7 +116,11 @@ def distill_from_model(
 
     # Create the embeddings
     embeddings = create_embeddings(
-        tokenized=token_ids, model=model, device=device, pad_token_id=tokenizer.get_vocab()[pad_token]
+        tokenized=token_ids,
+        model=model,
+        device=device,
+        pad_token_id=tokenizer.get_vocab()[pad_token],
+        pooling=pooling,
     )
 
     if vocabulary_quantization is not None:
@@ -142,6 +148,7 @@ def distill_from_model(
         "hidden_dim": embeddings.shape[1],
         "seq_length": 1000000,  # Set this to a high value since we don't have a sequence length limit.
         "normalize": True,
+        "pooling": pooling,
     }
 
     if os.path.exists(model_name):
@@ -226,6 +233,7 @@ def distill(
     quantize_to: DType | str = DType.Float16,
     use_subword: bool | None = None,
     vocabulary_quantization: int | None = None,
+    pooling: PoolingType = PoolingType.MEAN,
 ) -> StaticModel:
     """
     Distill a staticmodel from a sentence transformer.
@@ -251,6 +259,7 @@ def distill(
     :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
     :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
+    :param pooling: The pooling strategy to use for creating embeddings. Can be one of "mean", "last", or "cls".
     :return: A StaticModel
 
     """
@@ -272,4 +281,5 @@ def distill(
         quantize_to=quantize_to,
         use_subword=use_subword,
         vocabulary_quantization=vocabulary_quantization,
+        pooling=pooling,
     )
diff --git a/model2vec/distill/inference.py b/model2vec/distill/inference.py
@@ -3,8 +3,9 @@
 
 import inspect
 import logging
+from enum import Enum
 from pathlib import Path
-from typing import Literal, Protocol, Union
+from typing import Literal, Union
 
 import numpy as np
 import torch
@@ -16,23 +17,26 @@
 
 logger = logging.getLogger(__name__)
 
-
 PathLike = Union[Path, str]
 PCADimType = Union[int, None, float, Literal["auto"]]
 
-
 _DEFAULT_BATCH_SIZE = 256
 
 
-class ModulewithWeights(Protocol):
-    weight: torch.nn.Parameter
+class PoolingType(str, Enum):
+    """Pooling strategies for embedding creation."""
+
+    MEAN = "mean"
+    LAST = "last"
+    CLS = "cls"
 
 
 def create_embeddings(
     model: PreTrainedModel,
     tokenized: list[list[int]],
     device: str,
     pad_token_id: int,
+    pooling: PoolingType = PoolingType.MEAN,
 ) -> np.ndarray:
     """
     Create output embeddings for a bunch of tokens using a pretrained model.
@@ -44,9 +48,11 @@ def create_embeddings(
     :param tokenized: All tokenized tokens.
     :param device: The torch device to use.
     :param pad_token_id: The pad token id. Used to pad sequences.
+    :param pooling: The pooling strategy to use.
     :return: The output embeddings.
+    :raises ValueError: If the pooling strategy is unknown.
     """
-    model = model.to(device)  # type: ignore  # Transformers error
+    model = model.to(device).eval()  # type: ignore  # Transformers error
 
     out_weights: np.ndarray
     intermediate_weights: list[np.ndarray] = []
@@ -62,56 +68,123 @@ def create_embeddings(
     pbar = tqdm(total=len(sorted_tokenized), desc="Encoding tokens", unit=" tokens")
 
     for batch_idx in range(0, len(sorted_tokenized), _DEFAULT_BATCH_SIZE):
-        batch = [torch.Tensor(x).long() for x in sorted_tokenized[batch_idx : batch_idx + _DEFAULT_BATCH_SIZE]]
+        batch_list = sorted_tokenized[batch_idx : batch_idx + _DEFAULT_BATCH_SIZE]
+        batch = [torch.tensor(x, dtype=torch.long) for x in batch_list]
 
         encoded = {}
         encoded["input_ids"] = pad_sequence(batch, batch_first=True, padding_value=pad_token_id)
-        encoded["attention_mask"] = encoded["input_ids"] != pad_token_id
+
+        if pooling == PoolingType.MEAN:
+            # For mean pooling, mask out padding tokens
+            encoded["attention_mask"] = encoded["input_ids"] != pad_token_id
+        else:
+            # For "last"/"cls": build mask directly from true lengths to ensure
+            # the last non-pad token and CLS positions are chosen correctly
+            seq_len = encoded["input_ids"].size(1)
+            batch_lengths = torch.tensor([len(x) for x in batch_list], device=encoded["input_ids"].device)
+            token_positions = torch.arange(seq_len, device=encoded["input_ids"].device)
+            encoded["attention_mask"] = token_positions.unsqueeze(0) < batch_lengths.unsqueeze(1)
 
         if add_token_type_ids:
+            # Add token_type_ids for models that support it
             encoded["token_type_ids"] = torch.zeros_like(encoded["input_ids"])
 
-        out = _encode_mean_using_model(model, encoded)
+        if pooling == PoolingType.MEAN:
+            out = _encode_mean_with_model(model, encoded)
+        elif pooling == PoolingType.LAST:
+            out = _encode_last_with_model(model, encoded)
+        elif pooling == PoolingType.CLS:
+            out = _encode_cls_with_model(model, encoded)
+        else:
+            raise ValueError(f"Unknown pooling: {pooling}")
+
         intermediate_weights.extend(out.numpy())
         pbar.update(len(batch))
 
     # Sort the output back to the original order
     intermediate_weights = [intermediate_weights[i] for i in np.argsort(sort_order)]
     out_weights = np.stack(intermediate_weights)
-
     out_weights = np.nan_to_num(out_weights)
 
     return out_weights
 
 
-@torch.no_grad()
-def _encode_mean_using_model(model: PreTrainedModel, encodings: dict[str, torch.Tensor]) -> torch.Tensor:
+def _encode_with_model(
+    model: PreTrainedModel, encodings: dict[str, torch.Tensor]
+) -> tuple[torch.Tensor, torch.Tensor | None, dict[str, torch.Tensor]]:
     """
-    Encode a batch of tokens using a model.
-
-    Note that if a token in the input batch does not have any embeddings, it will be output as a vector of zeros.
-    So detection of these is necessary.
+    Move inputs to the model device, run a forward pass, and standardize dtypes.
 
     :param model: The model to use.
     :param encodings: The encoded tokens to turn into features.
-    :return: The mean of the output for each token.
+    :return: a tuple consisting of:
+      - hidden: last_hidden_state
+      - pooler: pooler_output if present, else None
+      - encodings_on_device: the device-moved encodings (for masks)
     """
-    encodings = {k: v.to(model.device) for k, v in encodings.items()}
-    encoded: BaseModelOutputWithPoolingAndCrossAttentions = model(**encodings)
-    out: torch.Tensor = encoded.last_hidden_state.cpu()  # type: ignore  # False positive
+    encodings_on_device = {k: v.to(model.device) for k, v in encodings.items()}
+    outputs: BaseModelOutputWithPoolingAndCrossAttentions = model(**encodings_on_device)
+    hidden: torch.Tensor = outputs.last_hidden_state  # type: ignore  # False positive
     # NOTE: If the dtype is bfloat 16, we convert to float32,
     # because numpy does not suport bfloat16
     # See here: https://github.com/numpy/numpy/issues/19808
-    if out.dtype == torch.bfloat16:
-        out = out.float()
+    if hidden.dtype == torch.bfloat16:
+        hidden = hidden.float()
+    pooler = getattr(outputs, "pooler_output", None)
+    if pooler is not None and pooler.dtype == torch.bfloat16:
+        pooler = pooler.float()
+    return hidden, pooler, encodings_on_device
 
+
+@torch.inference_mode()
+def _encode_mean_with_model(model: PreTrainedModel, encodings: dict[str, torch.Tensor]) -> torch.Tensor:
+    """
+    Encode a batch of tokens using mean pooling.
+
+    :param model: The model to use.
+    :param encodings: The encoded tokens to turn into features.
+    :return: The mean of the output for each token.
+    """
+    hidden, _, encodings_on_device = _encode_with_model(model, encodings)
     # Take the mean by averaging over the attention mask.
-    mask = encodings["attention_mask"].cpu().float()
-    mask /= mask.sum(1)[:, None]
+    mask = encodings_on_device["attention_mask"].cpu().float()
+    lengths = mask.sum(1, keepdim=True).clamp_min_(1.0)
+    mask = mask / lengths
+    return torch.bmm(mask.to(hidden.device)[:, None, :], hidden).squeeze(1).cpu()
 
-    result = torch.bmm(mask[:, None, :].float(), out).squeeze(1)
 
-    return result
+@torch.inference_mode()
+def _encode_last_with_model(model: PreTrainedModel, encodings: dict[str, torch.Tensor]) -> torch.Tensor:
+    """
+    Encode a batch of tokens using last token pooling.
+
+    :param model: The model to use.
+    :param encodings: The encoded tokens to turn into features.
+    :return: The last hidden state for each token.
+    """
+    hidden, _, encodings_on_device = _encode_with_model(model, encodings)
+    # Get the last hidden state for each token
+    mask = encodings_on_device["attention_mask"].bool()
+    last_idx = (mask.sum(dim=1) - 1).clamp_min(0).long()
+    b = torch.arange(hidden.size(0), device=hidden.device)
+    return hidden[b, last_idx, :].cpu()
+
+
+@torch.inference_mode()
+def _encode_cls_with_model(model: PreTrainedModel, encodings: dict[str, torch.Tensor]) -> torch.Tensor:
+    """
+    Encode a batch of tokens using CLS pooling.
+
+    If the model has a pooler_output, use that,  otherwise, use the first token's hidden state.
+
+    :param model: The model to use.
+    :param encodings: The encoded tokens to turn into features.
+    :return: The [CLS] token representation for each token.
+    """
+    hidden, pooler, _ = _encode_with_model(model, encodings)
+    if pooler is not None:
+        return pooler.cpu()
+    return hidden[:, 0, :].cpu()
 
 
 def post_process_embeddings(
@@ -124,30 +197,22 @@ def post_process_embeddings(
         if pca_dims > embeddings.shape[1]:
             logger.warning(
                 f"PCA dimension ({pca_dims}) is larger than the number of dimensions in the embeddings ({embeddings.shape[1]}). "
-                "Applying PCA, but not reducing dimensionality. Is this is not desired, please set `pca_dims` to None. "
-                "Applying PCA will probably improve performance, so consider just leaving it."
+                "Applying PCA, but not reducing dimensionality. If this is not desired, set `pca_dims` to None."
             )
             pca_dims = embeddings.shape[1]
         if pca_dims >= embeddings.shape[0]:
             logger.warning(
                 f"PCA dimension ({pca_dims}) is larger than the number of tokens in the vocabulary ({embeddings.shape[0]}). Not applying PCA."
             )
         elif pca_dims <= embeddings.shape[1]:
-            if isinstance(pca_dims, float):
-                logger.info(f"Applying PCA with {pca_dims} explained variance.")
-            else:
-                logger.info(f"Applying PCA with n_components {pca_dims}")
-
             orig_dims = embeddings.shape[1]
             p = PCA(n_components=pca_dims, svd_solver="full")
             embeddings = p.fit_transform(embeddings)
-
             if embeddings.shape[1] < orig_dims:
-                explained_variance_ratio = np.sum(p.explained_variance_ratio_)
-                explained_variance = np.sum(p.explained_variance_)
-                logger.info(f"Reduced dimensionality from {orig_dims} to {embeddings.shape[1]}.")
-                logger.info(f"Explained variance ratio: {explained_variance_ratio:.3f}.")
-                logger.info(f"Explained variance: {explained_variance:.3f}.")
+                logger.info(
+                    f"Reduced dimensionality {orig_dims} -> {embeddings.shape[1]} "
+                    f"(explained var ratio: {np.sum(p.explained_variance_ratio_):.3f})."
+                )
 
     if sif_coefficient is not None:
         logger.info("Estimating word frequencies using Zipf's law, and then applying SIF.")
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -59,29 +59,30 @@ def mock_transformer() -> PreTrainedModel:
     """Create a mock transformer model."""
 
     class MockPreTrainedModel:
-        def __init__(self) -> None:
+        def __init__(self, dim: int = 768, with_pooler: bool = True, pooler_value: float = 7.0) -> None:
             self.device = "cpu"
             self.name_or_path = "mock-model"
+            self.dim = dim
+            self.with_pooler = with_pooler
+            self.pooler_value = pooler_value
 
         def to(self, device: str) -> MockPreTrainedModel:
             self.device = device
             return self
 
+        def eval(self) -> MockPreTrainedModel:
+            return self
+
         def forward(self, *args: Any, **kwargs: Any) -> Any:
-            # Simulate a last_hidden_state output for a transformer model
-            batch_size, seq_length = kwargs["input_ids"].shape
-            # Return a tensor of shape (batch_size, seq_length, 768)
-            return type(
-                "BaseModelOutputWithPoolingAndCrossAttentions",
-                (object,),
-                {
-                    "last_hidden_state": torch.rand(batch_size, seq_length, 768)  # Simulate 768 hidden units
-                },
-            )
-
-        def __call__(self, *args: Any, **kwargs: Any) -> Any:
-            # Simply call the forward method to simulate the same behavior as transformers models
-            return self.forward(*args, **kwargs)
+            input_ids = kwargs["input_ids"]
+            B, T = input_ids.shape
+            hidden = torch.arange(T, dtype=torch.float32, device=self.device).repeat(B, self.dim, 1).transpose(1, 2)
+            out = {"last_hidden_state": hidden}
+            if self.with_pooler:
+                out["pooler_output"] = torch.full((B, self.dim), self.pooler_value, device=self.device)
+            return type("BaseModelOutputWithPoolingAndCrossAttentions", (object,), out)()
+
+        __call__ = forward
 
     return cast(PreTrainedModel, MockPreTrainedModel())
 
diff --git a/tests/test_distillation.py b/tests/test_distillation.py
@@ -16,6 +16,7 @@
     distill_from_model,
     post_process_embeddings,
 )
+from model2vec.distill.inference import PoolingType, create_embeddings
 from model2vec.model import StaticModel
 
 try:
@@ -251,9 +252,9 @@ def test__post_process_embeddings(
         sif_weights = (sif_coefficient / (sif_coefficient + proba))[:, None]
 
         expected_zipf_embeddings = original_embeddings * sif_weights
-        assert np.allclose(
-            processed_embeddings, expected_zipf_embeddings, rtol=1e-5
-        ), "Zipf weighting not applied correctly"
+        assert np.allclose(processed_embeddings, expected_zipf_embeddings, rtol=1e-5), (
+            "Zipf weighting not applied correctly"
+        )
 
 
 @pytest.mark.parametrize(
@@ -288,3 +289,28 @@ def test_clean_and_create_vocabulary(
         # Ensure the expected warnings contain expected keywords like 'Removed', 'duplicate', or 'empty'
         for expected_warning in expected_warnings:
             assert any(expected_warning in logged_warning for logged_warning in logged_warnings)
+
+
+@pytest.mark.parametrize(
+    "pooling,with_pooler,expected_rows",
+    [
+        (PoolingType.MEAN, False, [1.0, 0.0]),  # len=3: mean(0,1,2)=1; len=1: mean(0) = 0
+        (PoolingType.LAST, False, [2.0, 0.0]),  # last of 3: 2; last of 1: 0
+        (PoolingType.CLS, False, [0.0, 0.0]),  # first position: 0
+        (PoolingType.CLS, True, [7.0, 7.0]),  # pooler_output is used
+    ],
+)
+def test_pooling_strategies(mock_transformer, pooling, with_pooler, expected_rows) -> None:
+    """Test different pooling strategies."""
+    mock_transformer.with_pooler = with_pooler
+    tokenized = [[10, 11, 12], [20]]
+    out = create_embeddings(
+        model=mock_transformer,
+        tokenized=tokenized,
+        device="cpu",
+        pad_token_id=0,
+        pooling=pooling,
+    )
+    dim = out.shape[1]
+    expected = np.stack([np.full((dim,), v, dtype=np.float32) for v in expected_rows])
+    assert np.allclose(out, expected, rtol=1e-6, atol=0.0)