Updated docstrings

Pringled · Pringled · commit 658791816a2f · 2025-10-03T12:28:18.000+02:00
diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py
@@ -60,8 +60,12 @@ def distill_from_model(
     :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
     :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
-    :param pooling: The pooling strategy to use for creating embeddings. Can be one of "mean" (default), "last", "first", or "pooler".
-    :return: A StaticModel
+    :param pooling: The pooling strategy to use for creating embeddings. Can be one of:
+        'mean' (default): mean over all tokens. Robust and works well in most cases.
+        'last': use the last token's hidden state (often the [EOS] token). Common for decoder-style models.
+        'first': use the first token's hidden state ([CLS] token in BERT-style models).
+        'pooler': use the pooler output (if available). This is often a non-linear projection of the [CLS] token.
+    :return: A StaticModel.
     :raises: ValueError if the vocabulary is empty after preprocessing.
 
     """
@@ -259,7 +263,11 @@ def distill(
     :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
     :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
-    :param pooling: The pooling strategy to use for creating embeddings. Can be one of "mean" (default), "last", "first", or "pooler".
+    :param pooling: The pooling strategy to use for creating embeddings. Can be one of:
+        'mean' (default): mean over all tokens. Robust and works well in most cases.
+        'last': use the last token's hidden state (often the [EOS] token). Common for decoder-style models.
+        'first': use the first token's hidden state ([CLS] token in BERT-style models).
+        'pooler': use the pooler output (if available). This is often a non-linear projection of the [CLS] token.
     :return: A StaticModel
 
     """