@@ -60,8 +60,12 @@ def distill_from_model(
6060 :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
6161 :param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
6262 :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
63- :param pooling: The pooling strategy to use for creating embeddings. Can be one of "mean" (default), "last", "first", or "pooler".
64- :return: A StaticModel
63+ :param pooling: The pooling strategy to use for creating embeddings. Can be one of:
64+ 'mean' (default): mean over all tokens. Robust and works well in most cases.
65+ 'last': use the last token's hidden state (often the [EOS] token). Common for decoder-style models.
66+ 'first': use the first token's hidden state ([CLS] token in BERT-style models).
67+ 'pooler': use the pooler output (if available). This is often a non-linear projection of the [CLS] token.
68+ :return: A StaticModel.
6569 :raises: ValueError if the vocabulary is empty after preprocessing.
6670
6771 """
@@ -259,7 +263,11 @@ def distill(
259263 :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
260264 :param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
261265 :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
262- :param pooling: The pooling strategy to use for creating embeddings. Can be one of "mean" (default), "last", "first", or "pooler".
266+ :param pooling: The pooling strategy to use for creating embeddings. Can be one of:
267+ 'mean' (default): mean over all tokens. Robust and works well in most cases.
268+ 'last': use the last token's hidden state (often the [EOS] token). Common for decoder-style models.
269+ 'first': use the first token's hidden state ([CLS] token in BERT-style models).
270+ 'pooler': use the pooler output (if available). This is often a non-linear projection of the [CLS] token.
263271 :return: A StaticModel
264272
265273 """
0 commit comments