You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: model2vec/distill/distillation.py
+1-34Lines changed: 1 addition & 34 deletions
Original file line number
Diff line number
Diff line change
@@ -27,11 +27,9 @@ def distill_from_model(
27
27
vocabulary: list[str] |None=None,
28
28
device: str|None=None,
29
29
pca_dims: PCADimType=256,
30
-
apply_zipf: bool|None=None,
31
30
sif_coefficient: float|None=1e-4,
32
31
token_remove_pattern: str|None=r"\[unused\d+\]",
33
32
quantize_to: DType|str=DType.Float16,
34
-
use_subword: bool|None=None,
35
33
vocabulary_quantization: int|None=None,
36
34
pooling: PoolingType=PoolingType.MEAN,
37
35
) ->StaticModel:
@@ -51,14 +49,11 @@ def distill_from_model(
51
49
:param pca_dims: The number of components to use for PCA.
52
50
If this is None, we don't apply PCA.
53
51
If this is 'auto', we don't reduce dimensionality, but still apply PCA.
54
-
:param apply_zipf: DEPRECATED: This parameter used to control whether Zipf is applied.
55
-
Zipf weighting is now controlled by the sif_coefficient parameter. If this is set to None, no weighting is applied.
56
52
:param sif_coefficient: The SIF coefficient to use. If this is None, no weighting is applied.
57
53
Should be a value > 0 and < 1.0. A value of 1e-4 is a good default.
58
54
:param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary.
59
55
If the pattern is so general that it removes all tokens, we throw an error. If the pattern can't be compiled into a valid regex, we also throw an error.
60
56
:param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
61
-
:param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
62
57
:param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
63
58
:param pooling: The pooling strategy to use for creating embeddings. Can be one of:
64
59
'mean' (default): mean over all tokens. Robust and works well in most cases.
@@ -69,13 +64,9 @@ def distill_from_model(
69
64
:raises: ValueError if the vocabulary is empty after preprocessing.
70
65
71
66
"""
72
-
ifuse_subwordisnotNone:
73
-
logger.warning(
74
-
"The `use_subword` parameter is deprecated and will be removed in the next release. It doesn't do anything."
"seq_length": 1000000, # Set this to a high value since we don't have a sequence length limit.
@@ -182,35 +172,19 @@ def distill_from_model(
182
172
183
173
184
174
def_validate_parameters(
185
-
apply_zipf: bool|None,
186
175
sif_coefficient: float|None,
187
176
token_remove_pattern: str|None,
188
177
) ->tuple[float|None, re.Pattern|None]:
189
178
"""
190
179
Validate the parameters passed to the distillation function.
191
180
192
-
:param apply_zipf: DEPRECATED: This parameter used to control whether Zipf is applied.
193
-
Zipf weighting is now controlled by the sif_coefficient parameter. If this is set to None, no weighting is applied.
194
181
:param sif_coefficient: The SIF coefficient to use. If this is None, no weighting is applied.
195
182
Should be a value >= 0 and < 1.0. A value of 1e-4 is a good default.
196
183
:param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary.
197
184
:return: The SIF coefficient to use.
198
185
:raises: ValueError if the regex can't be compiled.
199
186
200
187
"""
201
-
ifapply_zipfisnotNone:
202
-
logger.warning(
203
-
"The `apply_zipf` parameter is deprecated and will be removed in the next release. "
204
-
"Zipf weighting is applied based on the sif_coefficient parameter. If this is set to None, "
205
-
"no weighting is applied."
206
-
)
207
-
ifapply_zipfandsif_coefficientisNone:
208
-
logger.warning("You set apply_zipf to True, but sif_coefficient is None. Setting sif_coefficient to 1e-4.")
209
-
sif_coefficient=1e-4
210
-
elifnotapply_zipf:
211
-
logger.warning("Because you set apply_zipf to False, we ignore the sif_coefficient parameter.")
212
-
sif_coefficient=None
213
-
214
188
ifsif_coefficientisnotNone:
215
189
ifnot0<sif_coefficient<1.0:
216
190
raiseValueError("SIF coefficient must be a value > 0 and < 1.0.")
@@ -230,12 +204,10 @@ def distill(
230
204
vocabulary: list[str] |None=None,
231
205
device: str|None=None,
232
206
pca_dims: PCADimType=256,
233
-
apply_zipf: bool|None=None,
234
207
sif_coefficient: float|None=1e-4,
235
208
token_remove_pattern: str|None=r"\[unused\d+\]",
236
209
trust_remote_code: bool=False,
237
210
quantize_to: DType|str=DType.Float16,
238
-
use_subword: bool|None=None,
239
211
vocabulary_quantization: int|None=None,
240
212
pooling: PoolingType=PoolingType.MEAN,
241
213
) ->StaticModel:
@@ -254,14 +226,11 @@ def distill(
254
226
:param pca_dims: The number of components to use for PCA.
255
227
If this is None, we don't apply PCA.
256
228
If this is 'auto', we don't reduce dimenionality, but still apply PCA.
257
-
:param apply_zipf: DEPRECATED: This parameter used to control whether Zipf is applied.
258
-
Zipf weighting is now controlled by the sif_coefficient parameter. If this is set to None, no weighting is applied.
259
229
:param sif_coefficient: The SIF coefficient to use. If this is None, no weighting is applied.
260
230
Should be a value >= 0 and < 1.0. A value of 1e-4 is a good default.
261
231
:param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary.
262
232
:param trust_remote_code: Whether to trust the remote code. If this is False, we will only load components coming from `transformers`. If this is True, we will load all components.
263
233
:param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
264
-
:param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
265
234
:param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
266
235
:param pooling: The pooling strategy to use for creating embeddings. Can be one of:
267
236
'mean' (default): mean over all tokens. Robust and works well in most cases.
0 commit comments