feat: Add SIF-like coef (#174)

stephantul · web-flow · commit da79779ca8a9 · 2025-02-05T09:53:14.000+01:00
* feat: token mean and sif weighting

* fix error

* rename

* fix tests

* feat: update config

* reviewer comments

* set normalize to True by default

* fix tests

* fix: bug and tests
diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py
@@ -40,7 +40,8 @@ def distill_from_model(
     vocabulary: list[str] | None = None,
     device: str | None = None,
     pca_dims: PCADimType = 256,
-    apply_zipf: bool = True,
+    apply_zipf: bool | None = None,
+    sif_coefficient: float | None = 1e-4,
     use_subword: bool = True,
     token_remove_pattern: str | None = r"\[unused\d+\]",
 ) -> StaticModel:
@@ -60,30 +61,19 @@ def distill_from_model(
     :param pca_dims: The number of components to use for PCA.
         If this is None, we don't apply PCA.
         If this is 'auto', we don't reduce dimensionality, but still apply PCA.
-    :param apply_zipf: Whether to apply Zipf weighting to the embeddings.
+    :param apply_zipf: DEPRECATED: This parameter used to control whether Zipf is applied.
+        Zipf weighting is now controlled by the sif_coefficient parameter. If this is set to None, no weighting is applied.
+    :param sif_coefficient: The SIF coefficient to use. If this is None, no weighting is applied.
+        Should be a value > 0 and < 1.0. A value of 1e-4 is a good default.
     :param use_subword: Whether to keep subword tokens in the vocabulary. If this is False, you must pass a vocabulary, and the returned tokenizer will only detect full words.
     :param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary.
         If the pattern is so general that it removes all tokens, we throw an error. If the pattern can't be compiled into a valid regex, we also throw an error.
-    :raises: ValueError if the PCA dimension is larger than the number of dimensions in the embeddings.
-    :raises: ValueError if the vocabulary contains duplicate tokens.
-    :raises: ValueError if the regex can't be compiled.
-    :raises: ValueError if the vocabulary is empty after token removal.
     :return: A StaticModel
 
     """
-    device = select_optimal_device(device)
-    if not use_subword and vocabulary is None:
-        raise ValueError(
-            "You must pass a vocabulary if you don't use subword tokens. Either pass a vocabulary, or set use_subword to True."
-        )
-
-    if vocabulary and isinstance(tokenizer.backend_tokenizer.model, (BPE, Unigram)):
-        raise ValueError(
-            "You passed a vocabulary, but the model you are using does not use a WordPiece tokenizer. "
-            "This is not supported yet."
-            "Feel free to open an issue if this is a blocker: https://github.com/MinishLab/model2vec/issues"
-        )
+    sif_coefficient = _validate_parameters(tokenizer, vocabulary, apply_zipf, sif_coefficient, use_subword)
 
+    device = select_optimal_device(device)
     # Make a base list of tokens.
     tokens: list[str] = []
     if use_subword:
@@ -129,7 +119,7 @@ def distill_from_model(
             logger.warning("Didn't create any token embeddings as all tokens were duplicates or empty.")
 
     # Post process the embeddings by applying PCA and Zipf weighting.
-    embeddings = _post_process_embeddings(np.asarray(embeddings), pca_dims, apply_zipf)
+    embeddings = _post_process_embeddings(np.asarray(embeddings), pca_dims, sif_coefficient=sif_coefficient)
 
     model_name = getattr(model, "name_or_path", "")
 
@@ -139,8 +129,10 @@ def distill_from_model(
         "tokenizer_name": model_name,
         "apply_pca": pca_dims,
         "apply_zipf": apply_zipf,
+        "sif_coefficient": sif_coefficient,
         "hidden_dim": embeddings.shape[1],
         "seq_length": 1000000,  # Set this to a high value since we don't have a sequence length limit.
+        "normalize": True,
     }
 
     if os.path.exists(model_name):
@@ -157,10 +149,71 @@ def distill_from_model(
             language = None
 
     return StaticModel(
-        vectors=embeddings, tokenizer=new_tokenizer, config=config, base_model_name=model_name, language=language
+        vectors=embeddings,
+        tokenizer=new_tokenizer,
+        config=config,
+        base_model_name=model_name,
+        language=language,
+        normalize=True,
     )
 
 
+def _validate_parameters(
+    tokenizer: PreTrainedTokenizerFast,
+    vocabulary: list[str] | None,
+    apply_zipf: bool | None,
+    sif_coefficient: float | None,
+    use_subword: bool,
+) -> float | None:
+    """
+    Validate the parameters passed to the distillation function.
+
+    :param tokenizer: The tokenizer to use.
+    :param vocabulary: The vocabulary to use.
+    :param apply_zipf: DEPRECATED: This parameter used to control whether Zipf is applied.
+        Zipf weighting is now controlled by the sif_coefficient parameter. If this is set to None, no weighting is applied.
+    :param sif_coefficient: The SIF coefficient to use. If this is None, no weighting is applied.
+        Should be a value >= 0 and < 1.0. A value of 1e-4 is a good default.
+    :param use_subword: Whether to keep subword tokens in the vocabulary. If this is False, you must pass a vocabulary, and the returned tokenizer will only detect full words.
+    :return: The SIF coefficient to use.
+    :raises: ValueError if the PCA dimension is larger than the number of dimensions in the embeddings.
+    :raises: ValueError if the vocabulary contains duplicate tokens.
+    :raises: ValueError if the regex can't be compiled.
+    :raises: ValueError if the vocabulary is empty after token removal.
+
+    """
+    if apply_zipf is not None:
+        logger.warning(
+            "The `apply_zipf` parameter is deprecated and will be removed in the next release. "
+            "Zipf weighting is applied based on the sif_coefficient parameter. If this is set to None, "
+            "no weighting is applied."
+        )
+        if apply_zipf and sif_coefficient is None:
+            logger.warning("You set apply_zipf to True, but sif_coefficient is None. Setting sif_coefficient to 1e-4.")
+            sif_coefficient = 1e-4
+        elif not apply_zipf:
+            logger.warning("Because you set apply_zipf to False, we ignore the sif_coefficient parameter.")
+            sif_coefficient = None
+
+    if sif_coefficient is not None:
+        if not 0 < sif_coefficient < 1.0:
+            raise ValueError("SIF coefficient must be a value > 0 and < 1.0.")
+
+    if not use_subword and vocabulary is None:
+        raise ValueError(
+            "You must pass a vocabulary if you don't use subword tokens. Either pass a vocabulary, or set use_subword to True."
+        )
+
+    if vocabulary and isinstance(tokenizer.backend_tokenizer.model, (BPE, Unigram)):
+        raise ValueError(
+            "You passed a vocabulary, but the model you are using does not use a WordPiece tokenizer. "
+            "This is not supported yet."
+            "Feel free to open an issue if this is a blocker: https://github.com/MinishLab/model2vec/issues"
+        )
+
+    return sif_coefficient
+
+
 def _remove_tokens_and_embeddings(
     tokenizer: PreTrainedTokenizerFast, token_remove_pattern: str | None, tokens: list[str], embeddings: np.ndarray
 ) -> tuple[Tokenizer, np.ndarray]:
@@ -201,7 +254,8 @@ def distill(
     vocabulary: list[str] | None = None,
     device: str | None = None,
     pca_dims: PCADimType = 256,
-    apply_zipf: bool = True,
+    apply_zipf: bool | None = None,
+    sif_coefficient: float | None = 1e-4,
     use_subword: bool = True,
     token_remove_pattern: str | None = r"\[unused\d+\]",
     trust_remote_code: bool = False,
@@ -221,7 +275,10 @@ def distill(
     :param pca_dims: The number of components to use for PCA.
         If this is None, we don't apply PCA.
         If this is 'auto', we don't reduce dimenionality, but still apply PCA.
-    :param apply_zipf: Whether to apply Zipf weighting to the embeddings.
+    :param apply_zipf: DEPRECATED: This parameter used to control whether Zipf is applied.
+        Zipf weighting is now controlled by the sif_coefficient parameter. If this is set to None, no weighting is applied.
+    :param sif_coefficient: The SIF coefficient to use. If this is None, no weighting is applied.
+        Should be a value >= 0 and < 1.0. A value of 1e-4 is a good default.
     :param use_subword: Whether to keep subword tokens in the vocabulary. If this is False, you must pass a vocabulary, and the returned tokenizer will only detect full words.
     :param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary.
     :param trust_remote_code: Whether to trust the remote code. If this is False, we will only load components coming from `transformers`. If this is True, we will load all components.
@@ -240,11 +297,14 @@ def distill(
         apply_zipf=apply_zipf,
         use_subword=use_subword,
         token_remove_pattern=token_remove_pattern,
+        sif_coefficient=sif_coefficient,
     )
 
 
-def _post_process_embeddings(embeddings: np.ndarray, pca_dims: PCADimType, apply_zipf: bool) -> np.ndarray:
-    """Post process embeddings by applying PCA and Zipf weighting."""
+def _post_process_embeddings(
+    embeddings: np.ndarray, pca_dims: PCADimType, sif_coefficient: float | None = 1e-4
+) -> np.ndarray:
+    """Post process embeddings by applying PCA and SIF weighting by estimating the frequencies through Zipf's law."""
     if pca_dims is not None:
         if pca_dims == "auto":
             pca_dims = embeddings.shape[1]
@@ -276,9 +336,11 @@ def _post_process_embeddings(embeddings: np.ndarray, pca_dims: PCADimType, apply
                 logger.info(f"Explained variance ratio: {explained_variance_ratio:.3f}.")
                 logger.info(f"Explained variance: {explained_variance:.3f}.")
 
-    if apply_zipf:
-        logger.info("Applying Zipf weighting")
-        embeddings *= np.log(1 + np.arange(embeddings.shape[0]))[:, None]
+    if sif_coefficient is not None:
+        logger.info("Estimating word frequencies using Zipf's law, and then applying SIF.")
+        inv_rank = 1 / (np.arange(2, embeddings.shape[0] + 2))
+        proba = inv_rank / np.sum(inv_rank)
+        embeddings *= (sif_coefficient / (sif_coefficient + proba))[:, None]
 
     return embeddings
 
diff --git a/tests/test_distillation.py b/tests/test_distillation.py
@@ -160,22 +160,29 @@ def test_distill_removal_pattern(
 
 
 @pytest.mark.parametrize(
-    "vocabulary, use_subword, pca_dims, apply_zipf, expected_shape",
+    "vocabulary, use_subword, pca_dims, apply_zipf, sif_coefficient, expected_shape",
     [
-        (None, True, 256, True, (29528, 256)),  # Output vocab with subwords, PCA applied
+        (None, True, 256, True, None, (29528, 256)),  # Output vocab with subwords, PCA applied
         (
             ["wordA", "wordB"],
             False,
             4,
             False,
+            None,
             (7, 4),
         ),  # Custom vocab without subword , PCA applied
-        (None, True, "auto", False, (29528, 768)),  # Subword, PCA set to 'auto'
-        (None, True, 1024, False, (29528, 768)),  # Subword, PCA set to high number.
-        (["wordA", "wordB"], True, 4, False, (29530, 4)),  # Custom vocab with subword, PCA applied
-        (None, True, None, True, (29528, 768)),  # No PCA applied
-        (["wordA", "wordB"], False, 4, True, (7, 4)),  # Custom vocab without subwords PCA and Zipf applied
-        (None, False, 256, True, None),  # use_subword = False without passing a vocabulary should raise an error
+        (None, True, "auto", False, None, (29528, 768)),  # Subword, PCA set to 'auto'
+        (None, True, "auto", True, 1e-4, (29528, 768)),  # Subword, PCA set to 'auto'
+        (None, True, "auto", False, 1e-4, (29528, 768)),  # Subword, PCA set to 'auto'
+        (None, True, "auto", True, 0, None),  # Sif too low
+        (None, True, "auto", True, 1, None),  # Sif too high
+        (None, True, "auto", False, 0, (29528, 768)),  # Sif too low, but apply_zipf is False
+        (None, True, "auto", False, 1, (29528, 768)),  # Sif too high, but apply_zipf is False
+        (None, True, 1024, False, None, (29528, 768)),  # Subword, PCA set to high number.
+        (["wordA", "wordB"], True, 4, False, None, (29530, 4)),  # Custom vocab with subword, PCA applied
+        (None, True, None, True, None, (29528, 768)),  # No PCA applied
+        (["wordA", "wordB"], False, 4, True, None, (7, 4)),  # Custom vocab without subwords PCA and Zipf applied
+        (None, False, 256, True, None, None),  # use_subword = False without passing a vocabulary should raise an error
     ],
 )
 @patch.object(import_module("model2vec.distill.distillation"), "model_info")
@@ -188,6 +195,7 @@ def test_distill(
     use_subword: bool,
     pca_dims: int | None,
     apply_zipf: bool,
+    sif_coefficient: float | None,
     expected_shape: tuple[int, int],
 ) -> None:
     """Test distill function with different parameters."""
@@ -208,7 +216,25 @@ def test_distill(
                 pca_dims=pca_dims,
                 apply_zipf=apply_zipf,
                 use_subword=use_subword,
+                sif_coefficient=sif_coefficient,
             )
+    elif (
+        apply_zipf is not None
+        and apply_zipf
+        and sif_coefficient is not None
+        and (sif_coefficient <= 0 or sif_coefficient >= 1)
+    ):
+        with pytest.raises(ValueError):
+            static_model = distill(
+                model_name=model_name,
+                vocabulary=vocabulary,
+                device="cpu",
+                pca_dims=pca_dims,
+                apply_zipf=apply_zipf,
+                use_subword=use_subword,
+                sif_coefficient=sif_coefficient,
+            )
+
     else:
         # Call the distill function with the parametrized inputs
         static_model = distill(
@@ -218,6 +244,7 @@ def test_distill(
             pca_dims=pca_dims,
             apply_zipf=apply_zipf,
             use_subword=use_subword,
+            sif_coefficient=sif_coefficient,
         )
 
         # Assert the model is correctly generated
@@ -240,35 +267,38 @@ def test_missing_modelinfo(
 
 
 @pytest.mark.parametrize(
-    "embeddings, pca_dims, apply_zipf, expected_shape",
+    "embeddings, pca_dims, sif_coefficient, expected_shape",
     [
-        (rng.random((1000, 768)), 256, False, (1000, 256)),  # PCA applied correctly
-        (rng.random((1000, 768)), None, False, (1000, 768)),  # No PCA applied, dimensions remain unchanged
-        (rng.random((1000, 768)), 256, True, (1000, 256)),  # PCA and Zipf applied
-        (rng.random((10, 768)), 256, False, (10, 768)),  # PCA dims higher than vocab size, no PCA applied
+        (rng.random((1000, 768)), 256, None, (1000, 256)),  # PCA applied correctly
+        (rng.random((1000, 768)), None, None, (1000, 768)),  # No PCA applied, dimensions remain unchanged
+        (rng.random((1000, 768)), 256, 1e-4, (1000, 256)),  # PCA and Zipf applied
+        (rng.random((10, 768)), 256, 1e-4, (10, 768)),  # PCA dims higher than vocab size, no PCA applied
     ],
 )
 def test__post_process_embeddings(
-    embeddings: np.ndarray, pca_dims: int, apply_zipf: bool, expected_shape: tuple[int, int]
+    embeddings: np.ndarray, pca_dims: int, sif_coefficient: float | None, expected_shape: tuple[int, int]
 ) -> None:
     """Test the _post_process_embeddings function."""
     original_embeddings = embeddings.copy()  # Copy embeddings to compare later
 
     # Test that the function raises an error if the PCA dims are larger than the number of dimensions
     if pca_dims and pca_dims > embeddings.shape[1]:
         with pytest.raises(ValueError):
-            _post_process_embeddings(embeddings, pca_dims, False)
+            _post_process_embeddings(embeddings, pca_dims, None)
 
-    processed_embeddings = _post_process_embeddings(embeddings, pca_dims, apply_zipf)
+    processed_embeddings = _post_process_embeddings(embeddings, pca_dims, sif_coefficient)
 
     # Assert the shape is correct
     assert processed_embeddings.shape == expected_shape
 
     # If Zipf weighting is applied compare the original and processed embeddings
     # and check the weights are applied correctly
-    if apply_zipf and pca_dims is None:
-        zipf_weights = np.log(1 + np.arange(embeddings.shape[0]))[:, None]
-        expected_zipf_embeddings = original_embeddings * zipf_weights
+    if sif_coefficient and pca_dims is None:
+        inv_rank = 1 / (np.arange(2, embeddings.shape[0] + 2))
+        proba = inv_rank / np.sum(inv_rank)
+        sif_weights = (sif_coefficient / (sif_coefficient + proba))[:, None]
+
+        expected_zipf_embeddings = original_embeddings * sif_weights
         assert np.allclose(
             processed_embeddings, expected_zipf_embeddings, rtol=1e-5
         ), "Zipf weighting not applied correctly"