Refactory/fix of sampled softmax on PopularityBasedSamplerV2 / ContrastiveOutput / Candidate (#1051)

gabrielspmoreira · marcromeyn · edknv · web-flow · commit 2a205475e8a4 · 2023-04-18T14:03:24.000-03:00
Co-authored-by: Marc Romeyn &lt;marcromeyn@gmail.com&gt;
Co-authored-by: edknv &lt;109497216+edknv@users.noreply.github.com&gt;
diff --git a/merlin/models/tf/models/retrieval.py b/merlin/models/tf/models/retrieval.py
@@ -579,6 +579,7 @@ def YoutubeDNNRetrievalModelV2(
             negative_samplers=PopularityBasedSamplerV2(
                 max_num_samples=num_sampled, max_id=num_classes - 1, min_id=min_sampled_id
             ),
+            logq_sampling_correction=True,
         )
 
     return RetrievalModelV2(query=query, output=outputs)
diff --git a/merlin/models/tf/outputs/contrastive.py b/merlin/models/tf/outputs/contrastive.py
@@ -14,7 +14,8 @@
 # limitations under the License.
 #
 import logging
-from typing import List, Optional, Protocol, Union, runtime_checkable
+import warnings
+from typing import List, Optional, Protocol, Tuple, Union, runtime_checkable
 
 import tensorflow as tf
 from tensorflow.keras.layers import Layer
@@ -77,6 +78,20 @@ class ContrastiveOutput(ModelOutput):
     store_negative_ids: bool, optional
         Whether to store negative ids for post-processing
         by default False
+    logq_sampling_correction: bool, optional
+        The LogQ correction is a standard technique for
+        sampled softmax and popularity-biased sampling.
+        It subtracts from the logits the
+        log expected count/prob of the positive and
+        negative samples in order to not overpenalize the
+        popular items for being sampled more often as negatives.
+        It can be enabled if a single negative sampler is provided
+        and if it provides the sampler provides the
+        sampling probabilities (i.e. implements with_sampling_probs()).
+        Another alternative for performing logQ correction is using
+        ContrastiveOutput(..., post=PopularityLogitsCorrection(item_frequencies)),
+        where you need to provide the items frequency probability distribution (prior).
+        Default is False.
 
     References:
     ----------
@@ -132,6 +147,7 @@ def __init__(
         query_name: str = "query",
         candidate_name: str = "candidate",
         store_negative_ids: bool = False,
+        logq_sampling_correction: Optional[bool] = False,
         **kwargs,
     ):
         self.col_schema = None
@@ -168,6 +184,7 @@ def __init__(
         self.query_name = query_name
         self.candidate_name = candidate_name
         self.store_negative_ids = store_negative_ids
+        self.logq_sampling_correction = logq_sampling_correction
 
         self.target_name = kwargs.pop("target", target_name)
         super().__init__(
@@ -223,7 +240,9 @@ def call_contrastive(self, inputs, features, targets, training=False, testing=Fa
         positive = Candidate(id=positive_id, metadata={**features}).with_embedding(
             positive_embedding
         )
-        negative = self.sample_negatives(positive, features, training=training, testing=testing)
+        negative, positive = self.sample_negatives(
+            positive, features, training=training, testing=testing
+        )
         if self.has_candidate_weights and (
             positive.id.shape != negative.id.shape or positive != negative
         ):
@@ -264,6 +283,18 @@ def outputs(
             tf.multiply(query_embedding, positive.embedding), keepdims=True, axis=-1
         )
 
+        if self.logq_sampling_correction:
+            if positive.sampling_prob is None or negative.sampling_prob is None:
+                warnings.warn(
+                    "The logQ sampling correction is enabled, but sampling probs were not found "
+                    "for both positive and negative candidates",
+                    RuntimeWarning,
+                )
+
+            epsilon = 1e-16
+            positive_scores -= tf.math.log(positive.sampling_prob + epsilon)
+            negative_scores -= tf.math.log(tf.transpose(negative.sampling_prob + epsilon))
+
         if self.downscore_false_negatives:
             negative_scores, _ = tf_utils.rescore_false_negatives(
                 positive.id, negative.id, negative_scores, self.false_negative_score
@@ -295,7 +326,7 @@ def sample_negatives(
         features: TabularData,
         training=False,
         testing=False,
-    ) -> Candidate:
+    ) -> Tuple[Candidate, Candidate]:
         """Method to sample negatives from `self.negative_samplers`
 
         Parameters
@@ -311,16 +342,28 @@ def sample_negatives(
 
         Returns
         -------
-        Items
-            Class containing the sampled negative ids
+        Tuple[Candidate, Candidate]
+            Tuple of candidates with sampled negative ids and the provided positive ids
+            added with the sampling probability
         """
         sampling_kwargs = {"training": training, "testing": testing, "features": features}
         candidates: List[Candidate] = []
+
+        if self.logq_sampling_correction and len(self.negative_samplers) > 1:
+            raise ValueError(
+                "It is only possible to apply logQ sampling correction "
+                "(logq_sampling_correction=True) when only one negative sampler is provided."
+            )
+
         for sampler in self.negative_samplers:
-            sampled: Candidate = tf_utils.call_layer(sampler, positive, **sampling_kwargs)
+            neg_samples: Candidate = tf_utils.call_layer(sampler, positive, **sampling_kwargs)
+
+            # Adds to the positive and negative candidates their sampling probs from the sampler
+            positive = sampler.with_sampling_probs(positive)
+            neg_samples = sampler.with_sampling_probs(neg_samples)
 
-            if sampled.id is not None:
-                candidates.append(sampled)
+            if neg_samples.id is not None:
+                candidates.append(neg_samples)
             else:
                 LOG.warn(
                     f"The sampler {type(sampler).__name__} returned no samples for this batch."
@@ -336,7 +379,7 @@ def sample_negatives(
             for neg in candidates[1:]:
                 negatives += neg
 
-        return negatives
+        return negatives, positive
 
     def embedding_lookup(self, ids: tf.Tensor):
         return self.to_call.embedding_lookup(tf.squeeze(ids))
diff --git a/merlin/models/tf/outputs/sampling/base.py b/merlin/models/tf/outputs/sampling/base.py
@@ -30,13 +30,16 @@ class Candidate(NamedTuple):
     ----------
     id : tf.Tensor
         The tensor of item ids
+    sampling_prob : tf.Tensor
+        Useful for logQ correction, based on the sampling distribution
     metadata:
         dictionary of tensors containing meta information
         about items such as item embeddings and item category
     """
 
     id: tf.Tensor
     metadata: Dict[str, tf.Tensor]
+    sampling_prob: Optional[tf.Tensor] = None
 
     @property
     def embedding(self) -> tf.Tensor:
@@ -51,6 +54,9 @@ def with_embedding(self, embedding: tf.Tensor) -> "Candidate":
 
         return self
 
+    def with_sampling_prob(self, sampling_prob: tf.Tensor) -> "Candidate":
+        return Candidate(id=self.id, metadata=self.metadata, sampling_prob=sampling_prob)
+
     def __add__(self, other):
         metadata = {}
         for key in self.metadata:
@@ -68,12 +74,12 @@ def shape(self) -> "Candidate":
     def __repr__(self):
         metadata = {key: str(val) for key, val in self.metadata.items()}
 
-        return f"Candidate({self.id}, {metadata})"
+        return f"Candidate({self.id}, {self.sampling_prob}, {metadata})"
 
     def __str__(self):
         metadata = {key: str(val) for key, val in self.metadata.items()}
 
-        return f"Candidate({self.id}, {metadata})"
+        return f"Candidate({self.id}, {self.sampling_prob}, {metadata})"
 
     def __eq__(self, other) -> bool:
         if self.id.shape != other.id.shape:
@@ -84,15 +90,17 @@ def __eq__(self, other) -> bool:
     def get_config(self):
         return {
             "id": self.id,
+            "sampling_prob": self.sampling_prob,
             "metadata": self.metadata,
         }
 
     @classmethod
     def from_config(cls, config):
         ids = config["config"]["id"]
+        sampling_prob = config["config"]["sampling_prob"]
         metadata = config["config"]["metadata"]
 
-        return cls(ids, metadata)
+        return cls(ids, sampling_prob, metadata)
 
 
 negative_sampling_registry: Registry = Registry.class_registry("tf.negative_sampling")
@@ -139,6 +147,9 @@ def add(self, items: Candidate):
     def sample(self) -> Candidate:
         raise NotImplementedError()
 
+    def with_sampling_probs(self, items: Candidate) -> Candidate:
+        return items
+
     @property
     def max_num_samples(self) -> int:
         return self._max_num_samples
diff --git a/merlin/models/tf/outputs/sampling/popularity.py b/merlin/models/tf/outputs/sampling/popularity.py
@@ -23,9 +23,25 @@
 @tf.keras.utils.register_keras_serializable(package="merlin.models")
 class PopularityBasedSamplerV2(CandidateSampler):
     """
-    Provides a popularity-based negative sampling for the softmax layer
+    Provides a popularity-based negative sampling for sampled softmax [1]_ [2]_.
     to ensure training efficiency when the catalog of items is very large.
-    The capacity of the queue is fixed and is equal to the catalog size.
+    Items are sampled from the whole catalog. It also allows saving
+    the sampling probabilities for both positive and negative candidates,
+    that are required by the logQ sampling correction of sampled softmax.
+    This class do not require the actual frequency of items. It assumes that
+    item ids are sorted by frequency and follow a long tail distribution and
+    uses tf.random.log_uniform_candidate_sampler() for sampling the candidate ids.
+
+    References
+    ----------
+    .. [1] Yoshua Bengio and Jean-Sébastien Sénécal. 2003. Quick Training of Probabilistic
+       Neural Nets by Importance Sampling. In Proceedings of the conference on Artificial
+       Intelligence and Statistics (AISTATS).
+
+    .. [2] Y. Bengio and J. S. Senecal. 2008. Adaptive Importance Sampling to Accelerate
+       Training of a Neural Probabilistic Language Model. Trans. Neur. Netw. 19, 4 (April
+       2008), 713–722. https://doi.org/10.1109/TNN.2007.912312
+
 
     Parameters
     ----------
@@ -38,6 +54,8 @@ class PopularityBasedSamplerV2(CandidateSampler):
         Defaults to 0.
     max_num_samples: int
         The number of unique negatives to sample at each batch.
+    unique: True
+        Whether to return unique candidate ids or allow for repeated ones
     seed: int
         Fix the random values returned by the sampler to ensure reproducibility
         Defaults to None
@@ -48,13 +66,17 @@ def __init__(
         max_id: int,
         min_id: int = 0,
         max_num_samples: int = 10,
+        unique: Optional[bool] = True,
         seed: Optional[int] = None,
         **kwargs,
     ):
         super().__init__(max_num_samples=max_num_samples, **kwargs)
         self.max_id = max_id
         self.min_id = min_id
         self.seed = seed
+        self.unique = unique
+
+        self.sampling_dist = self.get_sampling_distribution()
 
         assert (
             self.max_num_samples <= self.max_id
@@ -91,22 +113,79 @@ def sample(self) -> Candidate:
         Items
             The negative items ids
         """
-        sampled_ids, _, _ = tf.random.log_uniform_candidate_sampler(
+        (
+            sampled_ids,
+            _,
+            _,
+        ) = tf.random.log_uniform_candidate_sampler(
+            # This is just a placeholder for true_classestrue classes.
+            # It should be provided the positive ids here if wanted to
+            # get the expected count probs returned.
+            # We rather make usage of CandidateSampler.with_sampling_probs()
+            # method to get the sampling probs from positives and negatives
             true_classes=tf.ones((1, 1), dtype=tf.int64),
             num_true=1,
             num_sampled=self.max_num_samples,
-            unique=True,
+            unique=self.unique,
             range_max=self.max_id - self.min_id,
             seed=self.seed,
         )
-
         # Shifting the sampled ids to ignore the first ids (usually reserved for nulls, OOV)
         sampled_ids += self.min_id
-
         sampled_ids = tf.expand_dims(sampled_ids, -1)
 
+        sampled_ids = tf.stop_gradient(sampled_ids)
+
         return Candidate(id=sampled_ids, metadata={})
 
+    def get_sampling_distribution(self) -> tf.Tensor:
+        """Returns the approximated distribution used to sample items
+        by using tf.random.log_uniform_candidate_sampler()
+
+        Returns
+        -------
+        tf.Tensor
+            Probabilities of each item to be sampled
+        """
+        log_indices = tf.math.log(tf.range(1.0, self.max_id - self.min_id + 2.0, 1.0))
+        sampling_probs = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
+
+        if self.unique:
+            # Below is a more numerically stable implementation of the probability of
+            # sampling an item at least once (suitable for sampling unique items)
+            # P(item is sampled at least once) = 1 - P(item is not sampled)^num_trials
+            # where P(item is not sampled) = 1-p and p is the
+            # probability to be sampled
+            sampling_probs = -tf.math.expm1(self.max_num_samples * tf.math.log1p(-sampling_probs))
+
+        # Shifting probs if first values of item id mapping table are reserved
+        if self.min_id > 0:
+            sampling_probs = tf.concat(
+                [tf.zeros([self.min_id], dtype=sampling_probs.dtype), sampling_probs], axis=0
+            )
+
+        sampling_probs = tf.stop_gradient(sampling_probs)
+
+        return sampling_probs
+
+    def with_sampling_probs(self, items: Candidate) -> Candidate:
+        """Returns a copy of the Candidate named tuple with
+        the sampling_probs set,
+
+        Parameters
+        ----------
+        items : Candidate
+            Positive or negative candidate items
+
+        Returns
+        -------
+        Candidate
+            Candidate items with sampling probability set
+        """
+        sampling_probs = tf.gather(self.sampling_dist, items.id)
+        items_with_sampling_prob = items.with_sampling_prob(sampling_probs)
+        return items_with_sampling_prob
+
     def get_config(self):
         config = super().get_config()
         config["max_id"] = self.max_id
diff --git a/merlin/models/tf/transforms/bias.py b/merlin/models/tf/transforms/bias.py
@@ -82,7 +82,8 @@ class PopularityLogitsCorrection(Block):
     where `item_prob = item_freq_count / sum(item_freq_count)` is
     a probability distribution of the item frequency. In a nutshell,
     the logQ correction aims to increase the prediction scores (logits)
-    for infrequent items and decrease the ones for frequent items.
+    for infrequent items and decrease the ones for frequent items, so
+    that they are not much more penalized for being sampled more often.
 
     References
     ----------
diff --git a/tests/unit/tf/outputs/test_contrastive.py b/tests/unit/tf/outputs/test_contrastive.py
@@ -101,7 +101,7 @@ def test_two_tower_constrastive_with_logq_correction(ecommerce_data: Dataset):
 
 
 @pytest.mark.parametrize("run_eagerly", [True, False])
-def test_contrastive_output(ecommerce_data: Dataset, run_eagerly):
+def test_contrastive_output_with_sampled_softmax(ecommerce_data: Dataset, run_eagerly):
     schema = ecommerce_data.schema
     schema["item_category"] = schema["item_category"].with_tags(
         schema["item_category"].tags + "target"
@@ -112,7 +112,8 @@ def test_contrastive_output(ecommerce_data: Dataset, run_eagerly):
         mm.MLPBlock([8]),
         mm.ContrastiveOutput(
             schema["item_category"],
-            negative_samplers=PopularityBasedSamplerV2(max_id=100, max_num_samples=20),
+            negative_samplers=PopularityBasedSamplerV2(max_id=100, max_num_samples=20, min_id=1),
+            logq_sampling_correction=True,
         ),
     )
 

Original file line number	Diff line number	Diff line change
`@@ -579,6 +579,7 @@ def YoutubeDNNRetrievalModelV2(`
`579`	`579`	`negative_samplers=PopularityBasedSamplerV2(`
`580`	`580`	`max_num_samples=num_sampled, max_id=num_classes - 1, min_id=min_sampled_id`
`581`	`581`	`),`
	`582`	`+ logq_sampling_correction=True,`
`582`	`583`	`)`
`583`	`584`
`584`	`585`	`return RetrievalModelV2(query=query, output=outputs)`