Add support to sample more negatives

stes · stes · commit 723bcfbafae1 · 2025-08-02T15:29:18.000+02:00
diff --git a/cebra/data/base.py b/cebra/data/base.py
@@ -239,6 +239,12 @@ class Loader(abc.ABC, cebra.io.HasDevice):
     batch_size: int = dataclasses.field(default=None,
                                         doc="""The total batch size.""")
 
+    num_negatives: int = dataclasses.field(
+        default=None,
+        doc="""The number of negative samples to draw for each reference.
+                                           If not specified, the batch size is used."""
+    )
+
     def __post_init__(self):
         if self.num_steps is None or self.num_steps <= 0:
             raise ValueError(
@@ -255,11 +261,12 @@ def __len__(self):
 
     def __iter__(self) -> Batch:
         for _ in range(len(self)):
-            index = self.get_indices(num_samples=self.batch_size)
+            index = self.get_indices(num_samples=self.batch_size,
+                                     num_negatives=self.num_negatives)
             yield self.dataset.load_batch(index)
 
     @abc.abstractmethod
-    def get_indices(self, num_samples: int):
+    def get_indices(self, num_samples: int, num_negatives: int = None):
         """Sample and return the specified number of indices.
 
         The elements of the returned `BatchIndex` will be used to index the
@@ -271,5 +278,10 @@ def get_indices(self, num_samples: int):
 
         Returns:
             batch indices for the reference, positive and negative sample.
+
+
+        Note:
+            From version 0.7.0 onwards, `num_negatives` parameter was added to allow
+            specifying a different number of negative samples compared to the batch size.
         """
         raise NotImplementedError()
diff --git a/cebra/data/multi_session.py b/cebra/data/multi_session.py
@@ -155,10 +155,14 @@ def __post_init__(self):
         super().__post_init__()
         self.sampler = cebra.distributions.MultisessionSampler(
             self.dataset, self.time_offset)
+        if self.num_negatives is None:
+            self.num_negatives = self.batch_size
 
-    def get_indices(self, num_samples: int) -> List[BatchIndex]:
+    # NOTE(stes): In the longer run, we need to unify the API here; the num_samples argument
+    # is not used in the multi-session case, which is different to the single session samples.
+    def get_indices(self, num_samples) -> List[BatchIndex]:
         ref_idx = self.sampler.sample_prior(self.batch_size)
-        neg_idx = self.sampler.sample_prior(self.batch_size)
+        neg_idx = self.sampler.sample_prior(self.num_negatives)
         pos_idx, idx, idx_rev = self.sampler.sample_conditional(ref_idx)
 
         ref_idx = torch.from_numpy(ref_idx)
@@ -251,7 +255,7 @@ def get_indices(self, num_samples: int) -> BatchIndex:
             Batch indices for the reference, positive and negative samples.
         """
         ref_idx = self.sampler.sample_prior(self.batch_size)
-        neg_idx = self.sampler.sample_prior(self.batch_size)
+        neg_idx = self.sampler.sample_prior(self.num_negatives)
 
         pos_idx = self.sampler.sample_conditional(ref_idx)
 
diff --git a/cebra/data/multiobjective.py b/cebra/data/multiobjective.py
@@ -71,7 +71,7 @@ def __post_init__(self):
     def add_config(self, config):
         self.labels.append(config['label'])
 
-    def get_indices(self, num_samples: int):
+    def get_indices(self, num_samples: int, num_negatives: int = None):
         if self.sampling_mode_supervised == "ref_shared":
             reference_idx = self.prior.sample_prior(num_samples)
         else:
@@ -142,11 +142,14 @@ def add_config(self, config):
 
         self.distributions.append(distribution)
 
-    def get_indices(self, num_samples: int):
+    def get_indices(self, num_samples: int, num_negatives: int = None):
         """Sample and return the specified number of indices."""
 
+        if num_negatives is None:
+            num_negatives = num_samples
+
         if self.sampling_mode_contrastive == "refneg_shared":
-            ref_and_neg = self.prior.sample_prior(num_samples * 2)
+            ref_and_neg = self.prior.sample_prior(num_samples + num_negatives)
             reference_idx = ref_and_neg[:num_samples]
             negative_idx = ref_and_neg[num_samples:]
 
@@ -169,5 +172,6 @@ def get_indices(self, num_samples: int):
 
     def __iter__(self):
         for _ in range(len(self)):
-            index = self.get_indices(num_samples=self.batch_size)
+            index = self.get_indices(num_samples=self.batch_size,
+                                     num_negatives=self.num_negatives)
             yield self.dataset.load_batch_contrastive(index)
diff --git a/cebra/data/single_session.py b/cebra/data/single_session.py
@@ -138,7 +138,9 @@ def _init_distribution(self):
                 f"Invalid choice of prior distribution. Got '{self.prior}', but "
                 f"only accept 'uniform' or 'empirical' as potential values.")
 
-    def get_indices(self, num_samples: int) -> BatchIndex:
+    def get_indices(self,
+                    num_samples: int,
+                    num_negatives: int = None) -> BatchIndex:
         """Samples indices for reference, positive and negative examples.
 
         The reference samples will be sampled from the empirical or uniform prior
@@ -154,11 +156,16 @@ def get_indices(self, num_samples: int) -> BatchIndex:
         Args:
             num_samples: The number of samples (batch size) of the returned
                 :py:class:`cebra.data.datatypes.BatchIndex`.
+            num_negatives: The number of negative samples. If None, defaults to num_samples.
 
         Returns:
             Indices for reference, positive and negatives samples.
         """
-        reference_idx = self.distribution.sample_prior(num_samples * 2)
+        if num_negatives is None:
+            num_negatives = num_samples
+
+        reference_idx = self.distribution.sample_prior(num_samples +
+                                                       num_negatives)
         negative_idx = reference_idx[num_samples:]
         reference_idx = reference_idx[:num_samples]
         reference = self.index[reference_idx]
@@ -246,7 +253,9 @@ def _init_distribution(self):
             else:
                 raise ValueError(self.conditional)
 
-    def get_indices(self, num_samples: int) -> BatchIndex:
+    def get_indices(self,
+                    num_samples: int,
+                    num_negatives: int = None) -> BatchIndex:
         """Samples indices for reference, positive and negative examples.
 
         The reference and negative samples will be sampled uniformly from
@@ -262,7 +271,11 @@ def get_indices(self, num_samples: int) -> BatchIndex:
         Returns:
             Indices for reference, positive and negatives samples.
         """
-        reference_idx = self.distribution.sample_prior(num_samples * 2)
+        if num_negatives is None:
+            num_negatives = num_samples
+
+        reference_idx = self.distribution.sample_prior(num_samples +
+                                                       num_negatives)
         negative_idx = reference_idx[num_samples:]
         reference_idx = reference_idx[:num_samples]
         positive_idx = self.distribution.sample_conditional(reference_idx)
@@ -305,7 +318,9 @@ def __post_init__(self):
             continuous=self.cindex,
             time_delta=self.time_offset)
 
-    def get_indices(self, num_samples: int) -> BatchIndex:
+    def get_indices(self,
+                    num_samples: int,
+                    num_negatives: int = None) -> BatchIndex:
         """Samples indices for reference, positive and negative examples.
 
         The reference and negative samples will be sampled uniformly from
@@ -319,6 +334,7 @@ def get_indices(self, num_samples: int) -> BatchIndex:
         Args:
             num_samples: The number of samples (batch size) of the returned
                 :py:class:`cebra.data.datatypes.BatchIndex`.
+            num_negatives: The number of negative samples. If None, defaults to num_samples.
 
         Returns:
             Indices for reference, positive and negatives samples.
@@ -328,10 +344,16 @@ def get_indices(self, num_samples: int) -> BatchIndex:
               class.
             - Sample the negatives with matching discrete variable
         """
-        reference_idx = self.distribution.sample_prior(num_samples)
+        if num_negatives is None:
+            num_negatives = num_samples
+
+        reference_idx = self.distribution.sample_prior(num_samples +
+                                                       num_negatives)
+        negative_idx = reference_idx[num_samples:]
+        reference_idx = reference_idx[:num_samples]
         return BatchIndex(
             reference=reference_idx,
-            negative=self.distribution.sample_prior(num_samples),
+            negative=negative_idx,
             positive=self.distribution.sample_conditional(reference_idx),
         )
 
@@ -421,11 +443,13 @@ def _init_time_distribution(self):
         else:
             raise ValueError
 
-    def get_indices(self, num_samples: int) -> BatchIndex:
+    def get_indices(self,
+                    num_samples: int,
+                    num_negatives: int = None) -> BatchIndex:
         """Samples indices for reference, positive and negative examples.
 
         The reference and negative samples will be sampled uniformly from
-        all available time steps, and a total of ``2*num_samples`` will be
+        all available time steps, and a total of ``num_samples + num_negatives`` will be
         returned for both.
 
         For the positive samples, ``num_samples`` are sampled according to the
@@ -436,6 +460,7 @@ def get_indices(self, num_samples: int) -> BatchIndex:
         Args:
             num_samples: The number of samples (batch size) of the returned
                 :py:class:`cebra.data.datatypes.BatchIndex`.
+            num_negatives: The number of negative samples. If None, defaults to num_samples.
 
         Returns:
             Indices for reference, positive and negatives samples.
@@ -444,7 +469,11 @@ def get_indices(self, num_samples: int) -> BatchIndex:
             Add the ``empirical`` vs. ``discrete`` sampling modes to this
             class.
         """
-        reference_idx = self.time_distribution.sample_prior(num_samples * 2)
+        if num_negatives is None:
+            num_negatives = num_samples
+
+        reference_idx = self.time_distribution.sample_prior(num_samples +
+                                                            num_negatives)
         negative_idx = reference_idx[num_samples:]
         reference_idx = reference_idx[:num_samples]
         behavior_positive_idx = self.behavior_distribution.sample_conditional(
@@ -470,7 +499,7 @@ def __post_init__(self):
     def offset(self):
         return self.dataset.offset
 
-    def get_indices(self, num_samples=None) -> BatchIndex:
+    def get_indices(self, num_samples=None, num_negatives=None) -> BatchIndex:
         """Samples indices for reference, positive and negative examples.
 
         The reference indices are all available (valid, according to the
@@ -491,6 +520,7 @@ def get_indices(self, num_samples=None) -> BatchIndex:
             class.
         """
         assert num_samples is None
+        assert num_negatives is None
 
         reference_idx = torch.arange(
             self.offset.left,