Improve sampling API

stes · stes · commit 07212f2e8f52 · 2025-08-02T16:15:29.000+02:00
diff --git a/cebra/data/base.py b/cebra/data/base.py
@@ -22,6 +22,7 @@
 """Base classes for datasets and loaders."""
 
 import abc
+from typing import Iterator
 
 import literate_dataclasses as dataclasses
 import torch
@@ -254,19 +255,25 @@ def __post_init__(self):
             raise ValueError(
                 f"Batch size has to be None, or a non-negative value. Got {self.batch_size}."
             )
+        if self.num_negatives is not None and self.num_negatives <= 0:
+            raise ValueError(
+                f"Number of negatives has to be None, or a non-negative value. Got {self.num_negatives}."
+            )
+
+        if self.num_negatives is None:
+            self.num_negatives = self.batch_size
 
     def __len__(self):
         """The number of batches returned when calling as an iterator."""
         return self.num_steps
 
-    def __iter__(self) -> Batch:
+    def __iter__(self) -> Iterator[Batch]:
         for _ in range(len(self)):
-            index = self.get_indices(num_samples=self.batch_size,
-                                     num_negatives=self.num_negatives)
+            index = self.get_indices()
             yield self.dataset.load_batch(index)
 
     @abc.abstractmethod
-    def get_indices(self, num_samples: int, num_negatives: int = None):
+    def get_indices(self):
         """Sample and return the specified number of indices.
 
         The elements of the returned `BatchIndex` will be used to index the
diff --git a/cebra/data/multi_session.py b/cebra/data/multi_session.py
@@ -160,9 +160,7 @@ def __post_init__(self):
 
     # NOTE(stes): In the longer run, we need to unify the API here; the num_samples argument
     # is not used in the multi-session case, which is different to the single session samples.
-    def get_indices(self,
-                    num_samples: int,
-                    num_negatives: int = None) -> List[BatchIndex]:
+    def get_indices(self) -> List[BatchIndex]:
         ref_idx = self.sampler.sample_prior(self.batch_size)
         neg_idx = self.sampler.sample_prior(self.num_negatives)
         pos_idx, idx, idx_rev = self.sampler.sample_conditional(ref_idx)
@@ -238,9 +236,14 @@ def __post_init__(self):
         self.sampler = cebra.distributions.UnifiedSampler(
             self.dataset, self.time_offset)
 
-    def get_indices(self,
-                    num_samples: int,
-                    num_negatives: int = None) -> BatchIndex:
+        if self.batch_size < 2:
+            raise ValueError("UnifiedLoader does not support batch_size < 2.")
+
+        if self.num_negatives < 2:
+            raise ValueError(
+                "UnifiedLoader does not support num_negatives < 2.")
+
+    def get_indices(self) -> BatchIndex:
         """Sample and return the specified number of indices.
 
         The elements of the returned ``BatchIndex`` will be used to index the
diff --git a/cebra/data/multiobjective.py b/cebra/data/multiobjective.py
@@ -20,10 +20,13 @@
 # limitations under the License.
 #
 
+from typing import Iterator
+
 import literate_dataclasses as dataclasses
 
 import cebra.data as cebra_data
 import cebra.distributions
+from cebra.data.datatypes import Batch
 from cebra.data.datatypes import BatchIndex
 from cebra.distributions.continuous import Prior
 
@@ -71,9 +74,9 @@ def __post_init__(self):
     def add_config(self, config):
         self.labels.append(config['label'])
 
-    def get_indices(self, num_samples: int, num_negatives: int = None):
+    def get_indices(self) -> BatchIndex:
         if self.sampling_mode_supervised == "ref_shared":
-            reference_idx = self.prior.sample_prior(num_samples)
+            reference_idx = self.prior.sample_prior(self.batch_size)
         else:
             raise ValueError(
                 f"Sampling mode {self.sampling_mode_supervised} is not implemented."
@@ -87,9 +90,9 @@ def get_indices(self, num_samples: int, num_negatives: int = None):
 
         return batch_index
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[Batch]:
         for _ in range(len(self)):
-            index = self.get_indices(num_samples=self.batch_size)
+            index = self.get_indices()
             yield self.dataset.load_batch_supervised(index, self.labels)
 
 
@@ -142,16 +145,14 @@ def add_config(self, config):
 
         self.distributions.append(distribution)
 
-    def get_indices(self, num_samples: int, num_negatives: int = None):
+    def get_indices(self) -> BatchIndex:
         """Sample and return the specified number of indices."""
 
-        if num_negatives is None:
-            num_negatives = num_samples
-
         if self.sampling_mode_contrastive == "refneg_shared":
-            ref_and_neg = self.prior.sample_prior(num_samples + num_negatives)
-            reference_idx = ref_and_neg[:num_samples]
-            negative_idx = ref_and_neg[num_samples:]
+            ref_and_neg = self.prior.sample_prior(self.batch_size +
+                                                  self.num_negatives)
+            reference_idx = ref_and_neg[:self.batch_size]
+            negative_idx = ref_and_neg[self.batch_size:]
 
             positives_idx = []
             for distribution in self.distributions:
@@ -172,6 +173,5 @@ def get_indices(self, num_samples: int, num_negatives: int = None):
 
     def __iter__(self):
         for _ in range(len(self)):
-            index = self.get_indices(num_samples=self.batch_size,
-                                     num_negatives=self.num_negatives)
+            index = self.get_indices()
             yield self.dataset.load_batch_contrastive(index)
diff --git a/cebra/data/single_session.py b/cebra/data/single_session.py
@@ -27,6 +27,7 @@
 
 import abc
 import warnings
+from typing import Iterator
 
 import literate_dataclasses as dataclasses
 import torch
@@ -138,9 +139,7 @@ def _init_distribution(self):
                 f"Invalid choice of prior distribution. Got '{self.prior}', but "
                 f"only accept 'uniform' or 'empirical' as potential values.")
 
-    def get_indices(self,
-                    num_samples: int,
-                    num_negatives: int = None) -> BatchIndex:
+    def get_indices(self) -> BatchIndex:
         """Samples indices for reference, positive and negative examples.
 
         The reference samples will be sampled from the empirical or uniform prior
@@ -161,13 +160,10 @@ def get_indices(self,
         Returns:
             Indices for reference, positive and negatives samples.
         """
-        if num_negatives is None:
-            num_negatives = num_samples
-
-        reference_idx = self.distribution.sample_prior(num_samples +
-                                                       num_negatives)
-        negative_idx = reference_idx[num_samples:]
-        reference_idx = reference_idx[:num_samples]
+        reference_idx = self.distribution.sample_prior(self.batch_size +
+                                                       self.num_negatives)
+        negative_idx = reference_idx[self.batch_size:]
+        reference_idx = reference_idx[:self.batch_size]
         reference = self.index[reference_idx]
         positive_idx = self.distribution.sample_conditional(reference)
         return BatchIndex(reference=reference_idx,
@@ -253,9 +249,7 @@ def _init_distribution(self):
             else:
                 raise ValueError(self.conditional)
 
-    def get_indices(self,
-                    num_samples: int,
-                    num_negatives: int = None) -> BatchIndex:
+    def get_indices(self) -> BatchIndex:
         """Samples indices for reference, positive and negative examples.
 
         The reference and negative samples will be sampled uniformly from
@@ -271,13 +265,10 @@ def get_indices(self,
         Returns:
             Indices for reference, positive and negatives samples.
         """
-        if num_negatives is None:
-            num_negatives = num_samples
-
-        reference_idx = self.distribution.sample_prior(num_samples +
-                                                       num_negatives)
-        negative_idx = reference_idx[num_samples:]
-        reference_idx = reference_idx[:num_samples]
+        reference_idx = self.distribution.sample_prior(self.batch_size +
+                                                       self.num_negatives)
+        negative_idx = reference_idx[self.batch_size:]
+        reference_idx = reference_idx[:self.batch_size]
         positive_idx = self.distribution.sample_conditional(reference_idx)
         return BatchIndex(reference=reference_idx,
                           positive=positive_idx,
@@ -318,9 +309,7 @@ def __post_init__(self):
             continuous=self.cindex,
             time_delta=self.time_offset)
 
-    def get_indices(self,
-                    num_samples: int,
-                    num_negatives: int = None) -> BatchIndex:
+    def get_indices(self) -> BatchIndex:
         """Samples indices for reference, positive and negative examples.
 
         The reference and negative samples will be sampled uniformly from
@@ -344,13 +333,10 @@ def get_indices(self,
               class.
             - Sample the negatives with matching discrete variable
         """
-        if num_negatives is None:
-            num_negatives = num_samples
-
-        reference_idx = self.distribution.sample_prior(num_samples +
-                                                       num_negatives)
-        negative_idx = reference_idx[num_samples:]
-        reference_idx = reference_idx[:num_samples]
+        reference_idx = self.distribution.sample_prior(self.batch_size +
+                                                       self.num_negatives)
+        negative_idx = reference_idx[self.batch_size:]
+        reference_idx = reference_idx[:self.batch_size]
         return BatchIndex(
             reference=reference_idx,
             negative=negative_idx,
@@ -443,9 +429,7 @@ def _init_time_distribution(self):
         else:
             raise ValueError
 
-    def get_indices(self,
-                    num_samples: int,
-                    num_negatives: int = None) -> BatchIndex:
+    def get_indices(self) -> BatchIndex:
         """Samples indices for reference, positive and negative examples.
 
         The reference and negative samples will be sampled uniformly from
@@ -469,13 +453,10 @@ def get_indices(self,
             Add the ``empirical`` vs. ``discrete`` sampling modes to this
             class.
         """
-        if num_negatives is None:
-            num_negatives = num_samples
-
-        reference_idx = self.time_distribution.sample_prior(num_samples +
-                                                            num_negatives)
-        negative_idx = reference_idx[num_samples:]
-        reference_idx = reference_idx[:num_samples]
+        reference_idx = self.time_distribution.sample_prior(self.batch_size +
+                                                            self.num_negatives)
+        negative_idx = reference_idx[self.batch_size:]
+        reference_idx = reference_idx[:self.batch_size]
         behavior_positive_idx = self.behavior_distribution.sample_conditional(
             reference_idx)
         time_positive_idx = self.time_distribution.sample_conditional(
@@ -493,13 +474,18 @@ class FullDataLoader(ContinuousDataLoader):
 
     def __post_init__(self):
         super().__post_init__()
-        self.batch_size = None
+
+        if self.batch_size is not None:
+            raise ValueError("Batch size cannot be set for FullDataLoader.")
+        if self.num_negatives is not None:
+            raise ValueError(
+                "Number of negatives cannot be set for FullDataLoader.")
 
     @property
     def offset(self):
         return self.dataset.offset
 
-    def get_indices(self, num_samples=None, num_negatives=None) -> BatchIndex:
+    def get_indices(self) -> BatchIndex:
         """Samples indices for reference, positive and negative examples.
 
         The reference indices are all available (valid, according to the
@@ -519,8 +505,6 @@ def get_indices(self, num_samples=None, num_negatives=None) -> BatchIndex:
             Add the ``empirical`` vs. ``discrete`` sampling modes to this
             class.
         """
-        assert num_samples is None
-        assert num_negatives is None
 
         reference_idx = torch.arange(
             self.offset.left,
@@ -534,7 +518,6 @@ def get_indices(self, num_samples=None, num_negatives=None) -> BatchIndex:
                                      positive=positive_idx,
                                      negative=negative_idx)
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[BatchIndex]:
         for _ in range(len(self)):
-            index = self.get_indices(num_samples=self.batch_size)
-            yield index
+            yield self.get_indices()
diff --git a/tests/test_loader.py b/tests/test_loader.py