Implement review comments

CeliaBenquet · CeliaBenquet · commit 1d0c498b4476 · 2025-04-24T18:05:54.000+02:00
diff --git a/cebra/data/base.py b/cebra/data/base.py
@@ -193,17 +193,16 @@ def load_batch(self, index: BatchIndex) -> Batch:
         """
         raise NotImplementedError()
 
-    @abc.abstractmethod
     def configure_for(self, model: "cebra.models.Model"):
         """Configure the dataset offset for the provided model.
 
         Call this function before indexing the dataset. This sets the
-        :py:attr:`offset` attribute of the dataset.
+        ``offset`` attribute of the dataset.
 
         Args:
             model: The model to configure the dataset for.
         """
-        raise NotImplementedError
+        self.offset = model.get_offset()
 
 
 @dataclasses.dataclass
diff --git a/cebra/data/datasets.py b/cebra/data/datasets.py
@@ -353,7 +353,7 @@ def configure_for(self, model: "Model"):
         """Configure the dataset offset for the provided model.
 
         Call this function before indexing the dataset. This sets the
-        :py:attr:`offset` attribute of the dataset.
+        ``offset`` attribute of the dataset.
 
         Args:
             model: The model to configure the dataset for.
diff --git a/cebra/data/multi_session.py b/cebra/data/multi_session.py
@@ -108,7 +108,7 @@ def configure_for(self, model: "cebra.models.Model"):
         """Configure the dataset offset for the provided model.
 
         Call this function before indexing the dataset. This sets the
-        :py:attr:`cebra_data.Dataset.offset` attribute of the dataset.
+        ``offset`` attribute of the dataset.
 
         Args:
             model: The model to configure the dataset for.
diff --git a/cebra/data/single_session.py b/cebra/data/single_session.py
@@ -69,17 +69,6 @@ def load_batch(self, index: BatchIndex) -> Batch:
             reference=self[index.reference],
         )
 
-    def configure_for(self, model: "cebra.models.Model"):
-        """Configure the dataset offset for the provided model.
-
-        Call this function before indexing the dataset. This sets the
-        :py:attr:`cebra_data.Dataset.offset` attribute of the dataset.
-
-        Args:
-            model: The model to configure the dataset for.
-        """
-        self.offset = model.get_offset()
-
 
 @dataclasses.dataclass
 class DiscreteDataLoader(cebra_data.Loader):
diff --git a/cebra/integrations/sklearn/cebra.py b/cebra/integrations/sklearn/cebra.py
@@ -22,6 +22,7 @@
 """Define the CEBRA model."""
 
 import itertools
+import warnings
 from typing import (Callable, Dict, Iterable, List, Literal, Optional, Tuple,
                     Union)
 
@@ -129,7 +130,7 @@ def _init_loader(
         (not is_cont, not is_disc, is_multi),
     ]
     if any(all(combination) for combination in incompatible_combinations):
-        raise ValueError(f"Invalid index combination.\n"
+        raise ValueError("Invalid index combination.\n"
                          f"Continuous: {is_cont},\n"
                          f"Discrete: {is_disc},\n"
                          f"Hybrid training: {is_hybrid},\n"
@@ -293,7 +294,7 @@ def _require_arg(key):
                         "single-session",
                     )
 
-    error_message = (f"Invalid index combination.\n"
+    error_message = ("Invalid index combination.\n"
                      f"Continuous: {is_cont},\n"
                      f"Discrete: {is_disc},\n"
                      f"Hybrid training: {is_hybrid},\n"
@@ -340,7 +341,7 @@ def _load_cebra_with_sklearn_backend(cebra_info: Dict) -> "CEBRA":
     if missing_keys:
         raise ValueError(
             f"Missing keys in data dictionary: {', '.join(missing_keys)}. "
-            f"You can try loading the CEBRA model with the torch backend.")
+            "You can try loading the CEBRA model with the torch backend.")
 
     args, state, state_dict = cebra_info['args'], cebra_info[
         'state'], cebra_info['state_dict']
@@ -656,12 +657,12 @@ def _get_dataset_multi(X: List[Iterable], y: List[Iterable]):
             # TODO(celia): to make it work for multiple set of index. For now, y should be a tuple of one list only
             if isinstance(y, tuple) and len(y) > 1:
                 raise NotImplementedError(
-                    f"Support for multiple set of index is not implemented in multissesion training, "
+                    "Support for multiple set of index is not implemented in multissesion training, "
                     f"got {len(y)} sets of indexes.")
 
             if not _are_sessions_equal(X, y):
                 raise ValueError(
-                    f"Invalid number of sessions: number of sessions in X and y need to match, "
+                    "Invalid number of sessions: number of sessions in X and y need to match, "
                     f"got X:{len(X)} and y:{[len(y_i) for y_i in y]}.")
 
             for session in range(len(X)):
@@ -685,8 +686,8 @@ def _get_dataset_multi(X: List[Iterable], y: List[Iterable]):
         else:
             if not _are_sessions_equal(X, y):
                 raise ValueError(
-                    f"Invalid number of samples or labels sessions: provide one session for single-session training, "
-                    f"and make sure the number of samples in X and y need match, "
+                    "Invalid number of samples or labels sessions: provide one session for single-session training, "
+                    "and make sure the number of samples in X and y need match, "
                     f"got {len(X)} and {[len(y_i) for y_i in y]}.")
             is_multisession = False
             dataset = _get_dataset(X, y)
@@ -848,7 +849,7 @@ def _check_labels_types(self, y: tuple, session_id: Optional[int] = None):
         # Check that same number of index
         if len(self.label_types_) != n_idx:
             raise ValueError(
-                f"Number of index invalid: labels must have the same number of index as for fitting,"
+                "Number of index invalid: labels must have the same number of index as for fitting,"
                 f"expects {len(self.label_types_)}, got {n_idx} idx.")
 
         for i in range(len(self.label_types_)):  # for each index
@@ -861,12 +862,12 @@ def _check_labels_types(self, y: tuple, session_id: Optional[int] = None):
                     > 1):  # is there more than one feature in the index
                 if label_types_idx[1][1] != y[i].shape[1]:
                     raise ValueError(
-                        f"Labels invalid: must have the same number of features as the ones used for fitting,"
+                        "Labels invalid: must have the same number of features as the ones used for fitting,"
                         f"expects {label_types_idx[1]}, got {y[i].shape}.")
 
             if label_types_idx[0] != y[i].dtype:
                 raise ValueError(
-                    f"Labels invalid: must have the same type of features as the ones used for fitting,"
+                    "Labels invalid: must have the same type of features as the ones used for fitting,"
                     f"expects {label_types_idx[0]}, got {y[i].dtype}.")
 
     def _prepare_fit(
@@ -1254,7 +1255,8 @@ def transform(self,
 
         return output.detach().cpu().numpy()
 
-    #NOTE: Deprecated: transform is now handled in the solver but kept for testing.
+    #NOTE: Deprecated: transform is now handled in the solver but the original
+    #      method is kept here for testing.
     def transform_deprecated(self,
                              X: Union[npt.NDArray, torch.Tensor],
                              session_id: Optional[int] = None) -> npt.NDArray:
@@ -1279,6 +1281,12 @@ def transform_deprecated(self,
             >>> embedding = cebra_model.transform(dataset)
 
         """
+        warnings.warn(
+            "The method `transform_deprecated` is deprecated "
+            "but kept for testing puroposes."
+            "We recommend using `transform` instead.",
+            DeprecationWarning,
+            stacklevel=2)
 
         sklearn_utils_validation.check_is_fitted(self, "n_features_")
         model, offset = self._select_model(X, session_id)
diff --git a/cebra/solver/base.py b/cebra/solver/base.py
@@ -84,7 +84,7 @@ def _check_indices(batch_start_idx: int, batch_end_idx: int,
         raise ValueError(
             f"The batch has length {batch_size_length} which "
             f"is smaller or equal than the required offset length {len(offset)}."
-            f"Either choose a model with smaller offset or the batch should contain more samples."
+            f"Either choose a model with smaller offset or the batch should contain 3 times more samples."
         )
 
 
@@ -127,7 +127,7 @@ def _get_batch(inputs: torch.Tensor, offset: Optional[cebra.data.Offset],
         inputs: Input data.
         offset: Model offset.
         batch_start_idx: Index of the first sample in the batch.
-        batch_end_idx: Index of the first sample in the batch.
+        batch_end_idx: Index of the last sample in the batch.
         pad_before_transform: If True zero-pad the batched data.
 
     Returns:
@@ -237,8 +237,8 @@ def __getitem__(self, idx):
 
     if len(index_dataloader) < 2:
         raise ValueError(
-            f"Number of batches must be greater than 1, you can use transform without batching instead, got {len(index_dataloader)}."
-        )
+            f"Number of batches must be greater than 1, you can use transform "
+            f"without batching instead, got {len(index_dataloader)}.")
 
     output = []
     for batch_idx, index_batch in enumerate(index_dataloader):
@@ -253,7 +253,11 @@ def __getitem__(self, idx):
         if batch_idx == (len(index_dataloader) - 1):
             # last batch, incomplete
             index_batch = torch.cat((last_batch, index_batch), dim=0)
+            assert index_batch[-1] + 1 == len(inputs), (
+                f"Last batch index {index_batch[-1]} + 1 should be equal to the length of inputs {len(inputs)}."
+            )
 
+        # Batch start and end so that `batch_size` size with the last batch including 2 batches
         batch_start_idx, batch_end_idx = index_batch[0], index_batch[-1] + 1
         batched_data = _get_batch(inputs=inputs,
                                   offset=offset,
@@ -264,7 +268,7 @@ def __getitem__(self, idx):
         output_batch = _inference_transform(model, batched_data)
         output.append(output_batch)
 
-    output = torch.cat(output)
+    output = torch.cat(output, dim=0)
     return output
 
 
@@ -608,7 +612,7 @@ def transform(self,
         of the given model, after switching it into eval mode.
 
         Args:
-            inputs: The input signal
+            inputs: The input signal (T, N).
             pad_before_transform: If ``False``, no padding is applied to the input
                 sequence and the output sequence will be smaller than the input
                 sequence due to the receptive field of the model. If the
@@ -635,11 +639,14 @@ def transform(self,
 
         model, offset = self._select_model(inputs, session_id)
 
-        if len(offset) < 2 and pad_before_transform:
-            pad_before_transform = False
+        #if len(offset) < 2 and pad_before_transform:
+        #    pad_before_transform = False
 
         model.eval()
-        if batch_size is not None and inputs.shape[0] > int(batch_size * 2):
+        if batch_size is not None and inputs.shape[0] > int(
+                batch_size * 2) and not isinstance(
+                    self.model, cebra.models.ResampleModelMixin):
+            # NOTE: resampling models are not supported for batched inference.
             output = _batched_transform(
                 model=model,
                 inputs=inputs,
diff --git a/cebra/solver/multiobjective.py b/cebra/solver/multiobjective.py
@@ -155,7 +155,7 @@ def finalize(self):
         if len(set(self.feature_ranges_tuple)) != len(
                 self.feature_ranges_tuple):
             raise RuntimeError(
-                f"Feature ranges are not unique. Please check again and remove the duplicates. "
+                "Feature ranges are not unique. Please check again and remove the duplicates. "
                 f"Feature ranges: {self.feature_ranges_tuple}")
 
         print("Creating MultiCriterion")
@@ -456,8 +456,27 @@ def validation(
         self.log.setdefault(("sum_loss_val",), []).append(sum_loss_valid)
         return stats_val
 
+    # NOTE: Deprecated: batched transform can now be performed (more memory efficient)
+    #       using the transform method of the model, and handling padding is implemented
+    #       directly in the base Solver. This method is kept for testing purposes.
     @torch.no_grad()
     def transform_deprecated(self, inputs: torch.Tensor) -> torch.Tensor:
+        """Transform the input data using the model.
+
+        Args:
+            inputs: The input data to transform.
+
+        Returns:
+            The transformed data.
+        """
+
+        warnings.warn(
+            "The method `transform_deprecated` is deprecated "
+            "but kept for testing puroposes."
+            "We recommend using `transform` instead.",
+            DeprecationWarning,
+            stacklevel=2)
+
         offset = self.model.get_offset()
         self.model.eval()
         X = inputs.cpu().numpy()
diff --git a/tests/test_integration_xcebra.py b/tests/test_integration_xcebra.py
@@ -158,13 +158,32 @@ def test_synthetic_data_training(synthetic_data, device):
     assert transform_embedding.shape[
         1] == n_latents, "Incorrect embedding dimension"
     assert not torch.isnan(transform_embedding).any(), "NaN values in embedding"
-    assert np.allclose(embedding, transform_embedding, rtol=1e-02)
+    assert np.allclose(embedding, transform_embedding, rtol=1e-4, atol=1e-4)
 
     # Test the transform with batching
     batched_embedding = solver.transform(data.neural.to(device), batch_size=512)
     assert batched_embedding.shape[
         1] == n_latents, "Incorrect embedding dimension"
     assert not torch.isnan(batched_embedding).any(), "NaN values in embedding"
-    assert np.allclose(embedding, batched_embedding, rtol=1e-02)
-
-    assert np.allclose(transform_embedding, batched_embedding, rtol=1e-02)
+    assert np.allclose(embedding, batched_embedding, rtol=1e-4, atol=1e-4)
+
+    assert np.allclose(transform_embedding,
+                       batched_embedding,
+                       rtol=1e-4,
+                       atol=1e-4)
+
+    # Test and compare the previous transform (transform_deprecated)
+    deprecated_transform_embedding = solver.transform_deprecated(
+        data.neural.to(device))
+    assert np.allclose(embedding,
+                       deprecated_transform_embedding,
+                       rtol=1e-4,
+                       atol=1e-4)
+    assert np.allclose(transform_embedding,
+                       deprecated_transform_embedding,
+                       rtol=1e-4,
+                       atol=1e-4)
+    assert np.allclose(batched_embedding,
+                       deprecated_transform_embedding,
+                       rtol=1e-4,
+                       atol=1e-4)
diff --git a/tests/test_solver.py b/tests/test_solver.py
diff --git a/tests/test_solver_batched.py b/tests/test_solver_batched.py