mehta-lab
diff --git a/‎viscy/data/cell_classification.py‎
Lines changed: 18 additions & 5 deletions b/‎viscy/data/cell_classification.py‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎viscy/data/combined.py‎
Lines changed: 42 additions & 6 deletions b/‎viscy/data/combined.py‎
Lines changed: 42 additions & 6 deletions
diff --git a/‎viscy/data/hcs.py‎
Lines changed: 12 additions & 10 deletions b/‎viscy/data/hcs.py‎
Lines changed: 12 additions & 10 deletions
@@ -46,6 +46,7 @@ def __init__(
         transform: Callable | None,
         initial_yx_patch_size: tuple[int, int],
         return_indices: bool = False,
+        label_column: str = "infection_state",
     ):
         self.plate = plate
         self.z_range = z_range
@@ -65,6 +66,7 @@ def __init__(
             annotation["y"].between(*y_range, inclusive="neither")
             & annotation["x"].between(*x_range, inclusive="neither")
         ]
+        self.label_column = label_column
 
     def __len__(self):
         """Return the number of samples in the dataset."""
@@ -103,7 +105,7 @@ def __getitem__(
         img = (image - norm_meta["mean"]) / norm_meta["std"]
         if self.transform is not None:
             img = self.transform(img)
-        label = torch.tensor(row["infection_state"]).float()[None]
+        label = torch.tensor(row[self.label_column]).float()[None]
         if self.return_indices:
             return img, label, row[INDEX_COLUMNS].to_dict()
         else:
@@ -149,25 +151,27 @@ def __init__(
         val_fovs: list[str] | None,
         channel_name: str,
         z_range: tuple[int, int],
-        train_exlude_timepoints: list[int],
+        train_exclude_timepoints: list[int],
         train_transforms: list[Callable] | None,
         val_transforms: list[Callable] | None,
         initial_yx_patch_size: tuple[int, int],
         batch_size: int,
         num_workers: int,
+        label_column: str = "infection_state",
     ):
         super().__init__()
         self.image_path = image_path
         self.annotation_path = annotation_path
         self.val_fovs = val_fovs
         self.channel_name = channel_name
         self.z_range = z_range
-        self.train_exlude_timepoints = train_exlude_timepoints
+        self.train_exclude_timepoints = train_exclude_timepoints
         self.train_transform = Compose(train_transforms)
         self.val_transform = Compose(val_transforms)
         self.initial_yx_patch_size = initial_yx_patch_size
         self.batch_size = batch_size
         self.num_workers = num_workers
+        self.label_column = label_column
 
     def _subset(
         self,
@@ -189,6 +193,7 @@ def _subset(
             transform=transform,
             initial_yx_patch_size=self.initial_yx_patch_size,
             return_indices=return_indices,
+            label_column=self.label_column,
         )
 
     def setup(self, stage=None) -> None:
@@ -208,8 +213,16 @@ def setup(self, stage=None) -> None:
             If stage is unknown.
         """
         plate = open_ome_zarr(self.image_path)
-        all_fovs = ["/" + name for (name, _) in plate.positions()]
         annotation = pd.read_csv(self.annotation_path)
+        all_fovs = [name for (name, _) in plate.positions()]
+        if annotation["fov_name"].iloc[0].startswith("/"):
+            all_fovs = ["/" + name for name in all_fovs]
+        if all_fovs[0].startswith("/"):
+            if not self.val_fovs[0].startswith("/"):
+                self.val_fovs = ["/" + name for name in self.val_fovs]
+        else:
+            if self.val_fovs[0].startswith("/"):
+                self.val_fovs = [name[1:] for name in self.val_fovs]
         for column in ("t", "y", "x"):
             annotation[column] = annotation[column].astype(int)
         if stage in (None, "fit", "validate"):
@@ -219,7 +232,7 @@ def setup(self, stage=None) -> None:
                 annotation,
                 train_fovs,
                 transform=self.train_transform,
-                exclude_timepoints=self.train_exlude_timepoints,
+                exclude_timepoints=self.train_exclude_timepoints,
             )
             self.val_dataset = self._subset(
                 plate,
 
@@ -189,7 +189,7 @@ def _get_sample_indices(self, idx: int) -> tuple[int, int]:
             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
         return dataset_idx, sample_idx
 
-    def __getitems__(self, indices: list[int]) -> list:
+    def __getitems__(self, indices: list[int]) -> list[dict[str, torch.Tensor]]:
         """Retrieve multiple items by indices with batched dataset access.
 
         Groups indices by source dataset and performs batched retrieval
@@ -202,19 +202,22 @@ def __getitems__(self, indices: list[int]) -> list:
 
         Returns
         -------
-        list
+        list[dict[str, torch.Tensor]]
             Samples from all requested indices, maintaining order.
         """
         grouped_indices = defaultdict(list)
         for idx in indices:
             dataset_idx, sample_indices = self._get_sample_indices(idx)
             grouped_indices[dataset_idx].append(sample_indices)
         _logger.debug(f"Grouped indices: {grouped_indices}")
-        sub_batches = []
+
+        micro_batches = []
         for dataset_idx, sample_indices in grouped_indices.items():
-            sub_batch = self.datasets[dataset_idx].__getitems__(sample_indices)
-            sub_batches.extend(sub_batch)
-        return sub_batches
+            micro_batch = self.datasets[dataset_idx].__getitems__(sample_indices)
+            micro_batch["_dataset_idx"] = dataset_idx
+            micro_batches.append(micro_batch)
+
+        return micro_batches
 
 
 class ConcatDataModule(LightningDataModule):
@@ -369,6 +372,7 @@ def train_dataloader(self) -> ThreadDataLoader:
             batch_size=self.batch_size,
             shuffle=True,
             drop_last=True,
+            collate_fn=lambda x: x,
             **self._dataloader_kwargs(),
         )
 
@@ -387,9 +391,41 @@ def val_dataloader(self) -> ThreadDataLoader:
             batch_size=self.batch_size,
             shuffle=False,
             drop_last=False,
+            collate_fn=lambda x: x,
             **self._dataloader_kwargs(),
         )
 
+    def on_after_batch_transfer(self, batch, dataloader_idx: int):
+        """Apply GPU transforms from constituent data modules to micro-batches."""
+        processed_micro_batches = []
+        for micro_batch in batch:
+            dataset_idx = micro_batch.pop("_dataset_idx")
+            dm = self.data_modules[dataset_idx]
+            if hasattr(dm, "on_after_batch_transfer"):
+                processed_micro_batch = dm.on_after_batch_transfer(
+                    micro_batch, dataloader_idx
+                )
+            else:
+                processed_micro_batch = micro_batch
+            processed_micro_batches.append(processed_micro_batch)
+        combined_batch = {}
+        for key in processed_micro_batches[0].keys():
+            if isinstance(processed_micro_batches[0][key], list):
+                combined_batch[key] = []
+                for micro_batch in processed_micro_batches:
+                    if key in micro_batch:
+                        combined_batch[key].extend(micro_batch[key])
+            else:
+                tensors_to_concat = [
+                    micro_batch[key]
+                    for micro_batch in processed_micro_batches
+                    if key in micro_batch
+                ]
+                if tensors_to_concat:
+                    combined_batch[key] = torch.cat(tensors_to_concat, dim=0)
+
+        return combined_batch
+
 
 class CachedConcatDataModule(LightningDataModule):
     """Cached concatenated data module for distributed training.
 
@@ -735,22 +735,24 @@ def _fit_transform(self) -> tuple[Compose, Compose]:
             Training and validation transform compositions
         """
         # TODO: These have a fixed order for now... ()
-        final_crop = [
-            CenterSpatialCropd(
-                keys=self.source_channel + self.target_channel,
-                roi_size=(
-                    self.z_window_size,
-                    self.yx_patch_size[0],
-                    self.yx_patch_size[1],
-                ),
-            )
-        ]
+        final_crop = [self._final_crop()]
         train_transform = Compose(
             self.normalizations + self._train_transform() + final_crop
         )
         val_transform = Compose(self.normalizations + final_crop)
         return train_transform, val_transform
 
+    def _final_crop(self) -> CenterSpatialCropd:
+        """Setup final cropping: center crop to the target size."""
+        return CenterSpatialCropd(
+            keys=self.source_channel + self.target_channel,
+            roi_size=(
+                self.z_window_size,
+                self.yx_patch_size[0],
+                self.yx_patch_size[1],
+            ),
+        )
+
     def _train_transform(self) -> list[Callable]:
         """Set up training augmentations.