Add support for outputs smaller than inputs in aggregator (#1394)

fepegar · web-flow · commit 6e779401ac34 · 2025-12-24T22:46:17.000Z
diff --git a/src/torchio/data/inference/aggregator.py b/src/torchio/data/inference/aggregator.py
@@ -24,6 +24,12 @@ class GridAggregator:
             in the overlapping areas will be weighted with a Hann window
             function. See the `grid aggregator tests`_ for a raw visualization
             of the three modes.
+        downsampling_factor: Factor by which the output volume is expected to
+            be smaller than the input volume in each spatial dimension. This is
+            useful when the model downsamples the input (e.g., with strided
+            convolutions or pooling layers). Currently, only a single integer
+            is supported, which applies the same downsampling factor to all
+            spatial dimensions.
 
     .. _grid aggregator tests: https://github.com/TorchIO-project/torchio/blob/main/tests/data/inference/test_aggregator.py
 
@@ -32,7 +38,12 @@ class GridAggregator:
         information about patch-based sampling.
     """
 
-    def __init__(self, sampler: GridSampler, overlap_mode: str = 'crop'):
+    def __init__(
+        self,
+        sampler: GridSampler,
+        overlap_mode: str = 'crop',
+        downsampling_factor: int = 1,  # TODO: support one per dimension
+    ):
         subject = sampler.subject
         self.volume_padded = sampler.padding_mode is not None
         self.spatial_shape = subject.spatial_shape
@@ -43,6 +54,9 @@ def __init__(self, sampler: GridSampler, overlap_mode: str = 'crop'):
         self.overlap_mode = overlap_mode
         self._avgmask_tensor: torch.Tensor | None = None
         self._hann_window: torch.Tensor | None = None
+        self._downsampling_factor = downsampling_factor
+        shape_array = np.array(subject.spatial_shape) // self._downsampling_factor
+        self.spatial_shape = tuple(shape_array.tolist())
 
     @staticmethod
     def _parse_overlap_mode(overlap_mode):
@@ -137,7 +151,7 @@ def add_batch(
         batch_tensor: torch.Tensor,
         locations: torch.Tensor,
     ) -> None:
-        """Add batch processed by a CNN to the output prediction volume.
+        """Add batch processed by a network to the output prediction volume.
 
         Args:
             batch_tensor: 5D tensor, typically the output of a convolutional
@@ -147,12 +161,13 @@ def add_batch(
                 extracted using ``batch[torchio.LOCATION]``.
         """
         batch = batch_tensor.cpu()
-        locations_array = locations.cpu().numpy()
-        patch_sizes = locations_array[:, 3:] - locations_array[:, :3]
+        locations_array = locations.cpu().numpy() // self._downsampling_factor
+        target_shapes = locations_array[:, 3:] - locations_array[:, :3]
         # There should be only one patch size
-        assert len(np.unique(patch_sizes, axis=0)) == 1
+        assert len(np.unique(target_shapes, axis=0)) == 1
         input_spatial_shape = tuple(batch.shape[-3:])
-        target_spatial_shape = tuple(patch_sizes[0])
+        target_spatial_shape_array = target_shapes[0]
+        target_spatial_shape = tuple(target_spatial_shape_array.tolist())
         if input_spatial_shape != target_spatial_shape:
             message = (
                 f'The shape of the input batch, {input_spatial_shape},'
diff --git a/src/torchio/datasets/ct_rate.py b/src/torchio/datasets/ct_rate.py
@@ -319,14 +319,14 @@ def _instantiate_image(self, image_row: pd.Series) -> ScalarImage:
             image_row: A pandas Series representing a row from the metadata DataFrame,
                 containing information about a single image.
         """
-        image_dict = image_row.to_dict()
-        filename = image_dict[self._FILENAME_KEY]
+        image_dict: dict[str, str | dict[str, str]] = image_row.to_dict()  # type: ignore[assignment]
+        filename: str = image_dict[self._FILENAME_KEY]  # type: ignore[assignment]
         relative_image_path = self._get_image_path(
             filename,
             load_fixed=self._load_fixed,
         )
         image_path = self._root_dir / relative_image_path
-        report_dict = self._extract_report_dict(image_dict)
+        report_dict = self._extract_report_dict(image_dict)  # type: ignore[arg-type]
         image_dict[self._report_key] = report_dict
         image = ScalarImage(image_path, verify_path=self._verify_paths, **image_dict)
         return image
diff --git a/src/torchio/datasets/ixi.py b/src/torchio/datasets/ixi.py
@@ -107,7 +107,7 @@ def _check_exists(root, modalities):
         return exists
 
     @staticmethod
-    def _get_subjects_list(root, modalities):
+    def _get_subjects_list(root: Path, modalities: Sequence[str]) -> list[Subject]:
         # The number of files for each modality is not the same
         # E.g. 581 for T1, 578 for T2
         # Let's just use the first modality as reference for now
@@ -134,7 +134,7 @@ def _get_subjects_list(root, modalities):
                 skip_subject = False
             if skip_subject:
                 continue
-            subjects.append(Subject(**images_dict))
+            subjects.append(Subject(**images_dict))  # type: ignore[arg-type]
         return subjects
 
     def _download(self, root, modalities):
diff --git a/tests/data/inference/test_aggregator.py b/tests/data/inference/test_aggregator.py
@@ -142,3 +142,40 @@ def test_bad_aggregator_shape(self):
             inference_batch = torch.stack(patches)
             with pytest.raises(RuntimeError):
                 aggregator.add_batch(inference_batch, batch[tio.LOCATION])
+
+    def test_downsampling_model(self):
+        # This might be useful to compute image embeddings using a sliding window
+        downsampling_factor = 4  # e.g. patch size in a ViT
+        embedding_dim = 5
+        net_input_size = 20
+        image_size = 40
+
+        def network(x):
+            down = x[
+                ...,
+                ::downsampling_factor,
+                ::downsampling_factor,
+                ::downsampling_factor,
+            ]
+            embeddings = torch.cat(embedding_dim * [down], dim=1)
+            return embeddings
+
+        tensor = torch.ones(1, image_size, image_size, image_size)
+        image_name = 'img'
+        subject = tio.Subject({image_name: tio.ScalarImage(tensor=tensor)})
+        sampler = tio.data.GridSampler(
+            subject,
+            patch_size=net_input_size,
+        )
+        aggregator = tio.data.GridAggregator(
+            sampler,
+            downsampling_factor=downsampling_factor,
+        )
+        loader = tio.SubjectsLoader(sampler, batch_size=3)
+        for batch in loader:
+            input_batch = batch[image_name][tio.DATA]
+            embeddings = network(input_batch)
+            aggregator.add_batch(embeddings, batch[tio.LOCATION])
+        output = aggregator.get_output_tensor()
+        expected_shape = (embedding_dim,) + (image_size // downsampling_factor,) * 3
+        self.assertEqual(output.shape, expected_shape)