Add support for downsampled outputs in aggregator

fepegar · fepegar · commit 6eedfe1f8d54 · 2025-12-24T17:06:53.000Z
diff --git a/src/torchio/data/inference/aggregator.py b/src/torchio/data/inference/aggregator.py
@@ -24,6 +24,12 @@ class GridAggregator:
             in the overlapping areas will be weighted with a Hann window
             function. See the `grid aggregator tests`_ for a raw visualization
             of the three modes.
+        downsampling_factor: Factor by which the output volume is expected to
+            be smaller than the input volume in each spatial dimension. This is
+            useful when the model downsamples the input (e.g., with strided
+            convolutions or pooling layers). Currently, only a single integer
+            is supported, which applies the same downsampling factor to all
+            spatial dimensions.
 
     .. _grid aggregator tests: https://github.com/TorchIO-project/torchio/blob/main/tests/data/inference/test_aggregator.py
 
@@ -32,7 +38,12 @@ class GridAggregator:
         information about patch-based sampling.
     """
 
-    def __init__(self, sampler: GridSampler, overlap_mode: str = 'crop'):
+    def __init__(
+        self,
+        sampler: GridSampler,
+        overlap_mode: str = 'crop',
+        downsampling_factor: int = 1,  # TODO: support one per dimension
+    ):
         subject = sampler.subject
         self.volume_padded = sampler.padding_mode is not None
         self.spatial_shape = subject.spatial_shape
@@ -43,6 +54,9 @@ def __init__(self, sampler: GridSampler, overlap_mode: str = 'crop'):
         self.overlap_mode = overlap_mode
         self._avgmask_tensor: torch.Tensor | None = None
         self._hann_window: torch.Tensor | None = None
+        self._downsampling_factor = downsampling_factor
+        shape_array = np.array(subject.spatial_shape) // self._downsampling_factor
+        self.spatial_shape = tuple(shape_array.tolist())
 
     @staticmethod
     def _parse_overlap_mode(overlap_mode):
@@ -137,7 +151,7 @@ def add_batch(
         batch_tensor: torch.Tensor,
         locations: torch.Tensor,
     ) -> None:
-        """Add batch processed by a CNN to the output prediction volume.
+        """Add batch processed by a network to the output prediction volume.
 
         Args:
             batch_tensor: 5D tensor, typically the output of a convolutional
@@ -147,12 +161,13 @@ def add_batch(
                 extracted using ``batch[torchio.LOCATION]``.
         """
         batch = batch_tensor.cpu()
-        locations_array = locations.cpu().numpy()
-        patch_sizes = locations_array[:, 3:] - locations_array[:, :3]
+        locations_array = locations.cpu().numpy() // self._downsampling_factor
+        target_shapes = locations_array[:, 3:] - locations_array[:, :3]
         # There should be only one patch size
-        assert len(np.unique(patch_sizes, axis=0)) == 1
+        assert len(np.unique(target_shapes, axis=0)) == 1
         input_spatial_shape = tuple(batch.shape[-3:])
-        target_spatial_shape = tuple(patch_sizes[0])
+        target_spatial_shape_array = target_shapes[0]
+        target_spatial_shape = tuple(target_spatial_shape_array.tolist())
         if input_spatial_shape != target_spatial_shape:
             message = (
                 f'The shape of the input batch, {input_spatial_shape},'
diff --git a/tests/data/inference/test_aggregator.py b/tests/data/inference/test_aggregator.py
@@ -142,3 +142,40 @@ def test_bad_aggregator_shape(self):
             inference_batch = torch.stack(patches)
             with pytest.raises(RuntimeError):
                 aggregator.add_batch(inference_batch, batch[tio.LOCATION])
+
+    def test_downsampling_model(self):
+        # This might be useful to compute image embeddings using a sliding window
+        downsampling_factor = 4  # e.g. patch size in a ViT
+        embedding_dim = 5
+        net_input_size = 20
+        image_size = 40
+
+        def network(x):
+            down = x[
+                ...,
+                ::downsampling_factor,
+                ::downsampling_factor,
+                ::downsampling_factor,
+            ]
+            embeddings = torch.cat(embedding_dim * [down], dim=1)
+            return embeddings
+
+        tensor = torch.ones(1, image_size, image_size, image_size)
+        image_name = 'img'
+        subject = tio.Subject({image_name: tio.ScalarImage(tensor=tensor)})
+        sampler = tio.data.GridSampler(
+            subject,
+            patch_size=net_input_size,
+        )
+        aggregator = tio.data.GridAggregator(
+            sampler,
+            downsampling_factor=downsampling_factor,
+        )
+        loader = tio.SubjectsLoader(sampler, batch_size=3)
+        for batch in loader:
+            input_batch = batch[image_name][tio.DATA]
+            embeddings = network(input_batch)
+            aggregator.add_batch(embeddings, batch[tio.LOCATION])
+        output = aggregator.get_output_tensor()
+        expected_shape = (embedding_dim,) + (image_size // downsampling_factor,) * 3
+        self.assertEqual(output.shape, expected_shape)