Optimization: Efficient Sequential Batching (Solves #47, Supersedes #186) (#187)

Sidharth1743 · pre-commit-ci[bot] · web-flow · commit c5b1864bd884 · 2026-01-02T09:51:47.000Z
* feat: Add efficient batching to reduce memory usage * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * feat: Add efficient batching to reduce memory usage * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Dependencies resolved , unncessaru docs removed , moved to pytest , benchmark is under scripts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * feat: Add efficient batching to reduce memory usage * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revert to nnja_ai import * fix: Revert nnja_ai import in data module * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/graph_weather/models/layers/assimilator_decoder.py b/graph_weather/models/layers/assimilator_decoder.py
@@ -41,6 +41,7 @@ def __init__(
         hidden_dim_decoder: int = 128,
         hidden_layers_decoder: int = 2,
         use_checkpointing: bool = False,
+        efficient_batching: bool = False,
     ):
         """
         Decoder from latent graph to lat/lon graph for assimilation of observation
@@ -63,6 +64,7 @@ def __init__(
         """
         super().__init__()
         self.use_checkpointing = use_checkpointing
+        self.efficient_batching = efficient_batching
         self.num_latlons = len(lat_lons)
         self.base_h3_grid = sorted(list(h3.uncompact_cells(h3.get_res0_cells(), resolution)))
         self.num_h3 = len(self.base_h3_grid)
@@ -137,28 +139,61 @@ def forward(self, processor_features: torch.Tensor, batch_size: int) -> torch.Te
             Updated features for model
         """
         self.graph = self.graph.to(processor_features.device)
-        edge_attr = self.edge_encoder(self.graph.edge_attr)  # Update attributes based on distance
-        edge_attr = einops.repeat(edge_attr, "e f -> (repeat e) f", repeat=batch_size)
-
-        edge_index = torch.cat(
-            [
-                self.graph.edge_index + i * torch.max(self.graph.edge_index) + i
-                for i in range(batch_size)
-            ],
-            dim=1,
-        )
-
-        # Readd nodes to match graph node number
         self.latlon_nodes = self.latlon_nodes.to(processor_features.device)
-        features = einops.rearrange(processor_features, "(b n) f -> b n f", b=batch_size)
-        features = torch.cat(
-            [features, einops.repeat(self.latlon_nodes, "n f -> b n f", b=batch_size)], dim=1
-        )
-        features = einops.rearrange(features, "b n f -> (b n) f")
-
-        out, _ = self.graph_processor(features, edge_index, edge_attr)  # Message Passing
-        # Remove the h3 nodes now, only want the latlon ones
-        out = self.node_decoder(out)  # Decode to 78 from 256
-        out = einops.rearrange(out, "(b n) f -> b n f", b=batch_size)
-        test, out = torch.split(out, [self.num_h3, self.num_latlons], dim=1)
-        return out
+
+        if self.efficient_batching:
+            # Efficient batching: process batches separately to avoid graph replication
+            edge_attr = self.edge_encoder(self.graph.edge_attr)  # Encode once
+
+            # Split processor features by batch
+            proc_features_batched = einops.rearrange(
+                processor_features, "(b n) f -> b n f", b=batch_size
+            )
+
+            batch_outputs = []
+            for i in range(batch_size):
+                # Get features for this batch
+                feat_i = torch.cat(
+                    [proc_features_batched[i], self.latlon_nodes], dim=0
+                )  # [num_h3 + num_latlon, F]
+
+                # Message passing with single graph (no replication)
+                out_i, _ = self.graph_processor(feat_i, self.graph.edge_index, edge_attr)
+
+                # Decode and extract latlon nodes
+                out_i = self.node_decoder(out_i)
+                out_i = out_i[self.num_h3 :]  # Keep only latlon nodes
+
+                batch_outputs.append(out_i)
+
+            # Stack outputs
+            out = torch.stack(batch_outputs, dim=0)  # [B, num_latlon, F]
+            return out
+        else:
+            # Original batching implementation
+            edge_attr = self.edge_encoder(
+                self.graph.edge_attr
+            )  # Update attributes based on distance
+            edge_attr = einops.repeat(edge_attr, "e f -> (repeat e) f", repeat=batch_size)
+
+            edge_index = torch.cat(
+                [
+                    self.graph.edge_index + i * torch.max(self.graph.edge_index) + i
+                    for i in range(batch_size)
+                ],
+                dim=1,
+            )
+
+            # Readd nodes to match graph node number
+            features = einops.rearrange(processor_features, "(b n) f -> b n f", b=batch_size)
+            features = torch.cat(
+                [features, einops.repeat(self.latlon_nodes, "n f -> b n f", b=batch_size)], dim=1
+            )
+            features = einops.rearrange(features, "b n f -> (b n) f")
+
+            out, _ = self.graph_processor(features, edge_index, edge_attr)  # Message Passing
+            # Remove the h3 nodes now, only want the latlon ones
+            out = self.node_decoder(out)  # Decode to 78 from 256
+            out = einops.rearrange(out, "(b n) f -> b n f", b=batch_size)
+            test, out = torch.split(out, [self.num_h3, self.num_latlons], dim=1)
+            return out
diff --git a/graph_weather/models/layers/decoder.py b/graph_weather/models/layers/decoder.py
@@ -37,6 +37,7 @@ def __init__(
         hidden_dim_decoder: int = 128,
         hidden_layers_decoder: int = 2,
         use_checkpointing: bool = False,
+        efficient_batching: bool = False,
     ):
         """
         Decoder from latent graph to lat/lon graph
@@ -56,6 +57,7 @@ def __init__(
             mlp_norm_type: Type of norm for the MLPs
                 one of 'LayerNorm', 'GraphNorm', 'InstanceNorm', 'BatchNorm', 'MessageNorm', or None
             use_checkpointing: Whether to use gradient checkpointing or not
+            efficient_batching: Whether to use efficient batching (avoids graph replication)
         """
         super().__init__(
             lat_lons,
@@ -71,6 +73,7 @@ def __init__(
             hidden_dim_decoder,
             hidden_layers_decoder,
             use_checkpointing,
+            efficient_batching,
         )
 
     def forward(
diff --git a/graph_weather/models/layers/encoder.py b/graph_weather/models/layers/encoder.py
@@ -49,6 +49,7 @@ def __init__(
         hidden_layers_processor_edge=2,
         mlp_norm_type="LayerNorm",
         use_checkpointing: bool = False,
+        efficient_batching: bool = False,
     ):
         """
         Encode the lat/lon data inot the isohedron graph
@@ -69,6 +70,7 @@ def __init__(
         """
         super().__init__()
         self.use_checkpointing = use_checkpointing
+        self.efficient_batching = efficient_batching
         self.output_dim = output_dim
         self.num_latlons = len(lat_lons)
         self.base_h3_grid = sorted(list(h3.uncompact_cells(h3.get_res0_cells(), resolution)))
@@ -161,46 +163,82 @@ def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, t
         self.h3_nodes = self.h3_nodes.to(features.device)
         self.graph = self.graph.to(features.device)
         self.latent_graph = self.latent_graph.to(features.device)
-        features = torch.cat(
-            [features, einops.repeat(self.h3_nodes, "n f -> b n f", b=batch_size)],
-            dim=1,
-        )
-        # Cat with the h3 nodes to have correct amount of nodes, and in right order
-        features = einops.rearrange(features, "b n f -> (b n) f")
-        out = self.node_encoder(features)  # Encode to 256 from 78
-        edge_attr = self.edge_encoder(self.graph.edge_attr)  # Update attributes based on distance
-        # Copy attributes batch times
-        edge_attr = einops.repeat(edge_attr, "e f -> (repeat e) f", repeat=batch_size)
-        # Expand edge index correct number of times while adding the proper number to the edge index
-        edge_index = torch.cat(
-            [
-                self.graph.edge_index + i * torch.max(self.graph.edge_index) + i
-                for i in range(batch_size)
-            ],
-            dim=1,
-        )
-        out, _ = self.graph_processor(out, edge_index, edge_attr)  # Message Passing
-        # Remove the extra nodes (lat/lon) from the output
-        out = einops.rearrange(out, "(b n) f -> b n f", b=batch_size)
-        _, out = torch.split(out, [self.num_latlons, self.h3_nodes.shape[0]], dim=1)
-        out = einops.rearrange(out, "b n f -> (b n) f")
-        return (
-            out,
-            torch.cat(
+
+        if self.efficient_batching:
+            # Efficient batching: process batches separately to avoid graph replication
+            batch_outputs = []
+            for i in range(batch_size):
+                # Process single batch item
+                feat_i = torch.cat([features[i : i + 1], self.h3_nodes.unsqueeze(0)], dim=1)
+                feat_i = feat_i.squeeze(0)  # [N, F]
+
+                # Encode nodes
+                out_i = self.node_encoder(feat_i)
+
+                # Encode edges (no replication needed)
+                edge_attr_i = self.edge_encoder(self.graph.edge_attr)
+
+                # Message passing with single graph
+                out_i, _ = self.graph_processor(out_i, self.graph.edge_index, edge_attr_i)
+
+                # Extract H3 nodes only
+                out_i = out_i[self.num_latlons :]  # Keep only H3 nodes
+                batch_outputs.append(out_i)
+
+            # Stack outputs
+            out = torch.cat(batch_outputs, dim=0)  # [B*num_h3, F]
+
+            # Return with SHARED latent graph (NO replication at all!)
+            latent_edge_attr = self.latent_edge_encoder(self.latent_graph.edge_attr)
+
+            # Return the single shared graph - no batching overhead
+            return (out, self.latent_graph.edge_index, latent_edge_attr)
+        else:
+            # Original batching implementation
+            features = torch.cat(
+                [features, einops.repeat(self.h3_nodes, "n f -> b n f", b=batch_size)],
+                dim=1,
+            )
+            # Cat with the h3 nodes to have correct amount of nodes, and in right order
+            features = einops.rearrange(features, "b n f -> (b n) f")
+            out = self.node_encoder(features)  # Encode to 256 from 78
+            edge_attr = self.edge_encoder(
+                self.graph.edge_attr
+            )  # Update attributes based on distance
+            # Copy attributes batch times
+            edge_attr = einops.repeat(edge_attr, "e f -> (repeat e) f", repeat=batch_size)
+            # Expand edge index correct number of times while adding the proper number to the edge index
+            edge_index = torch.cat(
                 [
-                    self.latent_graph.edge_index + i * torch.max(self.latent_graph.edge_index) + i
+                    self.graph.edge_index + i * torch.max(self.graph.edge_index) + i
                     for i in range(batch_size)
                 ],
                 dim=1,
-            ),
-            self.latent_edge_encoder(
-                einops.repeat(
-                    self.latent_graph.edge_attr,
-                    "e f -> (repeat e) f",
-                    repeat=batch_size,
-                )
-            ),
-        )  # New graph
+            )
+            out, _ = self.graph_processor(out, edge_index, edge_attr)  # Message Passing
+            # Remove the extra nodes (lat/lon) from the output
+            out = einops.rearrange(out, "(b n) f -> b n f", b=batch_size)
+            _, out = torch.split(out, [self.num_latlons, self.h3_nodes.shape[0]], dim=1)
+            out = einops.rearrange(out, "b n f -> (b n) f")
+            return (
+                out,
+                torch.cat(
+                    [
+                        self.latent_graph.edge_index
+                        + i * torch.max(self.latent_graph.edge_index)
+                        + i
+                        for i in range(batch_size)
+                    ],
+                    dim=1,
+                ),
+                self.latent_edge_encoder(
+                    einops.repeat(
+                        self.latent_graph.edge_attr,
+                        "e f -> (repeat e) f",
+                        repeat=batch_size,
+                    )
+                ),
+            )  # New graph
 
     def create_latent_graph(self) -> Data:
         """
diff --git a/graph_weather/models/layers/processor.py b/graph_weather/models/layers/processor.py
@@ -63,20 +63,49 @@ def __init__(
         if self.use_thermalizer:
             self.thermalizer = ThermalizerLayer(input_dim)
 
-    def forward(self, x: torch.Tensor, edge_index, edge_attr, t: int = 0) -> torch.Tensor:
+    def forward(
+        self,
+        x: torch.Tensor,
+        edge_index,
+        edge_attr,
+        t: int = 0,
+        batch_size: int = None,
+        efficient_batching: bool = False,
+    ) -> torch.Tensor:
         """
         Adds features to the encoding graph
 
         Args:
-            x: Torch tensor containing node features
+            x: Torch tensor containing node features [B*N, F] or [N, F]
             edge_index: Connectivity of graph, of shape [2, Num edges] in COO format
-            edge_attr: Edge attribues in [Num edges, Features] shape
+            edge_attr: Edge attributes in [Num edges, Features] shape
             t: Timestep for the thermalizer
+            batch_size: Batch size (required when efficient_batching=True)
+            efficient_batching: If True, process batches separately with shared graph
 
         Returns:
             torch Tensor containing the values of the nodes of the graph
         """
-        out, _ = self.graph_processor(x, edge_index, edge_attr)
-        if self.use_thermalizer:
-            out = self.thermalizer(out, t)
-        return out
+        if efficient_batching and batch_size is not None and batch_size > 1:
+            # Efficient batching: process each batch separately with shared graph
+            # x is [B*N, F], split into B batches of [N, F]
+            num_nodes_per_batch = x.shape[0] // batch_size
+            x_batched = x.view(batch_size, num_nodes_per_batch, -1)
+
+            batch_outputs = []
+            for i in range(batch_size):
+                # Process single batch with shared graph
+                out_i, _ = self.graph_processor(x_batched[i], edge_index, edge_attr)
+                if self.use_thermalizer:
+                    out_i = self.thermalizer(out_i, t)
+                batch_outputs.append(out_i)
+
+            # Concatenate outputs back to [B*N, F] format
+            out = torch.cat(batch_outputs, dim=0)
+            return out
+        else:
+            # Original batching: process all at once with batched graph
+            out, _ = self.graph_processor(x, edge_index, edge_attr)
+            if self.use_thermalizer:
+                out = self.thermalizer(out, t)
+            return out
diff --git a/scripts/benchmark_memory.py b/scripts/benchmark_memory.py
diff --git a/tests/models/layers/test_efficient_batching.py b/tests/models/layers/test_efficient_batching.py