move scatter op out from contracter module

cw-tan · cw-tan · commit 211bc489bb61 · 2026-02-24T17:07:22.000-05:00
diff --git a/allegro/nn/_allegro.py b/allegro/nn/_allegro.py
@@ -13,6 +13,7 @@
     ScalarMLPFunction,
     tp_path_exists,
     AvgNumNeighborsNorm,
+    scatter,
 )
 
 from ._strided import Contracter, MakeWeightedChannels
@@ -258,18 +259,36 @@ def forward(self, data: AtomicDataDict.Type) -> AtomicDataDict.Type:
             projection, -1, self.num_scalar_features, self._env_weighter.weight_numel
         )
 
-        # Get normalization tensor
-        scatter_norm = self.avg_num_neighbors_norm(data)[:num_atoms].unsqueeze(-1)
-
         layer_index: int = 0
         for latent, tp in zip(self.latents, self.tps):
-            # === Env Weight & TP ===
+            # === construct env weighted tensor ===
             env_w_edges = self._env_weighter(tensor_basis, env_w)
-            # scatter env_w_edges and TP with tensor_features
-            # second input irreps is the one that is scattered
+
+            # scatter env_w_edges to nodes and normalize
+            env_w_scatter = scatter(
+                env_w_edges,
+                edge_center,
+                dim=0,
+                dim_size=num_atoms,
+            )
+            env_w_scatter_size0 = env_w_scatter.size(0)
+            env_w_scatter_size1 = env_w_scatter.size(1)
+            env_w_scatter_size2 = env_w_scatter.size(2)
+            data[AtomicDataDict.NODE_FEATURES_KEY] = env_w_scatter.view(
+                env_w_scatter_size0,
+                env_w_scatter_size1 * env_w_scatter_size2,
+            )
+            data = self.avg_num_neighbors_norm(data)
+
+            # === TP ===
+            # second input irreps is node-scattered env features
             irin1 = tensor_features
-            irin2 = env_w_edges
-            tensor_features = tp(irin1, irin2, edge_center, num_atoms, scatter_norm)
+            irin2 = data[AtomicDataDict.NODE_FEATURES_KEY].view(
+                env_w_scatter_size0,
+                env_w_scatter_size1,
+                env_w_scatter_size2,
+            )
+            tensor_features = tp(irin1, irin2, edge_center)
 
             # Extract invariants from tensor track
             # features has shape [z][mul][k], where scalars are first
diff --git a/allegro/nn/_strided/_contract.py b/allegro/nn/_strided/_contract.py
@@ -3,7 +3,7 @@
 import torch
 from e3nn.o3._irreps import Irreps
 from e3nn.o3._wigner import wigner_3j
-from nequip.nn import scatter, replace_submodules, model_modifier
+from nequip.nn import replace_submodules, model_modifier
 from nequip.utils.dtype import torch_default_dtype
 from typing import List, Tuple, Optional
 
@@ -187,24 +187,12 @@ def forward(
         x1: torch.Tensor,
         x2: torch.Tensor,
         idxs: torch.Tensor,
-        scatter_dim_size: int,
-        scatter_norm: torch.Tensor,
     ) -> torch.Tensor:
-        # scatter and index select
-        x2_scatter = scatter(
-            x2,
-            idxs,
-            dim=0,
-            dim_size=scatter_dim_size,
-        )
-        # normalization
-        x2_scatter = x2_scatter * scatter_norm
-
         # === perform TP ===
         # convert to strided shape
         x1 = x1.reshape(-1, self.mul, self.base_dim1)
-        x2_scatter = x2_scatter.reshape(-1, self.mul, self.base_dim2)
-        return self._contract_conv(x1, x2_scatter, idxs)
+        x2 = x2.reshape(-1, self.mul, self.base_dim2)
+        return self._contract_conv(x1, x2, idxs)
 
     def _contract_conv(
         self, x1: torch.Tensor, x2: torch.Tensor, idxs: torch.Tensor
diff --git a/allegro/nn/_strided/_cueq_contracter.py b/allegro/nn/_strided/_cueq_contracter.py
@@ -1,7 +1,6 @@
 # This file is a part of the `allegro` package. Please see LICENSE and README at the root for information on using it.
 import torch
 
-from nequip.nn import scatter
 from ._contract import Contracter
 
 import itertools
@@ -87,20 +86,12 @@ def forward(
         x1: torch.Tensor,
         x2: torch.Tensor,
         idxs: torch.Tensor,
-        scatter_dim_size: int,
-        scatter_norm: torch.Tensor,
     ) -> torch.Tensor:
         # NOTE: the reason for some duplicated code is because TorchScript doesn't support super() calls
         # see https://github.com/pytorch/pytorch/issues/42885
 
-        x2_scatter = scatter(
-            x2,
-            idxs,
-            dim=0,
-            dim_size=scatter_dim_size,
-        )
-
-        x2_scatter = x2_scatter * scatter_norm
+        x1 = x1.reshape(-1, self.mul, self.base_dim1)
+        x2 = x2.reshape(-1, self.mul, self.base_dim2)
 
         if x1.is_cuda and self.num_paths >= 1:
             empty_dict: Dict[int, torch.Tensor] = {}  # for torchscript
@@ -120,10 +111,10 @@ def forward(
                     .view(
                         x1.size(0), self.base_dim1 * self.mul
                     ),  # (edges, irreps * mul)
-                    x2_scatter.transpose(1, 2)
+                    x2.transpose(1, 2)
                     .contiguous()
                     .view(
-                        scatter_dim_size, self.base_dim2 * self.mul
+                        x2.size(0), self.base_dim2 * self.mul
                     ),  # (atoms, irreps * mul)
                 ],
                 {2: idxs},  # input indices
@@ -138,4 +129,4 @@ def forward(
                 .contiguous()
             )
         else:
-            return self._contract_conv(x1, x2_scatter, idxs)
+            return self._contract_conv(x1, x2, idxs)
diff --git a/allegro/nn/edgewise.py b/allegro/nn/edgewise.py
@@ -53,8 +53,9 @@ def forward(self, data: AtomicDataDict.Type) -> AtomicDataDict.Type:
             reduce=self.reduce,
         )
         # === scale ===
-        factor = self.norm_module(data)[: AtomicDataDict.num_nodes(data)]
-        out = out * (factor / sqrt(2))
+        data[AtomicDataDict.NODE_FEATURES_KEY] = out
+        data = self.norm_module(data)
+        out = data[AtomicDataDict.NODE_FEATURES_KEY] / sqrt(2)
         # ^ factor of 2 to normalize dE/dr_i which includes both contributions from dE/dr_ij
         # and every other derivative against r_ji.
 
diff --git a/allegro/utils/autotune_triton.py b/allegro/utils/autotune_triton.py
@@ -86,13 +86,11 @@ def create_nacl_supercell(supercell_size=10):
 ]
 
 
-def benchmark_forward(
-    contracter, input1, input2, scatter_idxs, num_atoms, warmup=3, n_iter=10
-):
+def benchmark_forward(contracter, input1, input2, scatter_idxs, warmup=3, n_iter=10):
     """Benchmark forward pass."""
     # warmup
     for _ in range(warmup):
-        _ = contracter(input1, input2, scatter_idxs, num_atoms)
+        _ = contracter(input1, input2, scatter_idxs)
         torch.cuda.synchronize()
 
     # benchmark
@@ -101,7 +99,7 @@ def benchmark_forward(
 
     start_event.record()
     for _ in range(n_iter):
-        _ = contracter(input1, input2, scatter_idxs, num_atoms)
+        _ = contracter(input1, input2, scatter_idxs)
     end_event.record()
 
     torch.cuda.synchronize()
@@ -110,9 +108,7 @@ def benchmark_forward(
     return total_time_ms / n_iter
 
 
-def benchmark_backward(
-    contracter, input1, input2, scatter_idxs, num_atoms, warmup=3, n_iter=10
-):
+def benchmark_backward(contracter, input1, input2, scatter_idxs, warmup=3, n_iter=10):
     """Benchmark full forward+backward pass.
 
     Returns:
@@ -122,7 +118,7 @@ def benchmark_backward(
     for _ in range(warmup):
         input1.grad = None
         input2.grad = None
-        out = contracter(input1, input2, scatter_idxs, num_atoms)
+        out = contracter(input1, input2, scatter_idxs)
         grad_out = torch.randn_like(out)
         out.backward(grad_out)
         torch.cuda.synchronize()
@@ -135,7 +131,7 @@ def benchmark_backward(
     for _ in range(n_iter):
         input1.grad = None
         input2.grad = None
-        out = contracter(input1, input2, scatter_idxs, num_atoms)
+        out = contracter(input1, input2, scatter_idxs)
         grad_out = torch.randn_like(out)
         out.backward(grad_out)
     end_event.record()
@@ -188,7 +184,6 @@ def autotune(
     num_nodes = AtomicDataDict.num_nodes(data)
     num_edges = AtomicDataDict.num_edges(data)
     scatter_idxs = data[AtomicDataDict.EDGE_INDEX_KEY][1]
-    num_atoms_tensor = torch.tensor([num_nodes], dtype=torch.int64, device=device)
 
     print(f"  num_nodes: {num_nodes}")
     print(f"  num_edges: {num_edges}")
@@ -206,9 +201,9 @@ def autotune(
         irreps_in2 = model_config["irreps_in2"]
         mul = model_config["mul"]
 
-        # both inputs are edge-indexed, enable grad for backward
+        # input1 is edge-indexed, input2 is node-indexed
         input1 = irreps_in1.randn(num_edges, mul, -1, dtype=dtype, device=device)
-        input2 = irreps_in2.randn(num_edges, mul, -1, dtype=dtype, device=device)
+        input2 = irreps_in2.randn(num_nodes, mul, -1, dtype=dtype, device=device)
         input1.requires_grad_(True)
         input2.requires_grad_(True)
 
@@ -237,7 +232,6 @@ def autotune(
                 input1,
                 input2,
                 scatter_idxs,
-                num_atoms_tensor,
                 warmup=5,
                 n_iter=20,
             )
@@ -247,7 +241,6 @@ def autotune(
                 input1,
                 input2,
                 scatter_idxs,
-                num_atoms_tensor,
                 warmup=5,
                 n_iter=20,
             )
@@ -285,7 +278,6 @@ def autotune(
                     input1,
                     input2,
                     scatter_idxs,
-                    num_atoms_tensor,
                     warmup=5,
                     n_iter=20,
                 )
@@ -295,7 +287,6 @@ def autotune(
                     input1,
                     input2,
                     scatter_idxs,
-                    num_atoms_tensor,
                     warmup=5,
                     n_iter=20,
                 )