update Ln-norm logics for upcoming PyTorch update (#206)

namgyu-youn · namgyu-youn · commit 0d02d184801d · 2025-06-18T15:29:10.000+09:00
diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py
@@ -622,11 +622,15 @@ def _estimate_all_head_importance(self) -> TracedHp.Importance:
     def _estimate_query_group_importance(self) -> TracedHp.Importance:
         """Return the importance of the ``num_query_groups`` hparam."""
         assert self._activations is not None, "No activations collected for importance estimation."
-        group_importance = self._activations.view(
-            self.get_hparam("num_heads_per_group").max,
-            self.get_hparam("num_query_groups").max,
-            self.config.kv_channels,
-        ).norm(p=2, dim=(0, 2))
+        group_importance = torch.linalg.norm(
+            self._activations.view(
+                self.get_hparam("num_heads_per_group").max,
+                self.get_hparam("num_query_groups").max,
+                self.config.kv_channels,
+            ),
+            ord=2,
+            dim=(0, 2),
+        )
         return group_importance
 
     def export(self) -> torch.nn.Module:
diff --git a/modelopt/torch/nas/plugins/transformers.py b/modelopt/torch/nas/plugins/transformers.py
@@ -122,7 +122,9 @@ def configure_qkv_out(self, q_name: str, k_name: str, v_name: str, out_name: str
         out.in_features = hp_hidden_dim
 
         assert isinstance(out, nn.Linear)
-        hp_hidden_dim.register_importance(lambda: out._parameters["weight"].detach().norm(dim=0))
+        hp_hidden_dim.register_importance(
+            lambda: torch.linalg.norm(out._parameters["weight"].detach(), dim=0)
+        )
 
     def modify(
         self, *, n_heads_ratio: tuple[float, ...] | None = None, n_heads_divisor: int = 1