From 6badd1e31d6b2fc4bf4dc9b468f609b5ff817390 Mon Sep 17 00:00:00 2001
From: namgyu-youn <yynk2012@gmail.com>
Date: Fri, 6 Jun 2025 16:31:23 +0900
Subject: [PATCH 1/5] Replcae torch.norm with torch.linalg.vector_norm in
 L1-norm torch.norm is deprecated and may be removed in future PyTorch
 releases

---
 modelopt/torch/nas/modules/conv.py   | 8 ++++++--
 modelopt/torch/nas/modules/linear.py | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/modelopt/torch/nas/modules/conv.py b/modelopt/torch/nas/modules/conv.py
index 4769a46c9..935bbfc96 100644
--- a/modelopt/torch/nas/modules/conv.py
+++ b/modelopt/torch/nas/modules/conv.py
@@ -139,7 +139,9 @@ def _estimate_importance(self) -> TracedHp.Importance:
             return None
         weight = self._parameters["weight"]  # retrieve full weight tensor
         c_in = weight.shape[1]
-        return torch.norm(torch.reshape(weight.detach().transpose(0, 1), (c_in, -1)), dim=1)
+        return torch.linalg.vector_norm(
+            torch.reshape(weight.detach().transpose(0, 1), (c_in, -1)), dim=1
+        )
 
     def _setup(self):
         # only support ungrouped conv or grouped conv with in_channels == out_channels
@@ -249,4 +251,6 @@ def _estimate_importance(self) -> TracedHp.Importance:
             return None
         weight = self._parameters["weight"]  # retrieve full weight tensor
         c_in = weight.shape[0]
-        return torch.norm(torch.reshape(weight.detach(), (c_in, -1)), dim=1)
+        return torch.linalg.vector_norm(
+            torch.reshape(weight.detach().transpose(0, 1), (c_in, -1)), dim=1
+        )
diff --git a/modelopt/torch/nas/modules/linear.py b/modelopt/torch/nas/modules/linear.py
index b82bed68e..b8c171a63 100644
--- a/modelopt/torch/nas/modules/linear.py
+++ b/modelopt/torch/nas/modules/linear.py
@@ -41,7 +41,7 @@ def _get_bias(mod: "_DynamicLinear", bias: torch.Tensor | None) -> torch.Tensor
         return get_sliced_tensor(mod, bias, "out_features")
 
     def _estimate_importance(self) -> TracedHp.Importance:
-        return self._parameters["weight"].detach().norm(dim=0)
+        return torch.linalg.vector_norm(self._parameters["weight"].detach(), dim=0)
 
     def _setup(self):
         # register hyperparameters

From a619fa7404f918fbfaf572825fb8a83837607753 Mon Sep 17 00:00:00 2001
From: namgyu-youn <yynk2012@gmail.com>
Date: Fri, 6 Jun 2025 16:42:39 +0900
Subject: [PATCH 2/5] Replcae torch.norm with torch.linalg.vector_norm in
 L1-norm torch.norm is deprecated and may be removed in future PyTorch
 releases

---
 modelopt/torch/nas/modules/conv.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/modelopt/torch/nas/modules/conv.py b/modelopt/torch/nas/modules/conv.py
index 935bbfc96..e4b1eebe7 100644
--- a/modelopt/torch/nas/modules/conv.py
+++ b/modelopt/torch/nas/modules/conv.py
@@ -251,6 +251,4 @@ def _estimate_importance(self) -> TracedHp.Importance:
             return None
         weight = self._parameters["weight"]  # retrieve full weight tensor
         c_in = weight.shape[0]
-        return torch.linalg.vector_norm(
-            torch.reshape(weight.detach().transpose(0, 1), (c_in, -1)), dim=1
-        )
+        return torch.linalg.vector_norm(torch.reshape(weight.detach(), (c_in, -1)), dim=1)

From 0d02d184801d52ce26044480b6713e3f025c1920 Mon Sep 17 00:00:00 2001
From: namgyu-youn <yynk2012@gmail.com>
Date: Wed, 18 Jun 2025 15:29:10 +0900
Subject: [PATCH 3/5] update Ln-norm logics for upcoming PyTorch update (#206)

---
 modelopt/torch/nas/plugins/megatron.py     | 14 +++++++++-----
 modelopt/torch/nas/plugins/transformers.py |  4 +++-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py
index 30eb01507..abe4aa4a2 100644
--- a/modelopt/torch/nas/plugins/megatron.py
+++ b/modelopt/torch/nas/plugins/megatron.py
@@ -622,11 +622,15 @@ def _estimate_all_head_importance(self) -> TracedHp.Importance:
     def _estimate_query_group_importance(self) -> TracedHp.Importance:
         """Return the importance of the ``num_query_groups`` hparam."""
         assert self._activations is not None, "No activations collected for importance estimation."
-        group_importance = self._activations.view(
-            self.get_hparam("num_heads_per_group").max,
-            self.get_hparam("num_query_groups").max,
-            self.config.kv_channels,
-        ).norm(p=2, dim=(0, 2))
+        group_importance = torch.linalg.norm(
+            self._activations.view(
+                self.get_hparam("num_heads_per_group").max,
+                self.get_hparam("num_query_groups").max,
+                self.config.kv_channels,
+            ),
+            ord=2,
+            dim=(0, 2),
+        )
         return group_importance
 
     def export(self) -> torch.nn.Module:
diff --git a/modelopt/torch/nas/plugins/transformers.py b/modelopt/torch/nas/plugins/transformers.py
index ad8dcebec..a7c8d95b7 100644
--- a/modelopt/torch/nas/plugins/transformers.py
+++ b/modelopt/torch/nas/plugins/transformers.py
@@ -122,7 +122,9 @@ def configure_qkv_out(self, q_name: str, k_name: str, v_name: str, out_name: str
         out.in_features = hp_hidden_dim
 
         assert isinstance(out, nn.Linear)
-        hp_hidden_dim.register_importance(lambda: out._parameters["weight"].detach().norm(dim=0))
+        hp_hidden_dim.register_importance(
+            lambda: torch.linalg.norm(out._parameters["weight"].detach(), dim=0)
+        )
 
     def modify(
         self, *, n_heads_ratio: tuple[float, ...] | None = None, n_heads_divisor: int = 1

From bb630dbc29197e99bf3e719f7101c2676aa520c8 Mon Sep 17 00:00:00 2001
From: namgyu-youn <yynk2012@gmail.com>
Date: Thu, 19 Jun 2025 13:35:08 +0900
Subject: [PATCH 4/5] Replaces `torch.linalg.norm` with
 `torch.linalg.vector_norm` for vector-norm

`torch.linalg.norm` supports various calculations based on `dim` parameter:
- If dim is an int, the vector norm will be computed.
- If dim is a 2-tuple, the matrix norm will be computed.
- If dim=None and ord=None, A will be flattened to 1D and the 2-norm of the resulting vector will be computed.
- If dim=None and ord!=None, A must be 1D or 2D.

Therefore, vector norm is not computed when `dim` is tuple. (Nit: `torch.linalg.vector_norm` is more explicit for vector norms.)
---
 modelopt/torch/nas/plugins/megatron.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py
index abe4aa4a2..57e783d4f 100644
--- a/modelopt/torch/nas/plugins/megatron.py
+++ b/modelopt/torch/nas/plugins/megatron.py
@@ -622,7 +622,7 @@ def _estimate_all_head_importance(self) -> TracedHp.Importance:
     def _estimate_query_group_importance(self) -> TracedHp.Importance:
         """Return the importance of the ``num_query_groups`` hparam."""
         assert self._activations is not None, "No activations collected for importance estimation."
-        group_importance = torch.linalg.norm(
+        group_importance = torch.linalg.vector_norm(
             self._activations.view(
                 self.get_hparam("num_heads_per_group").max,
                 self.get_hparam("num_query_groups").max,

From 4e98a250aa6934d0ad01bd3b0b7d2215342de6c1 Mon Sep 17 00:00:00 2001
From: namgyu-youn <yynk2012@gmail.com>
Date: Wed, 25 Jun 2025 11:44:01 +0900
Subject: [PATCH 5/5] update Ln-norm logic for upcoming PyTorch release -
 `torch.norm` is deprecated in favor of `torch.linalg.norm` and
 `torch.linalg.vector_norm`.

---
 modelopt/torch/nas/plugins/megatron.py     | 2 +-
 modelopt/torch/nas/plugins/transformers.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py
index 57e783d4f..e65cdcb9b 100644
--- a/modelopt/torch/nas/plugins/megatron.py
+++ b/modelopt/torch/nas/plugins/megatron.py
@@ -616,7 +616,7 @@ def _estimate_all_head_importance(self) -> TracedHp.Importance:
         attn_head_importance = self._activations.view(
             self.get_hparam("num_heads_per_group").max * self.get_hparam("num_query_groups").max,
             self.config.kv_channels,
-        ).norm(p=2, dim=1)
+        ).vector_norm(ord=2, dim=1)
         return attn_head_importance
 
     def _estimate_query_group_importance(self) -> TracedHp.Importance:
diff --git a/modelopt/torch/nas/plugins/transformers.py b/modelopt/torch/nas/plugins/transformers.py
index a7c8d95b7..61d5cd5d6 100644
--- a/modelopt/torch/nas/plugins/transformers.py
+++ b/modelopt/torch/nas/plugins/transformers.py
@@ -123,7 +123,7 @@ def configure_qkv_out(self, q_name: str, k_name: str, v_name: str, out_name: str
 
         assert isinstance(out, nn.Linear)
         hp_hidden_dim.register_importance(
-            lambda: torch.linalg.norm(out._parameters["weight"].detach(), dim=0)
+            lambda: torch.linalg.vector_norm(out._parameters["weight"].detach(), dim=0)
         )
 
     def modify(