From 6badd1e31d6b2fc4bf4dc9b468f609b5ff817390 Mon Sep 17 00:00:00 2001 From: namgyu-youn Date: Fri, 6 Jun 2025 16:31:23 +0900 Subject: [PATCH 1/5] Replcae torch.norm with torch.linalg.vector_norm in L1-norm torch.norm is deprecated and may be removed in future PyTorch releases --- modelopt/torch/nas/modules/conv.py | 8 ++++++-- modelopt/torch/nas/modules/linear.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/modelopt/torch/nas/modules/conv.py b/modelopt/torch/nas/modules/conv.py index 4769a46c9..935bbfc96 100644 --- a/modelopt/torch/nas/modules/conv.py +++ b/modelopt/torch/nas/modules/conv.py @@ -139,7 +139,9 @@ def _estimate_importance(self) -> TracedHp.Importance: return None weight = self._parameters["weight"] # retrieve full weight tensor c_in = weight.shape[1] - return torch.norm(torch.reshape(weight.detach().transpose(0, 1), (c_in, -1)), dim=1) + return torch.linalg.vector_norm( + torch.reshape(weight.detach().transpose(0, 1), (c_in, -1)), dim=1 + ) def _setup(self): # only support ungrouped conv or grouped conv with in_channels == out_channels @@ -249,4 +251,6 @@ def _estimate_importance(self) -> TracedHp.Importance: return None weight = self._parameters["weight"] # retrieve full weight tensor c_in = weight.shape[0] - return torch.norm(torch.reshape(weight.detach(), (c_in, -1)), dim=1) + return torch.linalg.vector_norm( + torch.reshape(weight.detach().transpose(0, 1), (c_in, -1)), dim=1 + ) diff --git a/modelopt/torch/nas/modules/linear.py b/modelopt/torch/nas/modules/linear.py index b82bed68e..b8c171a63 100644 --- a/modelopt/torch/nas/modules/linear.py +++ b/modelopt/torch/nas/modules/linear.py @@ -41,7 +41,7 @@ def _get_bias(mod: "_DynamicLinear", bias: torch.Tensor | None) -> torch.Tensor return get_sliced_tensor(mod, bias, "out_features") def _estimate_importance(self) -> TracedHp.Importance: - return self._parameters["weight"].detach().norm(dim=0) + return torch.linalg.vector_norm(self._parameters["weight"].detach(), dim=0) def _setup(self): # register hyperparameters From a619fa7404f918fbfaf572825fb8a83837607753 Mon Sep 17 00:00:00 2001 From: namgyu-youn Date: Fri, 6 Jun 2025 16:42:39 +0900 Subject: [PATCH 2/5] Replcae torch.norm with torch.linalg.vector_norm in L1-norm torch.norm is deprecated and may be removed in future PyTorch releases --- modelopt/torch/nas/modules/conv.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modelopt/torch/nas/modules/conv.py b/modelopt/torch/nas/modules/conv.py index 935bbfc96..e4b1eebe7 100644 --- a/modelopt/torch/nas/modules/conv.py +++ b/modelopt/torch/nas/modules/conv.py @@ -251,6 +251,4 @@ def _estimate_importance(self) -> TracedHp.Importance: return None weight = self._parameters["weight"] # retrieve full weight tensor c_in = weight.shape[0] - return torch.linalg.vector_norm( - torch.reshape(weight.detach().transpose(0, 1), (c_in, -1)), dim=1 - ) + return torch.linalg.vector_norm(torch.reshape(weight.detach(), (c_in, -1)), dim=1) From 0d02d184801d52ce26044480b6713e3f025c1920 Mon Sep 17 00:00:00 2001 From: namgyu-youn Date: Wed, 18 Jun 2025 15:29:10 +0900 Subject: [PATCH 3/5] update Ln-norm logics for upcoming PyTorch update (#206) --- modelopt/torch/nas/plugins/megatron.py | 14 +++++++++----- modelopt/torch/nas/plugins/transformers.py | 4 +++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py index 30eb01507..abe4aa4a2 100644 --- a/modelopt/torch/nas/plugins/megatron.py +++ b/modelopt/torch/nas/plugins/megatron.py @@ -622,11 +622,15 @@ def _estimate_all_head_importance(self) -> TracedHp.Importance: def _estimate_query_group_importance(self) -> TracedHp.Importance: """Return the importance of the ``num_query_groups`` hparam.""" assert self._activations is not None, "No activations collected for importance estimation." - group_importance = self._activations.view( - self.get_hparam("num_heads_per_group").max, - self.get_hparam("num_query_groups").max, - self.config.kv_channels, - ).norm(p=2, dim=(0, 2)) + group_importance = torch.linalg.norm( + self._activations.view( + self.get_hparam("num_heads_per_group").max, + self.get_hparam("num_query_groups").max, + self.config.kv_channels, + ), + ord=2, + dim=(0, 2), + ) return group_importance def export(self) -> torch.nn.Module: diff --git a/modelopt/torch/nas/plugins/transformers.py b/modelopt/torch/nas/plugins/transformers.py index ad8dcebec..a7c8d95b7 100644 --- a/modelopt/torch/nas/plugins/transformers.py +++ b/modelopt/torch/nas/plugins/transformers.py @@ -122,7 +122,9 @@ def configure_qkv_out(self, q_name: str, k_name: str, v_name: str, out_name: str out.in_features = hp_hidden_dim assert isinstance(out, nn.Linear) - hp_hidden_dim.register_importance(lambda: out._parameters["weight"].detach().norm(dim=0)) + hp_hidden_dim.register_importance( + lambda: torch.linalg.norm(out._parameters["weight"].detach(), dim=0) + ) def modify( self, *, n_heads_ratio: tuple[float, ...] | None = None, n_heads_divisor: int = 1 From bb630dbc29197e99bf3e719f7101c2676aa520c8 Mon Sep 17 00:00:00 2001 From: namgyu-youn Date: Thu, 19 Jun 2025 13:35:08 +0900 Subject: [PATCH 4/5] Replaces `torch.linalg.norm` with `torch.linalg.vector_norm` for vector-norm `torch.linalg.norm` supports various calculations based on `dim` parameter: - If dim is an int, the vector norm will be computed. - If dim is a 2-tuple, the matrix norm will be computed. - If dim=None and ord=None, A will be flattened to 1D and the 2-norm of the resulting vector will be computed. - If dim=None and ord!=None, A must be 1D or 2D. Therefore, vector norm is not computed when `dim` is tuple. (Nit: `torch.linalg.vector_norm` is more explicit for vector norms.) --- modelopt/torch/nas/plugins/megatron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py index abe4aa4a2..57e783d4f 100644 --- a/modelopt/torch/nas/plugins/megatron.py +++ b/modelopt/torch/nas/plugins/megatron.py @@ -622,7 +622,7 @@ def _estimate_all_head_importance(self) -> TracedHp.Importance: def _estimate_query_group_importance(self) -> TracedHp.Importance: """Return the importance of the ``num_query_groups`` hparam.""" assert self._activations is not None, "No activations collected for importance estimation." - group_importance = torch.linalg.norm( + group_importance = torch.linalg.vector_norm( self._activations.view( self.get_hparam("num_heads_per_group").max, self.get_hparam("num_query_groups").max, From 4e98a250aa6934d0ad01bd3b0b7d2215342de6c1 Mon Sep 17 00:00:00 2001 From: namgyu-youn Date: Wed, 25 Jun 2025 11:44:01 +0900 Subject: [PATCH 5/5] update Ln-norm logic for upcoming PyTorch release - `torch.norm` is deprecated in favor of `torch.linalg.norm` and `torch.linalg.vector_norm`. --- modelopt/torch/nas/plugins/megatron.py | 2 +- modelopt/torch/nas/plugins/transformers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py index 57e783d4f..e65cdcb9b 100644 --- a/modelopt/torch/nas/plugins/megatron.py +++ b/modelopt/torch/nas/plugins/megatron.py @@ -616,7 +616,7 @@ def _estimate_all_head_importance(self) -> TracedHp.Importance: attn_head_importance = self._activations.view( self.get_hparam("num_heads_per_group").max * self.get_hparam("num_query_groups").max, self.config.kv_channels, - ).norm(p=2, dim=1) + ).vector_norm(ord=2, dim=1) return attn_head_importance def _estimate_query_group_importance(self) -> TracedHp.Importance: diff --git a/modelopt/torch/nas/plugins/transformers.py b/modelopt/torch/nas/plugins/transformers.py index a7c8d95b7..61d5cd5d6 100644 --- a/modelopt/torch/nas/plugins/transformers.py +++ b/modelopt/torch/nas/plugins/transformers.py @@ -123,7 +123,7 @@ def configure_qkv_out(self, q_name: str, k_name: str, v_name: str, out_name: str assert isinstance(out, nn.Linear) hp_hidden_dim.register_importance( - lambda: torch.linalg.norm(out._parameters["weight"].detach(), dim=0) + lambda: torch.linalg.vector_norm(out._parameters["weight"].detach(), dim=0) ) def modify(