From 3122842958e964d0b273570857801051d0359cb7 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Wed, 3 Dec 2025 13:40:56 +0400
Subject: [PATCH 01/30] enable awq

---
 .../algorithms/weight_compression/awq.py      | 71 +++++++++++--------
 1 file changed, 43 insertions(+), 28 deletions(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py
index 508ad57060d..5c8f3000a81 100644
--- a/src/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/src/nncf/quantization/algorithms/weight_compression/awq.py
@@ -183,8 +183,8 @@ def apply(
                     prev_statistics = statistics[merge_node.node_name]
                 scale = self._data_aware_step(wp, weight, statistics[k], prev_weight, prev_statistics)
 
-            w_scale = fns.unsqueeze(scale, 1 - wp.reduction_axes[0])
-            a_scale = fns.unsqueeze(1.0 / scale, wp.reduction_axes[0])
+            w_scale = fns.unsqueeze(scale, -1 - wp.reduction_axes[0])
+            a_scale = fns.unsqueeze(1.0 / scale, -wp.reduction_axes[0])
 
             scaled_weight = (weight * w_scale).astype(weight_dtype)
             self._backend_entity.set_weight(wp.node_with_weight, weight_port_id, model, graph, scaled_weight)
@@ -194,9 +194,9 @@ def apply(
                     merge_weight = self._backend_entity.get_weight(merge_node, port_id, model, graph)
                     merge_weight = (merge_weight * a_scale).astype(weight_dtype)
                     self._backend_entity.set_weight(merge_node, port_id, model, graph, merge_weight)
-                a_scale = fns.transpose(a_scale)
+                a_scale = fns.moveaxis(a_scale, -1, -2)
             else:  # for Act->Multiply->MatMul and Act->MatMul patterns scale inserted after Act as extra node
-                a_scale = fns.transpose(a_scale).astype(weight_dtype)
+                a_scale = fns.moveaxis(a_scale, -1, -2).astype(weight_dtype)
                 next_nodes = graph.get_next_nodes(merge_node)
                 source_node_output_port = graph.get_output_edges(merge_node)[0].output_port_id
                 scale_insertion_command = self._backend_entity.scale_insertion_command(
@@ -217,6 +217,8 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
         s = s.astype(TensorDataType.float32)
         X = X.astype(TensorDataType.float32)
 
+        is_2d_weight = weight.ndim == 2
+
         assert isinstance(wp.reduction_axes, tuple) and len(wp.reduction_axes) == 1
         reduction_axis = wp.reduction_axes[0]
 
@@ -224,42 +226,54 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
         if prev_statistics is not None and prev_weight is not None:
             prev_s, _ = process_stats(prev_statistics, self._subset_size)
             prev_s = prev_s.astype(TensorDataType.float32).max().item()
-            prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis)
+            prev_weight = fns.unsqueeze(prev_weight, 0)  # [out_features, hidden_dim] -> [1, out_features, hidden_dim]
+            prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis + 1)
+
+        if is_2d_weight:
+            s = fns.unsqueeze(s, 0)  # [hidden_dim] -> [1, hidden_dim]
+            X = fns.unsqueeze(X, 0)  # [hidden_dim, samples] -> [1, hidden_dim, samples]
+            weight = fns.unsqueeze(weight, 0)  # [out_features, hidden_dim] -> [1, out_features, hidden_dim]
+            reduction_axis += 1
 
-        top_k = max(int(s.shape[0] * self._percent_to_apply), 1)
-        topk_idxs = fns.argsort(-s)[:top_k]
+        top_k = max(int(s.shape[-1] * self._percent_to_apply), 1)
+        topk_idxs = fns.argsort(-s)[:, :top_k]
 
         group_size = config.group_size
         if group_size == -1:
-            group_size = s.shape[0]
+            group_size = s.shape[-1]
 
         groups_to_correct = set()
-        for idx in topk_idxs:
-            groups_to_correct.add(idx.data // group_size)
+        for expert_idx in range(topk_idxs.shape[0]):
+            for k_idx in range(topk_idxs.shape[1]):
+                idx = topk_idxs[expert_idx, k_idx].item()
+                group_idx = idx // group_size
+                groups_to_correct.add((expert_idx, group_idx))
 
         groups_to_correct = list(groups_to_correct)
 
-        if reduction_axis == 0:
-            weight = fns.transpose(weight)
-            reduction_axis = 1
+        if reduction_axis == 1:
+            # Weights
+            # 3D: [num_experts, hidden_dimension, out_features] -> [num_experts, out_features, hidden_dimension]
+            # 2D: [1, hidden_dimension, out_features] -> [1, out_features, hidden_dimension]
+            weight = fns.moveaxis(weight, -1, -2)
+            reduction_axis = weight.ndim - 1
 
-        shape_vector = fns.mean(X, axis=1)
+        shape_vector = fns.mean(X, axis=-1)
         scale = fns.ones_like(shape_vector)
 
         awq_config = deepcopy(config)
         awq_config.group_size = -1
 
-        for gi in groups_to_correct:
+        for expert_idx, gi in groups_to_correct:
             offset = gi * group_size
-            gscale = s[offset : offset + group_size]
+            gscale = s[expert_idx, offset : offset + group_size]
+            gweight = weight[expert_idx, :, offset : offset + group_size]
+            gacts = X[expert_idx, offset : offset + group_size, :]
 
             a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32)
             a_max = 1e2
             gscale = fns.clip(gscale, a_min=a_min, a_max=a_max)
 
-            gweight = weight[:, offset : offset + group_size]
-            gacts = X[offset : offset + group_size, :]
-
             fp32_out = fns.matmul(gweight, gacts)
             min_diff = fns.max(fns.abs(fp32_out))
             best_scale = None
@@ -275,14 +289,16 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
                     # per channel magnitudes for the previous MatMul
                     # mean(abs(prev_weight)) * max(abs((prev_activation))) * prev_weight.shape[reduction_axis]
                     magnitudes = (
-                        (prev_w[offset : offset + group_size] / cur_scale) * prev_s * prev_weight.shape[reduction_axis]
+                        (prev_w[expert_idx, offset : offset + group_size] / cur_scale)
+                        * prev_s
+                        * prev_weight.shape[reduction_axis]
                     )
                     if magnitudes.max() >= threshold:
                         cur_scale = AWQ._clamp_scale(
                             magnitudes,
                             threshold,
                             cur_scale,
-                            prev_w[offset : offset + group_size]
+                            prev_w[expert_idx, offset : offset + group_size]
                             * prev_s
                             * prev_weight.shape[reduction_axis]
                             / threshold,
@@ -290,13 +306,9 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
 
                 weights_to_fake_quantize = gweight * cur_scale
                 if not config.is_integer:
-                    g_decompressed_weighs = float_quantize_dequantize_weight(
-                        weights_to_fake_quantize, awq_config, reduction_axis
-                    )
+                    g_decompressed_weighs = float_quantize_dequantize_weight(weights_to_fake_quantize, awq_config, -1)
                 else:
-                    g_decompressed_weighs = integer_quantize_dequantize_weight(
-                        weights_to_fake_quantize, awq_config, reduction_axis
-                    )
+                    g_decompressed_weighs = integer_quantize_dequantize_weight(weights_to_fake_quantize, awq_config, -1)
                 sacts = gacts / fns.unsqueeze(cur_scale, 1)
 
                 cur_out = fns.matmul(g_decompressed_weighs, sacts)
@@ -307,7 +319,10 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
                 alpha += alpha_step
 
             if best_scale is not None:
-                scale.data[offset : offset + group_size] = best_scale.data
+                scale.data[expert_idx, offset : offset + group_size] = best_scale.data
+
+        if is_2d_weight:
+            scale = fns.squeeze(scale, 0)  # [1, hidden_dim] -> [hidden_dim]
 
         return scale
 

From 59033c1ba1c3f4b42c4ad8c019d91bd181e4a6c3 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Fri, 5 Dec 2025 16:29:22 +0400
Subject: [PATCH 02/30] update scale unsqueeze logic

---
 .../algorithms/weight_compression/awq.py      | 33 ++++++++++++-------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py
index 5c8f3000a81..a54289cbe70 100644
--- a/src/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/src/nncf/quantization/algorithms/weight_compression/awq.py
@@ -183,8 +183,17 @@ def apply(
                     prev_statistics = statistics[merge_node.node_name]
                 scale = self._data_aware_step(wp, weight, statistics[k], prev_weight, prev_statistics)
 
-            w_scale = fns.unsqueeze(scale, -1 - wp.reduction_axes[0])
-            a_scale = fns.unsqueeze(1.0 / scale, -wp.reduction_axes[0])
+            # For 3D weights, len(scale.shape)%2 == 0 whereas for 2D it is 1. This allows us to index
+            # from the last dim and not consider the batch dim in 3D case.
+            # Example:
+            #   3D weights: W shape = [B, M, N]; reduction_axes = 2
+            #   scale_shape = [M, N] -> len(scale.shape) = 2 -> 2 % 2 = 0
+            #   unsqueeze scale at -(0 + 2) = -2.
+            #   2D weights:   W shape = [M, N]; reduction_axes = 1
+            #   scale_shape = [M] -> len(scale.shape) = 1 -> 1 % 2 = 1
+            #   unsqueeze scale at -(1 + 1) = -2.
+            w_scale = fns.unsqueeze(scale, -(len(scale.shape) % 2 + wp.reduction_axes[0]))
+            a_scale = fns.unsqueeze(1.0 / scale, wp.reduction_axes[0])
 
             scaled_weight = (weight * w_scale).astype(weight_dtype)
             self._backend_entity.set_weight(wp.node_with_weight, weight_port_id, model, graph, scaled_weight)
@@ -243,11 +252,11 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
             group_size = s.shape[-1]
 
         groups_to_correct = set()
-        for expert_idx in range(topk_idxs.shape[0]):
+        for batch_idx in range(topk_idxs.shape[0]):
             for k_idx in range(topk_idxs.shape[1]):
-                idx = topk_idxs[expert_idx, k_idx].item()
+                idx = topk_idxs[batch_idx, k_idx].item()
                 group_idx = idx // group_size
-                groups_to_correct.add((expert_idx, group_idx))
+                groups_to_correct.add((batch_idx, group_idx))
 
         groups_to_correct = list(groups_to_correct)
 
@@ -264,11 +273,11 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
         awq_config = deepcopy(config)
         awq_config.group_size = -1
 
-        for expert_idx, gi in groups_to_correct:
+        for batch_idx, gi in groups_to_correct:
             offset = gi * group_size
-            gscale = s[expert_idx, offset : offset + group_size]
-            gweight = weight[expert_idx, :, offset : offset + group_size]
-            gacts = X[expert_idx, offset : offset + group_size, :]
+            gscale = s[batch_idx, offset : offset + group_size]
+            gweight = weight[batch_idx, :, offset : offset + group_size]
+            gacts = X[batch_idx, offset : offset + group_size, :]
 
             a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32)
             a_max = 1e2
@@ -289,7 +298,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
                     # per channel magnitudes for the previous MatMul
                     # mean(abs(prev_weight)) * max(abs((prev_activation))) * prev_weight.shape[reduction_axis]
                     magnitudes = (
-                        (prev_w[expert_idx, offset : offset + group_size] / cur_scale)
+                        (prev_w[batch_idx, offset : offset + group_size] / cur_scale)
                         * prev_s
                         * prev_weight.shape[reduction_axis]
                     )
@@ -298,7 +307,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
                             magnitudes,
                             threshold,
                             cur_scale,
-                            prev_w[expert_idx, offset : offset + group_size]
+                            prev_w[batch_idx, offset : offset + group_size]
                             * prev_s
                             * prev_weight.shape[reduction_axis]
                             / threshold,
@@ -319,7 +328,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
                 alpha += alpha_step
 
             if best_scale is not None:
-                scale.data[expert_idx, offset : offset + group_size] = best_scale.data
+                scale.data[batch_idx, offset : offset + group_size] = best_scale.data
 
         if is_2d_weight:
             scale = fns.squeeze(scale, 0)  # [1, hidden_dim] -> [hidden_dim]

From 608b2e2e62c167385e9061d3c78c22068d109a35 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Fri, 5 Dec 2025 20:14:14 +0400
Subject: [PATCH 03/30] mergeable fix

---
 .../algorithms/weight_compression/awq.py         | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py
index a54289cbe70..ba5cfb43200 100644
--- a/src/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/src/nncf/quantization/algorithms/weight_compression/awq.py
@@ -231,19 +231,21 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
         assert isinstance(wp.reduction_axes, tuple) and len(wp.reduction_axes) == 1
         reduction_axis = wp.reduction_axes[0]
 
-        prev_s, prev_w = None, None
-        if prev_statistics is not None and prev_weight is not None:
-            prev_s, _ = process_stats(prev_statistics, self._subset_size)
-            prev_s = prev_s.astype(TensorDataType.float32).max().item()
-            prev_weight = fns.unsqueeze(prev_weight, 0)  # [out_features, hidden_dim] -> [1, out_features, hidden_dim]
-            prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis + 1)
-
         if is_2d_weight:
             s = fns.unsqueeze(s, 0)  # [hidden_dim] -> [1, hidden_dim]
             X = fns.unsqueeze(X, 0)  # [hidden_dim, samples] -> [1, hidden_dim, samples]
             weight = fns.unsqueeze(weight, 0)  # [out_features, hidden_dim] -> [1, out_features, hidden_dim]
+            prev_weight = (
+                fns.unsqueeze(prev_weight, 0) if prev_weight else None
+            )  # [out_features, hidden_dim] -> [1, out_features, hidden_dim]
             reduction_axis += 1
 
+        prev_s, prev_w = None, None
+        if prev_statistics is not None and prev_weight is not None:
+            prev_s, _ = process_stats(prev_statistics, self._subset_size)
+            prev_s = prev_s.astype(TensorDataType.float32).max().item()
+            prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis + 1)
+
         top_k = max(int(s.shape[-1] * self._percent_to_apply), 1)
         topk_idxs = fns.argsort(-s)[:, :top_k]
 

From 1948c959d7dca28ff9c5773fec83711d5e5a2299 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Fri, 5 Dec 2025 20:24:10 +0400
Subject: [PATCH 04/30] fix the fix

---
 src/nncf/quantization/algorithms/weight_compression/awq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py
index ba5cfb43200..e36915eb3c4 100644
--- a/src/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/src/nncf/quantization/algorithms/weight_compression/awq.py
@@ -244,7 +244,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
         if prev_statistics is not None and prev_weight is not None:
             prev_s, _ = process_stats(prev_statistics, self._subset_size)
             prev_s = prev_s.astype(TensorDataType.float32).max().item()
-            prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis + 1)
+            prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis)
 
         top_k = max(int(s.shape[-1] * self._percent_to_apply), 1)
         topk_idxs = fns.argsort(-s)[:, :top_k]

From b5bbe159f993d38f65dc56d704510b7ddf1913fb Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Fri, 5 Dec 2025 20:42:19 +0400
Subject: [PATCH 05/30] add ignored nodes awq test for torch and torch fx

---
 .../template_test_weights_compression.py      | 22 ++++++----
 .../quantization/test_weights_compression.py  | 40 +++++++++++++++++--
 tests/torch2/fx/test_compress_weights.py      | 16 +++++---
 3 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py
index ec5c1a4e710..d63415f6cd0 100644
--- a/tests/cross_fw/test_templates/template_test_weights_compression.py
+++ b/tests/cross_fw/test_templates/template_test_weights_compression.py
@@ -382,8 +382,11 @@ def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul(self, int
 
     @staticmethod
     @abstractmethod
-    def get_awq_model() -> TModel:
-        "Returns a backend model for test_awq_with_ignored_scope."
+    def get_awq_model(is_3d_weights) -> TModel:
+        """
+        Returns a backend model for test_awq_with_ignored_scope."
+        :param is_3d_weights: The model has 3d weights
+        """
 
     @staticmethod
     @abstractmethod
@@ -402,16 +405,19 @@ def get_num_int4_group_sizes(model: TModel) -> dict[int, int]:
 
     @staticmethod
     @abstractmethod
-    def get_ignored_scope_name() -> str:
+    def get_ignored_scope_name(is_3d_weights) -> str:
         "Returns ignored scope name for test_awq_with_ignored_scope."
 
-    def test_awq_with_ignored_scope(self, mocker):
-        model = self.get_awq_model()
+    @pytest.mark.parametrize("is_3d_weights", [True, False])
+    def test_awq_with_ignored_scope(self, mocker, is_3d_weights):
+        model = self.get_awq_model(is_3d_weights)
         sz = 8
         n_samples = 10
 
+        input_shape = [2, 8, sz]
+
         dataset = Dataset(
-            [self.to_tensor(np.ones([1, 8, sz], dtype=np.float32)) for i in range(n_samples)],
+            [self.to_tensor(np.ones(input_shape, dtype=np.float32)) for i in range(n_samples)],
             self.get_transform_func(),
         )
 
@@ -423,12 +429,12 @@ def test_awq_with_ignored_scope(self, mocker):
                 group_size=-1,
                 dataset=dataset,
                 awq=True,
-                ignored_scope=IgnoredScope(names=[self.get_ignored_scope_name()]),
+                ignored_scope=IgnoredScope(names=[self.get_ignored_scope_name(is_3d_weights)]),
             )
 
         int4_ref_num_compressed = 4  # last MatMul is always int8; one - is ignored; total 6 matmuls
         int4_num_nodes = self.get_num_int4_nodes(compressed_model)
-        assert int4_num_nodes == int4_ref_num_compressed
+        assert int4_num_nodes == int4_ref_num_compressed, int4_num_nodes
 
     def test_rope_weight_compression(self):
         model = self.get_RoPE_model()
diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py
index d7db382df83..12fa717d441 100644
--- a/tests/torch2/function_hook/quantization/test_weights_compression.py
+++ b/tests/torch2/function_hook/quantization/test_weights_compression.py
@@ -213,6 +213,34 @@ def forward(self, x):
         return node6
 
 
+class AWQLinearModel3D(nn.Module):
+    def __init__(self, is_int8=False):
+        super().__init__()
+        self.is_int8 = is_int8
+
+        weight_data = 0.01 * torch.arange(0, 2 * 8 * 8).reshape(2, 8, 8) + 0.05
+
+        self.w1 = nn.Parameter(weight_data)
+        self.w2 = nn.Parameter(weight_data)
+        self.w3 = nn.Parameter(weight_data)
+        self.w4 = nn.Parameter(weight_data)
+        self.w5 = nn.Parameter(weight_data)
+        self.w6 = nn.Parameter(weight_data)
+
+    def forward(self, x):
+        node1 = torch.bmm(x, self.w1)
+        node2 = torch.bmm(x, self.w2)
+        node_multiply = node1 * node2
+
+        node3 = torch.bmm(node_multiply, self.w3)
+        node4 = torch.bmm(node3, self.w4)
+        node5 = torch.bmm(node3, self.w5)
+        node_multiply_2 = node4 * node5
+
+        node6 = torch.bmm(node_multiply_2, self.w6)
+        return node6
+
+
 class FunctionalModel(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -516,8 +544,10 @@ def get_moe_model_for_test_scale_estimation():
         return model
 
     @staticmethod
-    def get_awq_model() -> torch.nn.Module:
-        return AWQLinearModel()
+    def get_awq_model(is_3d_weights) -> torch.nn.Module:
+        if not is_3d_weights:
+            return AWQLinearModel()
+        return AWQLinearModel3D()
 
     @staticmethod
     def get_different_channel_size_model(channel_sizes: list[int]) -> torch.nn.Module:
@@ -715,8 +745,10 @@ def get_decompressed_weight(compressed_model: torch.nn.Module, input: torch.Tens
         return Tensor(unpacked_w)
 
     @staticmethod
-    def get_ignored_scope_name() -> str:
-        return "linear5/linear/0"
+    def get_ignored_scope_name(is_3d_weights) -> str:
+        if not is_3d_weights:
+            return "linear5/linear/0"
+        return "/bmm/4"
 
     @staticmethod
     def get_num_int4_nodes(model: torch.nn.Module) -> int:
diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py
index 082edaea787..d839242d9b8 100644
--- a/tests/torch2/fx/test_compress_weights.py
+++ b/tests/torch2/fx/test_compress_weights.py
@@ -40,6 +40,7 @@
 from tests.torch2.function_hook.quantization.test_weights_compression import UNSUPPORTED_MODES
 from tests.torch2.function_hook.quantization.test_weights_compression import AWQActLinearModel
 from tests.torch2.function_hook.quantization.test_weights_compression import AWQLinearModel
+from tests.torch2.function_hook.quantization.test_weights_compression import AWQLinearModel3D
 from tests.torch2.function_hook.quantization.test_weights_compression import ConvolutionModel
 from tests.torch2.function_hook.quantization.test_weights_compression import DifferentChannelSizeMatmulModel
 from tests.torch2.function_hook.quantization.test_weights_compression import DTypeModel
@@ -364,10 +365,13 @@ def get_moe_model_for_test_scale_estimation():
         return exported_model
 
     @staticmethod
-    def get_awq_model() -> torch.fx.GraphModule:
-        model = AWQLinearModel()
+    def get_awq_model(is_3d_weights) -> torch.fx.GraphModule:
+        if not is_3d_weights:
+            model = AWQLinearModel()
+        else:
+            model = AWQLinearModel3D()
         dynamic_shapes = [[None, torch.export.Dim("dynamic_shape"), None]]
-        ex_input = torch.ones([1, 4, 8], dtype=torch.float32)
+        ex_input = torch.ones([2, 4, 8], dtype=torch.float32)
         exported_model = get_torch_fx_model(model, ex_input, dynamic_shapes=dynamic_shapes)
         return exported_model
 
@@ -574,8 +578,10 @@ def get_decompressed_weight(compressed_model: torch.fx.GraphModule, input: torch
         return Tensor(unpacked_w)
 
     @staticmethod
-    def get_ignored_scope_name() -> str:
-        return "linear_4"
+    def get_ignored_scope_name(is_3d_weights) -> str:
+        if not is_3d_weights:
+            return "linear_4"
+        return "bmm_4"
 
     @staticmethod
     def get_num_int4_nodes(model: torch.fx.GraphModule) -> int:

From 10bdd6fe1f5994396948332e452dfefba640ea87 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Fri, 5 Dec 2025 20:52:52 +0400
Subject: [PATCH 06/30] add remaining awq tests

---
 .../template_test_weights_compression.py      |  7 +++--
 .../quantization/test_weights_compression.py  | 30 +++++++++++++++++--
 tests/torch2/fx/test_compress_weights.py      | 30 +++++++++++++++++--
 3 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py
index d63415f6cd0..b2bcee4da2e 100644
--- a/tests/cross_fw/test_templates/template_test_weights_compression.py
+++ b/tests/cross_fw/test_templates/template_test_weights_compression.py
@@ -482,11 +482,12 @@ def test_sam_pe_weight_compression(self):
     def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]:
         "Returns reference for test_awq_scale_reference."
 
-    def test_awq_scale_reference(self, monkeypatch, mocker):
+    @pytest.mark.parametrize("is_3d_weights", [True, False])
+    def test_awq_scale_reference(self, monkeypatch, mocker, is_3d_weights):
         monkeypatch.setattr("nncf.quantization.algorithms.weight_compression.algorithm.AWQ", SpyAWQ)
-        model = self.get_awq_model()
+        model = self.get_awq_model(is_3d_weights)
 
-        input = 0.01 * np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8) + 0.02
+        input = 0.01 * np.arange(0, 2 * 4 * 8, dtype=np.float32).reshape(2, 4, 8) + 0.02
         input = self.to_tensor(input)
         dataset = Dataset([input], self.get_transform_func())
 
diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py
index 12fa717d441..d73c2f41a9b 100644
--- a/tests/torch2/function_hook/quantization/test_weights_compression.py
+++ b/tests/torch2/function_hook/quantization/test_weights_compression.py
@@ -781,8 +781,34 @@ def get_num_multiply_from_awq(model):
     def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]:
         return {
             "linear3/linear/0": Tensor(
-                torch.tensor([[1.226455, 1.205499, 1.141340, 1.097436, 1.064355, 1.037971, 1.016118, 0.997526]])
-            )
+                torch.tensor([[1.422865, 1.347446, 1.133510, 1.001522, 0.909387, 0.840226, 0.785757, 0.741368]])
+            ),
+            "/bmm/2": Tensor(
+                torch.tensor(
+                    [
+                        [
+                            [1.109999],
+                            [1.108342],
+                            [1.102878],
+                            [1.097587],
+                            [1.092457],
+                            [1.087481],
+                            [1.082649],
+                            [1.077955],
+                        ],
+                        [
+                            [0.130212],
+                            [0.129630],
+                            [0.127712],
+                            [0.125842],
+                            [0.124017],
+                            [0.122236],
+                            [0.120498],
+                            [0.118800],
+                        ],
+                    ]
+                )
+            ),
         }
 
 
diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py
index d839242d9b8..70bf272752d 100644
--- a/tests/torch2/fx/test_compress_weights.py
+++ b/tests/torch2/fx/test_compress_weights.py
@@ -620,6 +620,32 @@ def get_num_multiply_from_awq(model):
     def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]:
         return {
             "linear_2": Tensor(
-                torch.tensor([[1.226455, 1.205499, 1.141340, 1.097436, 1.064355, 1.037971, 1.016118, 0.997526]])
-            )
+                torch.tensor([[1.422865, 1.347446, 1.133510, 1.001522, 0.909387, 0.840226, 0.785757, 0.741368]])
+            ),
+            "bmm_2": Tensor(
+                torch.tensor(
+                    [
+                        [
+                            [1.109999],
+                            [1.108342],
+                            [1.102878],
+                            [1.097587],
+                            [1.092457],
+                            [1.087481],
+                            [1.082649],
+                            [1.077955],
+                        ],
+                        [
+                            [0.130212],
+                            [0.129630],
+                            [0.127712],
+                            [0.125842],
+                            [0.124017],
+                            [0.122236],
+                            [0.120498],
+                            [0.118800],
+                        ],
+                    ]
+                )
+            ),
         }

From 9f511a4748b8b93e589a09813cb3f5cc0211880d Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Sun, 7 Dec 2025 13:50:54 +0400
Subject: [PATCH 07/30] Update awq.py

---
 src/nncf/quantization/algorithms/weight_compression/awq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py
index e36915eb3c4..0c7645841a1 100644
--- a/src/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/src/nncf/quantization/algorithms/weight_compression/awq.py
@@ -236,7 +236,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
             X = fns.unsqueeze(X, 0)  # [hidden_dim, samples] -> [1, hidden_dim, samples]
             weight = fns.unsqueeze(weight, 0)  # [out_features, hidden_dim] -> [1, out_features, hidden_dim]
             prev_weight = (
-                fns.unsqueeze(prev_weight, 0) if prev_weight else None
+                fns.unsqueeze(prev_weight, 0) if prev_weight is not None else None
             )  # [out_features, hidden_dim] -> [1, out_features, hidden_dim]
             reduction_axis += 1
 

From b62074f0bc043af382514c6cf1248b556f042f8c Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 07:37:14 +0400
Subject: [PATCH 08/30] add 3d matmul model to onnx

---
 tests/onnx/quantization/test_weights_compression.py      | 9 +++++++--
 .../native/quantization/test_weights_compression.py      | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py
index ddf441272b5..2121f6a0e54 100644
--- a/tests/onnx/quantization/test_weights_compression.py
+++ b/tests/onnx/quantization/test_weights_compression.py
@@ -9,6 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import Any, Callable, Optional
@@ -696,7 +697,7 @@ def get_num_multiply_from_awq(model: onnx.ModelProto) -> int:
         return awq_num
 
     @staticmethod
-    def get_awq_model() -> onnx.ModelProto:
+    def get_awq_model(is_3d_weights) -> onnx.ModelProto:
         """
         Builds a model to be used in the following tests:
             - TemplateWeightCompression.test_awq_with_ignored_scope()
@@ -709,7 +710,11 @@ def get_awq_model() -> onnx.ModelProto:
         x = mb.add_input("input", (1, None, 8))
         output = mb.add_output("output", (1, None, 8))
 
-        w_data = 0.01 * np.arange(0, 64, dtype=np.float32).reshape(8, 8) + 0.05
+        weight_shape = (8, 8)
+        if is_3d_weights:
+            weight_shape = (2, 8, 8)
+
+        w_data = 0.01 * np.arange(0, math.prod(weight_shape), dtype=np.float32).reshape(weight_shape) + 0.05
         w_data = w_data.T
 
         num_blocks = 2
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index c21e5b7d46f..9bf7ab9b3b6 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -2083,7 +2083,7 @@ def get_moe_model_for_test_scale_estimation():
         return SimpleMoEModel().ov_model
 
     @staticmethod
-    def get_awq_model() -> ov.Model:
+    def get_awq_model(is_3d_weights) -> ov.Model:
         return AWQMatmulModel().ov_model
 
     @staticmethod

From b364b4def3ff39e89407e903883ff89254105d9f Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 08:10:45 +0400
Subject: [PATCH 09/30] fix onnx model test

---
 tests/onnx/quantization/test_weights_compression.py | 11 ++++++-----
 tests/torch2/fx/test_compress_weights.py            |  5 ++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py
index 2121f6a0e54..e572dec13eb 100644
--- a/tests/onnx/quantization/test_weights_compression.py
+++ b/tests/onnx/quantization/test_weights_compression.py
@@ -707,12 +707,13 @@ def get_awq_model(is_3d_weights) -> onnx.ModelProto:
         """
         mb = ModelBuilder()
 
-        x = mb.add_input("input", (1, None, 8))
-        output = mb.add_output("output", (1, None, 8))
-
         weight_shape = (8, 8)
         if is_3d_weights:
-            weight_shape = (2, 8, 8)
+            # The first and last dimension are later transposed
+            weight_shape = (8, 8, 2)
+
+        x = mb.add_input("input", (2, None, 8))
+        output = mb.add_output("output", (2, None, 8))
 
         w_data = 0.01 * np.arange(0, math.prod(weight_shape), dtype=np.float32).reshape(weight_shape) + 0.05
         w_data = w_data.T
@@ -765,7 +766,7 @@ def get_num_int4_group_sizes(model: onnx.ModelProto) -> dict[int, int]:
         return num
 
     @staticmethod
-    def get_ignored_scope_name() -> str:
+    def get_ignored_scope_name(is_3d_weights) -> str:
         return "MatMul_4"  # Zero-based indices (e.g., MatMul_0, MatMul_1, ...)
 
     @staticmethod
diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py
index 70bf272752d..ab2870f9754 100644
--- a/tests/torch2/fx/test_compress_weights.py
+++ b/tests/torch2/fx/test_compress_weights.py
@@ -366,9 +366,8 @@ def get_moe_model_for_test_scale_estimation():
 
     @staticmethod
     def get_awq_model(is_3d_weights) -> torch.fx.GraphModule:
-        if not is_3d_weights:
-            model = AWQLinearModel()
-        else:
+        model = AWQLinearModel()
+        if is_3d_weights:
             model = AWQLinearModel3D()
         dynamic_shapes = [[None, torch.export.Dim("dynamic_shape"), None]]
         ex_input = torch.ones([2, 4, 8], dtype=torch.float32)

From 42496265a41a690fc17e545884f53d9d436e6569 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 08:30:48 +0400
Subject: [PATCH 10/30] fix some tests

---
 .../template_test_weights_compression.py      |  4 +-
 .../quantization/test_weights_compression.py  | 49 +++++++++++---
 .../quantization/test_weights_compression.py  | 66 ++++++++++---------
 tests/torch2/fx/test_compress_weights.py      | 66 ++++++++++---------
 4 files changed, 112 insertions(+), 73 deletions(-)

diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py
index b2bcee4da2e..b75cc11ba3e 100644
--- a/tests/cross_fw/test_templates/template_test_weights_compression.py
+++ b/tests/cross_fw/test_templates/template_test_weights_compression.py
@@ -479,7 +479,7 @@ def test_sam_pe_weight_compression(self):
 
     @staticmethod
     @abstractmethod
-    def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]:
+    def get_reference_for_test_awq_scale_reference(is_3d_weights) -> dict[str, Tensor]:
         "Returns reference for test_awq_scale_reference."
 
     @pytest.mark.parametrize("is_3d_weights", [True, False])
@@ -502,7 +502,7 @@ def test_awq_scale_reference(self, monkeypatch, mocker, is_3d_weights):
             )
         assert spy_instance is not None
         for node_name, scales in spy_instance._scale_per_target_node.items():
-            assert fns.allclose(scales, self.get_reference_for_test_awq_scale_reference()[node_name])
+            assert fns.allclose(scales, self.get_reference_for_test_awq_scale_reference(is_3d_weights)[node_name])
 
     @pytest.mark.parametrize(
         ["group_size", "fallback_mode", "min_adjusted_group_size", "expected_outcome"],
diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py
index e572dec13eb..bf3e4c3b539 100644
--- a/tests/onnx/quantization/test_weights_compression.py
+++ b/tests/onnx/quantization/test_weights_compression.py
@@ -770,15 +770,46 @@ def get_ignored_scope_name(is_3d_weights) -> str:
         return "MatMul_4"  # Zero-based indices (e.g., MatMul_0, MatMul_1, ...)
 
     @staticmethod
-    def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]:
-        return {
-            "MatMul_3": Tensor(
-                np.array(
-                    [[1.2264546, 1.2054994, 1.1413403, 1.0974358, 1.0643553, 1.0379708, 1.0161183, 0.9975262]],
-                    dtype=np.float32,
-                ).T
-            )
-        }
+    def get_reference_for_test_awq_scale_reference(is_3d_weights) -> dict[str, Tensor]:
+        return [
+            {
+                "MatMul_3": Tensor(
+                    np.array(
+                        [[1.4228648, 1.3474456, 1.1335096, 1.001522, 0.90938693, 0.84022623, 0.78575736, 0.7413683]],
+                        dtype=np.float32,
+                    ).T
+                )
+            },
+            {
+                "MatMul_3": Tensor(
+                    np.array(
+                        [
+                            [
+                                [1.119726],
+                                [1.1012304],
+                                [1.0438583],
+                                [1.006067],
+                                [0.97812414],
+                                [0.95607865],
+                                [0.9379444],
+                                [0.922586],
+                            ],
+                            [
+                                [0.99698645],
+                                [0.9808075],
+                                [0.9307146],
+                                [0.8974796],
+                                [0.87281394],
+                                [0.8533093],
+                                [0.8372402],
+                                [0.82361573],
+                            ],
+                        ],
+                        dtype=np.float32,
+                    )
+                )
+            },
+        ][is_3d_weights]
 
     @staticmethod
     def get_transform_func() -> Optional[Callable[..., Any]]:
diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py
index d73c2f41a9b..0ede375f739 100644
--- a/tests/torch2/function_hook/quantization/test_weights_compression.py
+++ b/tests/torch2/function_hook/quantization/test_weights_compression.py
@@ -778,38 +778,42 @@ def get_num_multiply_from_awq(model):
         return awq_num
 
     @staticmethod
-    def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]:
-        return {
-            "linear3/linear/0": Tensor(
-                torch.tensor([[1.422865, 1.347446, 1.133510, 1.001522, 0.909387, 0.840226, 0.785757, 0.741368]])
-            ),
-            "/bmm/2": Tensor(
-                torch.tensor(
-                    [
+    def get_reference_for_test_awq_scale_reference(is_3d_weights) -> dict[str, Tensor]:
+        return [
+            {
+                "linear3/linear/0": Tensor(
+                    torch.tensor([[1.422865, 1.347446, 1.133510, 1.001522, 0.909387, 0.840226, 0.785757, 0.741368]])
+                ),
+            },
+            {
+                "/bmm/2": Tensor(
+                    torch.tensor(
                         [
-                            [1.109999],
-                            [1.108342],
-                            [1.102878],
-                            [1.097587],
-                            [1.092457],
-                            [1.087481],
-                            [1.082649],
-                            [1.077955],
-                        ],
-                        [
-                            [0.130212],
-                            [0.129630],
-                            [0.127712],
-                            [0.125842],
-                            [0.124017],
-                            [0.122236],
-                            [0.120498],
-                            [0.118800],
-                        ],
-                    ]
-                )
-            ),
-        }
+                            [
+                                [1.109999],
+                                [1.108342],
+                                [1.102878],
+                                [1.097587],
+                                [1.092457],
+                                [1.087481],
+                                [1.082649],
+                                [1.077955],
+                            ],
+                            [
+                                [0.130212],
+                                [0.129630],
+                                [0.127712],
+                                [0.125842],
+                                [0.124017],
+                                [0.122236],
+                                [0.120498],
+                                [0.118800],
+                            ],
+                        ]
+                    )
+                ),
+            },
+        ][is_3d_weights]
 
 
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py
index ab2870f9754..e7788ec365d 100644
--- a/tests/torch2/fx/test_compress_weights.py
+++ b/tests/torch2/fx/test_compress_weights.py
@@ -616,35 +616,39 @@ def get_num_multiply_from_awq(model):
         return awq_num
 
     @staticmethod
-    def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]:
-        return {
-            "linear_2": Tensor(
-                torch.tensor([[1.422865, 1.347446, 1.133510, 1.001522, 0.909387, 0.840226, 0.785757, 0.741368]])
-            ),
-            "bmm_2": Tensor(
-                torch.tensor(
-                    [
+    def get_reference_for_test_awq_scale_reference(is_3d_weights) -> dict[str, Tensor]:
+        return [
+            {
+                "linear_2": Tensor(
+                    torch.tensor([[1.422865, 1.347446, 1.133510, 1.001522, 0.909387, 0.840226, 0.785757, 0.741368]])
+                ),
+            },
+            {
+                "bmm_2": Tensor(
+                    torch.tensor(
                         [
-                            [1.109999],
-                            [1.108342],
-                            [1.102878],
-                            [1.097587],
-                            [1.092457],
-                            [1.087481],
-                            [1.082649],
-                            [1.077955],
-                        ],
-                        [
-                            [0.130212],
-                            [0.129630],
-                            [0.127712],
-                            [0.125842],
-                            [0.124017],
-                            [0.122236],
-                            [0.120498],
-                            [0.118800],
-                        ],
-                    ]
-                )
-            ),
-        }
+                            [
+                                [1.109999],
+                                [1.108342],
+                                [1.102878],
+                                [1.097587],
+                                [1.092457],
+                                [1.087481],
+                                [1.082649],
+                                [1.077955],
+                            ],
+                            [
+                                [0.130212],
+                                [0.129630],
+                                [0.127712],
+                                [0.125842],
+                                [0.124017],
+                                [0.122236],
+                                [0.120498],
+                                [0.118800],
+                            ],
+                        ]
+                    )
+                ),
+            },
+        ][is_3d_weights]

From cc441791a1b407508778ff15ce2bc86b7a85933b Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 14:23:22 +0400
Subject: [PATCH 11/30] add ov model

---
 .../quantization/test_weights_compression.py  | 43 +++++++++++++++----
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 9bf7ab9b3b6..b4092cd85f4 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -64,6 +64,7 @@
 from tests.openvino.native.common import get_actual_reference_for_current_openvino
 from tests.openvino.native.models import AWQActMatmulModel
 from tests.openvino.native.models import AWQMatmulModel
+from tests.openvino.native.models import AWQMatmulModel3D
 from tests.openvino.native.models import AWQModel_fp16_overlow
 from tests.openvino.native.models import DifferentChannelSizeMatmulModel
 from tests.openvino.native.models import GatherAndMatmulShareData
@@ -2084,6 +2085,8 @@ def get_moe_model_for_test_scale_estimation():
 
     @staticmethod
     def get_awq_model(is_3d_weights) -> ov.Model:
+        if is_3d_weights:
+            return AWQMatmulModel3D().ov_model
         return AWQMatmulModel().ov_model
 
     @staticmethod
@@ -2322,12 +2325,36 @@ def get_num_multiply_from_awq(model):
         return awq_num
 
     @staticmethod
-    def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]:
-        return {
-            "MatMul_3": Tensor(
-                np.array(
-                    [[1.2264546, 1.2054994, 1.1413403, 1.0974358, 1.0643553, 1.0379708, 1.0161183, 0.9975262]],
-                    dtype=np.float32,
+    def get_reference_for_test_awq_scale_reference(is_3d_weights) -> dict[str, Tensor]:
+        return [
+            {
+                "MatMul_3": Tensor(
+                    np.array(
+                        [[1.4228648, 1.3474456, 1.1335096, 1.001522, 0.90938693, 0.84022623, 0.78575736, 0.7413683]],
+                        dtype=np.float32,
+                    )
                 )
-            )
-        }
+            },
+            {
+                "MatMul_3": Tensor(
+                    np.array(
+                        [
+                            [[1.2264546, 1.2054994, 1.1413403, 1.0974358, 1.0643553, 1.0379708, 1.0161183, 0.9975262]],
+                            [
+                                [
+                                    0.46889508,
+                                    0.4599662,
+                                    0.4321173,
+                                    0.40815368,
+                                    0.387274,
+                                    0.36888793,
+                                    0.35255024,
+                                    0.33791822,
+                                ]
+                            ],
+                        ],
+                        dtype=np.float32,
+                    )
+                )
+            },
+        ][is_3d_weights]

From a934313c2c941a747ffbe40f8cffd87cbb42682e Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 15:27:13 +0400
Subject: [PATCH 12/30] add model

---
 tests/openvino/native/models.py | 43 ++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py
index 0e0ef99b40a..d2651491761 100644
--- a/tests/openvino/native/models.py
+++ b/tests/openvino/native/models.py
@@ -1004,7 +1004,7 @@ def get_weights(weights_data, is_int8, name):
         return (qw - zp) * scale
 
     def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False):
-        input_node = opset.parameter([1] * n_extra_dims + [-1, 8], name="Input_1")
+        input_node = opset.parameter([2] * n_extra_dims + [-1, 8], name="Input_1")
 
         weights_data1 = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05
         weights1 = self.get_weights(weights_data1, is_int8, name="weights_1")
@@ -1040,6 +1040,47 @@ def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False):
         return model
 
 
+class AWQMatmulModel3D(OVReferenceModel):
+    """
+    3D-weights version of AWQMatmulModel.
+    All weight tensors are [2, 8, 8]; input is [2, L, 8].
+    """
+
+    def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False):
+        input_node = opset.parameter([2] * n_extra_dims + [-1, 8], name="Input_1")
+
+        def make_weights(name):
+            w = 0.01 * np.arange(0, 2 * 8 * 8).reshape(2, 8, 8) + 0.05
+            return opset.constant(w, dtype=np.float32, name=name)
+
+        weights1 = make_weights("weights_1")
+        node1 = opset.matmul(input_node, weights1, transpose_a=False, transpose_b=True, name="MatMul_1")
+
+        weights2 = make_weights("weights_2")
+        node2 = opset.matmul(input_node, weights2, transpose_a=False, transpose_b=True, name="MatMul_2")
+
+        node_multiply = opset.multiply(node1, node2, name="Multiply")
+
+        weights3 = make_weights("weights_3")
+        node3 = opset.matmul(node_multiply, weights3, transpose_a=False, transpose_b=True, name="MatMul_3")
+
+        weights4 = make_weights("weights_4")
+        node4 = opset.matmul(node3, weights4, transpose_a=False, transpose_b=True, name="MatMul_4")
+
+        weights5 = make_weights("weights_5")
+        node5 = opset.matmul(node3, weights5, transpose_a=False, transpose_b=True, name="MatMul_5")
+
+        node_multiply_2 = opset.multiply(node4, node5, name="Multiply_2")
+
+        weights6 = make_weights("weights_6")
+        node6 = opset.matmul(node_multiply_2, weights6, transpose_a=False, transpose_b=True, name="MatMul_6")
+
+        result = opset.result(node6, name="Result")
+        result.get_output_tensor(0).set_names(set(["Result"]))
+        model = ov.Model([result], [input_node])
+        return model
+
+
 class AWQActMatmulModel(OVReferenceModel):
     """
     Model for testing AWQ algorithm. Contains MatMul->Multiply->MatMul pattern.

From 48c499c61c085d261cbf3a97c9dd29101fa99067 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 15:45:04 +0400
Subject: [PATCH 13/30] xfail openvino test

---
 .../quantization/test_weights_compression.py       | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index b4092cd85f4..c8942436276 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -2272,7 +2272,19 @@ def get_moe_scale_estimation_ref(check_sampling_activation_stats_flow):
     @pytest.mark.parametrize("is_moe", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))])
     @pytest.mark.parametrize("check_sampling_activation_stats_flow", [False, True])
     def test_scale_estimation(self, mocker, is_moe, check_sampling_activation_stats_flow):
-        super().test_scale_estimation(mocker, is_moe, check_sampling_activation_stats_flow)
+        return super().test_scale_estimation(mocker, is_moe, check_sampling_activation_stats_flow)
+
+    @pytest.mark.parametrize(
+        "is_3d_weights", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))]
+    )
+    def test_awq_with_ignored_scope(self, mocker, is_3d_weights):
+        return super().test_awq_with_ignored_scope(mocker, is_3d_weights)
+
+    @pytest.mark.parametrize(
+        "is_3d_weights", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))]
+    )
+    def test_awq_scale_reference(self, monkeypatch, mocker, is_3d_weights):
+        return super().test_awq_scale_reference(monkeypatch, mocker, is_3d_weights)
 
     @staticmethod
     def get_orig_weight(model: ov.Model) -> Tensor:

From ba8b725b5f392b16d76e791a5265c7e0a5f21cad Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 16:18:07 +0400
Subject: [PATCH 14/30] fix condition for is_mergeable

---
 .../algorithms/weight_compression/awq.py         | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py
index 0c7645841a1..8a2def2d8f9 100644
--- a/src/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/src/nncf/quantization/algorithms/weight_compression/awq.py
@@ -159,17 +159,25 @@ def apply(
             weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph)
             if len(weight_data) != 1:  # not supported by the algorithm
                 continue
-            is_mergeable = self._backend_entity.is_node_with_weights(merge_node, graph)
-
-            nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}")
-
             _, weight_port_id = weight_data[0]
+
             weight = self._backend_entity.get_weight(
                 wp.node_with_weight, weight_port_id, model, graph
             )  # get_const_value(wp.weight_node)
             weight_dtype = weight.dtype
             weight = weight.astype(TensorDataType.float32)
 
+            # returns an empty list if no weights are present
+            mergeable_node_weight_data = self._backend_entity.get_weight_names_and_port_ids(merge_node, graph)
+            merge_node_weight_dims = [
+                len(self._backend_entity.get_weight_shape(merge_node, port_id, graph))
+                for _, port_id in mergeable_node_weight_data
+            ]
+            # if no weights are present, it checks membership with empty list which is False.
+            is_mergeable = len(weight.shape) in merge_node_weight_dims
+
+            nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}")
+
             if is_data_free:
                 scale = self._data_free_step(weight, 1 - wp.reduction_axes[0])
             else:

From 667716e8c5fc364863593425d4988679dd6022a2 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 16:41:38 +0400
Subject: [PATCH 15/30] fix mergeable issue

---
 .../algorithms/weight_compression/awq.py         | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py
index 8a2def2d8f9..8a4fcbd64c7 100644
--- a/src/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/src/nncf/quantization/algorithms/weight_compression/awq.py
@@ -167,14 +167,14 @@ def apply(
             weight_dtype = weight.dtype
             weight = weight.astype(TensorDataType.float32)
 
-            # returns an empty list if no weights are present
-            mergeable_node_weight_data = self._backend_entity.get_weight_names_and_port_ids(merge_node, graph)
-            merge_node_weight_dims = [
-                len(self._backend_entity.get_weight_shape(merge_node, port_id, graph))
-                for _, port_id in mergeable_node_weight_data
-            ]
-            # if no weights are present, it checks membership with empty list which is False.
-            is_mergeable = len(weight.shape) in merge_node_weight_dims
+            is_mergeable = False
+            if self._backend_entity.is_node_with_weights(merge_node, graph):
+                mergeable_node_weight_data = self._backend_entity.get_weight_names_and_port_ids(merge_node, graph)
+                merge_node_weight_dims = [
+                    len(self._backend_entity.get_weight_shape(merge_node, port_id, graph))
+                    for _, port_id in mergeable_node_weight_data
+                ]
+                is_mergeable = len(weight.shape) in merge_node_weight_dims
 
             nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}")
 

From 2122b11d186e957f7e6967c540fb5ddd745f272b Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 16:45:25 +0400
Subject: [PATCH 16/30] add act model for openvino; include data free test and
 call max variance test

---
 .../template_test_weights_compression.py      | 20 +++++----
 tests/openvino/native/models.py               | 45 ++++++++++++++++++-
 .../quantization/test_weights_compression.py  |  5 ++-
 3 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py
index b75cc11ba3e..6866178d160 100644
--- a/tests/cross_fw/test_templates/template_test_weights_compression.py
+++ b/tests/cross_fw/test_templates/template_test_weights_compression.py
@@ -354,7 +354,7 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self, mocker):
     # AWQ Tests
     @staticmethod
     @abstractmethod
-    def get_awq_act_model(with_multiply, n_layers):
+    def get_awq_act_model(is_3d_weights, with_multiply, n_layers):
         "Returns a backend model for test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul."
 
     @staticmethod
@@ -366,13 +366,16 @@ def get_num_multiply_from_awq(model: TModel) -> int:
     def int4_mode(self, request):
         return None
 
+    @pytest.mark.parametrize("is_3d_weights", [True, False])
     @pytest.mark.parametrize("with_multiply", (True, False))
-    def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul(self, int4_mode, with_multiply, mocker):
+    def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul(
+        self, int4_mode, with_multiply, is_3d_weights, mocker
+    ):
         n_layers = 8
         n_awq_target = n_layers - 1  # first MatMul is always int8
-        model = self.get_awq_act_model(with_multiply, n_layers)
+        model = self.get_awq_act_model(is_3d_weights, with_multiply, n_layers)
 
-        dataset = Dataset([self.to_tensor(np.ones([1, 8, 8], dtype=np.float32))], self.get_transform_func())
+        dataset = Dataset([self.to_tensor(np.ones([2, 8, 8], dtype=np.float32))], self.get_transform_func())
 
         with SpyWeightCompressionStatisticsContext(mocker):
             model = compress_weights(model, mode=int4_mode, ratio=1.0, group_size=2, dataset=dataset, awq=True)
@@ -626,14 +629,15 @@ def test_group_size_fallback_modes(
             f"Expected {ref_num_group_sizes} group size values, but got {num_group_sizes}."
         )
 
-    @pytest.mark.parametrize("dataset", [None, np.ones([1, 8, 8], dtype=np.float32)])
+    @pytest.mark.parametrize("is_3d_weights", [True, False])
+    @pytest.mark.parametrize("dataset", [None, np.ones([2, 8, 8], dtype=np.float32)])
     @pytest.mark.parametrize("prefer_data_aware_scaling", [True, False])
-    def test_data_free_awq(self, dataset, prefer_data_aware_scaling, mocker):
-        input_data = np.ones([1, 8, 8], dtype=np.float32)
+    def test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker):
+        input_data = np.ones([2, 8, 8], dtype=np.float32)
 
         n_layers = 8
         n_awq_target = n_layers - 1  # first MatMul is always int8
-        model = self.get_awq_act_model(True, n_layers)
+        model = self.get_awq_act_model(is_3d_weights, True, n_layers)
         model = self.wrap_model(model, input_data)
 
         if dataset is not None:
diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py
index d2651491761..28405cb04ae 100644
--- a/tests/openvino/native/models.py
+++ b/tests/openvino/native/models.py
@@ -1046,7 +1046,7 @@ class AWQMatmulModel3D(OVReferenceModel):
     All weight tensors are [2, 8, 8]; input is [2, L, 8].
     """
 
-    def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False):
+    def _create_ov_model(self, n_extra_dims=1, is_int8=False):
         input_node = opset.parameter([2] * n_extra_dims + [-1, 8], name="Input_1")
 
         def make_weights(name):
@@ -1087,7 +1087,7 @@ class AWQActMatmulModel(OVReferenceModel):
     """
 
     def _create_ov_model(self, is_int8=False, with_multiply=False, n_layers=8):
-        input_node = opset.parameter([1, 8, 8], name="Input_1")
+        input_node = opset.parameter([2, 8, 8], name="Input_1")
         weights_data = np.arange(0, 64).reshape(8, 8) - 32
         weights = AWQMatmulModel.get_weights(weights_data, is_int8, name="weights_emb")
         out_node = opset.matmul(input_node, weights, transpose_a=False, transpose_b=True, name="MatMul_emb")
@@ -1125,6 +1125,47 @@ def _create_ov_model(self, is_int8=False, with_multiply=False, n_layers=8):
         return model
 
 
+class AWQActMatmulModel3D(OVReferenceModel):
+    """
+    Model for testing AWQ algorithm with 3D weights. Contains MatMul->Multiply->MatMul pattern.
+    """
+
+    def _create_ov_model(self, is_int8=False, with_multiply=False, n_layers=8):
+        input_node = opset.parameter([2, 8, 8], name="Input_1")
+
+        def make_weights(name: str):
+            w = np.arange(0, 2 * 8 * 8).reshape(2, 8, 8) - 32
+            return opset.constant(w, dtype=np.float32, name=name)
+
+        weights_emb = make_weights("weights_emb")
+        out_node = opset.matmul(input_node, weights_emb, transpose_a=False, transpose_b=True, name="MatMul_emb")
+
+        for i in range(n_layers):
+            weights1 = make_weights(f"weights_1_{i}")
+            mm1 = opset.matmul(out_node, weights1, transpose_a=False, transpose_b=True, name=f"MatMul_1_{i}")
+            node1 = opset.relu(mm1, name=f"ReLU_{i}")
+
+            if with_multiply:
+                weights2 = make_weights(f"weights_2_{i}")
+                mm2 = opset.matmul(out_node, weights2, transpose_a=False, transpose_b=True, name=f"MatMul_2_{i}")
+                alpha = opset.constant(np.array([1.5], dtype=np.float32), dtype=np.float32)
+                lambda_value = opset.constant(np.array([1.5], dtype=np.float32), dtype=np.float32)
+                node2 = opset.selu(mm2, alpha, lambda_value, name=f"SeLU_{i}")
+                node_multiply = opset.multiply(node1, node2, name=f"Multiply_{i}")
+            else:
+                node_multiply = node1
+
+            out_node = node_multiply
+
+        weights_lm_head = make_weights("weights_lm_head")
+        out_node = opset.matmul(out_node, weights_lm_head, transpose_a=False, transpose_b=True, name="MatMul_lm_head")
+
+        result = opset.result(out_node, name="Result")
+        result.get_output_tensor(0).set_names(set(["Result"]))
+        model = ov.Model([result], [input_node])
+        return model
+
+
 class AWQModel_fp16_overlow(OVReferenceModel):
     """
     Model for testing AWQ algorithm with fp16 overflow fix.
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index c8942436276..dd9f90d4bc7 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -63,6 +63,7 @@
 from tests.cross_fw.test_templates.template_test_weights_compression import TemplateWeightCompression
 from tests.openvino.native.common import get_actual_reference_for_current_openvino
 from tests.openvino.native.models import AWQActMatmulModel
+from tests.openvino.native.models import AWQActMatmulModel3D
 from tests.openvino.native.models import AWQMatmulModel
 from tests.openvino.native.models import AWQMatmulModel3D
 from tests.openvino.native.models import AWQModel_fp16_overlow
@@ -2094,7 +2095,9 @@ def get_different_channel_size_model(channel_sizes: list[int]) -> ov.Model:
         return DifferentChannelSizeMatmulModel(channel_sizes=channel_sizes).ov_model
 
     @staticmethod
-    def get_awq_act_model(with_multiply, n_layers):
+    def get_awq_act_model(is_3d_weights, with_multiply, n_layers):
+        if is_3d_weights:
+            return AWQActMatmulModel3D(with_multiply=with_multiply, n_layers=n_layers).ov_model
         return AWQActMatmulModel(with_multiply=with_multiply, n_layers=n_layers).ov_model
 
     @staticmethod

From 3e92f47bc93b68e32fa9b9e5dbab3b8bb6ff8f74 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 18:42:35 +0400
Subject: [PATCH 17/30] add torch and torch fx act linear model tests

---
 .../quantization/test_weights_compression.py  | 31 ++++++++++++++++++-
 tests/torch2/fx/test_compress_weights.py      |  7 +++--
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py
index 0ede375f739..e735f24c1cb 100644
--- a/tests/torch2/function_hook/quantization/test_weights_compression.py
+++ b/tests/torch2/function_hook/quantization/test_weights_compression.py
@@ -173,6 +173,33 @@ def forward(self, x):
         return out
 
 
+class AWQActLinearModel3D(nn.Module):
+    def __init__(self, with_multiply=False, n_layers=8):
+        super().__init__()
+        self.with_multiply = with_multiply
+        self.n_layers = n_layers
+
+        base_w = torch.arange(0, 2 * 8 * 8, dtype=torch.float32).reshape(2, 8, 8) - 32.0
+        self.emb_weight = nn.Parameter(base_w.clone())
+        self.lm_head_weight = nn.Parameter(base_w.clone())
+        n_params = 2 * n_layers if with_multiply else n_layers
+        self.layer_weights = nn.ParameterList(nn.Parameter(base_w) for _ in range(n_params))
+
+    def forward(self, x):
+        out = torch.bmm(x, self.emb_weight)
+
+        for i in range(self.n_layers):
+            node1 = F.relu(torch.bmm(out, self.layer_weights[i]))
+            if self.with_multiply:
+                node2 = torch.selu(torch.bmm(out, self.layer_weights[i]))
+                out = node1 * node2
+            else:
+                out = node1
+
+        out = torch.bmm(out, self.lm_head_weight)
+        return out
+
+
 class AWQLinearModel(nn.Module):
     def __init__(self, is_int8=False):
         super().__init__()
@@ -554,7 +581,9 @@ def get_different_channel_size_model(channel_sizes: list[int]) -> torch.nn.Modul
         return DifferentChannelSizeMatmulModel(channel_sizes=channel_sizes)
 
     @staticmethod
-    def get_awq_act_model(with_multiply, n_layers):
+    def get_awq_act_model(is_3d_weights, with_multiply, n_layers):
+        if is_3d_weights:
+            return AWQActLinearModel3D(with_multiply=with_multiply, n_layers=n_layers)
         return AWQActLinearModel(with_multiply=with_multiply, n_layers=n_layers)
 
     @staticmethod
diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py
index e7788ec365d..cd1317f0a85 100644
--- a/tests/torch2/fx/test_compress_weights.py
+++ b/tests/torch2/fx/test_compress_weights.py
@@ -39,6 +39,7 @@
 from tests.torch2.function_hook.quantization.test_weights_compression import SUPPORTED_MODES
 from tests.torch2.function_hook.quantization.test_weights_compression import UNSUPPORTED_MODES
 from tests.torch2.function_hook.quantization.test_weights_compression import AWQActLinearModel
+from tests.torch2.function_hook.quantization.test_weights_compression import AWQActLinearModel3D
 from tests.torch2.function_hook.quantization.test_weights_compression import AWQLinearModel
 from tests.torch2.function_hook.quantization.test_weights_compression import AWQLinearModel3D
 from tests.torch2.function_hook.quantization.test_weights_compression import ConvolutionModel
@@ -382,9 +383,11 @@ def get_different_channel_size_model(channel_sizes: list[int]) -> torch.nn.Modul
         return exported_model
 
     @staticmethod
-    def get_awq_act_model(with_multiply, n_layers):
+    def get_awq_act_model(is_3d_weights, with_multiply, n_layers):
         model = AWQActLinearModel(with_multiply=with_multiply, n_layers=n_layers)
-        ex_input = torch.ones([1, 8, 8], dtype=torch.float32)
+        if is_3d_weights:
+            model = AWQActLinearModel3D(with_multiply=with_multiply, n_layers=n_layers)
+        ex_input = torch.ones([2, 8, 8], dtype=torch.float32)
         exported_model = get_torch_fx_model(model, ex_input)
         return exported_model
 

From d44f6d5686f446072d4287e8a1da16904b1da5a8 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 19:38:30 +0400
Subject: [PATCH 18/30] fix data shape for OV model

---
 .../native/quantization/test_weights_compression.py         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index dd9f90d4bc7..17cd67579ba 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -930,7 +930,7 @@ def test_call_max_var_criterion_with_dataset_by_default(mocker, mode):
 @pytest.mark.parametrize("mode", INT4_MODES)
 def test_call_max_var_criterion_with_dataset_by_default_awq(mode):
     model = AWQMatmulModel().ov_model
-    dataset = Dataset([np.ones([1, 8, 8])])
+    dataset = Dataset([np.ones([2, 8, 8])])
 
     compress_weights(model, mode=mode, ratio=1.0, group_size=2, dataset=dataset, awq=True)
 
@@ -938,7 +938,7 @@ def test_call_max_var_criterion_with_dataset_by_default_awq(mode):
 @pytest.mark.parametrize("mode", INT4_NF4_MODES)
 def test_call_max_var_criterion_with_dataset_awq_for_compressed_model(mode):
     model = AWQMatmulModel(is_int8=True).ov_model
-    dataset = Dataset([np.ones([1, 8, 8])])
+    dataset = Dataset([np.ones([2, 8, 8])])
 
     compress_weights(model, mode=mode, ratio=1.0, group_size=2, dataset=dataset, awq=True)
 
@@ -946,7 +946,7 @@ def test_call_max_var_criterion_with_dataset_awq_for_compressed_model(mode):
 @pytest.mark.parametrize("mode", INT4_NF4_MODES)
 def test_call_max_var_criterion_with_dataset_awq_neg_group_size(mode):
     model = AWQMatmulModel().ov_model
-    dataset = Dataset([np.ones([1, 8, 8])])
+    dataset = Dataset([np.ones([2, 8, 8])])
     compress_weights(model, mode=mode, ratio=1.0, group_size=-1, dataset=dataset, awq=True)
 
 

From 090f0f509fdc405e6e57e254cee5685c57a342e3 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 19:49:27 +0400
Subject: [PATCH 19/30] fix awq data free

---
 src/nncf/quantization/algorithms/weight_compression/awq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py
index 8a4fcbd64c7..54dcdccde1c 100644
--- a/src/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/src/nncf/quantization/algorithms/weight_compression/awq.py
@@ -179,7 +179,7 @@ def apply(
             nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}")
 
             if is_data_free:
-                scale = self._data_free_step(weight, 1 - wp.reduction_axes[0])
+                scale = self._data_free_step(weight, -(len(weight.shape) - wp.reduction_axes[0]))
             else:
                 prev_weight, prev_statistics = None, None
                 if is_mergeable:

From 7222304f756a172f3577746c78eca0d6ee796ac8 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 20:09:41 +0400
Subject: [PATCH 20/30] add check for opset version when weights are 3D

---
 .../weight_compression/onnx_backend.py        |  4 ++++
 .../quantization/test_weights_compression.py  | 22 +++++++++++++------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
index 735ba9a2a3e..c99a259f275 100644
--- a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
+++ b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -259,6 +259,10 @@ def transform_model(
             # For opsets earlier than 21, we use the `MatMulNBits` operation from ONNX Runtime contrib operators.
             # See https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md
             if opset_version < 21 and dequantize_block_size > 0:
+                if len(weight.shape) == 3:
+                    msg = """ONNX does not support 3D weights for opset version < 21.
+                             Please use a higher opset version or per-channel quantization"""
+                    raise nncf.ParameterNotSupportedError(msg)
                 compressed_weight, scale, zero_point = self._preprocess_compressed_weight(
                     compressed_weight, weight.shape, dequantize_block_size=None, apply_transpose=True
                 )
diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py
index bf3e4c3b539..34c7faf39e9 100644
--- a/tests/onnx/quantization/test_weights_compression.py
+++ b/tests/onnx/quantization/test_weights_compression.py
@@ -659,7 +659,7 @@ def get_decompressed_weight(compressed_model: onnx.ModelProto, input: np.ndarray
         return Tensor(outputs["W_0_dequantized"])
 
     @staticmethod
-    def get_awq_act_model(with_multiply: bool, n_layers: int) -> onnx.ModelProto:
+    def get_awq_act_model(is_3d_weights: bool, with_multiply: bool, n_layers: int) -> onnx.ModelProto:
         """
         Builds a model to be used in the following tests:
             - TemplateWeightCompression.test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul()
@@ -668,7 +668,12 @@ def get_awq_act_model(with_multiply: bool, n_layers: int) -> onnx.ModelProto:
         """
         mb = ModelBuilder()
 
-        data = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05
+        weight_shape = (8, 8)
+        if is_3d_weights:
+            # The first and last dimension are later transposed
+            weight_shape = (8, 8, 2)
+
+        data = 0.01 * np.arange(0, math.prod(weight_shape)).reshape(weight_shape) + 0.05
         data = data.astype(np.float32).T
 
         x = mb.add_input("input", (1, 8, 8))
@@ -676,17 +681,17 @@ def get_awq_act_model(with_multiply: bool, n_layers: int) -> onnx.ModelProto:
 
         x = mb.add_matmul(x, shape=(8, 8), data=data)
         for _ in range(n_layers):
-            a = mb.add_matmul(x, shape=(8, 8), data=data)
+            a = mb.add_matmul(x, shape=data.shape, data=data)
             a = mb.add_relu(a)
             if with_multiply:
-                b = mb.add_matmul(x, shape=(8, 8), data=data)
+                b = mb.add_matmul(x, shape=data.shape, data=data)
                 b = mb.add_selu(b)
                 x = mb.add_mul(a, b)
             else:
                 x = a
-        mb.add_matmul(x, shape=(8, 8), output=output, data=data)
+        mb.add_matmul(x, shape=data.shape, output=output, data=data)
 
-        return mb.build()
+        return mb.build(opset_version=21)
 
     @staticmethod
     def get_num_multiply_from_awq(model: onnx.ModelProto) -> int:
@@ -708,9 +713,12 @@ def get_awq_model(is_3d_weights) -> onnx.ModelProto:
         mb = ModelBuilder()
 
         weight_shape = (8, 8)
+        opset_version = 13
         if is_3d_weights:
             # The first and last dimension are later transposed
             weight_shape = (8, 8, 2)
+            # 3D weights does not work due to no support in MatMulNBits which is used in opset_version < 21
+            opset_version = 21
 
         x = mb.add_input("input", (2, None, 8))
         output = mb.add_output("output", (2, None, 8))
@@ -725,7 +733,7 @@ def get_awq_model(is_3d_weights) -> onnx.ModelProto:
             x = mb.add_mul(a, b)
             x = mb.add_matmul(x, shape=w_data.shape, output=output if i == num_blocks - 1 else None, data=w_data)
 
-        return mb.build()
+        return mb.build(opset_version=opset_version)
 
     @staticmethod
     def get_different_channel_size_model(channel_sizes: list[int]) -> onnx.ModelProto:

From 211c806e15413bb01c8ef1fbb6b0c7f28675ffc1 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 20:56:24 +0400
Subject: [PATCH 21/30] xfail openvino case

---
 .../native/quantization/test_weights_compression.py       | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 17cd67579ba..1327c5619d9 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -2289,6 +2289,14 @@ def test_awq_with_ignored_scope(self, mocker, is_3d_weights):
     def test_awq_scale_reference(self, monkeypatch, mocker, is_3d_weights):
         return super().test_awq_scale_reference(monkeypatch, mocker, is_3d_weights)
 
+    @pytest.mark.parametrize(
+        "is_3d_weights", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))]
+    )
+    @pytest.mark.parametrize("dataset", [None, np.ones([2, 8, 8], dtype=np.float32)])
+    @pytest.mark.parametrize("prefer_data_aware_scaling", [True, False])
+    def test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker):
+        return super().test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker)
+
     @staticmethod
     def get_orig_weight(model: ov.Model) -> Tensor:
         for op in model.get_ordered_ops():

From 0d9651617bf3ce06d98ec15377f6d76185020e13 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 21:01:05 +0400
Subject: [PATCH 22/30] fix test

---
 tests/onnx/quantization/test_weights_compression.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py
index 34c7faf39e9..be361d30397 100644
--- a/tests/onnx/quantization/test_weights_compression.py
+++ b/tests/onnx/quantization/test_weights_compression.py
@@ -676,10 +676,10 @@ def get_awq_act_model(is_3d_weights: bool, with_multiply: bool, n_layers: int) -
         data = 0.01 * np.arange(0, math.prod(weight_shape)).reshape(weight_shape) + 0.05
         data = data.astype(np.float32).T
 
-        x = mb.add_input("input", (1, 8, 8))
-        output = mb.add_output("output", (1, 8, 8))
+        x = mb.add_input("input", (2, 8, 8))
+        output = mb.add_output("output", (2, 8, 8))
 
-        x = mb.add_matmul(x, shape=(8, 8), data=data)
+        x = mb.add_matmul(x, shape=data.shape, data=data)
         for _ in range(n_layers):
             a = mb.add_matmul(x, shape=data.shape, data=data)
             a = mb.add_relu(a)

From 0b1019cec83dfb2d92ba390c9993a6d9e7a19da7 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 21:14:07 +0400
Subject: [PATCH 23/30] remove extra comments

---
 .../quantization/algorithms/weight_compression/awq.py  | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py
index 54dcdccde1c..47f96226572 100644
--- a/src/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/src/nncf/quantization/algorithms/weight_compression/awq.py
@@ -240,12 +240,10 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
         reduction_axis = wp.reduction_axes[0]
 
         if is_2d_weight:
-            s = fns.unsqueeze(s, 0)  # [hidden_dim] -> [1, hidden_dim]
-            X = fns.unsqueeze(X, 0)  # [hidden_dim, samples] -> [1, hidden_dim, samples]
-            weight = fns.unsqueeze(weight, 0)  # [out_features, hidden_dim] -> [1, out_features, hidden_dim]
-            prev_weight = (
-                fns.unsqueeze(prev_weight, 0) if prev_weight is not None else None
-            )  # [out_features, hidden_dim] -> [1, out_features, hidden_dim]
+            s = fns.unsqueeze(s, 0)
+            X = fns.unsqueeze(X, 0)
+            weight = fns.unsqueeze(weight, 0)
+            prev_weight = fns.unsqueeze(prev_weight, 0) if prev_weight is not None else None
             reduction_axis += 1
 
         prev_s, prev_w = None, None

From 80c1ec2bdc6acb5254a4243775cddabfde2e9658 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 21:19:15 +0400
Subject: [PATCH 24/30] fix

---
 tests/openvino/native/models.py                           | 8 ++++----
 .../native/quantization/test_weights_compression.py       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py
index 28405cb04ae..755d6f6ccdc 100644
--- a/tests/openvino/native/models.py
+++ b/tests/openvino/native/models.py
@@ -1004,7 +1004,7 @@ def get_weights(weights_data, is_int8, name):
         return (qw - zp) * scale
 
     def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False):
-        input_node = opset.parameter([2] * n_extra_dims + [-1, 8], name="Input_1")
+        input_node = opset.parameter([-1] * n_extra_dims + [-1, 8], name="Input_1")
 
         weights_data1 = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05
         weights1 = self.get_weights(weights_data1, is_int8, name="weights_1")
@@ -1047,7 +1047,7 @@ class AWQMatmulModel3D(OVReferenceModel):
     """
 
     def _create_ov_model(self, n_extra_dims=1, is_int8=False):
-        input_node = opset.parameter([2] * n_extra_dims + [-1, 8], name="Input_1")
+        input_node = opset.parameter([-1] * n_extra_dims + [-1, 8], name="Input_1")
 
         def make_weights(name):
             w = 0.01 * np.arange(0, 2 * 8 * 8).reshape(2, 8, 8) + 0.05
@@ -1087,7 +1087,7 @@ class AWQActMatmulModel(OVReferenceModel):
     """
 
     def _create_ov_model(self, is_int8=False, with_multiply=False, n_layers=8):
-        input_node = opset.parameter([2, 8, 8], name="Input_1")
+        input_node = opset.parameter([-1, 8, 8], name="Input_1")
         weights_data = np.arange(0, 64).reshape(8, 8) - 32
         weights = AWQMatmulModel.get_weights(weights_data, is_int8, name="weights_emb")
         out_node = opset.matmul(input_node, weights, transpose_a=False, transpose_b=True, name="MatMul_emb")
@@ -1131,7 +1131,7 @@ class AWQActMatmulModel3D(OVReferenceModel):
     """
 
     def _create_ov_model(self, is_int8=False, with_multiply=False, n_layers=8):
-        input_node = opset.parameter([2, 8, 8], name="Input_1")
+        input_node = opset.parameter([-1, 8, 8], name="Input_1")
 
         def make_weights(name: str):
             w = np.arange(0, 2 * 8 * 8).reshape(2, 8, 8) - 32
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 1327c5619d9..ddb5e0fbfa0 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -2295,7 +2295,7 @@ def test_awq_scale_reference(self, monkeypatch, mocker, is_3d_weights):
     @pytest.mark.parametrize("dataset", [None, np.ones([2, 8, 8], dtype=np.float32)])
     @pytest.mark.parametrize("prefer_data_aware_scaling", [True, False])
     def test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker):
-        return super().test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker)
+        return super().test_data_free_awq(dataset, prefer_data_aware_scaling, is_3d_weights, mocker)
 
     @staticmethod
     def get_orig_weight(model: ov.Model) -> Tensor:
@@ -2316,7 +2316,7 @@ def get_decompressed_weight(compressed_model: ov.Model, input: np.ndarray) -> Te
         return Tensor(weight_output)
 
     @staticmethod
-    def get_ignored_scope_name() -> str:
+    def get_ignored_scope_name(is_3d_weights) -> str:
         return "MatMul_5"
 
     @staticmethod

From 1c76e5fdb03971cbf282d550481ddc1ccd9be2d7 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 21:20:48 +0400
Subject: [PATCH 25/30] fix dynamic shapes

---
 tests/torch2/fx/test_compress_weights.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py
index cd1317f0a85..6297a50e036 100644
--- a/tests/torch2/fx/test_compress_weights.py
+++ b/tests/torch2/fx/test_compress_weights.py
@@ -370,7 +370,7 @@ def get_awq_model(is_3d_weights) -> torch.fx.GraphModule:
         model = AWQLinearModel()
         if is_3d_weights:
             model = AWQLinearModel3D()
-        dynamic_shapes = [[None, torch.export.Dim("dynamic_shape"), None]]
+        dynamic_shapes = [[torch.export.Dim.DYNAMIC, torch.export.Dim.DYNAMIC, None]]
         ex_input = torch.ones([2, 4, 8], dtype=torch.float32)
         exported_model = get_torch_fx_model(model, ex_input, dynamic_shapes=dynamic_shapes)
         return exported_model
@@ -387,8 +387,9 @@ def get_awq_act_model(is_3d_weights, with_multiply, n_layers):
         model = AWQActLinearModel(with_multiply=with_multiply, n_layers=n_layers)
         if is_3d_weights:
             model = AWQActLinearModel3D(with_multiply=with_multiply, n_layers=n_layers)
+        dynamic_shapes = [[torch.export.Dim.DYNAMIC, torch.export.Dim.DYNAMIC, None]]
         ex_input = torch.ones([2, 8, 8], dtype=torch.float32)
-        exported_model = get_torch_fx_model(model, ex_input)
+        exported_model = get_torch_fx_model(model, ex_input, dynamic_shapes=dynamic_shapes)
         return exported_model
 
     @staticmethod

From b51d878b68375b04d83194c29e00811106b6b593 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 21:52:06 +0400
Subject: [PATCH 26/30] add xfail for last test

---
 .../native/quantization/test_weights_compression.py   | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index ddb5e0fbfa0..8824b898daf 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -2297,6 +2297,17 @@ def test_awq_scale_reference(self, monkeypatch, mocker, is_3d_weights):
     def test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker):
         return super().test_data_free_awq(dataset, prefer_data_aware_scaling, is_3d_weights, mocker)
 
+    @pytest.mark.parametrize(
+        "is_3d_weights", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))]
+    )
+    @pytest.mark.parametrize("with_multiply", (True, False))
+    def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul(
+        self, int4_mode, with_multiply, is_3d_weights, mocker
+    ):
+        return super().test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul(
+            int4_mode, with_multiply, is_3d_weights, mocker
+        )
+
     @staticmethod
     def get_orig_weight(model: ov.Model) -> Tensor:
         for op in model.get_ordered_ops():

From db8417bc0832fa8f2835864d581b38ba9e31d430 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 22:06:27 +0400
Subject: [PATCH 27/30] check dynamic dimensions correctly

---
 tests/torch2/fx/test_compress_weights.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py
index 6297a50e036..c228201e92e 100644
--- a/tests/torch2/fx/test_compress_weights.py
+++ b/tests/torch2/fx/test_compress_weights.py
@@ -370,7 +370,7 @@ def get_awq_model(is_3d_weights) -> torch.fx.GraphModule:
         model = AWQLinearModel()
         if is_3d_weights:
             model = AWQLinearModel3D()
-        dynamic_shapes = [[torch.export.Dim.DYNAMIC, torch.export.Dim.DYNAMIC, None]]
+        dynamic_shapes = [[torch.export.Dim.AUTO, torch.export.Dim.DYNAMIC, None]]
         ex_input = torch.ones([2, 4, 8], dtype=torch.float32)
         exported_model = get_torch_fx_model(model, ex_input, dynamic_shapes=dynamic_shapes)
         return exported_model
@@ -387,7 +387,7 @@ def get_awq_act_model(is_3d_weights, with_multiply, n_layers):
         model = AWQActLinearModel(with_multiply=with_multiply, n_layers=n_layers)
         if is_3d_weights:
             model = AWQActLinearModel3D(with_multiply=with_multiply, n_layers=n_layers)
-        dynamic_shapes = [[torch.export.Dim.DYNAMIC, torch.export.Dim.DYNAMIC, None]]
+        dynamic_shapes = [[torch.export.Dim.AUTO, torch.export.Dim.DYNAMIC, None]]
         ex_input = torch.ones([2, 8, 8], dtype=torch.float32)
         exported_model = get_torch_fx_model(model, ex_input, dynamic_shapes=dynamic_shapes)
         return exported_model

From 9173dfb32e22943c97131f9c0e6bf8c115138381 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Dec 2025 22:25:45 +0400
Subject: [PATCH 28/30] fix onnx backend formatting of weights

---
 .../algorithms/weight_compression/onnx_backend.py          | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
index c99a259f275..61d70821d27 100644
--- a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
+++ b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -109,15 +109,16 @@ def _preprocess_compressed_weight(
         scale = compressed_weight.scale
         zero_point = compressed_weight.zero_point
 
-        axis = 1 if dequantize_block_size else None
+        # For 3D weights, we need to squeeze at the next dimension compared to 2D because of batch dim
+        axis = 1 + len(scale.shape) % 3 if dequantize_block_size else None
         scale = scale.squeeze(axis=axis)
         if zero_point is not None:
             zero_point = zero_point.squeeze(axis=axis)
 
         if apply_transpose:
-            scale = fns.transpose(scale)
+            scale = fns.moveaxis(scale, -1, -2)
             if zero_point is not None:
-                zero_point = fns.transpose(zero_point)
+                zero_point = fns.moveaxis(zero_point, -1, -2)
 
         if zero_point is not None:
             zero_point = zero_point.astype(tensor.dtype)

From 7a9a12825a12ded244f570d42a1d138f536e17a6 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 9 Dec 2025 19:42:46 +0400
Subject: [PATCH 29/30] fix reduction axes

---
 .../algorithms/weight_compression/awq.py      | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py
index 47f96226572..ac9637b8a98 100644
--- a/src/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/src/nncf/quantization/algorithms/weight_compression/awq.py
@@ -178,8 +178,16 @@ def apply(
 
             nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}")
 
+            weight_dim = len(weight.shape)
             if is_data_free:
-                scale = self._data_free_step(weight, -(len(weight.shape) - wp.reduction_axes[0]))
+                # Reached this formula using a simple generalization of possible values.
+                # It comes out to be a beautiful constant - reduction axes where
+                # constant is (n-1)th odd number. Where n is the dimension
+                # 2(n-1)-1 -> 2n-3
+                # Example: 2D -> 1 - reduction_axes (reduction_axes=1 -> 1-1=0; reduction_axes=0; 1-0=1)
+                #          3D -> 3 - reduction_axes (reduction_axes=1 -> 3-1=2; reduction_axes=2; 3-2=1)
+                #          4D -> 5 - reduction_axes (reduction_axes=1 -> 3-1=2; reduction_axes=2; 3-2=1)
+                scale = self._data_free_step(weight, (weight_dim * 2) - 3 - wp.reduction_axes[0])
             else:
                 prev_weight, prev_statistics = None, None
                 if is_mergeable:
@@ -191,16 +199,7 @@ def apply(
                     prev_statistics = statistics[merge_node.node_name]
                 scale = self._data_aware_step(wp, weight, statistics[k], prev_weight, prev_statistics)
 
-            # For 3D weights, len(scale.shape)%2 == 0 whereas for 2D it is 1. This allows us to index
-            # from the last dim and not consider the batch dim in 3D case.
-            # Example:
-            #   3D weights: W shape = [B, M, N]; reduction_axes = 2
-            #   scale_shape = [M, N] -> len(scale.shape) = 2 -> 2 % 2 = 0
-            #   unsqueeze scale at -(0 + 2) = -2.
-            #   2D weights:   W shape = [M, N]; reduction_axes = 1
-            #   scale_shape = [M] -> len(scale.shape) = 1 -> 1 % 2 = 1
-            #   unsqueeze scale at -(1 + 1) = -2.
-            w_scale = fns.unsqueeze(scale, -(len(scale.shape) % 2 + wp.reduction_axes[0]))
+            w_scale = fns.unsqueeze(scale, (weight_dim * 2) - 3 - wp.reduction_axes[0])
             a_scale = fns.unsqueeze(1.0 / scale, wp.reduction_axes[0])
 
             scaled_weight = (weight * w_scale).astype(weight_dtype)

From e605827d3f9222511c77959c1ea53dac8eb39485 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 9 Dec 2025 19:44:10 +0400
Subject: [PATCH 30/30] fix onnx test

---
 tests/onnx/quantization/test_weights_compression.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py
index be361d30397..4463abb9bcc 100644
--- a/tests/onnx/quantization/test_weights_compression.py
+++ b/tests/onnx/quantization/test_weights_compression.py
@@ -9,9 +9,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 from collections import defaultdict
 from dataclasses import dataclass
+from functools import reduce
+from operator import mul
 from typing import Any, Callable, Optional
 
 import numpy as np
@@ -673,7 +674,7 @@ def get_awq_act_model(is_3d_weights: bool, with_multiply: bool, n_layers: int) -
             # The first and last dimension are later transposed
             weight_shape = (8, 8, 2)
 
-        data = 0.01 * np.arange(0, math.prod(weight_shape)).reshape(weight_shape) + 0.05
+        data = 0.01 * np.arange(0, reduce(mul, weight_shape, 1)).reshape(weight_shape) + 0.05
         data = data.astype(np.float32).T
 
         x = mb.add_input("input", (2, 8, 8))
@@ -723,7 +724,7 @@ def get_awq_model(is_3d_weights) -> onnx.ModelProto:
         x = mb.add_input("input", (2, None, 8))
         output = mb.add_output("output", (2, None, 8))
 
-        w_data = 0.01 * np.arange(0, math.prod(weight_shape), dtype=np.float32).reshape(weight_shape) + 0.05
+        w_data = 0.01 * np.arange(0, reduce(mul, weight_shape, 1), dtype=np.float32).reshape(weight_shape) + 0.05
         w_data = w_data.T
 
         num_blocks = 2