daniil-lyakhov
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/awq.py‎
Lines changed: 71 additions & 32 deletions b/‎src/nncf/quantization/algorithms/weight_compression/awq.py‎
Lines changed: 71 additions & 32 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/onnx_backend.py‎
Lines changed: 8 additions & 3 deletions b/‎src/nncf/quantization/algorithms/weight_compression/onnx_backend.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎tests/cross_fw/test_templates/template_test_weights_compression.py‎
Lines changed: 39 additions & 22 deletions b/‎tests/cross_fw/test_templates/template_test_weights_compression.py‎
Lines changed: 39 additions & 22 deletions
@@ -159,11 +159,8 @@ def apply(
             weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph)
             if len(weight_data) != 1:  # not supported by the algorithm
                 continue
-            is_mergeable = self._backend_entity.is_node_with_weights(merge_node, graph)
-
-            nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}")
-
             _, weight_port_id = weight_data[0]
+
             weight = self._backend_entity.get_weight(
                 wp.node_with_weight, weight_port_id, model, graph
             )  # get_const_value(wp.weight_node)
@@ -172,8 +169,26 @@ def apply(
 
             act_ch_axis, act_shape = self._get_activation_channel_axis_and_shape(graph, wp)
 
+            is_mergeable = False
+            if self._backend_entity.is_node_with_weights(merge_node, graph):
+                mergeable_node_weight_data = self._backend_entity.get_weight_names_and_port_ids(merge_node, graph)
+                merge_node_weight_ndims = [
+                    len(self._backend_entity.get_weight_shape(merge_node, port_id, graph))
+                    for _, port_id in mergeable_node_weight_data
+                ]
+                is_mergeable = len(weight.shape) in merge_node_weight_ndims
+
+            nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}")
+
+            weight_ndim = len(weight.shape)
+            # Weights scale reduction formula:
+            # 2(n-1)-1 -> 2n-3
+            # Example: 2D -> 1 - reduction_axes (reduction_axes=1) = 0
+            #          3D -> 3 - reduction_axes (reduction_axes=1) = 2
+            #          4D -> 5 - reduction_axes (reduction_axes=1) = 4
+            weight_scale_reduction_axes = (weight_ndim * 2) - 3 - wp.reduction_axes[0]
             if is_data_free:
-                scale = self._data_free_step(weight, 1 - wp.reduction_axes[0])
+                scale = self._data_free_step(weight, axis=weight_scale_reduction_axes)
             else:
                 prev_weight, prev_statistics = None, None
                 if is_mergeable:
@@ -185,7 +200,7 @@ def apply(
                     prev_statistics = statistics[merge_node.node_name]
                 scale = self._data_aware_step(wp, weight, statistics[k], act_ch_axis, prev_weight, prev_statistics)
 
-            w_scale = fns.unsqueeze(scale, 1 - wp.reduction_axes[0])
+            w_scale = fns.unsqueeze(scale, weight_scale_reduction_axes)
             a_scale = 1.0 / scale
 
             scaled_weight = (weight * w_scale).astype(weight_dtype)
@@ -198,9 +213,18 @@ def apply(
                     merge_weight = (merge_weight * a_scale).astype(weight_dtype)
                     self._backend_entity.set_weight(merge_node, port_id, model, graph, merge_weight)
             else:  # for Act->Multiply->MatMul and Act->MatMul patterns scale inserted after Act as extra node
-                # Calculate the activation scale shape
-                a_scale_shape = [scale.shape[0] if axis == act_ch_axis else 1 for axis in range(len(act_shape))]
-                a_scale = fns.reshape(a_scale, tuple(a_scale_shape))
+                act_ndim = len(act_shape)
+                scale_shape = a_scale.shape
+                # Only the last dim in the activation scale is for channel. The others are for batch
+                batch_dims = iter(scale_shape[:-1])
+                # For the last dim of the scale which is assumed channel, we place it as it is
+                # For the rest of the elements we iterate the batch dims and place accordingly
+                # And once we finish, we start placing ones if the current dimension is not
+                # channel axis And it is not a batch dim, we place 1.
+                act_scale_shape = tuple(
+                    scale_shape[-1] if dim == act_ch_axis else next(batch_dims, 1) for dim in range(act_ndim)
+                )
+                a_scale = fns.reshape(a_scale, act_scale_shape)
 
                 next_nodes = graph.get_next_nodes(merge_node)
                 source_node_output_port = graph.get_output_edges(merge_node)[0].output_port_id
@@ -223,49 +247,63 @@ def _data_aware_step(self, wp, weight, statistics, act_ch_axis, prev_weight=None
         s = s.astype(TensorDataType.float32)
         X = X.astype(TensorDataType.float32)
 
+        is_2d_weight = weight.ndim == 2
+
         assert isinstance(wp.reduction_axes, tuple) and len(wp.reduction_axes) == 1
         reduction_axis = wp.reduction_axes[0]
 
+        if is_2d_weight:
+            s = fns.unsqueeze(s, 0)
+            X = fns.unsqueeze(X, 0)
+            weight = fns.unsqueeze(weight, 0)
+            prev_weight = fns.unsqueeze(prev_weight, 0) if prev_weight is not None else None
+            reduction_axis += 1
+
         prev_s, prev_w = None, None
         if prev_statistics is not None and prev_weight is not None:
             prev_s, _ = process_stats(prev_statistics, self._subset_size, act_ch_axis)
             prev_s = prev_s.astype(TensorDataType.float32).max().item()
             prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis)
 
-        top_k = max(int(s.shape[0] * self._percent_to_apply), 1)
-        topk_idxs = fns.argsort(-s)[:top_k]
+        top_k = max(int(s.shape[-1] * self._percent_to_apply), 1)
+        topk_idxs = fns.argsort(-s)[:, :top_k]
 
         group_size = config.group_size
         if group_size == -1:
-            group_size = s.shape[0]
+            group_size = s.shape[-1]
 
         groups_to_correct = set()
-        for idx in topk_idxs:
-            groups_to_correct.add(idx.data // group_size)
+        for batch_idx in range(topk_idxs.shape[0]):
+            for k_idx in range(topk_idxs.shape[1]):
+                idx = topk_idxs[batch_idx, k_idx].item()
+                group_idx = idx // group_size
+                groups_to_correct.add((batch_idx, group_idx))
 
         groups_to_correct = list(groups_to_correct)
 
-        if reduction_axis == 0:
-            weight = fns.transpose(weight)
-            reduction_axis = 1
+        if reduction_axis == 1:
+            # Weights
+            # 3D: [num_experts, hidden_dimension, out_features] -> [num_experts, out_features, hidden_dimension]
+            # 2D: [1, hidden_dimension, out_features] -> [1, out_features, hidden_dimension]
+            weight = fns.moveaxis(weight, -1, -2)
+            reduction_axis = weight.ndim - 1
 
-        shape_vector = fns.mean(X, axis=1)
+        shape_vector = fns.mean(X, axis=-1)
         scale = fns.ones_like(shape_vector)
 
         awq_config = deepcopy(config)
         awq_config.group_size = -1
 
-        for gi in groups_to_correct:
+        for batch_idx, gi in groups_to_correct:
             offset = gi * group_size
-            gscale = s[offset : offset + group_size]
+            gscale = s[batch_idx, offset : offset + group_size]
+            gweight = weight[batch_idx, :, offset : offset + group_size]
+            gacts = X[batch_idx, offset : offset + group_size, :]
 
             a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32)
             a_max = 1e2
             gscale = fns.clip(gscale, a_min=a_min, a_max=a_max)
 
-            gweight = weight[:, offset : offset + group_size]
-            gacts = X[offset : offset + group_size, :]
-
             fp32_out = fns.matmul(gweight, gacts)
             min_diff = fns.max(fns.abs(fp32_out))
             best_scale = None
@@ -281,28 +319,26 @@ def _data_aware_step(self, wp, weight, statistics, act_ch_axis, prev_weight=None
                     # per channel magnitudes for the previous MatMul
                     # mean(abs(prev_weight)) * max(abs((prev_activation))) * prev_weight.shape[reduction_axis]
                     magnitudes = (
-                        (prev_w[offset : offset + group_size] / cur_scale) * prev_s * prev_weight.shape[reduction_axis]
+                        (prev_w[batch_idx, offset : offset + group_size] / cur_scale)
+                        * prev_s
+                        * prev_weight.shape[reduction_axis]
                     )
                     if magnitudes.max() >= threshold:
                         cur_scale = AWQ._clamp_scale(
                             magnitudes,
                             threshold,
                             cur_scale,
-                            prev_w[offset : offset + group_size]
+                            prev_w[batch_idx, offset : offset + group_size]
                             * prev_s
                             * prev_weight.shape[reduction_axis]
                             / threshold,
                         )
 
                 weights_to_fake_quantize = gweight * cur_scale
                 if not config.is_integer:
-                    g_decompressed_weighs = float_quantize_dequantize_weight(
-                        weights_to_fake_quantize, awq_config, reduction_axis
-                    )
+                    g_decompressed_weighs = float_quantize_dequantize_weight(weights_to_fake_quantize, awq_config, -1)
                 else:
-                    g_decompressed_weighs = integer_quantize_dequantize_weight(
-                        weights_to_fake_quantize, awq_config, reduction_axis
-                    )
+                    g_decompressed_weighs = integer_quantize_dequantize_weight(weights_to_fake_quantize, awq_config, -1)
                 sacts = gacts / fns.unsqueeze(cur_scale, 1)
 
                 cur_out = fns.matmul(g_decompressed_weighs, sacts)
@@ -313,7 +349,10 @@ def _data_aware_step(self, wp, weight, statistics, act_ch_axis, prev_weight=None
                 alpha += alpha_step
 
             if best_scale is not None:
-                scale.data[offset : offset + group_size] = best_scale.data
+                scale.data[batch_idx, offset : offset + group_size] = best_scale.data
+
+        if is_2d_weight:
+            scale = fns.squeeze(scale, 0)  # [1, hidden_dim] -> [hidden_dim]
 
         return scale
 
 
@@ -110,15 +110,16 @@ def _preprocess_compressed_weight(
         scale = compressed_weight.scale
         zero_point = compressed_weight.zero_point
 
-        axis = 1 if dequantize_block_size else None
+        # For 3D weights, we need to squeeze at the next dimension compared to 2D because of batch dim
+        axis = 1 + len(scale.shape) % 3 if dequantize_block_size else None
         scale = scale.squeeze(axis=axis)
         if zero_point is not None:
             zero_point = zero_point.squeeze(axis=axis)
 
         if apply_transpose:
-            scale = fns.transpose(scale)
+            scale = fns.moveaxis(scale, -1, -2)
             if zero_point is not None:
-                zero_point = fns.transpose(zero_point)
+                zero_point = fns.moveaxis(zero_point, -1, -2)
 
         if zero_point is not None:
             zero_point = zero_point.astype(tensor.dtype)
@@ -267,6 +268,10 @@ def transform_model(
             # For opsets earlier than 21, we use the `MatMulNBits` operation from ONNX Runtime contrib operators.
             # See https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md
             if opset_version < 21 and dequantize_block_size > 0:
+                if len(weight.shape) == 3:
+                    msg = """ONNX does not support 3D weights for opset version < 21.
+                             Please use a higher opset version or per-channel quantization"""
+                    raise nncf.ParameterNotSupportedError(msg)
                 compressed_weight, scale, zero_point = self._preprocess_compressed_weight(
                     compressed_weight, weight.shape, dequantize_block_size=None, apply_transpose=True
                 )
 
@@ -360,7 +360,7 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self, mocker):
     # AWQ Tests
     @staticmethod
     @abstractmethod
-    def get_awq_act_model(with_multiply, n_layers):
+    def get_awq_act_model(is_3d_weights, with_multiply, n_layers):
         "Returns a backend model for test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul."
 
     @staticmethod
@@ -372,13 +372,16 @@ def get_num_multiply_from_awq(model: TModel) -> int:
     def int4_mode(self, request):
         return None
 
+    @pytest.mark.parametrize("is_3d_weights", [True, False])
     @pytest.mark.parametrize("with_multiply", (True, False))
-    def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul(self, int4_mode, with_multiply, mocker):
+    def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul(
+        self, int4_mode, with_multiply, is_3d_weights, mocker
+    ):
         n_layers = 8
         n_awq_target = n_layers - 1  # first MatMul is always int8
-        model = self.get_awq_act_model(with_multiply, n_layers)
+        model = self.get_awq_act_model(is_3d_weights, with_multiply, n_layers)
 
-        dataset = Dataset([self.to_tensor(np.ones([1, 8, 8], dtype=np.float32))], self.get_transform_func())
+        dataset = Dataset([self.to_tensor(np.ones([2, 8, 8], dtype=np.float32))], self.get_transform_func())
 
         with SpyWeightCompressionStatisticsContext(mocker):
             model = compress_weights(model, mode=int4_mode, ratio=1.0, group_size=2, dataset=dataset, awq=True)
@@ -388,8 +391,11 @@ def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul(self, int
 
     @staticmethod
     @abstractmethod
-    def get_awq_model(non_mergable_pattern: bool) -> TModel:
-        "Returns a backend model for test_awq_with_ignored_scope."
+    def get_awq_model(non_mergable_pattern: bool, is_3d_weights: bool) -> TModel:
+        """
+        Returns a backend model for test_awq_with_ignored_scope."
+        :param is_3d_weights: The model has 3d weights
+        """
 
     @staticmethod
     @abstractmethod
@@ -408,16 +414,19 @@ def get_num_int4_group_sizes(model: TModel) -> dict[int, int]:
 
     @staticmethod
     @abstractmethod
-    def get_ignored_scope_name() -> str:
+    def get_ignored_scope_name(is_3d_weights) -> str:
         "Returns ignored scope name for test_awq_with_ignored_scope."
 
-    def test_awq_with_ignored_scope(self, mocker):
-        model = self.get_awq_model(non_mergable_pattern=False)
+    @pytest.mark.parametrize("is_3d_weights", [True, False])
+    def test_awq_with_ignored_scope(self, mocker, is_3d_weights):
+        model = self.get_awq_model(non_mergable_pattern=False, is_3d_weights=is_3d_weights)
         sz = 8
         n_samples = 10
 
+        input_shape = [2, 8, sz]
+
         dataset = Dataset(
-            [self.to_tensor(np.ones([1, 8, sz], dtype=np.float32)) for i in range(n_samples)],
+            [self.to_tensor(np.ones(input_shape, dtype=np.float32)) for i in range(n_samples)],
             self.get_transform_func(),
         )
 
@@ -429,12 +438,12 @@ def test_awq_with_ignored_scope(self, mocker):
                 group_size=-1,
                 dataset=dataset,
                 awq=True,
-                ignored_scope=IgnoredScope(names=[self.get_ignored_scope_name()]),
+                ignored_scope=IgnoredScope(names=[self.get_ignored_scope_name(is_3d_weights)]),
             )
 
         int4_ref_num_compressed = 4  # last MatMul is always int8; one - is ignored; total 6 matmuls
         int4_num_nodes = self.get_num_int4_nodes(compressed_model)
-        assert int4_num_nodes == int4_ref_num_compressed
+        assert int4_num_nodes == int4_ref_num_compressed, int4_num_nodes
 
     def test_rope_weight_compression(self):
         model = self.get_RoPE_model()
@@ -490,12 +499,14 @@ def transpose_a_supported(self) -> bool:
 
     # Transpose inputs does not affect mergable pattern code, skippting (True, False)
     @pytest.mark.parametrize("transpose_a,non_mergable_pattern", [(True, True), (False, True), (False, False)])
+    @pytest.mark.parametrize("is_3d_weights", [True, False])
     def test_awq_scale_reference(
         self,
         non_mergable_pattern,
         transpose_a,
         test_awq_scale_ref,
         transpose_a_supported,
+        is_3d_weights,
         monkeypatch,
         mocker,
     ):
@@ -505,11 +516,14 @@ def test_awq_scale_reference(
                 msg = "Transpose a is not supported for the current backend"
                 pytest.skip(msg)
 
-            INPUT_SHAPE = (2, 4)
-            model = self.get_transposable_awq_model(transpose_a=True, transpose_b=True, input_shape=INPUT_SHAPE)
+            INPUT_SHAPE = (2, 2, 4) if is_3d_weights else (2, 4)
+            model = self.get_transposable_awq_model(
+                transpose_a=True, transpose_b=True, input_shape=INPUT_SHAPE, is_3d_weights=is_3d_weights
+            )
         else:
-            INPUT_SHAPE = (1, 4, 8)
-            model = self.get_awq_model(non_mergable_pattern)
+            batch_size = 1 if not is_3d_weights else 2
+            INPUT_SHAPE = (batch_size, 4, 8)
+            model = self.get_awq_model(non_mergable_pattern, is_3d_weights)
         input = 0.01 * np.arange(0, np.multiply.reduce(INPUT_SHAPE), dtype=np.float32).reshape(INPUT_SHAPE) + 0.02
         input = self.to_tensor(input)
         dataset = Dataset([input] * 2, self.get_transform_func())
@@ -526,7 +540,7 @@ def test_awq_scale_reference(
             )
         assert spy_instance is not None
         for node_name, scales in spy_instance._scale_per_target_node.items():
-            ref = test_awq_scale_ref[node_name]
+            ref = test_awq_scale_ref[is_3d_weights][node_name]
             assert fns.allclose(scales, ref)
             assert scales.shape == ref.shape
 
@@ -652,14 +666,15 @@ def test_group_size_fallback_modes(
             f"Expected {ref_num_group_sizes} group size values, but got {num_group_sizes}."
         )
 
-    @pytest.mark.parametrize("dataset", [None, np.ones([1, 8, 8], dtype=np.float32)])
+    @pytest.mark.parametrize("is_3d_weights", [True, False])
+    @pytest.mark.parametrize("dataset", [None, np.ones([2, 8, 8], dtype=np.float32)])
     @pytest.mark.parametrize("prefer_data_aware_scaling", [True, False])
-    def test_data_free_awq(self, dataset, prefer_data_aware_scaling, mocker):
-        input_data = np.ones([1, 8, 8], dtype=np.float32)
+    def test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker):
+        input_data = np.ones([2, 8, 8], dtype=np.float32)
 
         n_layers = 8
         n_awq_target = n_layers - 1  # first MatMul is always int8
-        model = self.get_awq_act_model(True, n_layers)
+        model = self.get_awq_act_model(is_3d_weights, True, n_layers)
         model = self.wrap_model(model, input_data)
 
         if dataset is not None:
@@ -778,7 +793,9 @@ def test_process_stats(self, case: ProcessStatsTestCase):
 
     @staticmethod
     @abstractmethod
-    def get_transposable_awq_model(transpose_a: bool, transpose_b: bool, input_shape=None) -> TModel:
+    def get_transposable_awq_model(
+        transpose_a: bool, transpose_b: bool, input_shape=None, is_3d_weights: bool = False
+    ) -> TModel:
         "Returns a backend model for test_compression_with_transpose."
 
     @pytest.mark.parametrize(