openvinotoolkit
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/activation_stats.py‎
Lines changed: 5 additions & 2 deletions b/‎src/nncf/quantization/algorithms/weight_compression/activation_stats.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/algorithm.py‎
Lines changed: 31 additions & 17 deletions b/‎src/nncf/quantization/algorithms/weight_compression/algorithm.py‎
Lines changed: 31 additions & 17 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/awq.py‎
Lines changed: 23 additions & 7 deletions b/‎src/nncf/quantization/algorithms/weight_compression/awq.py‎
Lines changed: 23 additions & 7 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/backend.py‎
Lines changed: 23 additions & 0 deletions b/‎src/nncf/quantization/algorithms/weight_compression/backend.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/gptq.py‎
Lines changed: 5 additions & 0 deletions b/‎src/nncf/quantization/algorithms/weight_compression/gptq.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/mixed_precision.py‎
Lines changed: 1 addition & 1 deletion b/‎src/nncf/quantization/algorithms/weight_compression/mixed_precision.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/onnx_backend.py‎
Lines changed: 19 additions & 3 deletions b/‎src/nncf/quantization/algorithms/weight_compression/onnx_backend.py‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/openvino_backend.py‎
Lines changed: 10 additions & 4 deletions b/‎src/nncf/quantization/algorithms/weight_compression/openvino_backend.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/scale_estimation.py‎
Lines changed: 4 additions & 0 deletions b/‎src/nncf/quantization/algorithms/weight_compression/scale_estimation.py‎
Lines changed: 4 additions & 0 deletions
@@ -17,12 +17,13 @@
 from nncf.tensor import functions as fns
 
 
-def process_stats(stats: WCTensorStatistic, subset_size: int) -> tuple[Tensor, Tensor]:
+def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int = -1) -> tuple[Tensor, Tensor]:
     """
     A function for processing activations. Shared between AWQ, Scale Estimation and LoRA Correction algorithms.
 
     :param stats: An object containing statistics for the layer.
     :param subset_size: The number of samples for AWQ.
+    :param act_ch_axis: The activation channel axis.
     :return: tuple of the following tensors:
         s - maximum channel magnitude across samples [HiddenDim]
         X - average channel magnitude across tokens in the sequence [HiddenDim, min(SampleSize, ~subset_size)]
@@ -41,7 +42,9 @@ def process_stats(stats: WCTensorStatistic, subset_size: int) -> tuple[Tensor, T
 
     # Prevent high memory and time consumption by sampling
     if X_full.shape[sample_axis] > subset_size:
-        lens = [reduce(mul, shape[:-1], 1) for shape in stats.shape_values]
+        lens = [
+            reduce(mul, shape[:act_ch_axis] + shape[act_ch_axis % len(shape) + 1 :], 1) for shape in stats.shape_values
+        ]
         step = X_full.shape[sample_axis] // subset_size
         idxs = [i[0] for i in sorted(enumerate(lens), key=lambda x: -x[1])][::step]
         X = X_full[..., idxs]
 
@@ -959,9 +959,9 @@ def get_weight_compression_parameters(
                         # MoE operations are usually matmuls, so the check for matmul metatype is done
                         # This is to avoid raising the error for non-MoE cases with 3D weights.
                         parsed_ov_version = f"{ov_version[0]}.{ov_version[1]}.{ov_version[2]}-{ov_version[3]}"
-                        msg = f"""NNCF compression algorithms do not support 3D weights with current version of 
-                                OpenVINO {parsed_ov_version} due to a known issue in statistics collection 
-                                Ticket - 176465. Please update to the latest OpenVINO nightly version. 
+                        msg = f"""NNCF compression algorithms do not support 3D weights with current version of
+                                OpenVINO {parsed_ov_version} due to a known issue in statistics collection
+                                Ticket - 176465. Please update to the latest OpenVINO nightly version.
                                 Node with weight: {node.node_name}."""
                         raise nncf.UnsupportedModelError(msg)
 
@@ -1087,6 +1087,11 @@ def apply_with_parameters(
                 )
 
             if self._lora_correction:
+                for wc_params in all_weight_params:
+                    if self._backend_entity.matmul_has_transposed_activations(wc_params.node_with_weight, graph):
+                        msg = "Transposed activations are not supported yet for the LoRa correction algorithm"
+                        raise nncf.UnsupportedModelError(msg)
+
                 lora_correction_params = self._advanced_parameters.lora_correction_params
                 lora_correction_algo = LoraCorrectionAlgorithm(statistics, lora_correction_params)
                 description += " with correction of low-rank adapters"
@@ -1128,19 +1133,21 @@ def apply_with_parameters(
         )
         return transformed_model
 
-    def _get_activation_node_and_port(self, node: NNCFNode, nncf_graph: NNCFGraph) -> tuple[NNCFNode, int]:
+    def _get_activation_node_port_and_channel(self, node: NNCFNode, nncf_graph: NNCFGraph) -> tuple[NNCFNode, int, int]:
         """
-        This method returns the activation layer and corresponding port id for the node.
+        This method returns the activation layer, corresponding port id and channel axis for the given node.
 
         :param node: NNCFGraph node for which the activation is sought.
         :param nncf_graph: NNCFGraph instance with the node.
-        :return: Tuple with the activation node and port id.
+        :return: Tuple with the activation node, port id and channel axis.
         """
         activation_port = self._backend_entity.get_activation_port_id(node, nncf_graph)
         activation_edge = nncf_graph.get_input_edge_by_port_id(node, activation_port)
         activation_node = activation_edge.from_node
-        port_id = activation_edge.output_port_id
-        return activation_node, port_id
+        activation_channel_axis = self._backend_entity.get_activation_channel_axis(
+            node, activation_edge.input_port_id, activation_edge.tensor_shape
+        )
+        return activation_node, activation_edge.output_port_id, activation_channel_axis
 
     def get_matmul_input_to_output_nodes_map(
         self, matmul_nodes: list[NNCFNode], graph: NNCFGraph
@@ -1161,8 +1168,8 @@ def get_matmul_input_to_output_nodes_map(
         """
         matmul_input_to_output_nodes_map = defaultdict(list)
         for node in matmul_nodes:
-            act_node, output_port_id = self._get_activation_node_and_port(node, graph)
-            matmul_input_to_output_nodes_map[(act_node, output_port_id)].append(node)
+            act_node, output_port_id, act_channel_axis = self._get_activation_node_port_and_channel(node, graph)
+            matmul_input_to_output_nodes_map[(act_node, output_port_id, act_channel_axis)].append(node)
         return matmul_input_to_output_nodes_map
 
     def get_compression_nodes_info(
@@ -1230,7 +1237,11 @@ def get_statistic_points(
 
         # Statistics for data aware algorithms
         if self._data_aware_compression:
-            for (node, output_port_id), node_with_weights in matmul_input_to_output_nodes_map.items():
+            for (
+                node,
+                output_port_id,
+                input_channel_axis,
+            ), node_with_weights in matmul_input_to_output_nodes_map.items():
                 statistic_point = self._backend_entity.target_point(
                     TargetType.POST_LAYER_OPERATION, node.node_name, port_id=output_port_id
                 )
@@ -1245,13 +1256,16 @@ def get_statistic_points(
                     ]
                     all_weight_dims.extend(weight_dims)
 
-                # by default, reduce activations across all but the last dimension. The last dimension is
-                # assumed to be the hidden size dimension.
+                # Reduce activations across all but the hidden dimension.
                 n_dims = len(graph.get_output_edges_by_port_id(node, output_port_id)[0].tensor_shape)
-                reduction_axes = tuple(range(n_dims - 1))
+                # negative axis (e.g. -1 for the last axis) is converted into corresponding positive value
+                input_channel_axis = input_channel_axis % n_dims
+                reduction_axes = tuple(i for i in range(n_dims) if i != input_channel_axis)
 
-                # For 3D weights, hidden dimension is the second dimension. Reduce by all other dimensions
-                reduction_axes = (1,) if any(weight_dim == 3 for weight_dim in all_weight_dims) else reduction_axes
+                # For 3D weights, keep the batch dimention
+                if any(weight_dim == 3 for weight_dim in all_weight_dims):
+                    assert len(reduction_axes) == 2
+                    reduction_axes = reduction_axes[1:]
 
                 stat_collector = self._backend_entity.mean_statistic_collector(
                     reduction_axes=reduction_axes, subset_size=self._subset_size
@@ -1291,7 +1305,7 @@ def _get_statistics_for_weights_compression(
         # Where mean_value is a 1D tensor representing an activation reduced over batch and sequence length dimensions,
         # shape is an original shape of an activation before reduction, n is the size of the dataset (or subset_size).
         statistics = {}
-        for (act_node, output_port_id), matmul_nodes in matmul_input_to_output_nodes_map.items():
+        for (act_node, output_port_id, _), matmul_nodes in matmul_input_to_output_nodes_map.items():
             tensor_collectors = list(
                 statistic_points.get_algo_statistics_for_node(
                     act_node.node_name,
 
@@ -170,6 +170,8 @@ def apply(
             weight_dtype = weight.dtype
             weight = weight.astype(TensorDataType.float32)
 
+            act_ch_axis, act_shape = self._get_activation_channel_axis_and_shape(graph, wp)
+
             if is_data_free:
                 scale = self._data_free_step(weight, 1 - wp.reduction_axes[0])
             else:
@@ -181,24 +183,28 @@ def apply(
                     prev_weight = self._backend_entity.get_weight(merge_node, prev_weight_port_id, model, graph)
 
                     prev_statistics = statistics[merge_node.node_name]
-                scale = self._data_aware_step(wp, weight, statistics[k], prev_weight, prev_statistics)
+                scale = self._data_aware_step(wp, weight, statistics[k], act_ch_axis, prev_weight, prev_statistics)
 
             w_scale = fns.unsqueeze(scale, 1 - wp.reduction_axes[0])
-            a_scale = fns.unsqueeze(1.0 / scale, wp.reduction_axes[0])
+            a_scale = 1.0 / scale
 
             scaled_weight = (weight * w_scale).astype(weight_dtype)
             self._backend_entity.set_weight(wp.node_with_weight, weight_port_id, model, graph, scaled_weight)
 
             if is_mergeable:  # for MatMul->Multiply->MatMul pattern the scale is merged to the first MatMul
                 for _, port_id in self._backend_entity.get_weight_names_and_port_ids(merge_node, graph):
                     merge_weight = self._backend_entity.get_weight(merge_node, port_id, model, graph)
+                    a_scale = fns.unsqueeze(a_scale, wp.reduction_axes[0])
                     merge_weight = (merge_weight * a_scale).astype(weight_dtype)
                     self._backend_entity.set_weight(merge_node, port_id, model, graph, merge_weight)
-                a_scale = fns.transpose(a_scale)
             else:  # for Act->Multiply->MatMul and Act->MatMul patterns scale inserted after Act as extra node
-                a_scale = fns.transpose(a_scale).astype(weight_dtype)
+                # Calculate the activation scale shape
+                a_scale_shape = [scale.shape[0] if axis == act_ch_axis else 1 for axis in range(len(act_shape))]
+                a_scale = fns.reshape(a_scale, tuple(a_scale_shape))
+
                 next_nodes = graph.get_next_nodes(merge_node)
                 source_node_output_port = graph.get_output_edges(merge_node)[0].output_port_id
+
                 scale_insertion_command = self._backend_entity.scale_insertion_command(
                     merge_node, next_nodes, source_node_output_port, a_scale.data
                 )
@@ -210,10 +216,10 @@ def apply(
 
         return transformed_model
 
-    def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statistics=None):
+    def _data_aware_step(self, wp, weight, statistics, act_ch_axis, prev_weight=None, prev_statistics=None):
         alpha_step = (self._alpha_max - self._alpha_min) / self._steps
         config = wp.compression_config
-        s, X = process_stats(statistics, self._subset_size)
+        s, X = process_stats(statistics, self._subset_size, act_ch_axis)
         s = s.astype(TensorDataType.float32)
         X = X.astype(TensorDataType.float32)
 
@@ -222,7 +228,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
 
         prev_s, prev_w = None, None
         if prev_statistics is not None and prev_weight is not None:
-            prev_s, _ = process_stats(prev_statistics, self._subset_size)
+            prev_s, _ = process_stats(prev_statistics, self._subset_size, act_ch_axis)
             prev_s = prev_s.astype(TensorDataType.float32).max().item()
             prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis)
 
@@ -311,6 +317,16 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis
 
         return scale
 
+    def _get_activation_channel_axis_and_shape(
+        self, graph: NNCFGraph, wp: WeightCompressionParameters
+    ) -> tuple[int, tuple[int, ...]]:
+        activation_port_id = self._backend_entity.get_activation_port_id(wp.node_with_weight, graph)
+        act_shape = graph.get_input_edge_by_port_id(wp.node_with_weight, activation_port_id).tensor_shape
+        act_ch_axis = self._backend_entity.get_activation_channel_axis(
+            wp.node_with_weight, activation_port_id, act_shape
+        )
+        return act_ch_axis % len(act_shape), act_shape
+
     @staticmethod
     def _clamp_scale(magnitudes, threshold, scale, clamped_scale):
         return fns.where(magnitudes < threshold, scale, clamped_scale)
 
@@ -110,6 +110,17 @@ def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: TMo
         :return: The weight tensor.
         """
 
+    @abstractmethod
+    def matmul_has_transposed_activations(self, matmul: NNCFNode, int, graph: NNCFGraph) -> bool:
+        """
+        Checks whether the activation input of a MatMul operation is transposed.
+
+        :param matmul: MatMul NNCFGraph node.
+        :param graph: The model graph associated with the model.
+        :return: True if the node is a matmul node and activation input is transposed,
+            False otherwise.
+        """
+
     @abstractmethod
     def get_weight_dtype(
         self, node_with_weight: NNCFNode, weight_port_id: int, model: TModel, graph: NNCFGraph
@@ -273,6 +284,18 @@ def get_ignored_patterns() -> GraphPattern:
         :return: backend-specific ignored patterns.
         """
 
+    @staticmethod
+    @abstractmethod
+    def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: tuple[int]) -> int:
+        """
+        Returns axis number of the activation tensor which correspond to it channel.
+
+        :param node: NNCFNode instance.
+        :param port_id: Port ID for input.
+        :param input_shape: Shape of the input.
+        :return: Channel axis number.
+        """
+
 
 class AWQAlgoBackend(WeightCompressionAlgoBackend):
     @staticmethod
 
@@ -124,6 +124,11 @@ def apply(
                 CompressWeightsMode.INT8_SYM,
             ]:
                 continue
+
+            if self._backend_entity.matmul_has_transposed_activations(wc_params.node_with_weight, graph):
+                msg = "Transposed activations are not supported yet for the GPTQ algorithm"
+                raise nncf.UnsupportedModelError(msg)
+
             _, input_tensors = next(iter(inputs.items()))
             hessian = self._calculate_hessian(node, input_tensors)
             scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian, input_tensors)
 
@@ -279,7 +279,7 @@ def get_statistic_points(
         self._set_backend_entity(model)
 
         statistic_container = StatisticPointsContainer()
-        for act_node, output_port_id in nodes_and_port_ids:
+        for act_node, output_port_id, _ in nodes_and_port_ids:
             n_dims = len(graph.get_output_edges_by_port_id(act_node, output_port_id)[0].tensor_shape)
             if n_dims < 2:
                 msg = (
 
@@ -38,6 +38,7 @@
 from nncf.onnx.graph.model_transformer import remove_initializer
 from nncf.onnx.graph.model_transformer import remove_node
 from nncf.onnx.graph.model_transformer import set_initializer
+from nncf.onnx.graph.node_utils import get_act_quantization_axis
 from nncf.onnx.graph.node_utils import get_weight_quantization_axis
 from nncf.onnx.graph.onnx_helper import ONNX_DTYPE_TO_NNCF_DTYPE
 from nncf.onnx.graph.onnx_helper import get_name_to_node_map
@@ -186,6 +187,13 @@ def get_weight(
         weight_tensor = get_tensor_value(model, weight_name)
         return Tensor(weight_tensor)
 
+    def matmul_has_transposed_activations(self, matmul: NNCFNode, graph: NNCFGraph) -> bool:
+        if matmul.metatype != metatypes.ONNXGemmMetatype:
+            return False
+        act_port_id = self.get_activation_port_id(matmul, graph)
+        trans_attr = "transB" if act_port_id else "transA"
+        return matmul.layer_attributes.node_attrs[trans_attr]
+
     def get_weight_dtype(
         self, node_with_weight: NNCFNode, weight_port_id: int, model: onnx.ModelProto, graph: NNCFGraph
     ) -> TensorDataType:
@@ -301,6 +309,10 @@ def filter_func(point: StatisticPoint) -> bool:
 
         return filter_func
 
+    @staticmethod
+    def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: tuple[int]) -> int:
+        return get_act_quantization_axis(node, port_id)
+
     def insert_adapters(
         self, wc_params: WeightCompressionParameters, lora_A: Tensor, lora_B: Tensor, int8_lora: bool
     ) -> None:
@@ -503,9 +515,13 @@ def get_ignored_patterns() -> GraphPattern:
 class ONNXAWQAlgoAlgoBackend(AWQAlgoBackend, ONNXWeightCompressionAlgoBackend):
     @staticmethod
     def get_awq_patterns() -> dict[str, Callable]:
-        return get_awq_patterns(
-            onnx_metatypes.ONNXMatMulMetatype, onnx_metatypes.ONNXMulLayerMetatype, ATOMIC_ACTIVATIONS_OPERATIONS
-        )
+        patterns = {}
+        for mm_metatype in (onnx_metatypes.ONNXMatMulMetatype, onnx_metatypes.ONNXGemmMetatype):
+            p = get_awq_patterns(mm_metatype, onnx_metatypes.ONNXMulLayerMetatype, ATOMIC_ACTIVATIONS_OPERATIONS)
+            p = {f"{mm_metatype.__name__}_{k}": v for k, v in p.items()}
+            patterns.update(p)
+
+        return patterns
 
     @staticmethod
     def scale_insertion_command(
 
@@ -13,7 +13,6 @@
 import openvino as ov
 from openvino import opset13 as opset
 
-import nncf
 from nncf.common.graph import NNCFGraph
 from nncf.common.graph import NNCFNode
 from nncf.common.graph.operator_metatypes import OperatorMetatype
@@ -35,6 +34,7 @@
 from nncf.openvino.graph.node_utils import convert_op
 from nncf.openvino.graph.node_utils import create_ov_codebook_subgraph
 from nncf.openvino.graph.node_utils import create_ov_const_from_tensor
+from nncf.openvino.graph.node_utils import get_activation_channel_axis
 from nncf.openvino.graph.node_utils import get_const_value_as_numpy_tensor
 from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor
 from nncf.openvino.graph.node_utils import get_weight_channel_axes
@@ -119,9 +119,6 @@ def mean_statistic_collector(
 
     @staticmethod
     def get_activation_port_id(node: NNCFNode, nncf_graph: NNCFGraph) -> int:
-        if node.layer_attributes.input_attributes["transpose"]:
-            msg = "Transposed input is not supported"
-            raise nncf.UnsupportedModelError(msg)
         constant_ports = node.layer_attributes.get_const_port_ids()
         activation_ports = [
             e.input_port_id for e in nncf_graph.get_input_edges(node) if e.input_port_id not in constant_ports
@@ -143,6 +140,11 @@ def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.
         weight_tensor = get_const_value_as_numpy_tensor(weight_node)
         return Tensor(weight_tensor)
 
+    def matmul_has_transposed_activations(self, matmul: NNCFNode, graph: NNCFGraph) -> bool:
+        if matmul.metatype != om.OVMatMulMetatype:
+            return False
+        return matmul.layer_attributes.input_attributes["transpose"]
+
     def get_weight_dtype(
         self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph
     ) -> TensorDataType:
@@ -378,6 +380,10 @@ def get_ignored_patterns() -> GraphPattern:
         pattern.add_pattern_alternative(create_sam_pe())
         return pattern
 
+    @staticmethod
+    def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: tuple[int]) -> int:
+        return get_activation_channel_axis(node, port_id, input_shape)
+
 
 class OVTensorWeightCompressionAlgoBackend(OVWeightCompressionAlgoBackend):
     """
 
@@ -139,6 +139,10 @@ def apply(
                 continue
             _, weight_port_id = weight_data[0]
 
+            if self._backend_entity.matmul_has_transposed_activations(wp.node_with_weight, graph):
+                msg = "Transposed activations are not supported yet for the Scale Estimation algorithm"
+                raise nncf.UnsupportedModelError(msg)
+
             weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)
 
             scale, zero_point = self.calculate_quantization_params(