openvinotoolkit
diff --git a/‎src/nncf/onnx/graph/node_utils.py‎
Lines changed: 2 additions & 0 deletions b/‎src/nncf/onnx/graph/node_utils.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/activation_stats.py‎
Lines changed: 26 additions & 9 deletions b/‎src/nncf/quantization/algorithms/weight_compression/activation_stats.py‎
Lines changed: 26 additions & 9 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/algorithm.py‎
Lines changed: 53 additions & 9 deletions b/‎src/nncf/quantization/algorithms/weight_compression/algorithm.py‎
Lines changed: 53 additions & 9 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/onnx_backend.py‎
Lines changed: 3 additions & 0 deletions b/‎src/nncf/quantization/algorithms/weight_compression/onnx_backend.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/scale_estimation.py‎
Lines changed: 28 additions & 22 deletions b/‎src/nncf/quantization/algorithms/weight_compression/scale_estimation.py‎
Lines changed: 28 additions & 22 deletions
diff --git a/‎src/nncf/quantization/statistics_caching.py‎
Lines changed: 1 addition & 1 deletion b/‎src/nncf/quantization/statistics_caching.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/cross_fw/test_templates/template_test_weights_compression.py‎
Lines changed: 33 additions & 5 deletions b/‎tests/cross_fw/test_templates/template_test_weights_compression.py‎
Lines changed: 33 additions & 5 deletions
@@ -138,6 +138,8 @@ def get_weight_quantization_axis(node: NNCFNode, port_id: int) -> int:
         transpose = node.layer_attributes.node_attrs[trans_attr]
         # 0 - (M, K), 1 - (K, N)
         weight_channel_axis = -1 - port_id if transpose else -2 + port_id
+    if node.metatype == om.ONNXMatMulMetatype:
+        weight_channel_axis = -1 - port_id if port_id == 0 else -2 + port_id
     return weight_channel_axis
 
 
 
@@ -27,17 +27,34 @@ def process_stats(stats: WCTensorStatistic, subset_size: int) -> tuple[Tensor, T
         s - maximum channel magnitude across samples [HiddenDim]
         X - average channel magnitude across tokens in the sequence [HiddenDim, min(SampleSize, ~subset_size)]
     """
-    X = fns.stack(stats.mean_values)  # [SampleSize, HiddenDim]
-    X_full = fns.transpose(X)  # [HiddenDim, SampleSize]
+    X = fns.stack(
+        stats.mean_values
+    )  # [SampleSize, HiddenDim] for 2-D or [SampleSize, No. of Experts, HiddenDim] for 3-D
 
-    # prevent high memory and time consumption
-    if X_full.shape[1] > subset_size:
-        # activations were reduced across all but the last dimension
+    # Move SampleSize to the last axis: [HiddenDim, SampleSize] or [No. of Experts, HiddenDim, SampleSize]
+    # General approach: move axis 0 to the end
+    axes = list(range(1, len(X.shape))) + [0]
+    X_full = fns.transpose(X, axes=axes)
+
+    # The sample dimension is always the last axis after transpose
+    sample_axis = -1
+
+    # Prevent high memory and time consumption by sampling
+    if X_full.shape[sample_axis] > subset_size:
+        # Activations were reduced across all but the last dimension
         lens = [reduce(mul, shape[:-1], 1) for shape in stats.shape_values]
-        step = X_full.shape[1] // subset_size
-        idxs = [i[0] for i in sorted(enumerate(lens), key=lambda x: -x[1])][::step]
-        X = X_full[:, idxs]  # [HiddenDim, ~SubsetSize]
+        step = X_full.shape[sample_axis] // subset_size
+        sorted_idxs = [i[0] for i in sorted(enumerate(lens), key=lambda x: -x[1])][::step]
+        idxs = [idx for idx in sorted_idxs if idx < X_full.shape[sample_axis]][:subset_size]
+
+        # Create index slices for all dimensions except the last one
+        # This works for both 2D and 3D (and theoretically any dimensionality)
+        index_slices = [slice(None)] * (len(X_full.shape) - 1) + [idxs]
+        X = X_full[tuple(index_slices)]
     else:
         X = X_full
-    s = fns.max(fns.abs(X_full), axis=1)  # [HiddenDim]
+
+    # Compute max magnitude along the sample axis (last axis)
+    # Result: [HiddenDim] or [No. of Experts, HiddenDim]
+    s = fns.max(fns.abs(X_full), axis=sample_axis)
     return s, X
@@ -14,7 +14,9 @@
 from collections import OrderedDict
 from collections import defaultdict
 from functools import reduce
-from typing import Any, Iterable, Optional, TypeVar
+from typing import Any, Optional, TypeVar
+
+from packaging import version
 
 import nncf
 from nncf import Dataset
@@ -786,6 +788,14 @@ def is_weight_compression_supported(
 
         return is_supported_dtype and not no_bit_reduction
 
+    def _maybe_get_ov_major_version(self) -> Optional[str]:
+        try:
+            import openvino as ov
+
+            return ov.__version__.split(".")[0]
+        except Exception:
+            return None
+
     def get_weight_compression_parameters(
         self,
         model: TModel,
@@ -851,6 +861,22 @@ def get_weight_compression_parameters(
                             f"node name: {node.node_name}. The node will be in {self._backup_mode} mode."
                         )
 
+                    model_backend = get_backend(model)
+                    ov_version = self._maybe_get_ov_major_version()
+                    if (
+                        model_backend == BackendType.OPENVINO
+                        and len(weight_shape) == 3
+                        and ov_version
+                        and version.parse(ov_version) <= version.parse("2026")
+                        and node.metatype in self._backend_entity.matmul_metatypes
+                    ):
+                        # MoE operations are usually matmuls, so the check for matmul metatype is done
+                        # This is to avoid raising the error for non-MoE cases with 3D weights.
+                        msg = f"""NNCF does not support 3D weights with current version of Openvino {ov_version} 
+                                due to a known issue in statistics collection Ticket - 176465
+                                Node with weight: {node.node_name}"""
+                        raise nncf.UnsupportedModelError(msg)
+
                     if self._backup_mode != BackupMode.NONE:
                         mode = (
                             CompressWeightsMode.INT8_ASYM
@@ -899,7 +925,7 @@ def get_weight_compression_parameters(
                 matmul_nodes_to_compress, graph
             )
             if statistic_points is None:
-                statistic_points = self.get_statistic_points(model, graph, matmul_input_to_output_nodes_map.keys())
+                statistic_points = self.get_statistic_points(model, graph, matmul_input_to_output_nodes_map)
                 statistic_points = self._collect_statistics(dataset, graph, model, statistic_points)
             statistics = self._get_statistics_for_weights_compression(
                 matmul_input_to_output_nodes_map, statistic_points
@@ -1089,28 +1115,46 @@ def get_statistic_points(
         self,
         model: TModel,
         graph: NNCFGraph,
-        nodes_and_port_ids: Iterable[tuple[NNCFNode, int]],
+        matmul_input_to_output_nodes_map: dict[tuple[NNCFNode, int], list[NNCFNode]],
     ) -> StatisticPointsContainer:
         """
         Returns statistic points, for which StatisticsCollector should collect statistics.
 
         :param model: Model for statistics collection.
         :param graph: Model graph.
-        :param nodes_and_port_ids: Nodes and port ids for which statistics should be collected.
+        :param matmul_input_to_output_nodes_map: A mapping from activation node and a port id to corresponding matmul
+            nodes which accept this activation as an input.
         :return: Statistic points, for which StatisticsCollector should collect statistics.
         """
         statistic_container = StatisticPointsContainer()
+
         # Statistics for data aware algorithms
         if self._data_aware_compression:
-            for node, output_port_id in nodes_and_port_ids:
+            for (node, output_port_id), node_with_weights in matmul_input_to_output_nodes_map.items():
                 statistic_point = self._backend_entity.target_point(
                     TargetType.POST_LAYER_OPERATION, node.node_name, port_id=output_port_id
                 )
-                # Reduce activations across all but the last dimension. The last dimension is assumed to be the hidden
-                # size dimension.
+                all_weight_dims = []
+                for node_with_weight in node_with_weights:
+                    _, weight_port_ids = zip(
+                        *self._backend_entity.get_weight_names_and_port_ids(node_with_weight, graph)
+                    )
+                    weight_dims = [
+                        len(self._backend_entity.get_weight_shape(node_with_weight, weight_port_id, graph))
+                        for weight_port_id in weight_port_ids
+                    ]
+                    all_weight_dims.extend(weight_dims)
+
+                # by default, reduce activations across all but the last dimension. The last dimension is
+                # assumed to be the hidden size dimension.
                 n_dims = len(graph.get_output_edges_by_port_id(node, output_port_id)[0].tensor_shape)
+                reduction_axes = tuple(range(n_dims - 1))
+
+                # For 3D weights, hidden dimension is the second dimension. Reduce by all other dimensions
+                reduction_axes = (1,) if any(weight_dim == 3 for weight_dim in all_weight_dims) else reduction_axes
+
                 stat_collector = self._backend_entity.mean_statistic_collector(
-                    reduction_axes=tuple(range(n_dims - 1)), subset_size=self._subset_size
+                    reduction_axes=reduction_axes, subset_size=self._subset_size
                 )
                 statistic_container.add_statistic_point(
                     StatisticPoint(
@@ -1120,7 +1164,7 @@ def get_statistic_points(
         # Statistics for mixed precision algorithm
         if self._data_aware_mixed_precision:
             mixed_precision_statistics = self._mixed_precision_algo.get_statistic_points(
-                model, graph, nodes_and_port_ids
+                model, graph, matmul_input_to_output_nodes_map.keys()
             )
             for points in mixed_precision_statistics.values():
                 for point in points:
 
@@ -144,6 +144,9 @@ def is_node_with_weights(node: NNCFNode, graph: NNCFGraph) -> bool:
     def get_reduction_axes(node_with_weight: NNCFNode, weight_port_id: int, graph: NNCFGraph) -> Optional[tuple[int]]:
         channel_axes = (get_weight_quantization_axis(node_with_weight, weight_port_id),)
         const_shape = node_with_weight.layer_attributes.weight_attrs[weight_port_id]["shape"]
+        # Everything remains the same, except when 3D weights, reduce by batch dimension also.
+        if len(const_shape) == 3:
+            channel_axes = (0,) + channel_axes
         return get_reduction_axes(channel_axes, const_shape)
 
     @staticmethod
 
@@ -196,11 +196,15 @@ def calculate_quantization_params(
         X = X.astype(TensorDataType.float32)
         weight = weight.astype(TensorDataType.float32)
         eps = fns.finfo(weight).eps
+        is_3d_weight = len(weight.shape) == 3
 
         was_transposed = False
-        if reduction_axis == 0:
-            weight = fns.transpose(weight)
-            reduction_axis = 1
+        if reduction_axis == 0 or (reduction_axis == 1 and is_3d_weight):
+            # Weights
+            # 3D: [num_experts, hidden_dimension, out_features] -> [num_experts, out_features, hidden_dimension]
+            # 2D: [hidden_dimension, out_features] -> [out_features, hidden_dimension]
+            weight = fns.moveaxis(weight, -1, -2)
+            reduction_axis = weight.ndim - 1
             was_transposed = True
 
         group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis]
@@ -220,7 +224,7 @@ def calculate_quantization_params(
             if zp is not None:
                 zp = zp.astype(scale.dtype)
 
-        s = fns.unsqueeze(s, 0)
+        s = fns.unsqueeze(s, -2)
         s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size)
 
         original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size)
@@ -233,18 +237,20 @@ def calculate_quantization_params(
         importance = fns.where(zero_mask, 0.0, importance)
 
         # normalize importances for every group of weights to make sum of them equal to 1.0
-        denum = fns.sum(importance, axis=2, keepdims=True)
+        denum = fns.sum(importance, axis=-1, keepdims=True)
         importance = importance / (denum + eps)
 
-        X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size)
+        X, _ = reshape_weight_for_grouped_quantization(X, -2, group_size)
         best_diffs = None
         result_scale = None
-        fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X)
-        q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X)
+        fp_outs = fns.matmul(fns.moveaxis(original_weight, -2, -3), X)
+        q_outs = fns.matmul(fns.moveaxis(q_weights, -2, -3), X)
 
         # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE
+        # For 3D weights, it is [Batch Size, C_OUT, N_GROUPS]
         min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
-        min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0))
+        min_max_scale_diffs = fns.moveaxis(min_max_scale_diffs, -1, -2)
+
         if weight_penalty > 0.0:
             min_max_scale_diffs += weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1)
 
@@ -272,10 +278,10 @@ def calculate_quantization_params(
                 )
 
             q_weights_ = fns.zeros_like(original_weight) + out
-            q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
+            q_outs = fns.matmul(fns.moveaxis(q_weights_, -2, -3), X)
 
             ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
-            ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0))
+            ideal_scale_diffs = fns.moveaxis(ideal_scale_diffs, -1, -2)
             if weight_penalty > 0.0:
                 ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1)
 
@@ -286,7 +292,7 @@ def calculate_quantization_params(
 
             best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs
 
-            mask = fns.unsqueeze(mask, axis=2)
+            mask = fns.unsqueeze(mask, axis=-1)
 
             if result_scale is None:
                 near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale
@@ -340,17 +346,17 @@ def calculate_quantization_params(
                 )
             q_weights_ = fns.zeros_like(original_weight) + out
 
-            q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
+            q_outs = fns.matmul(fns.moveaxis(q_weights_, -2, -3), X)
             ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
-            ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0))
+            ideal_scale_diffs = fns.moveaxis(ideal_scale_diffs, -1, -2)
             if weight_penalty > 0.0:
                 ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1)
 
             mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype)
 
             best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs
 
-            mask = fns.unsqueeze(mask, axis=2)
+            mask = fns.unsqueeze(mask, axis=-1)
 
             if result_scale is None:
                 near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale
@@ -359,19 +365,19 @@ def calculate_quantization_params(
             result_scale = near_to_ideal_scale
 
         if config.group_size == -1:
-            result_scale = fns.squeeze(result_scale, axis=1)
+            result_scale = fns.squeeze(result_scale, axis=-2)
         if zp is not None and config.group_size == -1:
-            zp = fns.squeeze(zp, axis=1)
+            zp = fns.squeeze(zp, axis=-2)
 
         if was_transposed:
             if config.group_size == -1:
-                result_scale = fns.transpose(result_scale)
+                result_scale = fns.moveaxis(result_scale, -1, -2)
                 if zp is not None:
-                    zp = fns.transpose(zp)
+                    zp = fns.moveaxis(zp, -1, -2)
             else:
-                result_scale = fns.transpose(result_scale, axes=(1, 2, 0))
+                result_scale = fns.moveaxis(result_scale, (-1, -2, -3), (-2, -3, -1))
                 if zp is not None:
-                    zp = fns.transpose(zp, axes=(1, 2, 0))
+                    zp = fns.moveaxis(zp, (-1, -2, -3), (-2, -3, -1))
 
         return result_scale, zp
 
@@ -421,5 +427,5 @@ def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importanc
     """
     ideal_scale = fns.abs(weight) / (fns.abs(target) + zero_mask)
     weighted_scale = ideal_scale * importance
-    near_to_ideal_scale = fns.sum(weighted_scale, axis=2, keepdims=True)
+    near_to_ideal_scale = fns.sum(weighted_scale, axis=-1, keepdims=True)
     return near_to_ideal_scale
@@ -39,7 +39,7 @@ def register_statistics_for_algorithm(
     :param matmul_input_to_output_nodes_map: A dictionary mapping from a tuple of (activation node, port ID)
     to a list of MatMul nodes that accept the activation as input.
     """
-    statistic_points = compression_algo.get_statistic_points(model, graph, matmul_input_to_output_nodes_map.keys())
+    statistic_points = compression_algo.get_statistic_points(model, graph, matmul_input_to_output_nodes_map)
     aggregator.register_statistic_points(statistic_points)
 
 
 
@@ -231,20 +231,40 @@ def get_model_for_test_scale_estimation() -> TModel:
         Returns a backend model for test_scale_estimation.
         """
 
+    @staticmethod
+    @abstractmethod
+    def get_moe_model_for_test_scale_estimation() -> TModel:
+        """
+        Returns a backend MoE model for test_scale_estimation with 3D weights.
+        """
+
+    @staticmethod
+    @abstractmethod
+    def get_moe_scale_estimation_ref() -> TTensor:
+        """
+        Returns the reference output of calculate_quantization_params for MoE model.
+        """
+
     @staticmethod
     @abstractmethod
     def get_scale_estimation_ref() -> TTensor:
         """
         Returns the reference output of calculate_quantization_params of ScaleEstimation.
         """
 
-    def test_scale_estimation(self, mocker):
+    @pytest.mark.parametrize("is_moe", [False, True])
+    def test_scale_estimation(self, mocker, is_moe):
         """Checks that scales match the reference."""
         calc_q_params_spy = mocker.spy(ScaleEstimation, "calculate_quantization_params")
-        model = self.get_model_for_test_scale_estimation()
+
+        if is_moe:
+            model = self.get_moe_model_for_test_scale_estimation()
+            input = np.arange(0, 2 * 4 * 8, dtype=np.float32).reshape(2, 4, 8)
+        else:
+            model = self.get_model_for_test_scale_estimation()
+            input = np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8)
 
         # prepare dataset with one input tensor
-        input = np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8)
         input = self.to_tensor(input)
         dataset = Dataset([input], self.get_transform_func())
 
@@ -258,8 +278,15 @@ def test_scale_estimation(self, mocker):
                 all_layers=True,
                 dataset=dataset,
             )
-        reference = self.get_scale_estimation_ref()
-        assert fns.allclose(Tensor(reference), calc_q_params_spy.spy_return[0])
+
+        computed_scale = calc_q_params_spy.spy_return[0]
+
+        if is_moe:
+            reference = self.get_moe_scale_estimation_ref()
+        else:
+            reference = self.get_scale_estimation_ref()
+
+        assert fns.allclose(Tensor(reference), computed_scale)
 
     @staticmethod
     @abstractmethod
@@ -328,6 +355,7 @@ def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul(self, int
         model = self.get_awq_act_model(with_multiply, n_layers)
 
         dataset = Dataset([self.to_tensor(np.ones([1, 8, 8], dtype=np.float32))], self.get_transform_func())
+
         with SpyWeightCompressionStatisticsContext(mocker):
             model = compress_weights(model, mode=int4_mode, ratio=1.0, group_size=2, dataset=dataset, awq=True)