diff --git a/src/nncf/quantization/algorithms/weight_compression/activation_stats.py b/src/nncf/quantization/algorithms/weight_compression/activation_stats.py index 514aaec43a7..24778e1e952 100644 --- a/src/nncf/quantization/algorithms/weight_compression/activation_stats.py +++ b/src/nncf/quantization/algorithms/weight_compression/activation_stats.py @@ -17,12 +17,13 @@ from nncf.tensor import functions as fns -def process_stats(stats: WCTensorStatistic, subset_size: int) -> tuple[Tensor, Tensor]: +def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int = -1) -> tuple[Tensor, Tensor]: """ A function for processing activations. Shared between AWQ, Scale Estimation and LoRA Correction algorithms. :param stats: An object containing statistics for the layer. :param subset_size: The number of samples for AWQ. + :param act_ch_axis: The activation channel axis. :return: tuple of the following tensors: s - maximum channel magnitude across samples [HiddenDim] X - average channel magnitude across tokens in the sequence [HiddenDim, min(SampleSize, ~subset_size)] @@ -41,7 +42,9 @@ def process_stats(stats: WCTensorStatistic, subset_size: int) -> tuple[Tensor, T # Prevent high memory and time consumption by sampling if X_full.shape[sample_axis] > subset_size: - lens = [reduce(mul, shape[:-1], 1) for shape in stats.shape_values] + lens = [ + reduce(mul, shape[:act_ch_axis] + shape[act_ch_axis % len(shape) + 1 :], 1) for shape in stats.shape_values + ] step = X_full.shape[sample_axis] // subset_size idxs = [i[0] for i in sorted(enumerate(lens), key=lambda x: -x[1])][::step] X = X_full[..., idxs] diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index 7a58055a4d8..aebde028217 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -959,9 +959,9 @@ def get_weight_compression_parameters( # MoE operations are usually matmuls, so the check for matmul metatype is done # This is to avoid raising the error for non-MoE cases with 3D weights. parsed_ov_version = f"{ov_version[0]}.{ov_version[1]}.{ov_version[2]}-{ov_version[3]}" - msg = f"""NNCF compression algorithms do not support 3D weights with current version of - OpenVINO {parsed_ov_version} due to a known issue in statistics collection - Ticket - 176465. Please update to the latest OpenVINO nightly version. + msg = f"""NNCF compression algorithms do not support 3D weights with current version of + OpenVINO {parsed_ov_version} due to a known issue in statistics collection + Ticket - 176465. Please update to the latest OpenVINO nightly version. Node with weight: {node.node_name}.""" raise nncf.UnsupportedModelError(msg) @@ -1087,6 +1087,11 @@ def apply_with_parameters( ) if self._lora_correction: + for wc_params in all_weight_params: + if self._backend_entity.matmul_has_transposed_activations(wc_params.node_with_weight, graph): + msg = "Transposed activations are not supported yet for the LoRa correction algorithm" + raise nncf.UnsupportedModelError(msg) + lora_correction_params = self._advanced_parameters.lora_correction_params lora_correction_algo = LoraCorrectionAlgorithm(statistics, lora_correction_params) description += " with correction of low-rank adapters" @@ -1128,19 +1133,21 @@ def apply_with_parameters( ) return transformed_model - def _get_activation_node_and_port(self, node: NNCFNode, nncf_graph: NNCFGraph) -> tuple[NNCFNode, int]: + def _get_activation_node_port_and_channel(self, node: NNCFNode, nncf_graph: NNCFGraph) -> tuple[NNCFNode, int, int]: """ - This method returns the activation layer and corresponding port id for the node. + This method returns the activation layer, corresponding port id and channel axis for the given node. :param node: NNCFGraph node for which the activation is sought. :param nncf_graph: NNCFGraph instance with the node. - :return: Tuple with the activation node and port id. + :return: Tuple with the activation node, port id and channel axis. """ activation_port = self._backend_entity.get_activation_port_id(node, nncf_graph) activation_edge = nncf_graph.get_input_edge_by_port_id(node, activation_port) activation_node = activation_edge.from_node - port_id = activation_edge.output_port_id - return activation_node, port_id + activation_channel_axis = self._backend_entity.get_activation_channel_axis( + node, activation_edge.input_port_id, activation_edge.tensor_shape + ) + return activation_node, activation_edge.output_port_id, activation_channel_axis def get_matmul_input_to_output_nodes_map( self, matmul_nodes: list[NNCFNode], graph: NNCFGraph @@ -1161,8 +1168,8 @@ def get_matmul_input_to_output_nodes_map( """ matmul_input_to_output_nodes_map = defaultdict(list) for node in matmul_nodes: - act_node, output_port_id = self._get_activation_node_and_port(node, graph) - matmul_input_to_output_nodes_map[(act_node, output_port_id)].append(node) + act_node, output_port_id, act_channel_axis = self._get_activation_node_port_and_channel(node, graph) + matmul_input_to_output_nodes_map[(act_node, output_port_id, act_channel_axis)].append(node) return matmul_input_to_output_nodes_map def get_compression_nodes_info( @@ -1230,7 +1237,11 @@ def get_statistic_points( # Statistics for data aware algorithms if self._data_aware_compression: - for (node, output_port_id), node_with_weights in matmul_input_to_output_nodes_map.items(): + for ( + node, + output_port_id, + input_channel_axis, + ), node_with_weights in matmul_input_to_output_nodes_map.items(): statistic_point = self._backend_entity.target_point( TargetType.POST_LAYER_OPERATION, node.node_name, port_id=output_port_id ) @@ -1245,13 +1256,16 @@ def get_statistic_points( ] all_weight_dims.extend(weight_dims) - # by default, reduce activations across all but the last dimension. The last dimension is - # assumed to be the hidden size dimension. + # Reduce activations across all but the hidden dimension. n_dims = len(graph.get_output_edges_by_port_id(node, output_port_id)[0].tensor_shape) - reduction_axes = tuple(range(n_dims - 1)) + # negative axis (e.g. -1 for the last axis) is converted into corresponding positive value + input_channel_axis = input_channel_axis % n_dims + reduction_axes = tuple(i for i in range(n_dims) if i != input_channel_axis) - # For 3D weights, hidden dimension is the second dimension. Reduce by all other dimensions - reduction_axes = (1,) if any(weight_dim == 3 for weight_dim in all_weight_dims) else reduction_axes + # For 3D weights, keep the batch dimention + if any(weight_dim == 3 for weight_dim in all_weight_dims): + assert len(reduction_axes) == 2 + reduction_axes = reduction_axes[1:] stat_collector = self._backend_entity.mean_statistic_collector( reduction_axes=reduction_axes, subset_size=self._subset_size @@ -1291,7 +1305,7 @@ def _get_statistics_for_weights_compression( # Where mean_value is a 1D tensor representing an activation reduced over batch and sequence length dimensions, # shape is an original shape of an activation before reduction, n is the size of the dataset (or subset_size). statistics = {} - for (act_node, output_port_id), matmul_nodes in matmul_input_to_output_nodes_map.items(): + for (act_node, output_port_id, _), matmul_nodes in matmul_input_to_output_nodes_map.items(): tensor_collectors = list( statistic_points.get_algo_statistics_for_node( act_node.node_name, diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py index fab970fc0dc..411d79ae795 100644 --- a/src/nncf/quantization/algorithms/weight_compression/awq.py +++ b/src/nncf/quantization/algorithms/weight_compression/awq.py @@ -170,6 +170,8 @@ def apply( weight_dtype = weight.dtype weight = weight.astype(TensorDataType.float32) + act_ch_axis, act_shape = self._get_activation_channel_axis_and_shape(graph, wp) + if is_data_free: scale = self._data_free_step(weight, 1 - wp.reduction_axes[0]) else: @@ -181,10 +183,10 @@ def apply( prev_weight = self._backend_entity.get_weight(merge_node, prev_weight_port_id, model, graph) prev_statistics = statistics[merge_node.node_name] - scale = self._data_aware_step(wp, weight, statistics[k], prev_weight, prev_statistics) + scale = self._data_aware_step(wp, weight, statistics[k], act_ch_axis, prev_weight, prev_statistics) w_scale = fns.unsqueeze(scale, 1 - wp.reduction_axes[0]) - a_scale = fns.unsqueeze(1.0 / scale, wp.reduction_axes[0]) + a_scale = 1.0 / scale scaled_weight = (weight * w_scale).astype(weight_dtype) self._backend_entity.set_weight(wp.node_with_weight, weight_port_id, model, graph, scaled_weight) @@ -192,13 +194,17 @@ def apply( if is_mergeable: # for MatMul->Multiply->MatMul pattern the scale is merged to the first MatMul for _, port_id in self._backend_entity.get_weight_names_and_port_ids(merge_node, graph): merge_weight = self._backend_entity.get_weight(merge_node, port_id, model, graph) + a_scale = fns.unsqueeze(a_scale, wp.reduction_axes[0]) merge_weight = (merge_weight * a_scale).astype(weight_dtype) self._backend_entity.set_weight(merge_node, port_id, model, graph, merge_weight) - a_scale = fns.transpose(a_scale) else: # for Act->Multiply->MatMul and Act->MatMul patterns scale inserted after Act as extra node - a_scale = fns.transpose(a_scale).astype(weight_dtype) + # Calculate the activation scale shape + a_scale_shape = [scale.shape[0] if axis == act_ch_axis else 1 for axis in range(len(act_shape))] + a_scale = fns.reshape(a_scale, tuple(a_scale_shape)) + next_nodes = graph.get_next_nodes(merge_node) source_node_output_port = graph.get_output_edges(merge_node)[0].output_port_id + scale_insertion_command = self._backend_entity.scale_insertion_command( merge_node, next_nodes, source_node_output_port, a_scale.data ) @@ -210,10 +216,10 @@ def apply( return transformed_model - def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statistics=None): + def _data_aware_step(self, wp, weight, statistics, act_ch_axis, prev_weight=None, prev_statistics=None): alpha_step = (self._alpha_max - self._alpha_min) / self._steps config = wp.compression_config - s, X = process_stats(statistics, self._subset_size) + s, X = process_stats(statistics, self._subset_size, act_ch_axis) s = s.astype(TensorDataType.float32) X = X.astype(TensorDataType.float32) @@ -222,7 +228,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis prev_s, prev_w = None, None if prev_statistics is not None and prev_weight is not None: - prev_s, _ = process_stats(prev_statistics, self._subset_size) + prev_s, _ = process_stats(prev_statistics, self._subset_size, act_ch_axis) prev_s = prev_s.astype(TensorDataType.float32).max().item() prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis) @@ -311,6 +317,16 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis return scale + def _get_activation_channel_axis_and_shape( + self, graph: NNCFGraph, wp: WeightCompressionParameters + ) -> tuple[int, tuple[int, ...]]: + activation_port_id = self._backend_entity.get_activation_port_id(wp.node_with_weight, graph) + act_shape = graph.get_input_edge_by_port_id(wp.node_with_weight, activation_port_id).tensor_shape + act_ch_axis = self._backend_entity.get_activation_channel_axis( + wp.node_with_weight, activation_port_id, act_shape + ) + return act_ch_axis % len(act_shape), act_shape + @staticmethod def _clamp_scale(magnitudes, threshold, scale, clamped_scale): return fns.where(magnitudes < threshold, scale, clamped_scale) diff --git a/src/nncf/quantization/algorithms/weight_compression/backend.py b/src/nncf/quantization/algorithms/weight_compression/backend.py index 24baed2ae7d..d1ddf8f99dc 100644 --- a/src/nncf/quantization/algorithms/weight_compression/backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/backend.py @@ -110,6 +110,17 @@ def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: TMo :return: The weight tensor. """ + @abstractmethod + def matmul_has_transposed_activations(self, matmul: NNCFNode, int, graph: NNCFGraph) -> bool: + """ + Checks whether the activation input of a MatMul operation is transposed. + + :param matmul: MatMul NNCFGraph node. + :param graph: The model graph associated with the model. + :return: True if the node is a matmul node and activation input is transposed, + False otherwise. + """ + @abstractmethod def get_weight_dtype( self, node_with_weight: NNCFNode, weight_port_id: int, model: TModel, graph: NNCFGraph @@ -273,6 +284,18 @@ def get_ignored_patterns() -> GraphPattern: :return: backend-specific ignored patterns. """ + @staticmethod + @abstractmethod + def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: tuple[int]) -> int: + """ + Returns axis number of the activation tensor which correspond to it channel. + + :param node: NNCFNode instance. + :param port_id: Port ID for input. + :param input_shape: Shape of the input. + :return: Channel axis number. + """ + class AWQAlgoBackend(WeightCompressionAlgoBackend): @staticmethod diff --git a/src/nncf/quantization/algorithms/weight_compression/gptq.py b/src/nncf/quantization/algorithms/weight_compression/gptq.py index 343716615cd..aeb32adede1 100644 --- a/src/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/src/nncf/quantization/algorithms/weight_compression/gptq.py @@ -124,6 +124,11 @@ def apply( CompressWeightsMode.INT8_SYM, ]: continue + + if self._backend_entity.matmul_has_transposed_activations(wc_params.node_with_weight, graph): + msg = "Transposed activations are not supported yet for the GPTQ algorithm" + raise nncf.UnsupportedModelError(msg) + _, input_tensors = next(iter(inputs.items())) hessian = self._calculate_hessian(node, input_tensors) scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian, input_tensors) diff --git a/src/nncf/quantization/algorithms/weight_compression/mixed_precision.py b/src/nncf/quantization/algorithms/weight_compression/mixed_precision.py index bc1551e00aa..93439f8669c 100644 --- a/src/nncf/quantization/algorithms/weight_compression/mixed_precision.py +++ b/src/nncf/quantization/algorithms/weight_compression/mixed_precision.py @@ -279,7 +279,7 @@ def get_statistic_points( self._set_backend_entity(model) statistic_container = StatisticPointsContainer() - for act_node, output_port_id in nodes_and_port_ids: + for act_node, output_port_id, _ in nodes_and_port_ids: n_dims = len(graph.get_output_edges_by_port_id(act_node, output_port_id)[0].tensor_shape) if n_dims < 2: msg = ( diff --git a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py index ed483486e5a..00a4394a14a 100644 --- a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -38,6 +38,7 @@ from nncf.onnx.graph.model_transformer import remove_initializer from nncf.onnx.graph.model_transformer import remove_node from nncf.onnx.graph.model_transformer import set_initializer +from nncf.onnx.graph.node_utils import get_act_quantization_axis from nncf.onnx.graph.node_utils import get_weight_quantization_axis from nncf.onnx.graph.onnx_helper import ONNX_DTYPE_TO_NNCF_DTYPE from nncf.onnx.graph.onnx_helper import get_name_to_node_map @@ -186,6 +187,13 @@ def get_weight( weight_tensor = get_tensor_value(model, weight_name) return Tensor(weight_tensor) + def matmul_has_transposed_activations(self, matmul: NNCFNode, graph: NNCFGraph) -> bool: + if matmul.metatype != metatypes.ONNXGemmMetatype: + return False + act_port_id = self.get_activation_port_id(matmul, graph) + trans_attr = "transB" if act_port_id else "transA" + return matmul.layer_attributes.node_attrs[trans_attr] + def get_weight_dtype( self, node_with_weight: NNCFNode, weight_port_id: int, model: onnx.ModelProto, graph: NNCFGraph ) -> TensorDataType: @@ -301,6 +309,10 @@ def filter_func(point: StatisticPoint) -> bool: return filter_func + @staticmethod + def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: tuple[int]) -> int: + return get_act_quantization_axis(node, port_id) + def insert_adapters( self, wc_params: WeightCompressionParameters, lora_A: Tensor, lora_B: Tensor, int8_lora: bool ) -> None: @@ -503,9 +515,13 @@ def get_ignored_patterns() -> GraphPattern: class ONNXAWQAlgoAlgoBackend(AWQAlgoBackend, ONNXWeightCompressionAlgoBackend): @staticmethod def get_awq_patterns() -> dict[str, Callable]: - return get_awq_patterns( - onnx_metatypes.ONNXMatMulMetatype, onnx_metatypes.ONNXMulLayerMetatype, ATOMIC_ACTIVATIONS_OPERATIONS - ) + patterns = {} + for mm_metatype in (onnx_metatypes.ONNXMatMulMetatype, onnx_metatypes.ONNXGemmMetatype): + p = get_awq_patterns(mm_metatype, onnx_metatypes.ONNXMulLayerMetatype, ATOMIC_ACTIVATIONS_OPERATIONS) + p = {f"{mm_metatype.__name__}_{k}": v for k, v in p.items()} + patterns.update(p) + + return patterns @staticmethod def scale_insertion_command( diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 1b3eb386d36..56c5282d2e0 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -13,7 +13,6 @@ import openvino as ov from openvino import opset13 as opset -import nncf from nncf.common.graph import NNCFGraph from nncf.common.graph import NNCFNode from nncf.common.graph.operator_metatypes import OperatorMetatype @@ -35,6 +34,7 @@ from nncf.openvino.graph.node_utils import convert_op from nncf.openvino.graph.node_utils import create_ov_codebook_subgraph from nncf.openvino.graph.node_utils import create_ov_const_from_tensor +from nncf.openvino.graph.node_utils import get_activation_channel_axis from nncf.openvino.graph.node_utils import get_const_value_as_numpy_tensor from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor from nncf.openvino.graph.node_utils import get_weight_channel_axes @@ -119,9 +119,6 @@ def mean_statistic_collector( @staticmethod def get_activation_port_id(node: NNCFNode, nncf_graph: NNCFGraph) -> int: - if node.layer_attributes.input_attributes["transpose"]: - msg = "Transposed input is not supported" - raise nncf.UnsupportedModelError(msg) constant_ports = node.layer_attributes.get_const_port_ids() activation_ports = [ e.input_port_id for e in nncf_graph.get_input_edges(node) if e.input_port_id not in constant_ports @@ -143,6 +140,11 @@ def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov. weight_tensor = get_const_value_as_numpy_tensor(weight_node) return Tensor(weight_tensor) + def matmul_has_transposed_activations(self, matmul: NNCFNode, graph: NNCFGraph) -> bool: + if matmul.metatype != om.OVMatMulMetatype: + return False + return matmul.layer_attributes.input_attributes["transpose"] + def get_weight_dtype( self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph ) -> TensorDataType: @@ -378,6 +380,10 @@ def get_ignored_patterns() -> GraphPattern: pattern.add_pattern_alternative(create_sam_pe()) return pattern + @staticmethod + def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: tuple[int]) -> int: + return get_activation_channel_axis(node, port_id, input_shape) + class OVTensorWeightCompressionAlgoBackend(OVWeightCompressionAlgoBackend): """ diff --git a/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 92ef97364ef..d953a284c06 100644 --- a/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -139,6 +139,10 @@ def apply( continue _, weight_port_id = weight_data[0] + if self._backend_entity.matmul_has_transposed_activations(wp.node_with_weight, graph): + msg = "Transposed activations are not supported yet for the Scale Estimation algorithm" + raise nncf.UnsupportedModelError(msg) + weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) scale, zero_point = self.calculate_quantization_params( diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py index 143ec0d7173..eb142c032b4 100644 --- a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -60,6 +60,7 @@ from nncf.torch.model_graph_manager import get_module_by_name from nncf.torch.model_graph_manager import get_weight_compression_reduction_axes from nncf.torch.model_graph_manager import split_const_name +from nncf.torch.node_utils import get_activation_channel_axis as get_activation_channel_axis_util from nncf.torch.quantization.ignored_patterns import create_rope from nncf.torch.quantization.ignored_patterns import create_sam_pe from nncf.torch.quantization.layers import QUANTIZATION_MODULES @@ -175,6 +176,9 @@ def get_weight( raise nncf.InternalError(msg) return Tensor(weight) + def matmul_has_transposed_activations(self, matmul: NNCFNode, graph: NNCFGraph) -> bool: + return False + def get_weight_dtype( self, node_with_weight: NNCFNode, @@ -482,6 +486,10 @@ def get_ignored_patterns() -> GraphPattern: pattern.add_pattern_alternative(create_sam_pe()) return pattern + @staticmethod + def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: tuple[int]) -> int: + return get_activation_channel_axis_util(node, port_id) + class PTAWQAlgoAlgoBackend(AWQAlgoBackend, PTWeightCompressionAlgoBackend): @staticmethod diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index 8b219118bf7..2182c85b6bf 100644 --- a/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -56,6 +56,7 @@ from nncf.torch.model_graph_manager import get_const_node from nncf.torch.model_graph_manager import get_weight_compression_reduction_axes from nncf.torch.model_graph_manager import get_weight_tensor_port_ids +from nncf.torch.node_utils import get_activation_channel_axis as get_activation_channel_axis_fn from nncf.torch.quantization.ignored_patterns import create_rope from nncf.torch.quantization.ignored_patterns import create_sam_pe from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor @@ -127,6 +128,9 @@ def get_weight( return Tensor(weight) + def matmul_has_transposed_activations(self, matmul: NNCFNode, graph: NNCFGraph) -> bool: + return False + def get_weight_dtype( self, node_with_weight: NNCFNode, weight_port_id: int, model: torch.fx.GraphModule, graph: NNCFGraph ) -> TensorDataType: @@ -262,6 +266,10 @@ def get_ignored_patterns() -> GraphPattern: pattern.add_pattern_alternative(create_sam_pe()) return pattern + @staticmethod + def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: tuple[int]) -> int: + return get_activation_channel_axis_fn(node, port_id) + class FXMixedPrecisionAlgoBackend(MixedPrecisionAlgoBackend, FXWeightCompressionAlgoBackend): pass diff --git a/src/nncf/torch/node_utils.py b/src/nncf/torch/node_utils.py new file mode 100644 index 00000000000..7f325834557 --- /dev/null +++ b/src/nncf/torch/node_utils.py @@ -0,0 +1,42 @@ +# Copyright (c) 2026 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import nncf +import nncf.torch.graph.operator_metatypes as op +from nncf.common.graph import NNCFNode +from nncf.torch.graph.operator_metatypes import PTAddmmMetatype +from nncf.torch.graph.operator_metatypes import PTMatMulMetatype + + +def get_activation_channel_axis(node: NNCFNode, port_id: int) -> int: + """ + Returns axis number of the activation tensor which correspond to its channel. + + :param node: NNCFNode instance. + :param port_id: Port ID for input. + :return: Channel axis number. + """ + if node.metatype not in op.CONVOLUTION_METATYPES + op.MATMUL_METATYPES: + msg = f"Activation channel axis retrieval from node with metatype {node.metatype} is not supported" + raise nncf.InternalError(msg) + + if node.metatype not in [PTMatMulMetatype, PTAddmmMetatype]: + return node.metatype.output_channel_axis + + if port_id == 0: + # X(port:0) * W(port:1): [..., C_IN] * [... , C_IN, C_OUT] + return -1 + if port_id == 1: + # W(port:0) * X(port:1): [... , C_OUT, C_IN] * [... , C_IN, ...] + return -2 + + msg = f"Port id for a {node.metatype} operation is expected to be in [0, 1], {port_id} recieved" + raise nncf.InternalError(msg) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index 3e6aa4802eb..d5ced55ff8d 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -11,6 +11,7 @@ import math from abc import ABC from abc import abstractmethod +from dataclasses import dataclass from functools import reduce from operator import mul from typing import Any, Callable, Optional, TypeVar @@ -30,6 +31,7 @@ from nncf.quantization import compress_weights from nncf.quantization.advanced_parameters import AdvancedAWQParameters as AWQParams from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams +from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams from nncf.quantization.algorithms.weight_compression.activation_stats import WCTensorStatistic from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.algorithm import WeightCompression @@ -162,7 +164,7 @@ def test_data_based_criterion(self, mode, ref_score, ref_act_score, mocker): @staticmethod @abstractmethod - def get_sequential_matmul_model() -> TModel: + def get_sequential_matmul_model(transpose_a: bool) -> TModel: """Returns a backend model for test_mixed_precision.""" @staticmethod @@ -172,7 +174,7 @@ def to_tensor(x: TTensor) -> TTensor: @staticmethod @abstractmethod - def check_weights(model: TModel, ref_ids: list[int]) -> None: + def check_weights(model: TModel, ref_ids: list[int], transpose_a=False) -> None: """Checks that only weights with specified ids are compressed in int4 format.""" @staticmethod @@ -210,10 +212,14 @@ def wrap_model(model, data) -> CompressionParams: (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2]), ), ) - def test_mixed_precision(self, mode, all_layers, ratio, ref_ids, mocker): - model = self.get_sequential_matmul_model() - first = self.to_tensor(np.ones([1, 4, 4], dtype=np.float32)) - second = self.to_tensor(np.arange(16, dtype=np.float32)).reshape(1, 4, 4) + @pytest.mark.parametrize("transpose_a", (False, True)) + def test_mixed_precision(self, mode, all_layers, ratio, ref_ids, transpose_a, transpose_a_supported, mocker): + if transpose_a and not transpose_a_supported: + pytest.skip("transpose_a is not supported for the current backend") + model = self.get_sequential_matmul_model(transpose_a=transpose_a) + input_shape = (4, 4) if transpose_a else (1, 4, 4) + first = self.to_tensor(np.ones(input_shape, dtype=np.float32)) + second = self.to_tensor(np.arange(16, dtype=np.float32)).reshape(input_shape) dataset = Dataset([first, second], self.get_transform_func()) compressed_model = compress_weights( model, @@ -224,7 +230,7 @@ def test_mixed_precision(self, mode, all_layers, ratio, ref_ids, mocker): sensitivity_metric=mode, dataset=dataset, ) - self.check_weights(compressed_model, ref_ids) + self.check_weights(compressed_model, ref_ids, transpose_a) # Scale Estimation Tests @@ -382,7 +388,7 @@ def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul(self, int @staticmethod @abstractmethod - def get_awq_model() -> TModel: + def get_awq_model(non_mergable_pattern: bool) -> TModel: "Returns a backend model for test_awq_with_ignored_scope." @staticmethod @@ -406,7 +412,7 @@ def get_ignored_scope_name() -> str: "Returns ignored scope name for test_awq_with_ignored_scope." def test_awq_with_ignored_scope(self, mocker): - model = self.get_awq_model() + model = self.get_awq_model(non_mergable_pattern=False) sz = 8 n_samples = 10 @@ -473,29 +479,56 @@ def test_sam_pe_weight_compression(self): @staticmethod @abstractmethod - def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: + @pytest.fixture + def test_awq_scale_ref() -> dict[str, Tensor]: "Returns reference for test_awq_scale_reference." - def test_awq_scale_reference(self, monkeypatch, mocker): + @abstractmethod + @pytest.fixture + def transpose_a_supported(self) -> bool: + """True if backend supports tranpose for MM activations, False otherwise""" + + # Transpose inputs does not affect mergable pattern code, skippting (True, False) + @pytest.mark.parametrize("transpose_a,non_mergable_pattern", [(True, True), (False, True), (False, False)]) + def test_awq_scale_reference( + self, + non_mergable_pattern, + transpose_a, + test_awq_scale_ref, + transpose_a_supported, + monkeypatch, + mocker, + ): monkeypatch.setattr("nncf.quantization.algorithms.weight_compression.algorithm.AWQ", SpyAWQ) - model = self.get_awq_model() + if transpose_a: + if not transpose_a_supported: + msg = "Transpose a is not supported for the current backend" + pytest.skip(msg) - input = 0.01 * np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8) + 0.02 + INPUT_SHAPE = (2, 4) + model = self.get_transposable_awq_model(transpose_a=True, transpose_b=True, input_shape=INPUT_SHAPE) + else: + INPUT_SHAPE = (1, 4, 8) + model = self.get_awq_model(non_mergable_pattern) + input = 0.01 * np.arange(0, np.multiply.reduce(INPUT_SHAPE), dtype=np.float32).reshape(INPUT_SHAPE) + 0.02 input = self.to_tensor(input) - dataset = Dataset([input], self.get_transform_func()) + dataset = Dataset([input] * 2, self.get_transform_func()) with SpyWeightCompressionStatisticsContext(mocker): _ = compress_weights( model, mode=CompressWeightsMode.INT4_SYM, ratio=1.0, + all_layers=transpose_a, group_size=-1, dataset=dataset, awq=True, ) assert spy_instance is not None for node_name, scales in spy_instance._scale_per_target_node.items(): - assert fns.allclose(scales, self.get_reference_for_test_awq_scale_reference()[node_name]) + ref = test_awq_scale_ref[node_name] + assert fns.allclose(scales, ref) + assert scales.shape == ref.shape @pytest.mark.parametrize( ["group_size", "fallback_mode", "min_adjusted_group_size", "expected_outcome"], @@ -662,45 +695,127 @@ def get_transform_func() -> Optional[Callable[..., Any]]: def get_reduction_axes() -> int: return 1 + @dataclass + class ProcessStatsTestCase: + reduced_shape: tuple[int, ...] + activation_shapes: list[tuple[int, ...]] + subset_size: int + ref_s: np.ndarray + ref_X: np.ndarray + act_ch_axis: Optional[int] = None + @pytest.mark.parametrize( - "mean_values_shape,num_samples,subset_size,expected_s_shape,expected_X_shape,expected_indices", + "case", [ # 2D Activations - ((8,), 10, 5, (8,), (8, 5), [0, 2, 4, 6, 8]), - ((8,), 5, 10, (8,), (8, 5), [0, 1, 2, 3, 4]), - ((8,), 12, 5, (8,), (8, 6), [0, 2, 4, 6, 8, 10]), + ProcessStatsTestCase( + reduced_shape=(2,), + activation_shapes=[(1, 2), (3, 2), (5, 2), (10, 2)], + subset_size=2, + ref_s=np.array([6, 7]), + ref_X=np.array([6, 2, 7, 3]).reshape(2, 2), + ), + ProcessStatsTestCase( + reduced_shape=(2,), + activation_shapes=[(2, 1), (2, 3), (2, 5), (2, 10)], + subset_size=2, + act_ch_axis=0, + ref_s=np.array([6, 7]), + ref_X=np.array([6, 2, 7, 3]).reshape(2, 2), + ), + ProcessStatsTestCase( + reduced_shape=(2,), + activation_shapes=[(5, 2), (5, 2)], + subset_size=2, + ref_s=np.array([2, 3]), + ref_X=np.array([0, 2, 1, 3]).reshape(2, 2), + ), # 3D Activations - ((4, 8), 10, 5, (4, 8), (4, 8, 5), [0, 2, 4, 6, 8]), - ((4, 8), 5, 10, (4, 8), (4, 8, 5), [0, 1, 2, 3, 4]), - ((4, 8), 25, 8, (4, 8), (4, 8, 9), [0, 3, 6, 9, 12, 15, 18, 21, 24]), + ProcessStatsTestCase( + reduced_shape=(2, 4), + activation_shapes=[(1, 2, 4), (3, 2, 4), (5, 2, 4), (10, 2, 4)], + subset_size=2, + ref_s=np.array(list(range(24, 32))).reshape(2, 4), + ref_X=np.array([24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15]).reshape(2, 4, 2), + ), + ProcessStatsTestCase( + reduced_shape=(2, 4), + activation_shapes=[(1, 100000, 2, 4), (3, 10000, 2, 4), (5, 1000, 2, 4), (10, 5, 2, 4)], + subset_size=2, + act_ch_axis=1, + ref_s=np.array(list(range(24, 32))).reshape(2, 4), + ref_X=np.array([24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15]).reshape(2, 4, 2), + ), + ProcessStatsTestCase( + reduced_shape=(2, 4), + activation_shapes=[(1, 2, 4), (1, 2, 4)], + subset_size=2, + ref_s=np.array(list(range(8, 16))).reshape(2, 4), + ref_X=np.array([0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15]).reshape(2, 4, 2), + ), ], ) - def test_process_stats( - self, mean_values_shape, num_samples, subset_size, expected_s_shape, expected_X_shape, expected_indices - ): - total_elements = reduce(mul, mean_values_shape, 1) + def test_process_stats(self, case: ProcessStatsTestCase): + total_elements = reduce(mul, case.reduced_shape, 1) mean_values = [ - Tensor(np.arange(i * total_elements, (i + 1) * total_elements, dtype=np.float32).reshape(mean_values_shape)) - for i in range(num_samples) + Tensor( + np.arange(i * total_elements, (i + 1) * total_elements, dtype=np.float32).reshape(case.reduced_shape) + ) + for i in range(len(case.activation_shapes)) ] - shape_values = [(1,) + mean_values_shape for _ in range(num_samples)] - stats = WCTensorStatistic(mean_values=mean_values, shape_values=shape_values) + stats = WCTensorStatistic(mean_values=mean_values, shape_values=case.activation_shapes) - s, X = process_stats(stats, subset_size) + if case.act_ch_axis is None: + s, X = process_stats(stats, case.subset_size) + else: + s, X = process_stats(stats, case.subset_size, case.act_ch_axis) - assert s.shape == expected_s_shape, f"Expected s shape {expected_s_shape}, got {s.shape}" - assert X.shape == expected_X_shape, f"Expected X shape {expected_X_shape}, got {X.shape}" + assert s.shape == case.ref_s.shape + assert fns.allclose(s, self.to_tensor(case.ref_s)) + assert X.shape == case.ref_X.shape + assert fns.allclose(X, self.to_tensor(case.ref_X)) - X_full_list = [mean_values[i] for i in range(num_samples)] - X_full = fns.stack(X_full_list) - axes = list(range(1, len(X_full.shape))) + [0] - X_full_transposed = fns.transpose(X_full, axes=axes) + @staticmethod + @abstractmethod + def get_transposable_awq_model(transpose_a: bool, transpose_b: bool, input_shape=None) -> TModel: + "Returns a backend model for test_compression_with_transpose." - for idx, sample_idx in enumerate(expected_indices): - expected_sample = X_full_transposed[..., sample_idx] - actual_sample = X[..., idx] - assert fns.all(actual_sample == expected_sample) + @pytest.mark.parametrize( + "kwargs", + [ + dict(scale_estimation=True), + dict(lora_correction=True), + dict( + gptq=True, + advanced_parameters=CompressionParams(gptq_params=GPTQParams(subset_size=2)), + ), + ], + ) + def test_compression_skipped_with_transposed_activations(self, transpose_a_supported, kwargs): + if not transpose_a_supported: + pytest.skip("transpose_a is not supported for the current backend") + if kwargs.get("scale_estimation", False) and "scale_estimation" in self.get_not_supported_algorithms(): + pytest.skip("Scale estimation is not supported") + if kwargs.get("gptq", False) and "gptq" in self.get_not_supported_algorithms(): + pytest.skip("GPTQ is not supported") + if kwargs.get("lora_correction", False) and "lora_correction" in self.get_not_supported_algorithms(): + pytest.skip("lora_correction is not supported") + + INPUT_SHAPE = (2, 4) + model = self.get_transposable_awq_model(transpose_a=True, transpose_b=True, input_shape=INPUT_SHAPE) + input = 0.01 * np.arange(0, np.multiply.reduce(INPUT_SHAPE), dtype=np.float32).reshape(INPUT_SHAPE) + 0.02 + input = self.to_tensor(input) + dataset = Dataset([input] * 2, self.get_transform_func()) - expected_s = fns.max(fns.abs(X_full_transposed), axis=-1) - assert fns.all(s == expected_s) + with pytest.raises(nncf.UnsupportedModelError): + compress_weights( + model, + mode=CompressWeightsMode.INT4_SYM, + ratio=1.0, + group_size=1, + subset_size=2, + dataset=dataset, + all_layers=True, + **kwargs, + ) diff --git a/tests/onnx/common.py b/tests/onnx/common.py index 803e6dafdcb..7584c6afa70 100644 --- a/tests/onnx/common.py +++ b/tests/onnx/common.py @@ -106,6 +106,7 @@ def add_gemm( output: Optional[str] = None, weight_data: Optional[np.ndarray] = None, bias_data: Optional[np.ndarray] = None, + trans_a: int = 0, trans_b: int = 0, ) -> str: i = len(self._nodes) @@ -140,7 +141,7 @@ def add_gemm( inputs=[input, w_name, b_name], outputs=[output], name=f"Gemm_{i}", - transA=0, + transA=trans_a, transB=trans_b, ) ) diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py index 365e9653284..46931d5afac 100644 --- a/tests/onnx/quantization/test_weights_compression.py +++ b/tests/onnx/quantization/test_weights_compression.py @@ -414,30 +414,65 @@ def get_SAM_PE_model() -> onnx.ModelProto: return mb.build() @staticmethod - def get_sequential_matmul_model() -> onnx.ModelProto: + def get_sequential_matmul_model(transpose_a: bool) -> onnx.ModelProto: """ Builds a model to be used in the TemplateWeightCompression.test_mixed_precision() test. """ mb = ModelBuilder() - x = mb.add_input("input", (1, 4, 4)) - output = mb.add_output("output", (1, 4, 4)) + input_shape = (4, 4) if transpose_a else (1, 4, 4) + x = mb.add_input("input", input_shape) main_values = [10000, 1000, 1, 10, 10000] + + if transpose_a: + x = mb.add_transpose(x, [1, 0]) for i, main_value in enumerate(main_values): weights_data = np.arange(0, 16).reshape(4, 4).astype(np.float32) weights_data[-1, -1] = main_value weights_data = weights_data.T - x = mb.add_matmul(x, shape=weights_data.shape, output=output if i == 4 else None, data=weights_data) + if transpose_a: + x = mb.add_gemm(x, shape=weights_data.shape, weight_data=weights_data) + # Without additional output there is no edges between gemms in the graph + # for some odd reason + mb.add_output(x, input_shape) + else: + x = mb.add_matmul(x, shape=weights_data.shape, data=weights_data) + if i == 4: + mb.add_output(x, input_shape) return mb.build(opset_version=21) + @staticmethod + def get_transposable_awq_model(transpose_a: bool, transpose_b: bool, input_shape=None): + mb = ModelBuilder() + + assert len(input_shape) == 2 + input_shape = input_shape or (2, 3) + x = mb.add_input("input", input_shape) + output = mb.add_output("output", input_shape) + + inp_ch_idx = -2 if transpose_a else -1 + w_shape = (input_shape[inp_ch_idx], input_shape[inp_ch_idx]) + w_data = 0.1 * np.arange(0, np.prod(w_shape), dtype=np.float32).reshape(w_shape) + 0.05 + w_data = w_data.T + + relu = mb.add_relu(x) + mb.add_gemm( + relu, w_data.shape, weight_data=w_data, trans_a=int(transpose_a), trans_b=int(transpose_b), output=output + ) + model = mb.build() + return model + @staticmethod def to_tensor(x: np.ndarray) -> np.ndarray: return np.array(x) @staticmethod - def check_weights(model: onnx.ModelProto, ref_ids: list[int]) -> None: + def check_weights(model: onnx.ModelProto, ref_ids: list[int], transpose_a: bool = False) -> None: names = {i.name for i in model.graph.initializer if i.data_type == onnx.TensorProto.INT4} + if transpose_a: + # First transpose node increments weights indexes + ref_ids = [i + 1 for i in ref_ids] low_precision_nodes = {f"W_{i}_quantized" for i in ref_ids} assert low_precision_nodes == names @@ -696,7 +731,7 @@ def get_num_multiply_from_awq(model: onnx.ModelProto) -> int: return awq_num @staticmethod - def get_awq_model() -> onnx.ModelProto: + def get_awq_model(non_mergable_pattern: bool) -> onnx.ModelProto: """ Builds a model to be used in the following tests: - TemplateWeightCompression.test_awq_with_ignored_scope() @@ -713,11 +748,17 @@ def get_awq_model() -> onnx.ModelProto: w_data = w_data.T num_blocks = 2 + for i in range(num_blocks): - a = mb.add_matmul(x, shape=w_data.shape, data=w_data) - b = mb.add_matmul(x, shape=w_data.shape, data=w_data) - x = mb.add_mul(a, b) - x = mb.add_matmul(x, shape=w_data.shape, output=output if i == num_blocks - 1 else None, data=w_data) + if non_mergable_pattern: + a = mb.add_matmul(x, shape=w_data.shape, data=w_data) + b = mb.add_relu(a) + x = mb.add_matmul(b, shape=w_data.shape, output=output if i == num_blocks - 1 else None, data=w_data) + else: + a = mb.add_matmul(x, shape=w_data.shape, data=w_data) + b = mb.add_matmul(x, shape=w_data.shape, data=w_data) + x = mb.add_mul(a, b) + x = mb.add_matmul(x, shape=w_data.shape, output=output if i == num_blocks - 1 else None, data=w_data) return mb.build() @@ -764,14 +805,35 @@ def get_ignored_scope_name() -> str: return "MatMul_4" # Zero-based indices (e.g., MatMul_0, MatMul_1, ...) @staticmethod - def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: + @pytest.fixture + def test_awq_scale_ref() -> dict[str, Tensor]: return { + "Gemm_1": Tensor(np.array([[14.299703], [8.364688]], dtype=np.float32)), "MatMul_3": Tensor( np.array( [[1.2264546, 1.2054994, 1.1413403, 1.0974358, 1.0643553, 1.0379708, 1.0161183, 0.9975262]], dtype=np.float32, - ).T - ) + ) + ), + "MatMul_2": Tensor( + np.array( + [ + [ + [ + 1.9909902, + 1.8632966, + 1.5759803, + 1.3974594, + 1.2722752, + 1.1779976, + 1.1035581, + 1.042768, + ] + ] + ], + dtype=np.float32, + ), + ), } @staticmethod @@ -784,3 +846,7 @@ def transform_func(x): @staticmethod def get_reduction_axes() -> int: return 0 + + @pytest.fixture + def transpose_a_supported(self) -> bool: + return True diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py index 82c09759db8..db26d61a77e 100644 --- a/tests/openvino/native/models.py +++ b/tests/openvino/native/models.py @@ -614,7 +614,9 @@ def _create_ov_model(self, input_name) -> ov.Model: @SYNTHETIC_MODELS.register() class IntegerModel(OVReferenceModel): - def _create_ov_model(self, dim1=1, dim2=7, dim3=6, max_input_value=2, add_batch_dimension=False, positive_w=True): + def _create_ov_model( + self, dim1=1, dim2=7, dim3=6, max_input_value=2, add_batch_dimension=False, positive_w=True, transpose_a=False + ): def get_rand_w(shape): value = self._rng.random(shape) return value if positive_w else value * 2 - 1 @@ -643,7 +645,11 @@ def get_rand_w(shape): gather_4.set_friendly_name("Gather_4") matmul_2_data = opset.constant(get_rand_w((dim3, dim2)), dtype=np.float32, name="matmul_2_data") - matmul_2 = opset.matmul(gather_4, matmul_2_data, transpose_a=False, transpose_b=True, name="MatMul_2") + if transpose_a: + transpose = opset.transpose(gather_4, [1, 0]) + else: + transpose = gather_4 + matmul_2 = opset.matmul(transpose, matmul_2_data, transpose_a=transpose_a, transpose_b=True, name="MatMul_2") add_1 = opset.add(matmul_1, matmul_2, name="Add_1") result = opset.result(add_1, name="Result") @@ -812,17 +818,23 @@ class SequentialMatmulModel(OVReferenceModel): rel_error= 0.03 """ - def _create_ov_model(self, mm_hidden_dim=4): - input_node = opset.parameter([1, 4, mm_hidden_dim], name="Input_1") + def _create_ov_model(self, mm_hidden_dim=4, transpose_a: bool = False): + # Make 2d inputs for transposed model + # to allign with onnx ref model + if transpose_a: + input_node = opset.parameter([4, mm_hidden_dim], name="Input_1") + last_node = opset.transpose(input_node, input_order=[1, 0]) + else: + input_node = opset.parameter([1, 4, mm_hidden_dim], name="Input_1") + last_node = input_node main_values = [10000, 1000, 1, 10, 10000] - last_node = input_node for i, main_value in enumerate(main_values): weights_data = np.arange(0, mm_hidden_dim**2).reshape(mm_hidden_dim, mm_hidden_dim) weights_data[-1, -1] = main_value current_weights = opset.constant(weights_data, dtype=np.float32, name=f"weights_{i}") current_node = opset.matmul( - last_node, current_weights, transpose_a=False, transpose_b=True, name=f"MatMul_{i}" + last_node, current_weights, transpose_a=transpose_a, transpose_b=True, name=f"MatMul_{i}" ) last_node = current_node @@ -1003,7 +1015,7 @@ def get_weights(weights_data, is_int8, name): ) return (qw - zp) * scale - def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False): + def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False, non_mergable_pattern: bool = False): input_node = opset.parameter([1] * n_extra_dims + [-1, 8], name="Input_1") weights_data1 = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05 @@ -1012,13 +1024,16 @@ def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False): weights_data2 = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05 weights2 = self.get_weights(weights_data2, is_int8, name="weights_2") - node2 = opset.matmul(input_node, weights2, transpose_a=False, transpose_b=True, name="MatMul_2") - - node_multiply = opset.multiply(node1, node2, name="Multiply") + if non_mergable_pattern: + relu = opset.relu(node1) + node3 = opset.matmul(relu, weights2, transpose_a=False, transpose_b=True, name="MatMul_2") + else: + node2 = opset.matmul(input_node, weights2, transpose_a=False, transpose_b=True, name="MatMul_2") + node_multiply = opset.multiply(node1, node2, name="Multiply") - weights_data3 = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05 - weights3 = self.get_weights(weights_data3, is_int8, name="weights_3") - node3 = opset.matmul(node_multiply, weights3, transpose_a=False, transpose_b=True, name="MatMul_3") + weights_data3 = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05 + weights3 = self.get_weights(weights_data3, is_int8, name="weights_3") + node3 = opset.matmul(node_multiply, weights3, transpose_a=False, transpose_b=True, name="MatMul_3") weights_data4 = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05 weights4 = self.get_weights(weights_data4, is_int8, name="weights_4") @@ -1026,13 +1041,18 @@ def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False): weights_data5 = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05 weights5 = self.get_weights(weights_data5, is_int8, name="weights_5") - node5 = opset.matmul(node3, weights5, transpose_a=False, transpose_b=True, name="MatMul_5") - node_multiply_2 = opset.multiply(node4, node5, name="Multiply_2") + if non_mergable_pattern: + relu = opset.relu(node4) + node6 = opset.matmul(relu, weights5, transpose_a=False, transpose_b=True, name="MatMul_6") + else: + node5 = opset.matmul(node3, weights5, transpose_a=False, transpose_b=True, name="MatMul_5") + + node_multiply_2 = opset.multiply(node4, node5, name="Multiply_2") - weights_data6 = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05 - weights6 = self.get_weights(weights_data6, is_int8, name="weights_6") - node6 = opset.matmul(node_multiply_2, weights6, transpose_a=False, transpose_b=True, name="MatMul_6") + weights_data6 = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05 + weights6 = self.get_weights(weights_data6, is_int8, name="weights_6") + node6 = opset.matmul(node_multiply_2, weights6, transpose_a=False, transpose_b=True, name="MatMul_6") result = opset.result(node6, name="Result") result.get_output_tensor(0).set_names(set(["Result"])) @@ -1084,6 +1104,43 @@ def _create_ov_model(self, is_int8=False, with_multiply=False, n_layers=8): return model +class AWQModel(OVReferenceModel): + OUTPUT_DIM = 32 + HIDDEN_DIM = 16 + INPUT_SHAPE = [1, 24, HIDDEN_DIM] # [B, SeqLen, HiddenDim] + + def _create_ov_model( + self, + transpose_a: bool = False, + transpose_b: bool = True, + input_shape: Optional[list[int]] = None, + is_int8=False, + ): + self._input_shape = self.INPUT_SHAPE if input_shape is None else input_shape + hdim_axis = -2 if transpose_a else -1 + self._hidden_dim = self._input_shape[hdim_axis] + input_1 = opset.parameter(self._input_shape, name="Input") + weight_shape = self.get_weight_shape(transpose_b) + data = self._rng.random(weight_shape).astype(np.float32) + + weights = AWQMatmulModel.get_weights(data, is_int8=is_int8, name="weights_1") + + relu = opset.relu(input_1) + matmul = opset.matmul(relu, weights, transpose_a=transpose_a, transpose_b=transpose_b, name="MatMul") + + result = opset.result(matmul, name="Result") + result.get_output_tensor(0).set_names(set(["Result"])) + model = ov.Model([result], [input_1]) + return model + + @property + def hidden_dim(self): + return self._hidden_dim + + def get_weight_shape(self, transpose_b: bool = True): + return [self.OUTPUT_DIM, self.hidden_dim] if transpose_b else [self.hidden_dim, self.OUTPUT_DIM] + + class AWQModel_fp16_overlow(OVReferenceModel): """ Model for testing AWQ algorithm with fp16 overflow fix. diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 539ced6085a..dd58ff279a3 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -12,7 +12,7 @@ import inspect import os from collections import defaultdict -from typing import Callable +from typing import Callable, Optional from unittest.mock import patch import numpy as np @@ -64,6 +64,7 @@ from tests.openvino.native.common import get_actual_reference_for_current_openvino from tests.openvino.native.models import AWQActMatmulModel from tests.openvino.native.models import AWQMatmulModel +from tests.openvino.native.models import AWQModel from tests.openvino.native.models import AWQModel_fp16_overlow from tests.openvino.native.models import DifferentChannelSizeMatmulModel from tests.openvino.native.models import GatherAndMatmulShareData @@ -104,7 +105,9 @@ class LMLinearModel(OVReferenceModel): HIDDEN_DIM = 16 INPUT_SHAPE = [1, 24, HIDDEN_DIM] # [B, SeqLen, HiddenDim] - def _create_ov_model(self, transpose_b: bool = True, transpose_a=False, input_shape=None): + def _create_ov_model( + self, transpose_b: bool = True, transpose_a: bool = False, input_shape: Optional[list[int]] = None + ): self._input_shape = self.INPUT_SHAPE if input_shape is None else input_shape hdim_axis = -2 if transpose_a else -1 self._hidden_dim = self._input_shape[hdim_axis] @@ -1972,38 +1975,6 @@ def test_compression_with_different_algo_combinations(input_shape, kwargs): ) -@pytest.mark.parametrize( - "kwargs", - [ - dict(scale_estimation=True), - dict(lora_correction=True), - dict( - gptq=True, - awq=True, - scale_estimation=True, - advanced_parameters=CompressionParams(gptq_params=GPTQParams(subset_size=2)), - ), - ], -) -def test_compression_with_transposed_activations(kwargs): - dataset_size = 4 - model = LMLinearModel(transpose_a=True, transpose_b=False).ov_model - input_data = [np.ones(inp.shape) for inp in model.inputs] * dataset_size - dataset = Dataset(input_data) - - with pytest.raises(nncf.UnsupportedModelError): - compress_weights( - model, - mode=CompressWeightsMode.INT4_SYM, - ratio=1.0, - group_size=8, - subset_size=2, - dataset=dataset, - all_layers=True, - **kwargs, - ) - - @pytest.mark.parametrize("disabled", [False, True]) def test_disabled_optimized_compression(disabled): hidden_dim = (MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION // LMLinearModel.OUTPUT_DIM) + 1 @@ -2190,8 +2161,8 @@ def get_SAM_PE_model() -> ov.Model: return SAMPEModel().ov_model @staticmethod - def get_sequential_matmul_model() -> ov.Model: - return SequentialMatmulModel().ov_model + def get_sequential_matmul_model(transpose_a: bool) -> ov.Model: + return SequentialMatmulModel(transpose_a=transpose_a).ov_model @staticmethod def get_model_for_test_scale_estimation(): @@ -2202,8 +2173,8 @@ def get_moe_model_for_test_scale_estimation(): return SimpleMoEModel().ov_model @staticmethod - def get_awq_model() -> ov.Model: - return AWQMatmulModel().ov_model + def get_awq_model(non_mergable_pattern: bool) -> ov.Model: + return AWQMatmulModel(non_mergable_pattern=non_mergable_pattern).ov_model @staticmethod def get_different_channel_size_model(channel_sizes: list[int]) -> ov.Model: @@ -2213,6 +2184,11 @@ def get_different_channel_size_model(channel_sizes: list[int]) -> ov.Model: def get_awq_act_model(with_multiply, n_layers): return AWQActMatmulModel(with_multiply=with_multiply, n_layers=n_layers).ov_model + @staticmethod + def get_transposable_awq_model(transpose_a, transpose_b, input_shape=None): + ov_model = AWQModel(transpose_a=transpose_a, transpose_b=transpose_b, input_shape=input_shape).ov_model + return ov_model + @staticmethod def to_tensor(x) -> np.ndarray: return np.array(x) @@ -2226,7 +2202,7 @@ def cast_to(x: np.ndarray, dtype: TensorDataType) -> np.ndarray: raise NotImplementedError @staticmethod - def check_weights(model: ov.Model, ref_ids: list[int]) -> None: + def check_weights(model: ov.Model, ref_ids: list[int], transpose_a=False) -> None: names = {op.get_friendly_name() for op in model.get_ordered_ops() if op.get_element_type() == ov.Type.i4} low_precision_nodes = {f"weights_{i}" for i in ref_ids} assert low_precision_nodes == names @@ -2441,12 +2417,24 @@ def get_num_multiply_from_awq(model): return awq_num @staticmethod - def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: + @pytest.fixture + def test_awq_scale_ref() -> dict[str, Tensor]: return { + "MatMul": Tensor(np.array([[10.337929], [6.4558873]], dtype=np.float32)), "MatMul_3": Tensor( np.array( [[1.2264546, 1.2054994, 1.1413403, 1.0974358, 1.0643553, 1.0379708, 1.0161183, 0.9975262]], dtype=np.float32, + ).T + ), + "MatMul_2": Tensor( + np.array( + [[[1.9909902, 1.8632966, 1.5759803, 1.3974594, 1.2722752, 1.1779976, 1.1035581, 1.042768]]], + dtype=np.float32, ) - ) + ), } + + @pytest.fixture + def transpose_a_supported(self) -> bool: + return True diff --git a/tests/torch/function_hook/quantization/test_weights_compression.py b/tests/torch/function_hook/quantization/test_weights_compression.py index 7caad2aba73..4f01323b199 100644 --- a/tests/torch/function_hook/quantization/test_weights_compression.py +++ b/tests/torch/function_hook/quantization/test_weights_compression.py @@ -174,16 +174,19 @@ def forward(self, x): class AWQLinearModel(nn.Module): - def __init__(self, is_int8=False): + def __init__(self, non_mergable_pattern: bool = False, is_int8=False): super().__init__() self.is_int8 = is_int8 + self.non_mergable_pattern = non_mergable_pattern self.linear1 = self.get_linear_layer(0.01 * torch.arange(0, 64).reshape(8, 8) + 0.05, is_int8) self.linear2 = self.get_linear_layer(0.01 * torch.arange(0, 64).reshape(8, 8) + 0.05, is_int8) self.linear3 = self.get_linear_layer(0.01 * torch.arange(0, 64).reshape(8, 8) + 0.05, is_int8) self.linear4 = self.get_linear_layer(0.01 * torch.arange(0, 64).reshape(8, 8) + 0.05, is_int8) - self.linear5 = self.get_linear_layer(0.01 * torch.arange(0, 64).reshape(8, 8) + 0.05, is_int8) - self.linear6 = self.get_linear_layer(0.01 * torch.arange(0, 64).reshape(8, 8) + 0.05, is_int8) + + if not non_mergable_pattern: + self.linear5 = self.get_linear_layer(0.01 * torch.arange(0, 64).reshape(8, 8) + 0.05, is_int8) + self.linear6 = self.get_linear_layer(0.01 * torch.arange(0, 64).reshape(8, 8) + 0.05, is_int8) def get_linear_layer(self, weights_data, is_int8): if not is_int8: @@ -200,9 +203,19 @@ def get_linear_layer(self, weights_data, is_int8): return linear_layer def forward(self, x): - node1 = self.linear1(x) - node2 = self.linear2(x) - node_multiply = node1 * node2 + if self.non_mergable_pattern: + node1 = self.linear1(x) + y = torch.relu(node1) + node_multiply = self.linear2(y) + else: + node1 = self.linear1(x) + node2 = self.linear2(x) + node_multiply = node1 * node2 + + if self.non_mergable_pattern: + node3 = self.linear3(node_multiply) + y = torch.relu(node3) + return self.linear4(y) node3 = self.linear3(node_multiply) node4 = self.linear4(node3) @@ -500,7 +513,7 @@ def get_SAM_PE_model() -> torch.nn.Module: return SAMPEModel() @staticmethod - def get_sequential_matmul_model() -> torch.nn.Module: + def get_sequential_matmul_model(transpose_a: bool) -> torch.nn.Module: return SequentialMatmulModel() @staticmethod @@ -516,8 +529,8 @@ def get_moe_model_for_test_scale_estimation(): return model @staticmethod - def get_awq_model() -> torch.nn.Module: - return AWQLinearModel() + def get_awq_model(non_mergable_pattern: bool) -> torch.nn.Module: + return AWQLinearModel(non_mergable_pattern=non_mergable_pattern) @staticmethod def get_different_channel_size_model(channel_sizes: list[int]) -> torch.nn.Module: @@ -536,7 +549,7 @@ def cast_to(x: torch.Tensor, dtype: TensorDataType) -> torch.Tensor: return cast_to(x, dtype) @staticmethod - def check_weights(model: torch.nn.Module, ref_ids: list[int]) -> None: + def check_weights(model: torch.nn.Module, ref_ids: list[int], transpose_a=False) -> None: all_names = model.get_weight_names_in_exec_order() low_precision_nodes = list(map(lambda i: all_names[i], ref_ids)) decompressed_modules = list( @@ -746,13 +759,44 @@ def get_num_multiply_from_awq(model): return awq_num @staticmethod - def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: + @pytest.fixture + def test_awq_scale_ref() -> dict[str, Tensor]: return { "linear3/linear/0": Tensor( - torch.tensor([[1.226455, 1.205499, 1.141340, 1.097436, 1.064355, 1.037971, 1.016118, 0.997526]]) - ) + torch.tensor( + [[1.226455, 1.205499, 1.141340, 1.097436, 1.064355, 1.037971, 1.016118, 0.997526]], + dtype=torch.float32, + ).T + ), + "linear2/linear/0": Tensor( + torch.tensor( + [ + [ + [ + 1.9909899235, + 1.8632963896, + 1.5759800673, + 1.3974593878, + 1.2722752094, + 1.1779977083, + 1.1035580635, + 1.0427680016, + ] + ] + ], + dtype=torch.float32, + ) + ), } + @staticmethod + def get_transposable_awq_model(transpose_a: bool, transpose_b: bool): + pass + + @pytest.fixture + def transpose_a_supported(self) -> bool: + return False + @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) def test_half_precision_models(dtype): @@ -768,3 +812,7 @@ def test_half_precision_models(dtype): awq=True, dataset=nncf.Dataset([dict(inputs)]), ) + + @pytest.fixture + def tranpose_a_supported() -> bool: + return False diff --git a/tests/torch/fx/test_compress_weights.py b/tests/torch/fx/test_weights_compression.py similarity index 95% rename from tests/torch/fx/test_compress_weights.py rename to tests/torch/fx/test_weights_compression.py index 2d447b94d4c..139492d88f2 100644 --- a/tests/torch/fx/test_compress_weights.py +++ b/tests/torch/fx/test_weights_compression.py @@ -338,7 +338,7 @@ def get_SAM_PE_model() -> torch.fx.GraphModule: return exported_model @staticmethod - def get_sequential_matmul_model() -> torch.fx.GraphModule: + def get_sequential_matmul_model(transpose_a: bool) -> torch.fx.GraphModule: model = SequentialMatmulModel() ex_input = torch.ones([1, 4, 4], dtype=torch.float32) exported_model = get_torch_fx_model(model, ex_input) @@ -363,8 +363,8 @@ def get_moe_model_for_test_scale_estimation(): return exported_model @staticmethod - def get_awq_model() -> torch.fx.GraphModule: - model = AWQLinearModel() + def get_awq_model(non_mergable_pattern: bool) -> torch.fx.GraphModule: + model = AWQLinearModel(non_mergable_pattern=non_mergable_pattern) dynamic_shapes = [[None, torch.export.Dim("dynamic_shape"), None]] ex_input = torch.ones([1, 4, 8], dtype=torch.float32) exported_model = get_torch_fx_model(model, ex_input, dynamic_shapes=dynamic_shapes) @@ -393,7 +393,7 @@ def cast_to(x: torch.Tensor, dtype: TensorDataType) -> torch.Tensor: return cast_to(x, dtype) @staticmethod - def check_weights(model: torch.fx.GraphModule, ref_ids: list[int]) -> None: + def check_weights(model: torch.fx.GraphModule, ref_ids: list[int], transpose_a=False) -> None: all_names = list(model.graph.nodes) low_precision_nodes = list(map(lambda i: all_names[i].name, ref_ids)) for node in model.graph.nodes: @@ -610,9 +610,37 @@ def get_num_multiply_from_awq(model): return awq_num @staticmethod - def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: + @pytest.fixture + def test_awq_scale_ref() -> dict[str, Tensor]: return { "linear_2": Tensor( - torch.tensor([[1.226455, 1.205499, 1.141340, 1.097436, 1.064355, 1.037971, 1.016118, 0.997526]]) - ) + torch.tensor([[1.226455, 1.205499, 1.141340, 1.097436, 1.064355, 1.037971, 1.016118, 0.997526]]).T + ), + "linear_1": Tensor( + torch.tensor( + [ + [ + [ + 1.9909899235, + 1.8632963896, + 1.5759800673, + 1.3974593878, + 1.2722752094, + 1.1779977083, + 1.1035580635, + 1.0427680016, + ] + ] + ], + dtype=torch.float32, + ) + ), } + + @staticmethod + def get_transposable_awq_model(transpose_a: bool, transpose_b: bool): + pass + + @pytest.fixture + def transpose_a_supported(self) -> bool: + return False diff --git a/tests/torch/test_node_utils.py b/tests/torch/test_node_utils.py new file mode 100644 index 00000000000..8e52f1c3050 --- /dev/null +++ b/tests/torch/test_node_utils.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import nncf +import nncf.torch.graph.operator_metatypes as op +from nncf.common.graph import NNCFNode +from nncf.torch.node_utils import get_activation_channel_axis + + +@pytest.mark.parametrize( + "metatype,port_id,ref_out", + ( + (op.PTLinearMetatype, 0, -1), + (op.PTConv2dMetatype, 0, 1), + (op.PTDepthwiseConv2dSubtype, 0, 1), + (op.PTConvTranspose2dMetatype, 0, 1), + (op.PTMatMulMetatype, 0, -1), + (op.PTMatMulMetatype, 1, -2), + (op.PTAddmmMetatype, 0, -1), + (op.PTAddmmMetatype, 1, -2), + (op.PTMatMulMetatype, 2, "error"), + (op.PTAddMetatype, 0, "error"), + ), +) +def test_get_activation_channel_axis(metatype, port_id, ref_out): + node = NNCFNode({"metatype": metatype}) + if ref_out == "error": + with pytest.raises(nncf.InternalError): + get_activation_channel_axis(node, port_id) + else: + assert get_activation_channel_axis(node, port_id) == ref_out