From 3122842958e964d0b273570857801051d0359cb7 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 3 Dec 2025 13:40:56 +0400 Subject: [PATCH 01/30] enable awq --- .../algorithms/weight_compression/awq.py | 71 +++++++++++-------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py index 508ad57060d..5c8f3000a81 100644 --- a/src/nncf/quantization/algorithms/weight_compression/awq.py +++ b/src/nncf/quantization/algorithms/weight_compression/awq.py @@ -183,8 +183,8 @@ def apply( prev_statistics = statistics[merge_node.node_name] scale = self._data_aware_step(wp, weight, statistics[k], prev_weight, prev_statistics) - w_scale = fns.unsqueeze(scale, 1 - wp.reduction_axes[0]) - a_scale = fns.unsqueeze(1.0 / scale, wp.reduction_axes[0]) + w_scale = fns.unsqueeze(scale, -1 - wp.reduction_axes[0]) + a_scale = fns.unsqueeze(1.0 / scale, -wp.reduction_axes[0]) scaled_weight = (weight * w_scale).astype(weight_dtype) self._backend_entity.set_weight(wp.node_with_weight, weight_port_id, model, graph, scaled_weight) @@ -194,9 +194,9 @@ def apply( merge_weight = self._backend_entity.get_weight(merge_node, port_id, model, graph) merge_weight = (merge_weight * a_scale).astype(weight_dtype) self._backend_entity.set_weight(merge_node, port_id, model, graph, merge_weight) - a_scale = fns.transpose(a_scale) + a_scale = fns.moveaxis(a_scale, -1, -2) else: # for Act->Multiply->MatMul and Act->MatMul patterns scale inserted after Act as extra node - a_scale = fns.transpose(a_scale).astype(weight_dtype) + a_scale = fns.moveaxis(a_scale, -1, -2).astype(weight_dtype) next_nodes = graph.get_next_nodes(merge_node) source_node_output_port = graph.get_output_edges(merge_node)[0].output_port_id scale_insertion_command = self._backend_entity.scale_insertion_command( @@ -217,6 +217,8 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis s = s.astype(TensorDataType.float32) X = X.astype(TensorDataType.float32) + is_2d_weight = weight.ndim == 2 + assert isinstance(wp.reduction_axes, tuple) and len(wp.reduction_axes) == 1 reduction_axis = wp.reduction_axes[0] @@ -224,42 +226,54 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis if prev_statistics is not None and prev_weight is not None: prev_s, _ = process_stats(prev_statistics, self._subset_size) prev_s = prev_s.astype(TensorDataType.float32).max().item() - prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis) + prev_weight = fns.unsqueeze(prev_weight, 0) # [out_features, hidden_dim] -> [1, out_features, hidden_dim] + prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis + 1) + + if is_2d_weight: + s = fns.unsqueeze(s, 0) # [hidden_dim] -> [1, hidden_dim] + X = fns.unsqueeze(X, 0) # [hidden_dim, samples] -> [1, hidden_dim, samples] + weight = fns.unsqueeze(weight, 0) # [out_features, hidden_dim] -> [1, out_features, hidden_dim] + reduction_axis += 1 - top_k = max(int(s.shape[0] * self._percent_to_apply), 1) - topk_idxs = fns.argsort(-s)[:top_k] + top_k = max(int(s.shape[-1] * self._percent_to_apply), 1) + topk_idxs = fns.argsort(-s)[:, :top_k] group_size = config.group_size if group_size == -1: - group_size = s.shape[0] + group_size = s.shape[-1] groups_to_correct = set() - for idx in topk_idxs: - groups_to_correct.add(idx.data // group_size) + for expert_idx in range(topk_idxs.shape[0]): + for k_idx in range(topk_idxs.shape[1]): + idx = topk_idxs[expert_idx, k_idx].item() + group_idx = idx // group_size + groups_to_correct.add((expert_idx, group_idx)) groups_to_correct = list(groups_to_correct) - if reduction_axis == 0: - weight = fns.transpose(weight) - reduction_axis = 1 + if reduction_axis == 1: + # Weights + # 3D: [num_experts, hidden_dimension, out_features] -> [num_experts, out_features, hidden_dimension] + # 2D: [1, hidden_dimension, out_features] -> [1, out_features, hidden_dimension] + weight = fns.moveaxis(weight, -1, -2) + reduction_axis = weight.ndim - 1 - shape_vector = fns.mean(X, axis=1) + shape_vector = fns.mean(X, axis=-1) scale = fns.ones_like(shape_vector) awq_config = deepcopy(config) awq_config.group_size = -1 - for gi in groups_to_correct: + for expert_idx, gi in groups_to_correct: offset = gi * group_size - gscale = s[offset : offset + group_size] + gscale = s[expert_idx, offset : offset + group_size] + gweight = weight[expert_idx, :, offset : offset + group_size] + gacts = X[expert_idx, offset : offset + group_size, :] a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32) a_max = 1e2 gscale = fns.clip(gscale, a_min=a_min, a_max=a_max) - gweight = weight[:, offset : offset + group_size] - gacts = X[offset : offset + group_size, :] - fp32_out = fns.matmul(gweight, gacts) min_diff = fns.max(fns.abs(fp32_out)) best_scale = None @@ -275,14 +289,16 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis # per channel magnitudes for the previous MatMul # mean(abs(prev_weight)) * max(abs((prev_activation))) * prev_weight.shape[reduction_axis] magnitudes = ( - (prev_w[offset : offset + group_size] / cur_scale) * prev_s * prev_weight.shape[reduction_axis] + (prev_w[expert_idx, offset : offset + group_size] / cur_scale) + * prev_s + * prev_weight.shape[reduction_axis] ) if magnitudes.max() >= threshold: cur_scale = AWQ._clamp_scale( magnitudes, threshold, cur_scale, - prev_w[offset : offset + group_size] + prev_w[expert_idx, offset : offset + group_size] * prev_s * prev_weight.shape[reduction_axis] / threshold, @@ -290,13 +306,9 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis weights_to_fake_quantize = gweight * cur_scale if not config.is_integer: - g_decompressed_weighs = float_quantize_dequantize_weight( - weights_to_fake_quantize, awq_config, reduction_axis - ) + g_decompressed_weighs = float_quantize_dequantize_weight(weights_to_fake_quantize, awq_config, -1) else: - g_decompressed_weighs = integer_quantize_dequantize_weight( - weights_to_fake_quantize, awq_config, reduction_axis - ) + g_decompressed_weighs = integer_quantize_dequantize_weight(weights_to_fake_quantize, awq_config, -1) sacts = gacts / fns.unsqueeze(cur_scale, 1) cur_out = fns.matmul(g_decompressed_weighs, sacts) @@ -307,7 +319,10 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis alpha += alpha_step if best_scale is not None: - scale.data[offset : offset + group_size] = best_scale.data + scale.data[expert_idx, offset : offset + group_size] = best_scale.data + + if is_2d_weight: + scale = fns.squeeze(scale, 0) # [1, hidden_dim] -> [hidden_dim] return scale From 59033c1ba1c3f4b42c4ad8c019d91bd181e4a6c3 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Fri, 5 Dec 2025 16:29:22 +0400 Subject: [PATCH 02/30] update scale unsqueeze logic --- .../algorithms/weight_compression/awq.py | 33 ++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py index 5c8f3000a81..a54289cbe70 100644 --- a/src/nncf/quantization/algorithms/weight_compression/awq.py +++ b/src/nncf/quantization/algorithms/weight_compression/awq.py @@ -183,8 +183,17 @@ def apply( prev_statistics = statistics[merge_node.node_name] scale = self._data_aware_step(wp, weight, statistics[k], prev_weight, prev_statistics) - w_scale = fns.unsqueeze(scale, -1 - wp.reduction_axes[0]) - a_scale = fns.unsqueeze(1.0 / scale, -wp.reduction_axes[0]) + # For 3D weights, len(scale.shape)%2 == 0 whereas for 2D it is 1. This allows us to index + # from the last dim and not consider the batch dim in 3D case. + # Example: + # 3D weights: W shape = [B, M, N]; reduction_axes = 2 + # scale_shape = [M, N] -> len(scale.shape) = 2 -> 2 % 2 = 0 + # unsqueeze scale at -(0 + 2) = -2. + # 2D weights: W shape = [M, N]; reduction_axes = 1 + # scale_shape = [M] -> len(scale.shape) = 1 -> 1 % 2 = 1 + # unsqueeze scale at -(1 + 1) = -2. + w_scale = fns.unsqueeze(scale, -(len(scale.shape) % 2 + wp.reduction_axes[0])) + a_scale = fns.unsqueeze(1.0 / scale, wp.reduction_axes[0]) scaled_weight = (weight * w_scale).astype(weight_dtype) self._backend_entity.set_weight(wp.node_with_weight, weight_port_id, model, graph, scaled_weight) @@ -243,11 +252,11 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis group_size = s.shape[-1] groups_to_correct = set() - for expert_idx in range(topk_idxs.shape[0]): + for batch_idx in range(topk_idxs.shape[0]): for k_idx in range(topk_idxs.shape[1]): - idx = topk_idxs[expert_idx, k_idx].item() + idx = topk_idxs[batch_idx, k_idx].item() group_idx = idx // group_size - groups_to_correct.add((expert_idx, group_idx)) + groups_to_correct.add((batch_idx, group_idx)) groups_to_correct = list(groups_to_correct) @@ -264,11 +273,11 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis awq_config = deepcopy(config) awq_config.group_size = -1 - for expert_idx, gi in groups_to_correct: + for batch_idx, gi in groups_to_correct: offset = gi * group_size - gscale = s[expert_idx, offset : offset + group_size] - gweight = weight[expert_idx, :, offset : offset + group_size] - gacts = X[expert_idx, offset : offset + group_size, :] + gscale = s[batch_idx, offset : offset + group_size] + gweight = weight[batch_idx, :, offset : offset + group_size] + gacts = X[batch_idx, offset : offset + group_size, :] a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32) a_max = 1e2 @@ -289,7 +298,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis # per channel magnitudes for the previous MatMul # mean(abs(prev_weight)) * max(abs((prev_activation))) * prev_weight.shape[reduction_axis] magnitudes = ( - (prev_w[expert_idx, offset : offset + group_size] / cur_scale) + (prev_w[batch_idx, offset : offset + group_size] / cur_scale) * prev_s * prev_weight.shape[reduction_axis] ) @@ -298,7 +307,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis magnitudes, threshold, cur_scale, - prev_w[expert_idx, offset : offset + group_size] + prev_w[batch_idx, offset : offset + group_size] * prev_s * prev_weight.shape[reduction_axis] / threshold, @@ -319,7 +328,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis alpha += alpha_step if best_scale is not None: - scale.data[expert_idx, offset : offset + group_size] = best_scale.data + scale.data[batch_idx, offset : offset + group_size] = best_scale.data if is_2d_weight: scale = fns.squeeze(scale, 0) # [1, hidden_dim] -> [hidden_dim] From 608b2e2e62c167385e9061d3c78c22068d109a35 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Fri, 5 Dec 2025 20:14:14 +0400 Subject: [PATCH 03/30] mergeable fix --- .../algorithms/weight_compression/awq.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py index a54289cbe70..ba5cfb43200 100644 --- a/src/nncf/quantization/algorithms/weight_compression/awq.py +++ b/src/nncf/quantization/algorithms/weight_compression/awq.py @@ -231,19 +231,21 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis assert isinstance(wp.reduction_axes, tuple) and len(wp.reduction_axes) == 1 reduction_axis = wp.reduction_axes[0] - prev_s, prev_w = None, None - if prev_statistics is not None and prev_weight is not None: - prev_s, _ = process_stats(prev_statistics, self._subset_size) - prev_s = prev_s.astype(TensorDataType.float32).max().item() - prev_weight = fns.unsqueeze(prev_weight, 0) # [out_features, hidden_dim] -> [1, out_features, hidden_dim] - prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis + 1) - if is_2d_weight: s = fns.unsqueeze(s, 0) # [hidden_dim] -> [1, hidden_dim] X = fns.unsqueeze(X, 0) # [hidden_dim, samples] -> [1, hidden_dim, samples] weight = fns.unsqueeze(weight, 0) # [out_features, hidden_dim] -> [1, out_features, hidden_dim] + prev_weight = ( + fns.unsqueeze(prev_weight, 0) if prev_weight else None + ) # [out_features, hidden_dim] -> [1, out_features, hidden_dim] reduction_axis += 1 + prev_s, prev_w = None, None + if prev_statistics is not None and prev_weight is not None: + prev_s, _ = process_stats(prev_statistics, self._subset_size) + prev_s = prev_s.astype(TensorDataType.float32).max().item() + prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis + 1) + top_k = max(int(s.shape[-1] * self._percent_to_apply), 1) topk_idxs = fns.argsort(-s)[:, :top_k] From 1948c959d7dca28ff9c5773fec83711d5e5a2299 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Fri, 5 Dec 2025 20:24:10 +0400 Subject: [PATCH 04/30] fix the fix --- src/nncf/quantization/algorithms/weight_compression/awq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py index ba5cfb43200..e36915eb3c4 100644 --- a/src/nncf/quantization/algorithms/weight_compression/awq.py +++ b/src/nncf/quantization/algorithms/weight_compression/awq.py @@ -244,7 +244,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis if prev_statistics is not None and prev_weight is not None: prev_s, _ = process_stats(prev_statistics, self._subset_size) prev_s = prev_s.astype(TensorDataType.float32).max().item() - prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis + 1) + prev_w = fns.mean(fns.abs(prev_weight), axis=reduction_axis) top_k = max(int(s.shape[-1] * self._percent_to_apply), 1) topk_idxs = fns.argsort(-s)[:, :top_k] From b5bbe159f993d38f65dc56d704510b7ddf1913fb Mon Sep 17 00:00:00 2001 From: anzr299 Date: Fri, 5 Dec 2025 20:42:19 +0400 Subject: [PATCH 05/30] add ignored nodes awq test for torch and torch fx --- .../template_test_weights_compression.py | 22 ++++++---- .../quantization/test_weights_compression.py | 40 +++++++++++++++++-- tests/torch2/fx/test_compress_weights.py | 16 +++++--- 3 files changed, 61 insertions(+), 17 deletions(-) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index ec5c1a4e710..d63415f6cd0 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -382,8 +382,11 @@ def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul(self, int @staticmethod @abstractmethod - def get_awq_model() -> TModel: - "Returns a backend model for test_awq_with_ignored_scope." + def get_awq_model(is_3d_weights) -> TModel: + """ + Returns a backend model for test_awq_with_ignored_scope." + :param is_3d_weights: The model has 3d weights + """ @staticmethod @abstractmethod @@ -402,16 +405,19 @@ def get_num_int4_group_sizes(model: TModel) -> dict[int, int]: @staticmethod @abstractmethod - def get_ignored_scope_name() -> str: + def get_ignored_scope_name(is_3d_weights) -> str: "Returns ignored scope name for test_awq_with_ignored_scope." - def test_awq_with_ignored_scope(self, mocker): - model = self.get_awq_model() + @pytest.mark.parametrize("is_3d_weights", [True, False]) + def test_awq_with_ignored_scope(self, mocker, is_3d_weights): + model = self.get_awq_model(is_3d_weights) sz = 8 n_samples = 10 + input_shape = [2, 8, sz] + dataset = Dataset( - [self.to_tensor(np.ones([1, 8, sz], dtype=np.float32)) for i in range(n_samples)], + [self.to_tensor(np.ones(input_shape, dtype=np.float32)) for i in range(n_samples)], self.get_transform_func(), ) @@ -423,12 +429,12 @@ def test_awq_with_ignored_scope(self, mocker): group_size=-1, dataset=dataset, awq=True, - ignored_scope=IgnoredScope(names=[self.get_ignored_scope_name()]), + ignored_scope=IgnoredScope(names=[self.get_ignored_scope_name(is_3d_weights)]), ) int4_ref_num_compressed = 4 # last MatMul is always int8; one - is ignored; total 6 matmuls int4_num_nodes = self.get_num_int4_nodes(compressed_model) - assert int4_num_nodes == int4_ref_num_compressed + assert int4_num_nodes == int4_ref_num_compressed, int4_num_nodes def test_rope_weight_compression(self): model = self.get_RoPE_model() diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py index d7db382df83..12fa717d441 100644 --- a/tests/torch2/function_hook/quantization/test_weights_compression.py +++ b/tests/torch2/function_hook/quantization/test_weights_compression.py @@ -213,6 +213,34 @@ def forward(self, x): return node6 +class AWQLinearModel3D(nn.Module): + def __init__(self, is_int8=False): + super().__init__() + self.is_int8 = is_int8 + + weight_data = 0.01 * torch.arange(0, 2 * 8 * 8).reshape(2, 8, 8) + 0.05 + + self.w1 = nn.Parameter(weight_data) + self.w2 = nn.Parameter(weight_data) + self.w3 = nn.Parameter(weight_data) + self.w4 = nn.Parameter(weight_data) + self.w5 = nn.Parameter(weight_data) + self.w6 = nn.Parameter(weight_data) + + def forward(self, x): + node1 = torch.bmm(x, self.w1) + node2 = torch.bmm(x, self.w2) + node_multiply = node1 * node2 + + node3 = torch.bmm(node_multiply, self.w3) + node4 = torch.bmm(node3, self.w4) + node5 = torch.bmm(node3, self.w5) + node_multiply_2 = node4 * node5 + + node6 = torch.bmm(node_multiply_2, self.w6) + return node6 + + class FunctionalModel(torch.nn.Module): def __init__(self): super().__init__() @@ -516,8 +544,10 @@ def get_moe_model_for_test_scale_estimation(): return model @staticmethod - def get_awq_model() -> torch.nn.Module: - return AWQLinearModel() + def get_awq_model(is_3d_weights) -> torch.nn.Module: + if not is_3d_weights: + return AWQLinearModel() + return AWQLinearModel3D() @staticmethod def get_different_channel_size_model(channel_sizes: list[int]) -> torch.nn.Module: @@ -715,8 +745,10 @@ def get_decompressed_weight(compressed_model: torch.nn.Module, input: torch.Tens return Tensor(unpacked_w) @staticmethod - def get_ignored_scope_name() -> str: - return "linear5/linear/0" + def get_ignored_scope_name(is_3d_weights) -> str: + if not is_3d_weights: + return "linear5/linear/0" + return "/bmm/4" @staticmethod def get_num_int4_nodes(model: torch.nn.Module) -> int: diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py index 082edaea787..d839242d9b8 100644 --- a/tests/torch2/fx/test_compress_weights.py +++ b/tests/torch2/fx/test_compress_weights.py @@ -40,6 +40,7 @@ from tests.torch2.function_hook.quantization.test_weights_compression import UNSUPPORTED_MODES from tests.torch2.function_hook.quantization.test_weights_compression import AWQActLinearModel from tests.torch2.function_hook.quantization.test_weights_compression import AWQLinearModel +from tests.torch2.function_hook.quantization.test_weights_compression import AWQLinearModel3D from tests.torch2.function_hook.quantization.test_weights_compression import ConvolutionModel from tests.torch2.function_hook.quantization.test_weights_compression import DifferentChannelSizeMatmulModel from tests.torch2.function_hook.quantization.test_weights_compression import DTypeModel @@ -364,10 +365,13 @@ def get_moe_model_for_test_scale_estimation(): return exported_model @staticmethod - def get_awq_model() -> torch.fx.GraphModule: - model = AWQLinearModel() + def get_awq_model(is_3d_weights) -> torch.fx.GraphModule: + if not is_3d_weights: + model = AWQLinearModel() + else: + model = AWQLinearModel3D() dynamic_shapes = [[None, torch.export.Dim("dynamic_shape"), None]] - ex_input = torch.ones([1, 4, 8], dtype=torch.float32) + ex_input = torch.ones([2, 4, 8], dtype=torch.float32) exported_model = get_torch_fx_model(model, ex_input, dynamic_shapes=dynamic_shapes) return exported_model @@ -574,8 +578,10 @@ def get_decompressed_weight(compressed_model: torch.fx.GraphModule, input: torch return Tensor(unpacked_w) @staticmethod - def get_ignored_scope_name() -> str: - return "linear_4" + def get_ignored_scope_name(is_3d_weights) -> str: + if not is_3d_weights: + return "linear_4" + return "bmm_4" @staticmethod def get_num_int4_nodes(model: torch.fx.GraphModule) -> int: From 10bdd6fe1f5994396948332e452dfefba640ea87 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Fri, 5 Dec 2025 20:52:52 +0400 Subject: [PATCH 06/30] add remaining awq tests --- .../template_test_weights_compression.py | 7 +++-- .../quantization/test_weights_compression.py | 30 +++++++++++++++++-- tests/torch2/fx/test_compress_weights.py | 30 +++++++++++++++++-- 3 files changed, 60 insertions(+), 7 deletions(-) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index d63415f6cd0..b2bcee4da2e 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -482,11 +482,12 @@ def test_sam_pe_weight_compression(self): def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: "Returns reference for test_awq_scale_reference." - def test_awq_scale_reference(self, monkeypatch, mocker): + @pytest.mark.parametrize("is_3d_weights", [True, False]) + def test_awq_scale_reference(self, monkeypatch, mocker, is_3d_weights): monkeypatch.setattr("nncf.quantization.algorithms.weight_compression.algorithm.AWQ", SpyAWQ) - model = self.get_awq_model() + model = self.get_awq_model(is_3d_weights) - input = 0.01 * np.arange(0, 4 * 8, dtype=np.float32).reshape(1, 4, 8) + 0.02 + input = 0.01 * np.arange(0, 2 * 4 * 8, dtype=np.float32).reshape(2, 4, 8) + 0.02 input = self.to_tensor(input) dataset = Dataset([input], self.get_transform_func()) diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py index 12fa717d441..d73c2f41a9b 100644 --- a/tests/torch2/function_hook/quantization/test_weights_compression.py +++ b/tests/torch2/function_hook/quantization/test_weights_compression.py @@ -781,8 +781,34 @@ def get_num_multiply_from_awq(model): def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: return { "linear3/linear/0": Tensor( - torch.tensor([[1.226455, 1.205499, 1.141340, 1.097436, 1.064355, 1.037971, 1.016118, 0.997526]]) - ) + torch.tensor([[1.422865, 1.347446, 1.133510, 1.001522, 0.909387, 0.840226, 0.785757, 0.741368]]) + ), + "/bmm/2": Tensor( + torch.tensor( + [ + [ + [1.109999], + [1.108342], + [1.102878], + [1.097587], + [1.092457], + [1.087481], + [1.082649], + [1.077955], + ], + [ + [0.130212], + [0.129630], + [0.127712], + [0.125842], + [0.124017], + [0.122236], + [0.120498], + [0.118800], + ], + ] + ) + ), } diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py index d839242d9b8..70bf272752d 100644 --- a/tests/torch2/fx/test_compress_weights.py +++ b/tests/torch2/fx/test_compress_weights.py @@ -620,6 +620,32 @@ def get_num_multiply_from_awq(model): def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: return { "linear_2": Tensor( - torch.tensor([[1.226455, 1.205499, 1.141340, 1.097436, 1.064355, 1.037971, 1.016118, 0.997526]]) - ) + torch.tensor([[1.422865, 1.347446, 1.133510, 1.001522, 0.909387, 0.840226, 0.785757, 0.741368]]) + ), + "bmm_2": Tensor( + torch.tensor( + [ + [ + [1.109999], + [1.108342], + [1.102878], + [1.097587], + [1.092457], + [1.087481], + [1.082649], + [1.077955], + ], + [ + [0.130212], + [0.129630], + [0.127712], + [0.125842], + [0.124017], + [0.122236], + [0.120498], + [0.118800], + ], + ] + ) + ), } From 9f511a4748b8b93e589a09813cb3f5cc0211880d Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Sun, 7 Dec 2025 13:50:54 +0400 Subject: [PATCH 07/30] Update awq.py --- src/nncf/quantization/algorithms/weight_compression/awq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py index e36915eb3c4..0c7645841a1 100644 --- a/src/nncf/quantization/algorithms/weight_compression/awq.py +++ b/src/nncf/quantization/algorithms/weight_compression/awq.py @@ -236,7 +236,7 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis X = fns.unsqueeze(X, 0) # [hidden_dim, samples] -> [1, hidden_dim, samples] weight = fns.unsqueeze(weight, 0) # [out_features, hidden_dim] -> [1, out_features, hidden_dim] prev_weight = ( - fns.unsqueeze(prev_weight, 0) if prev_weight else None + fns.unsqueeze(prev_weight, 0) if prev_weight is not None else None ) # [out_features, hidden_dim] -> [1, out_features, hidden_dim] reduction_axis += 1 From b62074f0bc043af382514c6cf1248b556f042f8c Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 07:37:14 +0400 Subject: [PATCH 08/30] add 3d matmul model to onnx --- tests/onnx/quantization/test_weights_compression.py | 9 +++++++-- .../native/quantization/test_weights_compression.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py index ddf441272b5..2121f6a0e54 100644 --- a/tests/onnx/quantization/test_weights_compression.py +++ b/tests/onnx/quantization/test_weights_compression.py @@ -9,6 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math from collections import defaultdict from dataclasses import dataclass from typing import Any, Callable, Optional @@ -696,7 +697,7 @@ def get_num_multiply_from_awq(model: onnx.ModelProto) -> int: return awq_num @staticmethod - def get_awq_model() -> onnx.ModelProto: + def get_awq_model(is_3d_weights) -> onnx.ModelProto: """ Builds a model to be used in the following tests: - TemplateWeightCompression.test_awq_with_ignored_scope() @@ -709,7 +710,11 @@ def get_awq_model() -> onnx.ModelProto: x = mb.add_input("input", (1, None, 8)) output = mb.add_output("output", (1, None, 8)) - w_data = 0.01 * np.arange(0, 64, dtype=np.float32).reshape(8, 8) + 0.05 + weight_shape = (8, 8) + if is_3d_weights: + weight_shape = (2, 8, 8) + + w_data = 0.01 * np.arange(0, math.prod(weight_shape), dtype=np.float32).reshape(weight_shape) + 0.05 w_data = w_data.T num_blocks = 2 diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index c21e5b7d46f..9bf7ab9b3b6 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -2083,7 +2083,7 @@ def get_moe_model_for_test_scale_estimation(): return SimpleMoEModel().ov_model @staticmethod - def get_awq_model() -> ov.Model: + def get_awq_model(is_3d_weights) -> ov.Model: return AWQMatmulModel().ov_model @staticmethod From b364b4def3ff39e89407e903883ff89254105d9f Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 08:10:45 +0400 Subject: [PATCH 09/30] fix onnx model test --- tests/onnx/quantization/test_weights_compression.py | 11 ++++++----- tests/torch2/fx/test_compress_weights.py | 5 ++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py index 2121f6a0e54..e572dec13eb 100644 --- a/tests/onnx/quantization/test_weights_compression.py +++ b/tests/onnx/quantization/test_weights_compression.py @@ -707,12 +707,13 @@ def get_awq_model(is_3d_weights) -> onnx.ModelProto: """ mb = ModelBuilder() - x = mb.add_input("input", (1, None, 8)) - output = mb.add_output("output", (1, None, 8)) - weight_shape = (8, 8) if is_3d_weights: - weight_shape = (2, 8, 8) + # The first and last dimension are later transposed + weight_shape = (8, 8, 2) + + x = mb.add_input("input", (2, None, 8)) + output = mb.add_output("output", (2, None, 8)) w_data = 0.01 * np.arange(0, math.prod(weight_shape), dtype=np.float32).reshape(weight_shape) + 0.05 w_data = w_data.T @@ -765,7 +766,7 @@ def get_num_int4_group_sizes(model: onnx.ModelProto) -> dict[int, int]: return num @staticmethod - def get_ignored_scope_name() -> str: + def get_ignored_scope_name(is_3d_weights) -> str: return "MatMul_4" # Zero-based indices (e.g., MatMul_0, MatMul_1, ...) @staticmethod diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py index 70bf272752d..ab2870f9754 100644 --- a/tests/torch2/fx/test_compress_weights.py +++ b/tests/torch2/fx/test_compress_weights.py @@ -366,9 +366,8 @@ def get_moe_model_for_test_scale_estimation(): @staticmethod def get_awq_model(is_3d_weights) -> torch.fx.GraphModule: - if not is_3d_weights: - model = AWQLinearModel() - else: + model = AWQLinearModel() + if is_3d_weights: model = AWQLinearModel3D() dynamic_shapes = [[None, torch.export.Dim("dynamic_shape"), None]] ex_input = torch.ones([2, 4, 8], dtype=torch.float32) From 42496265a41a690fc17e545884f53d9d436e6569 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 08:30:48 +0400 Subject: [PATCH 10/30] fix some tests --- .../template_test_weights_compression.py | 4 +- .../quantization/test_weights_compression.py | 49 +++++++++++--- .../quantization/test_weights_compression.py | 66 ++++++++++--------- tests/torch2/fx/test_compress_weights.py | 66 ++++++++++--------- 4 files changed, 112 insertions(+), 73 deletions(-) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index b2bcee4da2e..b75cc11ba3e 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -479,7 +479,7 @@ def test_sam_pe_weight_compression(self): @staticmethod @abstractmethod - def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: + def get_reference_for_test_awq_scale_reference(is_3d_weights) -> dict[str, Tensor]: "Returns reference for test_awq_scale_reference." @pytest.mark.parametrize("is_3d_weights", [True, False]) @@ -502,7 +502,7 @@ def test_awq_scale_reference(self, monkeypatch, mocker, is_3d_weights): ) assert spy_instance is not None for node_name, scales in spy_instance._scale_per_target_node.items(): - assert fns.allclose(scales, self.get_reference_for_test_awq_scale_reference()[node_name]) + assert fns.allclose(scales, self.get_reference_for_test_awq_scale_reference(is_3d_weights)[node_name]) @pytest.mark.parametrize( ["group_size", "fallback_mode", "min_adjusted_group_size", "expected_outcome"], diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py index e572dec13eb..bf3e4c3b539 100644 --- a/tests/onnx/quantization/test_weights_compression.py +++ b/tests/onnx/quantization/test_weights_compression.py @@ -770,15 +770,46 @@ def get_ignored_scope_name(is_3d_weights) -> str: return "MatMul_4" # Zero-based indices (e.g., MatMul_0, MatMul_1, ...) @staticmethod - def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: - return { - "MatMul_3": Tensor( - np.array( - [[1.2264546, 1.2054994, 1.1413403, 1.0974358, 1.0643553, 1.0379708, 1.0161183, 0.9975262]], - dtype=np.float32, - ).T - ) - } + def get_reference_for_test_awq_scale_reference(is_3d_weights) -> dict[str, Tensor]: + return [ + { + "MatMul_3": Tensor( + np.array( + [[1.4228648, 1.3474456, 1.1335096, 1.001522, 0.90938693, 0.84022623, 0.78575736, 0.7413683]], + dtype=np.float32, + ).T + ) + }, + { + "MatMul_3": Tensor( + np.array( + [ + [ + [1.119726], + [1.1012304], + [1.0438583], + [1.006067], + [0.97812414], + [0.95607865], + [0.9379444], + [0.922586], + ], + [ + [0.99698645], + [0.9808075], + [0.9307146], + [0.8974796], + [0.87281394], + [0.8533093], + [0.8372402], + [0.82361573], + ], + ], + dtype=np.float32, + ) + ) + }, + ][is_3d_weights] @staticmethod def get_transform_func() -> Optional[Callable[..., Any]]: diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py index d73c2f41a9b..0ede375f739 100644 --- a/tests/torch2/function_hook/quantization/test_weights_compression.py +++ b/tests/torch2/function_hook/quantization/test_weights_compression.py @@ -778,38 +778,42 @@ def get_num_multiply_from_awq(model): return awq_num @staticmethod - def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: - return { - "linear3/linear/0": Tensor( - torch.tensor([[1.422865, 1.347446, 1.133510, 1.001522, 0.909387, 0.840226, 0.785757, 0.741368]]) - ), - "/bmm/2": Tensor( - torch.tensor( - [ + def get_reference_for_test_awq_scale_reference(is_3d_weights) -> dict[str, Tensor]: + return [ + { + "linear3/linear/0": Tensor( + torch.tensor([[1.422865, 1.347446, 1.133510, 1.001522, 0.909387, 0.840226, 0.785757, 0.741368]]) + ), + }, + { + "/bmm/2": Tensor( + torch.tensor( [ - [1.109999], - [1.108342], - [1.102878], - [1.097587], - [1.092457], - [1.087481], - [1.082649], - [1.077955], - ], - [ - [0.130212], - [0.129630], - [0.127712], - [0.125842], - [0.124017], - [0.122236], - [0.120498], - [0.118800], - ], - ] - ) - ), - } + [ + [1.109999], + [1.108342], + [1.102878], + [1.097587], + [1.092457], + [1.087481], + [1.082649], + [1.077955], + ], + [ + [0.130212], + [0.129630], + [0.127712], + [0.125842], + [0.124017], + [0.122236], + [0.120498], + [0.118800], + ], + ] + ) + ), + }, + ][is_3d_weights] @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py index ab2870f9754..e7788ec365d 100644 --- a/tests/torch2/fx/test_compress_weights.py +++ b/tests/torch2/fx/test_compress_weights.py @@ -616,35 +616,39 @@ def get_num_multiply_from_awq(model): return awq_num @staticmethod - def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: - return { - "linear_2": Tensor( - torch.tensor([[1.422865, 1.347446, 1.133510, 1.001522, 0.909387, 0.840226, 0.785757, 0.741368]]) - ), - "bmm_2": Tensor( - torch.tensor( - [ + def get_reference_for_test_awq_scale_reference(is_3d_weights) -> dict[str, Tensor]: + return [ + { + "linear_2": Tensor( + torch.tensor([[1.422865, 1.347446, 1.133510, 1.001522, 0.909387, 0.840226, 0.785757, 0.741368]]) + ), + }, + { + "bmm_2": Tensor( + torch.tensor( [ - [1.109999], - [1.108342], - [1.102878], - [1.097587], - [1.092457], - [1.087481], - [1.082649], - [1.077955], - ], - [ - [0.130212], - [0.129630], - [0.127712], - [0.125842], - [0.124017], - [0.122236], - [0.120498], - [0.118800], - ], - ] - ) - ), - } + [ + [1.109999], + [1.108342], + [1.102878], + [1.097587], + [1.092457], + [1.087481], + [1.082649], + [1.077955], + ], + [ + [0.130212], + [0.129630], + [0.127712], + [0.125842], + [0.124017], + [0.122236], + [0.120498], + [0.118800], + ], + ] + ) + ), + }, + ][is_3d_weights] From cc441791a1b407508778ff15ce2bc86b7a85933b Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 14:23:22 +0400 Subject: [PATCH 11/30] add ov model --- .../quantization/test_weights_compression.py | 43 +++++++++++++++---- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 9bf7ab9b3b6..b4092cd85f4 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -64,6 +64,7 @@ from tests.openvino.native.common import get_actual_reference_for_current_openvino from tests.openvino.native.models import AWQActMatmulModel from tests.openvino.native.models import AWQMatmulModel +from tests.openvino.native.models import AWQMatmulModel3D from tests.openvino.native.models import AWQModel_fp16_overlow from tests.openvino.native.models import DifferentChannelSizeMatmulModel from tests.openvino.native.models import GatherAndMatmulShareData @@ -2084,6 +2085,8 @@ def get_moe_model_for_test_scale_estimation(): @staticmethod def get_awq_model(is_3d_weights) -> ov.Model: + if is_3d_weights: + return AWQMatmulModel3D().ov_model return AWQMatmulModel().ov_model @staticmethod @@ -2322,12 +2325,36 @@ def get_num_multiply_from_awq(model): return awq_num @staticmethod - def get_reference_for_test_awq_scale_reference() -> dict[str, Tensor]: - return { - "MatMul_3": Tensor( - np.array( - [[1.2264546, 1.2054994, 1.1413403, 1.0974358, 1.0643553, 1.0379708, 1.0161183, 0.9975262]], - dtype=np.float32, + def get_reference_for_test_awq_scale_reference(is_3d_weights) -> dict[str, Tensor]: + return [ + { + "MatMul_3": Tensor( + np.array( + [[1.4228648, 1.3474456, 1.1335096, 1.001522, 0.90938693, 0.84022623, 0.78575736, 0.7413683]], + dtype=np.float32, + ) ) - ) - } + }, + { + "MatMul_3": Tensor( + np.array( + [ + [[1.2264546, 1.2054994, 1.1413403, 1.0974358, 1.0643553, 1.0379708, 1.0161183, 0.9975262]], + [ + [ + 0.46889508, + 0.4599662, + 0.4321173, + 0.40815368, + 0.387274, + 0.36888793, + 0.35255024, + 0.33791822, + ] + ], + ], + dtype=np.float32, + ) + ) + }, + ][is_3d_weights] From a934313c2c941a747ffbe40f8cffd87cbb42682e Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 15:27:13 +0400 Subject: [PATCH 12/30] add model --- tests/openvino/native/models.py | 43 ++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py index 0e0ef99b40a..d2651491761 100644 --- a/tests/openvino/native/models.py +++ b/tests/openvino/native/models.py @@ -1004,7 +1004,7 @@ def get_weights(weights_data, is_int8, name): return (qw - zp) * scale def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False): - input_node = opset.parameter([1] * n_extra_dims + [-1, 8], name="Input_1") + input_node = opset.parameter([2] * n_extra_dims + [-1, 8], name="Input_1") weights_data1 = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05 weights1 = self.get_weights(weights_data1, is_int8, name="weights_1") @@ -1040,6 +1040,47 @@ def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False): return model +class AWQMatmulModel3D(OVReferenceModel): + """ + 3D-weights version of AWQMatmulModel. + All weight tensors are [2, 8, 8]; input is [2, L, 8]. + """ + + def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False): + input_node = opset.parameter([2] * n_extra_dims + [-1, 8], name="Input_1") + + def make_weights(name): + w = 0.01 * np.arange(0, 2 * 8 * 8).reshape(2, 8, 8) + 0.05 + return opset.constant(w, dtype=np.float32, name=name) + + weights1 = make_weights("weights_1") + node1 = opset.matmul(input_node, weights1, transpose_a=False, transpose_b=True, name="MatMul_1") + + weights2 = make_weights("weights_2") + node2 = opset.matmul(input_node, weights2, transpose_a=False, transpose_b=True, name="MatMul_2") + + node_multiply = opset.multiply(node1, node2, name="Multiply") + + weights3 = make_weights("weights_3") + node3 = opset.matmul(node_multiply, weights3, transpose_a=False, transpose_b=True, name="MatMul_3") + + weights4 = make_weights("weights_4") + node4 = opset.matmul(node3, weights4, transpose_a=False, transpose_b=True, name="MatMul_4") + + weights5 = make_weights("weights_5") + node5 = opset.matmul(node3, weights5, transpose_a=False, transpose_b=True, name="MatMul_5") + + node_multiply_2 = opset.multiply(node4, node5, name="Multiply_2") + + weights6 = make_weights("weights_6") + node6 = opset.matmul(node_multiply_2, weights6, transpose_a=False, transpose_b=True, name="MatMul_6") + + result = opset.result(node6, name="Result") + result.get_output_tensor(0).set_names(set(["Result"])) + model = ov.Model([result], [input_node]) + return model + + class AWQActMatmulModel(OVReferenceModel): """ Model for testing AWQ algorithm. Contains MatMul->Multiply->MatMul pattern. From 48c499c61c085d261cbf3a97c9dd29101fa99067 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 15:45:04 +0400 Subject: [PATCH 13/30] xfail openvino test --- .../quantization/test_weights_compression.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index b4092cd85f4..c8942436276 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -2272,7 +2272,19 @@ def get_moe_scale_estimation_ref(check_sampling_activation_stats_flow): @pytest.mark.parametrize("is_moe", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))]) @pytest.mark.parametrize("check_sampling_activation_stats_flow", [False, True]) def test_scale_estimation(self, mocker, is_moe, check_sampling_activation_stats_flow): - super().test_scale_estimation(mocker, is_moe, check_sampling_activation_stats_flow) + return super().test_scale_estimation(mocker, is_moe, check_sampling_activation_stats_flow) + + @pytest.mark.parametrize( + "is_3d_weights", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))] + ) + def test_awq_with_ignored_scope(self, mocker, is_3d_weights): + return super().test_awq_with_ignored_scope(mocker, is_3d_weights) + + @pytest.mark.parametrize( + "is_3d_weights", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))] + ) + def test_awq_scale_reference(self, monkeypatch, mocker, is_3d_weights): + return super().test_awq_scale_reference(monkeypatch, mocker, is_3d_weights) @staticmethod def get_orig_weight(model: ov.Model) -> Tensor: From ba8b725b5f392b16d76e791a5265c7e0a5f21cad Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 16:18:07 +0400 Subject: [PATCH 14/30] fix condition for is_mergeable --- .../algorithms/weight_compression/awq.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py index 0c7645841a1..8a2def2d8f9 100644 --- a/src/nncf/quantization/algorithms/weight_compression/awq.py +++ b/src/nncf/quantization/algorithms/weight_compression/awq.py @@ -159,17 +159,25 @@ def apply( weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) if len(weight_data) != 1: # not supported by the algorithm continue - is_mergeable = self._backend_entity.is_node_with_weights(merge_node, graph) - - nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}") - _, weight_port_id = weight_data[0] + weight = self._backend_entity.get_weight( wp.node_with_weight, weight_port_id, model, graph ) # get_const_value(wp.weight_node) weight_dtype = weight.dtype weight = weight.astype(TensorDataType.float32) + # returns an empty list if no weights are present + mergeable_node_weight_data = self._backend_entity.get_weight_names_and_port_ids(merge_node, graph) + merge_node_weight_dims = [ + len(self._backend_entity.get_weight_shape(merge_node, port_id, graph)) + for _, port_id in mergeable_node_weight_data + ] + # if no weights are present, it checks membership with empty list which is False. + is_mergeable = len(weight.shape) in merge_node_weight_dims + + nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}") + if is_data_free: scale = self._data_free_step(weight, 1 - wp.reduction_axes[0]) else: From 667716e8c5fc364863593425d4988679dd6022a2 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 16:41:38 +0400 Subject: [PATCH 15/30] fix mergeable issue --- .../algorithms/weight_compression/awq.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py index 8a2def2d8f9..8a4fcbd64c7 100644 --- a/src/nncf/quantization/algorithms/weight_compression/awq.py +++ b/src/nncf/quantization/algorithms/weight_compression/awq.py @@ -167,14 +167,14 @@ def apply( weight_dtype = weight.dtype weight = weight.astype(TensorDataType.float32) - # returns an empty list if no weights are present - mergeable_node_weight_data = self._backend_entity.get_weight_names_and_port_ids(merge_node, graph) - merge_node_weight_dims = [ - len(self._backend_entity.get_weight_shape(merge_node, port_id, graph)) - for _, port_id in mergeable_node_weight_data - ] - # if no weights are present, it checks membership with empty list which is False. - is_mergeable = len(weight.shape) in merge_node_weight_dims + is_mergeable = False + if self._backend_entity.is_node_with_weights(merge_node, graph): + mergeable_node_weight_data = self._backend_entity.get_weight_names_and_port_ids(merge_node, graph) + merge_node_weight_dims = [ + len(self._backend_entity.get_weight_shape(merge_node, port_id, graph)) + for _, port_id in mergeable_node_weight_data + ] + is_mergeable = len(weight.shape) in merge_node_weight_dims nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}") From 2122b11d186e957f7e6967c540fb5ddd745f272b Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 16:45:25 +0400 Subject: [PATCH 16/30] add act model for openvino; include data free test and call max variance test --- .../template_test_weights_compression.py | 20 +++++---- tests/openvino/native/models.py | 45 ++++++++++++++++++- .../quantization/test_weights_compression.py | 5 ++- 3 files changed, 59 insertions(+), 11 deletions(-) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index b75cc11ba3e..6866178d160 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -354,7 +354,7 @@ def test_scale_estimation_outlier_channel_has_lowest_error(self, mocker): # AWQ Tests @staticmethod @abstractmethod - def get_awq_act_model(with_multiply, n_layers): + def get_awq_act_model(is_3d_weights, with_multiply, n_layers): "Returns a backend model for test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul." @staticmethod @@ -366,13 +366,16 @@ def get_num_multiply_from_awq(model: TModel) -> int: def int4_mode(self, request): return None + @pytest.mark.parametrize("is_3d_weights", [True, False]) @pytest.mark.parametrize("with_multiply", (True, False)) - def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul(self, int4_mode, with_multiply, mocker): + def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul( + self, int4_mode, with_multiply, is_3d_weights, mocker + ): n_layers = 8 n_awq_target = n_layers - 1 # first MatMul is always int8 - model = self.get_awq_act_model(with_multiply, n_layers) + model = self.get_awq_act_model(is_3d_weights, with_multiply, n_layers) - dataset = Dataset([self.to_tensor(np.ones([1, 8, 8], dtype=np.float32))], self.get_transform_func()) + dataset = Dataset([self.to_tensor(np.ones([2, 8, 8], dtype=np.float32))], self.get_transform_func()) with SpyWeightCompressionStatisticsContext(mocker): model = compress_weights(model, mode=int4_mode, ratio=1.0, group_size=2, dataset=dataset, awq=True) @@ -626,14 +629,15 @@ def test_group_size_fallback_modes( f"Expected {ref_num_group_sizes} group size values, but got {num_group_sizes}." ) - @pytest.mark.parametrize("dataset", [None, np.ones([1, 8, 8], dtype=np.float32)]) + @pytest.mark.parametrize("is_3d_weights", [True, False]) + @pytest.mark.parametrize("dataset", [None, np.ones([2, 8, 8], dtype=np.float32)]) @pytest.mark.parametrize("prefer_data_aware_scaling", [True, False]) - def test_data_free_awq(self, dataset, prefer_data_aware_scaling, mocker): - input_data = np.ones([1, 8, 8], dtype=np.float32) + def test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker): + input_data = np.ones([2, 8, 8], dtype=np.float32) n_layers = 8 n_awq_target = n_layers - 1 # first MatMul is always int8 - model = self.get_awq_act_model(True, n_layers) + model = self.get_awq_act_model(is_3d_weights, True, n_layers) model = self.wrap_model(model, input_data) if dataset is not None: diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py index d2651491761..28405cb04ae 100644 --- a/tests/openvino/native/models.py +++ b/tests/openvino/native/models.py @@ -1046,7 +1046,7 @@ class AWQMatmulModel3D(OVReferenceModel): All weight tensors are [2, 8, 8]; input is [2, L, 8]. """ - def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False): + def _create_ov_model(self, n_extra_dims=1, is_int8=False): input_node = opset.parameter([2] * n_extra_dims + [-1, 8], name="Input_1") def make_weights(name): @@ -1087,7 +1087,7 @@ class AWQActMatmulModel(OVReferenceModel): """ def _create_ov_model(self, is_int8=False, with_multiply=False, n_layers=8): - input_node = opset.parameter([1, 8, 8], name="Input_1") + input_node = opset.parameter([2, 8, 8], name="Input_1") weights_data = np.arange(0, 64).reshape(8, 8) - 32 weights = AWQMatmulModel.get_weights(weights_data, is_int8, name="weights_emb") out_node = opset.matmul(input_node, weights, transpose_a=False, transpose_b=True, name="MatMul_emb") @@ -1125,6 +1125,47 @@ def _create_ov_model(self, is_int8=False, with_multiply=False, n_layers=8): return model +class AWQActMatmulModel3D(OVReferenceModel): + """ + Model for testing AWQ algorithm with 3D weights. Contains MatMul->Multiply->MatMul pattern. + """ + + def _create_ov_model(self, is_int8=False, with_multiply=False, n_layers=8): + input_node = opset.parameter([2, 8, 8], name="Input_1") + + def make_weights(name: str): + w = np.arange(0, 2 * 8 * 8).reshape(2, 8, 8) - 32 + return opset.constant(w, dtype=np.float32, name=name) + + weights_emb = make_weights("weights_emb") + out_node = opset.matmul(input_node, weights_emb, transpose_a=False, transpose_b=True, name="MatMul_emb") + + for i in range(n_layers): + weights1 = make_weights(f"weights_1_{i}") + mm1 = opset.matmul(out_node, weights1, transpose_a=False, transpose_b=True, name=f"MatMul_1_{i}") + node1 = opset.relu(mm1, name=f"ReLU_{i}") + + if with_multiply: + weights2 = make_weights(f"weights_2_{i}") + mm2 = opset.matmul(out_node, weights2, transpose_a=False, transpose_b=True, name=f"MatMul_2_{i}") + alpha = opset.constant(np.array([1.5], dtype=np.float32), dtype=np.float32) + lambda_value = opset.constant(np.array([1.5], dtype=np.float32), dtype=np.float32) + node2 = opset.selu(mm2, alpha, lambda_value, name=f"SeLU_{i}") + node_multiply = opset.multiply(node1, node2, name=f"Multiply_{i}") + else: + node_multiply = node1 + + out_node = node_multiply + + weights_lm_head = make_weights("weights_lm_head") + out_node = opset.matmul(out_node, weights_lm_head, transpose_a=False, transpose_b=True, name="MatMul_lm_head") + + result = opset.result(out_node, name="Result") + result.get_output_tensor(0).set_names(set(["Result"])) + model = ov.Model([result], [input_node]) + return model + + class AWQModel_fp16_overlow(OVReferenceModel): """ Model for testing AWQ algorithm with fp16 overflow fix. diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index c8942436276..dd9f90d4bc7 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -63,6 +63,7 @@ from tests.cross_fw.test_templates.template_test_weights_compression import TemplateWeightCompression from tests.openvino.native.common import get_actual_reference_for_current_openvino from tests.openvino.native.models import AWQActMatmulModel +from tests.openvino.native.models import AWQActMatmulModel3D from tests.openvino.native.models import AWQMatmulModel from tests.openvino.native.models import AWQMatmulModel3D from tests.openvino.native.models import AWQModel_fp16_overlow @@ -2094,7 +2095,9 @@ def get_different_channel_size_model(channel_sizes: list[int]) -> ov.Model: return DifferentChannelSizeMatmulModel(channel_sizes=channel_sizes).ov_model @staticmethod - def get_awq_act_model(with_multiply, n_layers): + def get_awq_act_model(is_3d_weights, with_multiply, n_layers): + if is_3d_weights: + return AWQActMatmulModel3D(with_multiply=with_multiply, n_layers=n_layers).ov_model return AWQActMatmulModel(with_multiply=with_multiply, n_layers=n_layers).ov_model @staticmethod From 3e92f47bc93b68e32fa9b9e5dbab3b8bb6ff8f74 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 18:42:35 +0400 Subject: [PATCH 17/30] add torch and torch fx act linear model tests --- .../quantization/test_weights_compression.py | 31 ++++++++++++++++++- tests/torch2/fx/test_compress_weights.py | 7 +++-- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py index 0ede375f739..e735f24c1cb 100644 --- a/tests/torch2/function_hook/quantization/test_weights_compression.py +++ b/tests/torch2/function_hook/quantization/test_weights_compression.py @@ -173,6 +173,33 @@ def forward(self, x): return out +class AWQActLinearModel3D(nn.Module): + def __init__(self, with_multiply=False, n_layers=8): + super().__init__() + self.with_multiply = with_multiply + self.n_layers = n_layers + + base_w = torch.arange(0, 2 * 8 * 8, dtype=torch.float32).reshape(2, 8, 8) - 32.0 + self.emb_weight = nn.Parameter(base_w.clone()) + self.lm_head_weight = nn.Parameter(base_w.clone()) + n_params = 2 * n_layers if with_multiply else n_layers + self.layer_weights = nn.ParameterList(nn.Parameter(base_w) for _ in range(n_params)) + + def forward(self, x): + out = torch.bmm(x, self.emb_weight) + + for i in range(self.n_layers): + node1 = F.relu(torch.bmm(out, self.layer_weights[i])) + if self.with_multiply: + node2 = torch.selu(torch.bmm(out, self.layer_weights[i])) + out = node1 * node2 + else: + out = node1 + + out = torch.bmm(out, self.lm_head_weight) + return out + + class AWQLinearModel(nn.Module): def __init__(self, is_int8=False): super().__init__() @@ -554,7 +581,9 @@ def get_different_channel_size_model(channel_sizes: list[int]) -> torch.nn.Modul return DifferentChannelSizeMatmulModel(channel_sizes=channel_sizes) @staticmethod - def get_awq_act_model(with_multiply, n_layers): + def get_awq_act_model(is_3d_weights, with_multiply, n_layers): + if is_3d_weights: + return AWQActLinearModel3D(with_multiply=with_multiply, n_layers=n_layers) return AWQActLinearModel(with_multiply=with_multiply, n_layers=n_layers) @staticmethod diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py index e7788ec365d..cd1317f0a85 100644 --- a/tests/torch2/fx/test_compress_weights.py +++ b/tests/torch2/fx/test_compress_weights.py @@ -39,6 +39,7 @@ from tests.torch2.function_hook.quantization.test_weights_compression import SUPPORTED_MODES from tests.torch2.function_hook.quantization.test_weights_compression import UNSUPPORTED_MODES from tests.torch2.function_hook.quantization.test_weights_compression import AWQActLinearModel +from tests.torch2.function_hook.quantization.test_weights_compression import AWQActLinearModel3D from tests.torch2.function_hook.quantization.test_weights_compression import AWQLinearModel from tests.torch2.function_hook.quantization.test_weights_compression import AWQLinearModel3D from tests.torch2.function_hook.quantization.test_weights_compression import ConvolutionModel @@ -382,9 +383,11 @@ def get_different_channel_size_model(channel_sizes: list[int]) -> torch.nn.Modul return exported_model @staticmethod - def get_awq_act_model(with_multiply, n_layers): + def get_awq_act_model(is_3d_weights, with_multiply, n_layers): model = AWQActLinearModel(with_multiply=with_multiply, n_layers=n_layers) - ex_input = torch.ones([1, 8, 8], dtype=torch.float32) + if is_3d_weights: + model = AWQActLinearModel3D(with_multiply=with_multiply, n_layers=n_layers) + ex_input = torch.ones([2, 8, 8], dtype=torch.float32) exported_model = get_torch_fx_model(model, ex_input) return exported_model From d44f6d5686f446072d4287e8a1da16904b1da5a8 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 19:38:30 +0400 Subject: [PATCH 18/30] fix data shape for OV model --- .../native/quantization/test_weights_compression.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index dd9f90d4bc7..17cd67579ba 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -930,7 +930,7 @@ def test_call_max_var_criterion_with_dataset_by_default(mocker, mode): @pytest.mark.parametrize("mode", INT4_MODES) def test_call_max_var_criterion_with_dataset_by_default_awq(mode): model = AWQMatmulModel().ov_model - dataset = Dataset([np.ones([1, 8, 8])]) + dataset = Dataset([np.ones([2, 8, 8])]) compress_weights(model, mode=mode, ratio=1.0, group_size=2, dataset=dataset, awq=True) @@ -938,7 +938,7 @@ def test_call_max_var_criterion_with_dataset_by_default_awq(mode): @pytest.mark.parametrize("mode", INT4_NF4_MODES) def test_call_max_var_criterion_with_dataset_awq_for_compressed_model(mode): model = AWQMatmulModel(is_int8=True).ov_model - dataset = Dataset([np.ones([1, 8, 8])]) + dataset = Dataset([np.ones([2, 8, 8])]) compress_weights(model, mode=mode, ratio=1.0, group_size=2, dataset=dataset, awq=True) @@ -946,7 +946,7 @@ def test_call_max_var_criterion_with_dataset_awq_for_compressed_model(mode): @pytest.mark.parametrize("mode", INT4_NF4_MODES) def test_call_max_var_criterion_with_dataset_awq_neg_group_size(mode): model = AWQMatmulModel().ov_model - dataset = Dataset([np.ones([1, 8, 8])]) + dataset = Dataset([np.ones([2, 8, 8])]) compress_weights(model, mode=mode, ratio=1.0, group_size=-1, dataset=dataset, awq=True) From 090f0f509fdc405e6e57e254cee5685c57a342e3 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 19:49:27 +0400 Subject: [PATCH 19/30] fix awq data free --- src/nncf/quantization/algorithms/weight_compression/awq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py index 8a4fcbd64c7..54dcdccde1c 100644 --- a/src/nncf/quantization/algorithms/weight_compression/awq.py +++ b/src/nncf/quantization/algorithms/weight_compression/awq.py @@ -179,7 +179,7 @@ def apply( nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}") if is_data_free: - scale = self._data_free_step(weight, 1 - wp.reduction_axes[0]) + scale = self._data_free_step(weight, -(len(weight.shape) - wp.reduction_axes[0])) else: prev_weight, prev_statistics = None, None if is_mergeable: From 7222304f756a172f3577746c78eca0d6ee796ac8 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 20:09:41 +0400 Subject: [PATCH 20/30] add check for opset version when weights are 3D --- .../weight_compression/onnx_backend.py | 4 ++++ .../quantization/test_weights_compression.py | 22 +++++++++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py index 735ba9a2a3e..c99a259f275 100644 --- a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -259,6 +259,10 @@ def transform_model( # For opsets earlier than 21, we use the `MatMulNBits` operation from ONNX Runtime contrib operators. # See https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md if opset_version < 21 and dequantize_block_size > 0: + if len(weight.shape) == 3: + msg = """ONNX does not support 3D weights for opset version < 21. + Please use a higher opset version or per-channel quantization""" + raise nncf.ParameterNotSupportedError(msg) compressed_weight, scale, zero_point = self._preprocess_compressed_weight( compressed_weight, weight.shape, dequantize_block_size=None, apply_transpose=True ) diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py index bf3e4c3b539..34c7faf39e9 100644 --- a/tests/onnx/quantization/test_weights_compression.py +++ b/tests/onnx/quantization/test_weights_compression.py @@ -659,7 +659,7 @@ def get_decompressed_weight(compressed_model: onnx.ModelProto, input: np.ndarray return Tensor(outputs["W_0_dequantized"]) @staticmethod - def get_awq_act_model(with_multiply: bool, n_layers: int) -> onnx.ModelProto: + def get_awq_act_model(is_3d_weights: bool, with_multiply: bool, n_layers: int) -> onnx.ModelProto: """ Builds a model to be used in the following tests: - TemplateWeightCompression.test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul() @@ -668,7 +668,12 @@ def get_awq_act_model(with_multiply: bool, n_layers: int) -> onnx.ModelProto: """ mb = ModelBuilder() - data = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05 + weight_shape = (8, 8) + if is_3d_weights: + # The first and last dimension are later transposed + weight_shape = (8, 8, 2) + + data = 0.01 * np.arange(0, math.prod(weight_shape)).reshape(weight_shape) + 0.05 data = data.astype(np.float32).T x = mb.add_input("input", (1, 8, 8)) @@ -676,17 +681,17 @@ def get_awq_act_model(with_multiply: bool, n_layers: int) -> onnx.ModelProto: x = mb.add_matmul(x, shape=(8, 8), data=data) for _ in range(n_layers): - a = mb.add_matmul(x, shape=(8, 8), data=data) + a = mb.add_matmul(x, shape=data.shape, data=data) a = mb.add_relu(a) if with_multiply: - b = mb.add_matmul(x, shape=(8, 8), data=data) + b = mb.add_matmul(x, shape=data.shape, data=data) b = mb.add_selu(b) x = mb.add_mul(a, b) else: x = a - mb.add_matmul(x, shape=(8, 8), output=output, data=data) + mb.add_matmul(x, shape=data.shape, output=output, data=data) - return mb.build() + return mb.build(opset_version=21) @staticmethod def get_num_multiply_from_awq(model: onnx.ModelProto) -> int: @@ -708,9 +713,12 @@ def get_awq_model(is_3d_weights) -> onnx.ModelProto: mb = ModelBuilder() weight_shape = (8, 8) + opset_version = 13 if is_3d_weights: # The first and last dimension are later transposed weight_shape = (8, 8, 2) + # 3D weights does not work due to no support in MatMulNBits which is used in opset_version < 21 + opset_version = 21 x = mb.add_input("input", (2, None, 8)) output = mb.add_output("output", (2, None, 8)) @@ -725,7 +733,7 @@ def get_awq_model(is_3d_weights) -> onnx.ModelProto: x = mb.add_mul(a, b) x = mb.add_matmul(x, shape=w_data.shape, output=output if i == num_blocks - 1 else None, data=w_data) - return mb.build() + return mb.build(opset_version=opset_version) @staticmethod def get_different_channel_size_model(channel_sizes: list[int]) -> onnx.ModelProto: From 211c806e15413bb01c8ef1fbb6b0c7f28675ffc1 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 20:56:24 +0400 Subject: [PATCH 21/30] xfail openvino case --- .../native/quantization/test_weights_compression.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 17cd67579ba..1327c5619d9 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -2289,6 +2289,14 @@ def test_awq_with_ignored_scope(self, mocker, is_3d_weights): def test_awq_scale_reference(self, monkeypatch, mocker, is_3d_weights): return super().test_awq_scale_reference(monkeypatch, mocker, is_3d_weights) + @pytest.mark.parametrize( + "is_3d_weights", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))] + ) + @pytest.mark.parametrize("dataset", [None, np.ones([2, 8, 8], dtype=np.float32)]) + @pytest.mark.parametrize("prefer_data_aware_scaling", [True, False]) + def test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker): + return super().test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker) + @staticmethod def get_orig_weight(model: ov.Model) -> Tensor: for op in model.get_ordered_ops(): From 0d9651617bf3ce06d98ec15377f6d76185020e13 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 21:01:05 +0400 Subject: [PATCH 22/30] fix test --- tests/onnx/quantization/test_weights_compression.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py index 34c7faf39e9..be361d30397 100644 --- a/tests/onnx/quantization/test_weights_compression.py +++ b/tests/onnx/quantization/test_weights_compression.py @@ -676,10 +676,10 @@ def get_awq_act_model(is_3d_weights: bool, with_multiply: bool, n_layers: int) - data = 0.01 * np.arange(0, math.prod(weight_shape)).reshape(weight_shape) + 0.05 data = data.astype(np.float32).T - x = mb.add_input("input", (1, 8, 8)) - output = mb.add_output("output", (1, 8, 8)) + x = mb.add_input("input", (2, 8, 8)) + output = mb.add_output("output", (2, 8, 8)) - x = mb.add_matmul(x, shape=(8, 8), data=data) + x = mb.add_matmul(x, shape=data.shape, data=data) for _ in range(n_layers): a = mb.add_matmul(x, shape=data.shape, data=data) a = mb.add_relu(a) From 0b1019cec83dfb2d92ba390c9993a6d9e7a19da7 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 21:14:07 +0400 Subject: [PATCH 23/30] remove extra comments --- .../quantization/algorithms/weight_compression/awq.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py index 54dcdccde1c..47f96226572 100644 --- a/src/nncf/quantization/algorithms/weight_compression/awq.py +++ b/src/nncf/quantization/algorithms/weight_compression/awq.py @@ -240,12 +240,10 @@ def _data_aware_step(self, wp, weight, statistics, prev_weight=None, prev_statis reduction_axis = wp.reduction_axes[0] if is_2d_weight: - s = fns.unsqueeze(s, 0) # [hidden_dim] -> [1, hidden_dim] - X = fns.unsqueeze(X, 0) # [hidden_dim, samples] -> [1, hidden_dim, samples] - weight = fns.unsqueeze(weight, 0) # [out_features, hidden_dim] -> [1, out_features, hidden_dim] - prev_weight = ( - fns.unsqueeze(prev_weight, 0) if prev_weight is not None else None - ) # [out_features, hidden_dim] -> [1, out_features, hidden_dim] + s = fns.unsqueeze(s, 0) + X = fns.unsqueeze(X, 0) + weight = fns.unsqueeze(weight, 0) + prev_weight = fns.unsqueeze(prev_weight, 0) if prev_weight is not None else None reduction_axis += 1 prev_s, prev_w = None, None From 80c1ec2bdc6acb5254a4243775cddabfde2e9658 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 21:19:15 +0400 Subject: [PATCH 24/30] fix --- tests/openvino/native/models.py | 8 ++++---- .../native/quantization/test_weights_compression.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py index 28405cb04ae..755d6f6ccdc 100644 --- a/tests/openvino/native/models.py +++ b/tests/openvino/native/models.py @@ -1004,7 +1004,7 @@ def get_weights(weights_data, is_int8, name): return (qw - zp) * scale def _create_ov_model(self, n_extra_dims: int = 1, is_int8=False): - input_node = opset.parameter([2] * n_extra_dims + [-1, 8], name="Input_1") + input_node = opset.parameter([-1] * n_extra_dims + [-1, 8], name="Input_1") weights_data1 = 0.01 * np.arange(0, 64).reshape(8, 8) + 0.05 weights1 = self.get_weights(weights_data1, is_int8, name="weights_1") @@ -1047,7 +1047,7 @@ class AWQMatmulModel3D(OVReferenceModel): """ def _create_ov_model(self, n_extra_dims=1, is_int8=False): - input_node = opset.parameter([2] * n_extra_dims + [-1, 8], name="Input_1") + input_node = opset.parameter([-1] * n_extra_dims + [-1, 8], name="Input_1") def make_weights(name): w = 0.01 * np.arange(0, 2 * 8 * 8).reshape(2, 8, 8) + 0.05 @@ -1087,7 +1087,7 @@ class AWQActMatmulModel(OVReferenceModel): """ def _create_ov_model(self, is_int8=False, with_multiply=False, n_layers=8): - input_node = opset.parameter([2, 8, 8], name="Input_1") + input_node = opset.parameter([-1, 8, 8], name="Input_1") weights_data = np.arange(0, 64).reshape(8, 8) - 32 weights = AWQMatmulModel.get_weights(weights_data, is_int8, name="weights_emb") out_node = opset.matmul(input_node, weights, transpose_a=False, transpose_b=True, name="MatMul_emb") @@ -1131,7 +1131,7 @@ class AWQActMatmulModel3D(OVReferenceModel): """ def _create_ov_model(self, is_int8=False, with_multiply=False, n_layers=8): - input_node = opset.parameter([2, 8, 8], name="Input_1") + input_node = opset.parameter([-1, 8, 8], name="Input_1") def make_weights(name: str): w = np.arange(0, 2 * 8 * 8).reshape(2, 8, 8) - 32 diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 1327c5619d9..ddb5e0fbfa0 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -2295,7 +2295,7 @@ def test_awq_scale_reference(self, monkeypatch, mocker, is_3d_weights): @pytest.mark.parametrize("dataset", [None, np.ones([2, 8, 8], dtype=np.float32)]) @pytest.mark.parametrize("prefer_data_aware_scaling", [True, False]) def test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker): - return super().test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker) + return super().test_data_free_awq(dataset, prefer_data_aware_scaling, is_3d_weights, mocker) @staticmethod def get_orig_weight(model: ov.Model) -> Tensor: @@ -2316,7 +2316,7 @@ def get_decompressed_weight(compressed_model: ov.Model, input: np.ndarray) -> Te return Tensor(weight_output) @staticmethod - def get_ignored_scope_name() -> str: + def get_ignored_scope_name(is_3d_weights) -> str: return "MatMul_5" @staticmethod From 1c76e5fdb03971cbf282d550481ddc1ccd9be2d7 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 21:20:48 +0400 Subject: [PATCH 25/30] fix dynamic shapes --- tests/torch2/fx/test_compress_weights.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py index cd1317f0a85..6297a50e036 100644 --- a/tests/torch2/fx/test_compress_weights.py +++ b/tests/torch2/fx/test_compress_weights.py @@ -370,7 +370,7 @@ def get_awq_model(is_3d_weights) -> torch.fx.GraphModule: model = AWQLinearModel() if is_3d_weights: model = AWQLinearModel3D() - dynamic_shapes = [[None, torch.export.Dim("dynamic_shape"), None]] + dynamic_shapes = [[torch.export.Dim.DYNAMIC, torch.export.Dim.DYNAMIC, None]] ex_input = torch.ones([2, 4, 8], dtype=torch.float32) exported_model = get_torch_fx_model(model, ex_input, dynamic_shapes=dynamic_shapes) return exported_model @@ -387,8 +387,9 @@ def get_awq_act_model(is_3d_weights, with_multiply, n_layers): model = AWQActLinearModel(with_multiply=with_multiply, n_layers=n_layers) if is_3d_weights: model = AWQActLinearModel3D(with_multiply=with_multiply, n_layers=n_layers) + dynamic_shapes = [[torch.export.Dim.DYNAMIC, torch.export.Dim.DYNAMIC, None]] ex_input = torch.ones([2, 8, 8], dtype=torch.float32) - exported_model = get_torch_fx_model(model, ex_input) + exported_model = get_torch_fx_model(model, ex_input, dynamic_shapes=dynamic_shapes) return exported_model @staticmethod From b51d878b68375b04d83194c29e00811106b6b593 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 21:52:06 +0400 Subject: [PATCH 26/30] add xfail for last test --- .../native/quantization/test_weights_compression.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index ddb5e0fbfa0..8824b898daf 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -2297,6 +2297,17 @@ def test_awq_scale_reference(self, monkeypatch, mocker, is_3d_weights): def test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker): return super().test_data_free_awq(dataset, prefer_data_aware_scaling, is_3d_weights, mocker) + @pytest.mark.parametrize( + "is_3d_weights", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))] + ) + @pytest.mark.parametrize("with_multiply", (True, False)) + def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul( + self, int4_mode, with_multiply, is_3d_weights, mocker + ): + return super().test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul( + int4_mode, with_multiply, is_3d_weights, mocker + ) + @staticmethod def get_orig_weight(model: ov.Model) -> Tensor: for op in model.get_ordered_ops(): From db8417bc0832fa8f2835864d581b38ba9e31d430 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 22:06:27 +0400 Subject: [PATCH 27/30] check dynamic dimensions correctly --- tests/torch2/fx/test_compress_weights.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py index 6297a50e036..c228201e92e 100644 --- a/tests/torch2/fx/test_compress_weights.py +++ b/tests/torch2/fx/test_compress_weights.py @@ -370,7 +370,7 @@ def get_awq_model(is_3d_weights) -> torch.fx.GraphModule: model = AWQLinearModel() if is_3d_weights: model = AWQLinearModel3D() - dynamic_shapes = [[torch.export.Dim.DYNAMIC, torch.export.Dim.DYNAMIC, None]] + dynamic_shapes = [[torch.export.Dim.AUTO, torch.export.Dim.DYNAMIC, None]] ex_input = torch.ones([2, 4, 8], dtype=torch.float32) exported_model = get_torch_fx_model(model, ex_input, dynamic_shapes=dynamic_shapes) return exported_model @@ -387,7 +387,7 @@ def get_awq_act_model(is_3d_weights, with_multiply, n_layers): model = AWQActLinearModel(with_multiply=with_multiply, n_layers=n_layers) if is_3d_weights: model = AWQActLinearModel3D(with_multiply=with_multiply, n_layers=n_layers) - dynamic_shapes = [[torch.export.Dim.DYNAMIC, torch.export.Dim.DYNAMIC, None]] + dynamic_shapes = [[torch.export.Dim.AUTO, torch.export.Dim.DYNAMIC, None]] ex_input = torch.ones([2, 8, 8], dtype=torch.float32) exported_model = get_torch_fx_model(model, ex_input, dynamic_shapes=dynamic_shapes) return exported_model From 9173dfb32e22943c97131f9c0e6bf8c115138381 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Dec 2025 22:25:45 +0400 Subject: [PATCH 28/30] fix onnx backend formatting of weights --- .../algorithms/weight_compression/onnx_backend.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py index c99a259f275..61d70821d27 100644 --- a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -109,15 +109,16 @@ def _preprocess_compressed_weight( scale = compressed_weight.scale zero_point = compressed_weight.zero_point - axis = 1 if dequantize_block_size else None + # For 3D weights, we need to squeeze at the next dimension compared to 2D because of batch dim + axis = 1 + len(scale.shape) % 3 if dequantize_block_size else None scale = scale.squeeze(axis=axis) if zero_point is not None: zero_point = zero_point.squeeze(axis=axis) if apply_transpose: - scale = fns.transpose(scale) + scale = fns.moveaxis(scale, -1, -2) if zero_point is not None: - zero_point = fns.transpose(zero_point) + zero_point = fns.moveaxis(zero_point, -1, -2) if zero_point is not None: zero_point = zero_point.astype(tensor.dtype) From 7a9a12825a12ded244f570d42a1d138f536e17a6 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 9 Dec 2025 19:42:46 +0400 Subject: [PATCH 29/30] fix reduction axes --- .../algorithms/weight_compression/awq.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py index 47f96226572..ac9637b8a98 100644 --- a/src/nncf/quantization/algorithms/weight_compression/awq.py +++ b/src/nncf/quantization/algorithms/weight_compression/awq.py @@ -178,8 +178,16 @@ def apply( nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}") + weight_dim = len(weight.shape) if is_data_free: - scale = self._data_free_step(weight, -(len(weight.shape) - wp.reduction_axes[0])) + # Reached this formula using a simple generalization of possible values. + # It comes out to be a beautiful constant - reduction axes where + # constant is (n-1)th odd number. Where n is the dimension + # 2(n-1)-1 -> 2n-3 + # Example: 2D -> 1 - reduction_axes (reduction_axes=1 -> 1-1=0; reduction_axes=0; 1-0=1) + # 3D -> 3 - reduction_axes (reduction_axes=1 -> 3-1=2; reduction_axes=2; 3-2=1) + # 4D -> 5 - reduction_axes (reduction_axes=1 -> 3-1=2; reduction_axes=2; 3-2=1) + scale = self._data_free_step(weight, (weight_dim * 2) - 3 - wp.reduction_axes[0]) else: prev_weight, prev_statistics = None, None if is_mergeable: @@ -191,16 +199,7 @@ def apply( prev_statistics = statistics[merge_node.node_name] scale = self._data_aware_step(wp, weight, statistics[k], prev_weight, prev_statistics) - # For 3D weights, len(scale.shape)%2 == 0 whereas for 2D it is 1. This allows us to index - # from the last dim and not consider the batch dim in 3D case. - # Example: - # 3D weights: W shape = [B, M, N]; reduction_axes = 2 - # scale_shape = [M, N] -> len(scale.shape) = 2 -> 2 % 2 = 0 - # unsqueeze scale at -(0 + 2) = -2. - # 2D weights: W shape = [M, N]; reduction_axes = 1 - # scale_shape = [M] -> len(scale.shape) = 1 -> 1 % 2 = 1 - # unsqueeze scale at -(1 + 1) = -2. - w_scale = fns.unsqueeze(scale, -(len(scale.shape) % 2 + wp.reduction_axes[0])) + w_scale = fns.unsqueeze(scale, (weight_dim * 2) - 3 - wp.reduction_axes[0]) a_scale = fns.unsqueeze(1.0 / scale, wp.reduction_axes[0]) scaled_weight = (weight * w_scale).astype(weight_dtype) From e605827d3f9222511c77959c1ea53dac8eb39485 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 9 Dec 2025 19:44:10 +0400 Subject: [PATCH 30/30] fix onnx test --- tests/onnx/quantization/test_weights_compression.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py index be361d30397..4463abb9bcc 100644 --- a/tests/onnx/quantization/test_weights_compression.py +++ b/tests/onnx/quantization/test_weights_compression.py @@ -9,9 +9,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math from collections import defaultdict from dataclasses import dataclass +from functools import reduce +from operator import mul from typing import Any, Callable, Optional import numpy as np @@ -673,7 +674,7 @@ def get_awq_act_model(is_3d_weights: bool, with_multiply: bool, n_layers: int) - # The first and last dimension are later transposed weight_shape = (8, 8, 2) - data = 0.01 * np.arange(0, math.prod(weight_shape)).reshape(weight_shape) + 0.05 + data = 0.01 * np.arange(0, reduce(mul, weight_shape, 1)).reshape(weight_shape) + 0.05 data = data.astype(np.float32).T x = mb.add_input("input", (2, 8, 8)) @@ -723,7 +724,7 @@ def get_awq_model(is_3d_weights) -> onnx.ModelProto: x = mb.add_input("input", (2, None, 8)) output = mb.add_output("output", (2, None, 8)) - w_data = 0.01 * np.arange(0, math.prod(weight_shape), dtype=np.float32).reshape(weight_shape) + 0.05 + w_data = 0.01 * np.arange(0, reduce(mul, weight_shape, 1), dtype=np.float32).reshape(weight_shape) + 0.05 w_data = w_data.T num_blocks = 2