From 488cacc2be70b7ae7e417c555d2aeea29163f5b6 Mon Sep 17 00:00:00 2001 From: Aleksandr Suslov Date: Mon, 10 Jun 2024 19:17:08 +0400 Subject: [PATCH 01/68] Support scale estimation inside GPTQ --- .../algorithms/layerwise/scheduler.py | 34 +- .../weight_compression/activation_stats.py | 7 +- .../weight_compression/algorithm.py | 59 ++-- .../algorithms/weight_compression/gptq.py | 41 ++- .../weight_compression/scale_estimation.py | 316 ++++++++++-------- nncf/quantization/quantize_model.py | 5 - .../openvino/native/quantization/test_gptq.py | 5 +- .../quantization/test_weights_compression.py | 5 +- 8 files changed, 271 insertions(+), 201 deletions(-) diff --git a/nncf/quantization/algorithms/layerwise/scheduler.py b/nncf/quantization/algorithms/layerwise/scheduler.py index 8eee99fad28..8abc03400c0 100644 --- a/nncf/quantization/algorithms/layerwise/scheduler.py +++ b/nncf/quantization/algorithms/layerwise/scheduler.py @@ -9,6 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import OrderedDict from copy import deepcopy from dataclasses import dataclass from dataclasses import field @@ -177,26 +178,31 @@ def schedule( old_input_nodes = set() new_input_nodes = set() for p in paths: - target_output_nodes = set() + target_outputs = [] additional_output_nodes = set() for output_node in p.output_nodes: - if output_node in target_nodes: - target_output_nodes.add(output_node) - elif output_node in p.input_nodes: - reuse_input_nodes.add(output_node) - else: - # filter additional output nodes - for prev_node in inference_graph.get_previous_nodes(output_node): - if prev_node not in p.output_nodes: - additional_output_nodes.add(output_node) - break - if not target_output_nodes: + try: + target_node_index = target_nodes.index(output_node) + target_outputs.append((target_node_index, output_node)) + except ValueError: + if output_node in p.input_nodes: + reuse_input_nodes.add(output_node) + else: + # filter additional output nodes + for prev_node in inference_graph.get_previous_nodes(output_node): + if prev_node not in p.output_nodes: + additional_output_nodes.add(output_node) + break + if not target_outputs: continue + target_outputs.sort(key=lambda target_output: target_output[0]) + target_output_nodes = [output[1] for output in target_outputs] + old_input_nodes |= p.input_nodes - new_input_nodes |= target_output_nodes | additional_output_nodes + new_input_nodes |= set(target_output_nodes) | additional_output_nodes subgraph_inputs = list(p.inputs) - step_target_nodes = {} + step_target_nodes = OrderedDict() subgraph_outputs = [] for node in target_output_nodes: target_edge = {} diff --git a/nncf/quantization/algorithms/weight_compression/activation_stats.py b/nncf/quantization/algorithms/weight_compression/activation_stats.py index eb8286e6383..359887e7769 100644 --- a/nncf/quantization/algorithms/weight_compression/activation_stats.py +++ b/nncf/quantization/algorithms/weight_compression/activation_stats.py @@ -9,14 +9,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple, TypeVar +from typing import List, Tuple +from nncf.tensor import Tensor from nncf.tensor import functions as fns -TTensor = TypeVar("TTensor") - -def process_stats(stats: List[TTensor], subset_size: int) -> Tuple[TTensor, TTensor]: +def process_stats(stats: List[Tensor], subset_size: int) -> Tuple[Tensor, Tensor]: """ It's a processing of activations shared between AWQ, Scale Estimation and LoRA Correction algorithms. diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 3499521bce3..1b2af0fd9a3 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -124,7 +124,12 @@ def __init__( if self._gptq: gptq_params = self._advanced_parameters.gptq_params - self._gptq_algo = GPTQ(gptq_params.damp_percent, gptq_params.block_size, gptq_params.subset_size) + self._gptq_algo = GPTQ( + damp_percent=gptq_params.damp_percent, + block_size=gptq_params.block_size, + subset_size=gptq_params.subset_size, + scale_estimation=self._scale_estimation, + ) self._gptq_statistics = None @property @@ -379,25 +384,8 @@ def apply( scales = {} zero_points = {} - if ( - self._scale_estimation - and activations is not None - and self._mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] - ): - scale_estimation_params = self._advanced_parameters.scale_estimation_params - scale_algo = ScaleEstimation( - model, - self._backend_entity.name_to_node_mapping, - all_weight_params, - nodes_to_compress, - activations, - scale_estimation_params.subset_size, - scale_estimation_params.initial_steps, - scale_estimation_params.scale_steps, - scale_estimation_params.weight_penalty, - ) - scales = scale_algo.apply(model, graph) - + lora_correction_algo = None + description = "Applying Weight Compression" if self._gptq: model, scales, zero_points = self._gptq_algo.apply( model=model, @@ -407,13 +395,30 @@ def apply( statistic_points=self._gptq_statistics, backend_entity=self._backend_entity, ) + else: + if ( + self._scale_estimation + and activations is not None + and self._mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] + ): + scale_estimation_params = self._advanced_parameters.scale_estimation_params + scale_algo = ScaleEstimation( + model, + self._backend_entity.name_to_node_mapping, + all_weight_params, + nodes_to_compress, + activations, + scale_estimation_params.subset_size, + scale_estimation_params.initial_steps, + scale_estimation_params.scale_steps, + scale_estimation_params.weight_penalty, + ) + scales = scale_algo.apply(model, graph) - lora_correction_algo = None - description = "Applying Weight Compression" - if self._lora_correction: - lora_correction_params = self._advanced_parameters.lora_correction_params - lora_correction_algo = LoraCorrectionAlgorithm(activations, lora_correction_params) - description += " with correction of low-rank adapters" + if self._lora_correction: + lora_correction_params = self._advanced_parameters.lora_correction_params + lora_correction_algo = LoraCorrectionAlgorithm(activations, lora_correction_params) + description += " with correction of low-rank adapters" # Sort weight params to start compression with the bigger constants. This lowers peak memory footprint. all_weight_params = sorted(all_weight_params, key=lambda wp: wp.num_weights, reverse=True) @@ -542,7 +547,7 @@ def _get_activations( statistics_aggregator = StatisticsAggregatorFactory.create(model, dataset) statistics_aggregator.register_statistic_points(statistic_container) - if self._gptq: + if self._gptq and not self._awq: self._gptq_statistics = self._gptq_algo.get_statistic_points( model, graph, nodes_to_compress, self._backend_entity ) diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index b595e080533..b1101916da3 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -25,6 +25,7 @@ from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_integer_quantization_params from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_scale from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight @@ -44,10 +45,7 @@ class GPTQ: """ def __init__( - self, - damp_percent: float = 0.1, - block_size: int = 128, - subset_size: int = 128, + self, damp_percent: float = 0.1, block_size: int = 128, subset_size: int = 128, scale_estimation: bool = False ): """ :param damp_percent: The percent of the average Hessian diagonal to use for dampening, @@ -58,6 +56,7 @@ def __init__( self._damp_percent = damp_percent self._block_size = block_size self._subset_size = subset_size + self._scale_estimation = scale_estimation self._backend = None self._backend_entity = None @@ -124,10 +123,9 @@ def apply( CompressWeightsMode.INT8_SYM, ]: continue - assert len(inputs) == 1 _, input_tensors = next(iter(inputs.items())) hessian = self._calculate_hessian(node, input_tensors) - scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian) + scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian, input_tensors) scales[wc_params.weight_name] = scale zero_points[wc_params.weight_name] = zero_point @@ -193,7 +191,12 @@ def _calculate_hessian(self, node: NNCFNode, inputs: List[Tensor]) -> Tensor: return hessian def _quantize_weights( - self, model: TModel, graph: NNCFGraph, wc_params: WeightCompressionParameters, hessian: Tensor + self, + model: TModel, + graph: NNCFGraph, + wc_params: WeightCompressionParameters, + hessian: Tensor, + inputs: List[Tensor], ): """ Quantizes the weights of the model based on the calculated Hessian matrix. @@ -260,11 +263,25 @@ def _quantize_weights( scale = calculate_nf4_scale(weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes) scales.append(scale) else: - scale, zero_point = calculate_integer_quantization_params( - weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes, block_compression_config - ) - scales.append(scale) - zero_points.append(zero_point) + if self._scale_estimation and block_compression_config.num_bits == 4: + activations = [inp.squeeze()[:, (i1 + i) : (i1 + i + group_size)] for inp in inputs] + scale, zero_point = ScaleEstimation.calculate_quantization_params( + self._backend_entity, + activations, + weight_tensor[:, (i1 + i) : (i1 + i + group_size)], + reduction_axes, + wc_params.compression_config, + ) + scales.append(scale.squeeze(axis=1)) + zero_points.append(zero_point) + else: + scale, zero_point = calculate_integer_quantization_params( + weight_tensor[:, (i1 + i) : (i1 + i + group_size)], + reduction_axes, + block_compression_config, + ) + scales.append(scale) + zero_points.append(zero_point) if block_compression_config.mode == CompressWeightsMode.NF4: compressed_weights = do_nf4_quantization( fns.unsqueeze(weight_col, 1), scales[-1], is_normalized_weight=False diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 6d1110c108f..712c5fd955d 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -20,16 +20,17 @@ from nncf.common.utils.backend import BackendType from nncf.common.utils.backend import get_backend from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats +from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization +from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor import functions as fns TModel = TypeVar("TModel") -TTensor = TypeVar("TTensor") -TWeightType = TypeVar("TWeightType") class ScaleEstimation: @@ -37,13 +38,15 @@ class ScaleEstimation: Scale estimation algorithm implementation. """ + compress_decompress_cache = {} + def __init__( self, model: TModel, name_to_node_mapping: Dict[str, Any], all_weight_params: List[WeightCompressionParameters], nodes_to_compress: List[NNCFNode], - activations: Optional[Dict[str, TTensor]] = None, + activations: Optional[Dict[str, List[Tensor]]] = None, subset_size: int = 32, initial_steps: int = 5, scale_steps: int = 10, @@ -103,7 +106,7 @@ def apply( graph: NNCFGraph, statistic_points: Optional[StatisticPointsContainer] = None, dataset: Optional[Dataset] = None, - ) -> Dict[str, TTensor]: + ) -> Dict[str, Tensor]: """ Estimates better scale for the int4 nodes in the model. Minimizes per-group difference between floating point MatMul and @@ -118,8 +121,7 @@ def apply( :return: Dict with pairs (weight name, estimated scale). """ - compress_decompress_cache = {} - res = dict() + scales = dict() for wp in track(self._all_weight_params, description="Applying Scale Estimation"): weight_name = wp.weight_name @@ -127,11 +129,10 @@ def apply( config = wp.compression_config if config.num_bits != 4 or node_name not in self._activations: - res[weight_name] = None + scales[weight_name] = None continue - s, X = process_stats(self._activations[node_name], self._subset_size) - reduction_axis = wp.reduction_axes[0] + stats = self._activations[node_name] weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) if len(weight_data) != 1: # not supported by the algorithm @@ -139,162 +140,211 @@ def apply( _, weight_port_id = weight_data[0] weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) - weight = weight.astype(TensorDataType.float32) - eps = fns.finfo(weight).eps - if reduction_axis == 0: - weight = fns.transpose(weight) - reduction_axis = 1 + scales[weight_name], _ = self.calculate_quantization_params( + self._backend_entity, + stats, + weight, + wp.reduction_axes, + config, + self._subset_size, + self._initial_steps, + self._scale_steps, + self._weight_penalty, + ) - group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis] - cur_config = deepcopy(config) - cur_config.group_size = group_size + return scales - original_weight = fns.zeros_like(weight) + weight + @staticmethod + def calculate_quantization_params( + backend_entity: WeightCompressionAlgoBackend, + activations: List[Tensor], + weight: Tensor, + reduction_axes: Tuple[int, ...], + config: WeightCompressionConfig, + subset_size: int = 32, + initial_steps: int = 5, + scale_steps: int = 10, + weight_penalty: float = -1.0, + ) -> Tensor: + """ + Calculates the quantization parameters for a given set of weights and activations. + This function estimates the optimal quantization scale for weight compression by + minimizing the difference between floating-point operations and operations with + quantized weights. + + The function uses an iterative process: + 1. Initial scale rectification based on activation statistics. + 2. A grid search to further refine the scale parameters. + + :param backend_entity: The backend-specific implementation of the weight compression algorithm. + :param activations: List of activation tensors corresponding to the layers being quantized. + :param weight: The weight tensor that is being quantized. + :param reduction_axes: Tuple specifying the axes along which the reduction is performed for quantization. + :param config: Configuration parameters for the weight compression, including quantization settings. + :param subset_size: The number of samples to use for scale estimation. Defaults to 32. + :param initial_steps: The number of steps for initial scale rectification using activation statistics. + Defaults to 5. + :param scale_steps: The number of steps for refining the scale using a grid search. Defaults to 10. + :param weight_penalty: Penalty coefficient applied to the difference between floating-point + and quantized weights. A value of -1 disables the penalty. Defaults to -1.0. + :return: A tensor containing the calculated quantization scales and zero points if applicable. + """ + reduction_axis = reduction_axes[0] - compressed_weights, scale, zp = do_int_quantization(original_weight, reduction_axis, cur_config) - if zp is not None: - zp = zp.astype(scale.dtype) - q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis) + s, X = process_stats(activations, subset_size) - s = fns.unsqueeze(s, 0) - s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size) + weight = weight.astype(TensorDataType.float32) + eps = fns.finfo(weight).eps - original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size) + if reduction_axis == 0: + weight = fns.transpose(weight) + reduction_axis = 1 - # all weight in group has importance based on corresponding input activations - importance = fns.ones_like(original_weight) - importance = importance * s + group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis] + cur_config = deepcopy(config) + cur_config.group_size = group_size - target, zero_mask = get_target_zero_mask(compressed_weights, zp) - importance = fns.where(zero_mask, 0.0, importance) - - # normalize importances for every group of weights to make sum of them equal to 1.0 - denum = fns.sum(importance, axis=2, keepdims=True) - importance = importance / (denum + eps) - - X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size) - q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size) - best_diffs = None - result_scale = None - - fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X) - q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X) - - # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE - min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) - min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0)) - if self._weight_penalty > 0.0: - min_max_scale_diffs += self._weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1) - - zp_shape = zp.shape if zp is not None else None - key = [(wp.compression_config.mode, wp.compression_config.num_bits) + q_weights.shape + scale.shape] - if zp is not None: - key += zp_shape - key = tuple(key) - if key in compress_decompress_cache: - compress_decompress_model = compress_decompress_cache[key]["compress_decompress_model"] - compress_model = compress_decompress_cache[key]["compress_model"] - else: - compress_decompress_model = self._backend_entity.get_compress_decompress_pipeline( - wp.compression_config, q_weights.shape, scale.shape, zp_shape - ) - compress_model = self._backend_entity.get_compress_pipeline( - wp.compression_config, q_weights.shape, scale.shape, zp_shape - ) - compress_decompress_cache[key] = { - "compress_decompress_model": compress_decompress_model, - "compress_model": compress_model, - } - - scale_sign = scale / fns.abs(scale) - zero_scale = 0.001 - zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + original_weight = fns.zeros_like(weight) + weight - input_tensors = [original_weight.data, None] - if zp is not None: - input_tensors.append(zp.data) - # iterative rectification of initial scale - for i in range(self._initial_steps): - near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) - near_to_ideal_scale = near_to_ideal_scale * scale_sign - input_tensors[1] = near_to_ideal_scale.data + compressed_weights, scale, zp = do_int_quantization(original_weight, reduction_axis, cur_config) + if zp is not None: + zp = zp.astype(scale.dtype) + q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis) - out = compress_decompress_model(input_tensors) - q_weights_ = fns.zeros_like(original_weight) + out - q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) + s = fns.unsqueeze(s, 0) + s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size) - ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) - ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) - if self._weight_penalty > 0.0: - ideal_scale_diffs += self._weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) + original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size) - if best_diffs is None: - best_diffs = min_max_scale_diffs + # all weight in group has importance based on corresponding input activations + importance = fns.ones_like(original_weight) + importance = importance * s - mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) + target, zero_mask = get_target_zero_mask(compressed_weights, zp) + importance = fns.where(zero_mask, 0.0, importance) - best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs + # normalize importances for every group of weights to make sum of them equal to 1.0 + denum = fns.sum(importance, axis=2, keepdims=True) + importance = importance / (denum + eps) - mask = fns.unsqueeze(mask, axis=2) + X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size) + q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size) + best_diffs = None + result_scale = None - if result_scale is None: - near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale - else: - near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale - result_scale = near_to_ideal_scale - input_tensors[1] = near_to_ideal_scale.data + fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X) + q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X) - if i < self._initial_steps - 1: - out = compress_model(input_tensors) - compressed_weights = fns.zeros_like(original_weight) + out - target, zero_mask = get_target_zero_mask(compressed_weights, zp) - zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE + min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) + min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0)) + if weight_penalty > 0.0: + min_max_scale_diffs += weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1) - # iterative rectification of scale based on grid search - for scale_steps in range(self._scale_steps): - factor = 1.0 - 0.05 * scale_steps - scaled_scale = factor * scale + zp_shape = zp.shape if zp is not None else None + key = (config.mode, config.num_bits) + q_weights.shape + scale.shape + if zp is not None: + key += zp_shape + if key in ScaleEstimation.compress_decompress_cache: + compress_decompress_model = ScaleEstimation.compress_decompress_cache[key]["compress_decompress_model"] + compress_model = ScaleEstimation.compress_decompress_cache[key]["compress_model"] + else: + compress_decompress_model = backend_entity.get_compress_decompress_pipeline( + config, q_weights.shape, scale.shape, zp_shape + ) + compress_model = backend_entity.get_compress_pipeline(config, q_weights.shape, scale.shape, zp_shape) + ScaleEstimation.compress_decompress_cache[key] = { + "compress_decompress_model": compress_decompress_model, + "compress_model": compress_model, + } + scale_sign = scale / fns.abs(scale) + zero_scale = 0.001 + zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + + input_tensors = [original_weight.data, None] + if zp is not None: + input_tensors.append(zp.data) + # iterative rectification of initial scale + for i in range(initial_steps): + near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) + near_to_ideal_scale = near_to_ideal_scale * scale_sign + input_tensors[1] = near_to_ideal_scale.data + + out = compress_decompress_model(input_tensors) + q_weights_ = fns.zeros_like(original_weight) + out + q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) + + ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) + ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) + if weight_penalty > 0.0: + ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) + + if best_diffs is None: + best_diffs = min_max_scale_diffs + + mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) + + best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs + + mask = fns.unsqueeze(mask, axis=2) + + if result_scale is None: + near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale + else: + near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale + result_scale = near_to_ideal_scale + input_tensors[1] = near_to_ideal_scale.data - input_tensors[1] = scaled_scale.data + if i < initial_steps - 1: out = compress_model(input_tensors) compressed_weights = fns.zeros_like(original_weight) + out - target, zero_mask = get_target_zero_mask(compressed_weights, zp) zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) - near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) - near_to_ideal_scale = near_to_ideal_scale * scale_sign - input_tensors[1] = near_to_ideal_scale.data - out = compress_decompress_model(input_tensors) - q_weights_ = fns.zeros_like(original_weight) + out + # iterative rectification of scale based on grid search + for scale_steps in range(scale_steps): + factor = 1.0 - 0.05 * scale_steps + scaled_scale = factor * scale + + input_tensors[1] = scaled_scale.data + out = compress_model(input_tensors) + compressed_weights = fns.zeros_like(original_weight) + out - q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) - ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) - ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) - if self._weight_penalty > 0.0: - ideal_scale_diffs += self._weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) + target, zero_mask = get_target_zero_mask(compressed_weights, zp) + zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) + near_to_ideal_scale = near_to_ideal_scale * scale_sign + + input_tensors[1] = near_to_ideal_scale.data + out = compress_decompress_model(input_tensors) + q_weights_ = fns.zeros_like(original_weight) + out - mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) + q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) + ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) + ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) + if weight_penalty > 0.0: + ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) - best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs + mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) - mask = fns.unsqueeze(mask, axis=2) + best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs - if result_scale is None: - near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale - else: - near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale - result_scale = near_to_ideal_scale + mask = fns.unsqueeze(mask, axis=2) + + if result_scale is None: + near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale + else: + near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale + result_scale = near_to_ideal_scale - if config.group_size == -1: - result_scale = fns.squeeze(result_scale, axis=1) - res[weight_name] = result_scale + if config.group_size == -1: + result_scale = fns.squeeze(result_scale, axis=1) - return res + return result_scale, zp -def get_target_zero_mask(compressed_weights: TTensor, zp: Optional[TTensor] = None) -> Tuple[TTensor, TTensor]: +def get_target_zero_mask(compressed_weights: Tensor, zp: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: """ Computes the target values and a mask indicating zero values in the target. @@ -310,7 +360,7 @@ def get_target_zero_mask(compressed_weights: TTensor, zp: Optional[TTensor] = No return target, zero_mask -def estimate_scales(weight: TTensor, target: TTensor, zero_mask: TTensor, importance: TTensor) -> TTensor: +def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importance: Tensor) -> Tensor: """ Estimates scales for the given weight, target, zero mask, and importance. diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py index e96c4526c51..60baeacc48e 100644 --- a/nncf/quantization/quantize_model.py +++ b/nncf/quantization/quantize_model.py @@ -482,11 +482,6 @@ def compress_weights( if any((gptq, lora_correction)) and (dataset is None or mode == CompressWeightsMode.E2M1): raise AttributeError("GPTQ or Lora Correction algorithm is defined, but dataset is None or mode is E2M1.") - if gptq and scale_estimation: - raise AttributeError( - "Simultaneous use of Scale estimation and GPTQ algorithms is not supported. Select one of them." - ) - if gptq and lora_correction: raise AttributeError( "Simultaneous use of Lora correction and GPTQ algorithms is not supported. Select one of them." diff --git a/tests/openvino/native/quantization/test_gptq.py b/tests/openvino/native/quantization/test_gptq.py index 1202b216ec7..ad19990eac0 100644 --- a/tests/openvino/native/quantization/test_gptq.py +++ b/tests/openvino/native/quantization/test_gptq.py @@ -341,7 +341,8 @@ def test_calculate_scale_linear(): gptq._set_backend_entity(ov_model) nodes = graph.get_all_nodes() - H = gptq._calculate_hessian(nodes[1], [Tensor(inp) for inp in inputs]) + wrapped_inputs = [Tensor(inp) for inp in inputs] + H = gptq._calculate_hessian(nodes[1], wrapped_inputs) ref_H = ref_gptq.H.numpy() assert np.all(np.isclose(ref_H, H.data)) @@ -351,7 +352,7 @@ def test_calculate_scale_linear(): ) wc_params.compression_config = WeightCompressionConfig(mode=CompressWeightsMode.INT4_SYM, group_size=16) - scale, _ = gptq._quantize_weights(ov_model, graph, wc_params, H) + scale, _ = gptq._quantize_weights(ov_model, graph, wc_params, H, wrapped_inputs) ref_scale = ref_scale.numpy() scale = scale.reshape(ref_scale.shape) assert np.all(np.isclose(ref_scale, scale.data)) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index bb9b5c373c7..c51cf667ca2 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -713,10 +713,7 @@ def test_raise_error_with_unsupported_params_for_int8(mode, params): @pytest.mark.parametrize("mode", INT4_MODES) @pytest.mark.parametrize( "params", - ( - {"dataset": "anything", "scale_estimation": True, "gptq": True}, - {"dataset": "anything", "lora_correction": True, "gptq": True}, - ), + ({"dataset": "anything", "lora_correction": True, "gptq": True},), ) def test_raise_error_with_unsupported_params_for_int4(mode, params): with pytest.raises(AttributeError): From ee648777dcb951f4c7bdadd3997680a5083645a7 Mon Sep 17 00:00:00 2001 From: Aleksandr Suslov Date: Wed, 4 Sep 2024 13:25:22 +0400 Subject: [PATCH 02/68] fix for INT4_ASYM --- nncf/quantization/algorithms/weight_compression/gptq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index b1101916da3..bd6518c86ad 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -273,7 +273,7 @@ def _quantize_weights( wc_params.compression_config, ) scales.append(scale.squeeze(axis=1)) - zero_points.append(zero_point) + zero_points.append(zero_point if zero_point is None else zero_point.squeeze(axis=1)) else: scale, zero_point = calculate_integer_quantization_params( weight_tensor[:, (i1 + i) : (i1 + i + group_size)], From 2fc8f9cd1f93b43ccb65e31a4795cd93761bbeba Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 9 May 2025 18:48:42 +0200 Subject: [PATCH 03/68] Draft. --- .../weight_compression/algorithm.py | 10 ++++---- .../algorithms/weight_compression/backend.py | 4 ++-- .../algorithms/weight_compression/gptq.py | 11 ++++----- .../weight_compression/onnx_backend.py | 7 +++--- .../weight_compression/openvino_backend.py | 4 ++-- .../weight_compression/scale_estimation.py | 12 ++++++---- .../weight_compression/weight_lowering.py | 23 ++++--------------- 7 files changed, 27 insertions(+), 44 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 9ac54e144b9..05ffe54b725 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -649,13 +649,12 @@ def apply( # del is used to prematurely mark non-necessary data as free for garbage collection del self.awq_algo - scales = {} - zero_points = {} + compressed_weights = None lora_correction_algo = None description = "Applying Weight Compression" if self._gptq: del statistics - model, scales, zero_points = self._gptq_algo.apply( + model, compressed_weights = self._gptq_algo.apply( model=model, graph=graph, dataset=dataset, @@ -664,7 +663,7 @@ def apply( ) else: if self._scale_estimation: - scales, zero_points = self._scale_estimation_algo.apply( + compressed_weights = self._scale_estimation_algo.apply( model=model, graph=graph, all_weight_params=all_weight_params, @@ -687,8 +686,7 @@ def apply( model, graph, track(all_weight_params, description=description, weights=all_weight_sizes), - scales, - zero_points, + compressed_weights, lora_correction_algo, self._compression_format, self._advanced_parameters, diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py index 62d0745a0f4..2d928ff2908 100644 --- a/nncf/quantization/algorithms/weight_compression/backend.py +++ b/nncf/quantization/algorithms/weight_compression/backend.py @@ -26,6 +26,7 @@ from nncf.experimental.common.tensor_statistics.statistics import HessianTensorStatistic from nncf.parameters import CompressionFormat from nncf.quantization.advanced_parameters import AdvancedCompressionParameters +from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.tensor import Tensor @@ -148,8 +149,7 @@ def transform_model( model: TModel, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - precomputed_scales: dict[str, Tensor] = None, - precomputed_zero_points: dict[str, Tensor] = None, + compressed_weights: dict[str, CompressedWeight] = None, lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index 76674fd9288..bcca525bd75 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -23,6 +23,7 @@ from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.layerwise.engine import LayerwiseEngine from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation @@ -85,7 +86,7 @@ def apply( weight_compression_parameters: list[WeightCompressionParameters], statistic_points: Optional[StatisticPointsContainer] = None, backend_entity: Optional[WeightCompressionAlgoBackend] = None, - ) -> tuple[TModel, dict[str, Tensor], dict[str, Tensor]]: + ) -> tuple[TModel, dict[str, CompressedWeight]]: """ Applies the GPTQ algorithm to quantize the weights of the given model. @@ -101,8 +102,7 @@ def apply( if self._backend_entity is None: self._set_backend_entity(model) - scales = {} - zero_points = {} + res = {} target_nodes = [] target_nodes_wc_params_map = {} @@ -125,10 +125,9 @@ def apply( _, input_tensors = next(iter(inputs.items())) hessian = self._calculate_hessian(node, input_tensors) scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian, input_tensors) - scales[wc_params.weight_name] = scale - zero_points[wc_params.weight_name] = zero_point + res[wc_params.weight_name] = CompressedWeight(None, scale, zero_point, None) - return model, scales, zero_points + return model, res def get_statistic_points( self, diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py index a962cf163bc..bef2160aa7e 100644 --- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -43,6 +43,7 @@ from nncf.parameters import CompressionFormat from nncf.parameters import CompressWeightsMode from nncf.quantization.advanced_parameters import AdvancedCompressionParameters +from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm @@ -191,8 +192,7 @@ def transform_model( model: onnx.ModelProto, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - precomputed_scales: dict[str, Tensor] = None, - precomputed_zero_points: dict[str, Tensor] = None, + compressed_weights: dict[str, CompressedWeight] = None, lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), @@ -206,8 +206,7 @@ def transform_model( Tensor(weight), wc_params.reduction_axes, compression_config, - None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name), - None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name), + None if compressed_weights is None else compressed_weights.get(wc_params.weight_name) ) dequantize_block_size = max(compression_config.group_size, 0) # 0 - is no block wise quantization compressed_weight, scale, zero_point = self._preprocess_compressed_weight_shapes( diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 7c1838eb8d2..5df95ff40aa 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -53,6 +53,7 @@ from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error @@ -282,8 +283,7 @@ def transform_model( model: ov.Model, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - precomputed_scales: dict[str, Tensor] = None, - precomputed_zero_points: dict[str, Tensor] = None, + compressed_weights: dict[str, CompressedWeight] = None, lora_correction_algo: LoraCorrectionAlgorithm = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 4aea4633ebb..3ee49a2bd83 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -21,6 +21,7 @@ from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error @@ -99,7 +100,7 @@ def apply( all_weight_params: list[WeightCompressionParameters], statistics: dict[str, WCTensorStatistic], backend_entity: Optional[WeightCompressionAlgoBackend] = None, - ) -> tuple[dict[str, Tensor], dict[str, Tensor]]: + ) -> dict[str, CompressedWeight]: """ Estimates better scale for the int4 nodes in the model. Minimizes per-group difference between floating point MatMul and @@ -119,7 +120,7 @@ def apply( self._backend_entity = backend_entity if self._backend_entity is None: self._set_backend_entity(model) - scales, zero_points = dict(), dict() + res = dict() invalid_node_names = [] first_caught_error = None @@ -129,7 +130,7 @@ def apply( config = wp.compression_config if config.num_bits != 4 or node_name not in statistics: - scales[weight_name] = None + res[weight_name] = CompressedWeight() continue stats = statistics[node_name] @@ -142,7 +143,7 @@ def apply( weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) try: - scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params( + scale, zero_point = self.calculate_quantization_params( stats, weight, wp.reduction_axes, @@ -152,6 +153,7 @@ def apply( self._scale_steps, self._weight_penalty, ) + res[weight_name] = CompressedWeight(None, scale, zero_point, None) except nncf.InvalidGroupSizeError as error: first_caught_error = error invalid_node_names.append(wp.node_with_weight.node_name) @@ -159,7 +161,7 @@ def apply( if first_caught_error: handle_invalid_group_size_error(first_caught_error, invalid_node_names) - return scales, zero_points + return res @staticmethod def calculate_quantization_params( diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index f72a05193b9..0ce1c316746 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -9,7 +9,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from dataclasses import dataclass from typing import Optional, Union import numpy as np @@ -19,6 +18,7 @@ from nncf.common.utils.backend import is_openvino_at_least from nncf.common.utils.backend import is_openvino_available from nncf.parameters import CompressWeightsMode +from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.fake_quantize import calculate_scale_zero_point from nncf.tensor import Tensor @@ -72,22 +72,6 @@ ) -@dataclass -class CompressedWeight: - """ - Compressed weight and decompression parameters. - - :param tensor: The tensor with compressed weight. - :param scale: The decompression scale, in practice it is dequantization scale for the INT quantization. - :param zero_point: The zero-point, it is the value of the compression type corresponding to the value 0 - in the non-compression realm. Applicable for INT quantization. - """ - - tensor: Tensor - scale: Tensor - zero_point: Optional[Tensor] = None - - def reshape_weight_for_grouped_quantization( weight: Tensor, reduction_axes: ReductionAxes, group_size: int ) -> tuple[Tensor, int]: @@ -386,8 +370,7 @@ def compress_weight( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, - precomputed_scale: Tensor = None, - precomputed_zero_point: Tensor = None, + compressed_weight: CompressedWeight = None, ) -> CompressedWeight: """ Compress weight using compression configuration. @@ -399,6 +382,8 @@ def compress_weight( :param precomputed_zero_point: Precomputed zero point. :return: The compressed weight and decompression parameters as instance of CompressedWeight """ + + precomputed_scale, precomputed_zero_point = compressed_weight.scale, compressed_weight.zero_point if compressed_weight else (None, None) if not config.is_integer: if weight.backend == TensorBackend.ov: weight = weight.as_numpy_tensor() From 7c6795e00d4ba2d3f55c8060f4cd8c1e160bb43d Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 9 May 2025 19:46:36 +0200 Subject: [PATCH 04/68] Draft. --- .../algorithms/weight_compression/onnx_backend.py | 4 ++-- .../weight_compression/openvino_backend.py | 14 ++++---------- .../algorithms/weight_compression/torch_backend.py | 8 +++----- .../weight_compression/torch_fx_backend.py | 7 +++---- .../weight_compression/weight_lowering.py | 6 ++++-- 5 files changed, 16 insertions(+), 23 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py index bef2160aa7e..6d3633284cc 100644 --- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -43,8 +43,8 @@ from nncf.parameters import CompressionFormat from nncf.parameters import CompressWeightsMode from nncf.quantization.advanced_parameters import AdvancedCompressionParameters -from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight @@ -206,7 +206,7 @@ def transform_model( Tensor(weight), wc_params.reduction_axes, compression_config, - None if compressed_weights is None else compressed_weights.get(wc_params.weight_name) + None if compressed_weights is None else compressed_weights.get(wc_params.weight_name), ) dequantize_block_size = max(compression_config.group_size, 0) # 0 - is no block wise quantization compressed_weight, scale, zero_point = self._preprocess_compressed_weight_shapes( diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 5df95ff40aa..c85c7ea8b1f 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -218,8 +218,7 @@ def _create_compression_subgraph( weight_port_id: int, const_dtype, should_add_convert_node: bool, - layer_scales: Optional[Tensor] = None, - layer_zero_points: Optional[Tensor] = None, + compressed_weight: Optional[CompressedWeight] = None, ): scale_dtype = ov.Type.f16 if compression_config.mode == CompressWeightsMode.NF4: @@ -245,8 +244,7 @@ def _create_compression_subgraph( weight, reduction_axes, compression_config, - layer_scales, - layer_zero_points, + compressed_weight, ) compressed_const = create_ov_const_from_tensor( compressed_weight.tensor, compression_dtype, name=const_node_name @@ -308,10 +306,7 @@ def transform_model( should_add_convert_node = True break - layer_scales = None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name) - layer_zero_points = ( - None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name) - ) + compressed_weight = None if compressed_weights is None else compressed_weights.get(wc_params.weight_name) try: mul, compressed_weight = self._create_compression_subgraph( weight=weight, @@ -321,8 +316,7 @@ def transform_model( weight_port_id=wc_params.weight_port_id, const_dtype=const_dtype, should_add_convert_node=should_add_convert_node, - layer_scales=layer_scales, - layer_zero_points=layer_zero_points, + compressed_weight=compressed_weight, ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index 50f765c35c3..79869c49d46 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -45,10 +45,10 @@ from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm -from nncf.quantization.algorithms.weight_compression.weight_lowering import CompressedWeight from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType @@ -432,8 +432,7 @@ def transform_model( model: Union[GraphModelWrapper, torch.nn.Module], graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - precomputed_scales: dict[str, Tensor] = None, - precomputed_zero_points: dict[str, Tensor] = None, + compressed_weights: dict[str, CompressedWeight] = None, lora_correction_algo: LoraCorrectionAlgorithm = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), @@ -470,8 +469,7 @@ def transform_model( Tensor(weight), wc_params.reduction_axes, compression_config, - None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name), - None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name), + None if compressed_weights is None else compressed_weights.get(wc_params.weight_name), ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index 2650f16600c..80597096346 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -40,6 +40,7 @@ from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm @@ -189,8 +190,7 @@ def transform_model( model: torch.fx.GraphModule, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - precomputed_scales: dict[str, Tensor] = None, - precomputed_zero_points: dict[str, Tensor] = None, + compressed_weights: dict[str, CompressedWeight] = None, lora_correction_algo: LoraCorrectionAlgorithm = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), @@ -218,8 +218,7 @@ def transform_model( weight, wc_params.reduction_axes, compression_config, - None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name), - None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name), + None if compressed_weights is None else compressed_weights.get(wc_params.weight_name), ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 0ce1c316746..045dbc418eb 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -382,8 +382,10 @@ def compress_weight( :param precomputed_zero_point: Precomputed zero point. :return: The compressed weight and decompression parameters as instance of CompressedWeight """ - - precomputed_scale, precomputed_zero_point = compressed_weight.scale, compressed_weight.zero_point if compressed_weight else (None, None) + precomputed_scale, precomputed_zero_point = ( + compressed_weight.scale, + compressed_weight.zero_point if compressed_weight else (None, None), + ) if not config.is_integer: if weight.backend == TensorBackend.ov: weight = weight.as_numpy_tensor() From 1dcdd7598e7b13a1bc94fac9eb649f004dfc5a75 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 12 May 2025 13:01:30 +0200 Subject: [PATCH 05/68] Draft for codebook. --- nncf/parameters.py | 1 + .../algorithms/weight_compression/codebook.py | 169 ++++++++++++++++++ .../weight_compression/weight_lowering.py | 28 +-- 3 files changed, 187 insertions(+), 11 deletions(-) create mode 100644 nncf/quantization/algorithms/weight_compression/codebook.py diff --git a/nncf/parameters.py b/nncf/parameters.py index 92b158fa9a6..6a6e6883ab4 100644 --- a/nncf/parameters.py +++ b/nncf/parameters.py @@ -94,6 +94,7 @@ class CompressWeightsMode(StrEnum): NF4 = "nf4" INT8 = "int8" # Deprecated mode E2M1 = "e2m1" + CODEBOOK = "codebook" @api(canonical_alias="nncf.CompressionFormat") diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py new file mode 100644 index 00000000000..a4ad22498cb --- /dev/null +++ b/nncf/quantization/algorithms/weight_compression/codebook.py @@ -0,0 +1,169 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import deepcopy +from typing import Optional, TypeVar + +import nncf +from nncf.common.graph.graph import NNCFGraph +from nncf.common.logging.track_progress import track +from nncf.common.utils.backend import BackendType +from nncf.common.utils.backend import get_backend +from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.common import CompressedWeight +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_scale +from nncf.tensor import Tensor +from nncf.tensor import TensorDataType +from nncf.tensor import functions as fns + +TModel = TypeVar("TModel") + + +class Codebook: + """ + Codebook estimation algorithm implementation. + """ + + def __init__( + self, + initial_codebook: Tensor, + ): + """ + :param initial_codebook: codebook for compression. + """ + super().__init__() + self._initial_codebook = initial_codebook.flatten() + + @property + def available_backends(self) -> list[BackendType]: + return [BackendType.OPENVINO] + + def _set_backend_entity(self, model: TModel) -> None: + """ + Creates a helper class with a backed-specific logic of the algorithm. + + :param model: Backend-specific input model. + """ + model_backend = get_backend(model) + if model_backend == BackendType.OPENVINO: + from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend + + self._backend_entity = OVWeightCompressionAlgoBackend(model) + else: + msg = ( + "Cannot return backend-specific Scale Estimation entity because" + f" {model_backend.value} is not supported!" + ) + raise nncf.UnsupportedBackendError(msg) + + def apply( + self, + model: TModel, + graph: NNCFGraph, + all_weight_params: list[WeightCompressionParameters], + backend_entity: Optional[WeightCompressionAlgoBackend] = None, + ) -> dict[str, CompressedWeight]: + """ + Estimates better scale for the int4 nodes in the model. + Minimizes per-group difference between floating point MatMul and + MatMul with compressed weights. + The algorithm computes weighted scale for the group of weights in MatMul, which + shared the same scale. + + :param model: Model for applying algorithm. + :param graph: Model graph. + :param all_weight_params: List of all weight parameters. + :param backend_entity: Weight compression algorithm backend. + :return: Two dictionaries for estimated scales and zero points for each weight name. + """ + self._backend_entity = backend_entity + if self._backend_entity is None: + self._set_backend_entity(model) + + res = {} + invalid_node_names = [] + first_caught_error = None + for wp in track(all_weight_params, description="Applying Codebook Compression"): + weight_name = wp.weight_name + config = wp.compression_config + + weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) + if len(weight_data) != 1: # not supported by the algorithm + continue + _, weight_port_id = weight_data[0] + + weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) + + try: + indexes, scale, codebook = self.calculate_quantization_params(weight, wp.reduction_axes, config) + res[weight_name] = CompressedWeight(indexes, scale, None, codebook) + except nncf.InvalidGroupSizeError as error: + first_caught_error = error + invalid_node_names.append(wp.node_with_weight.node_name) + + if first_caught_error: + handle_invalid_group_size_error(first_caught_error, invalid_node_names) + + return res + + def calculate_quantization_params( + self, + weight: Tensor, + reduction_axes: tuple[int, ...], + config: WeightCompressionConfig, + ) -> Tensor: + """ + Calculates the quantization parameters for a given set of weights and activations. + This function estimates the optimal quantization scale for weight compression by + minimizing the difference between floating-point operations and operations with + quantized weights. + + The function uses an iterative process: + 1. Initial scale rectification based on activation statistics. + 2. A grid search to further refine the scale parameters. + + :param statistics: The input activations of the layer reduced over batch and sequence length dimensions, + together with original activation tensor shapes. + :param weight: The weight tensor that is being quantized. + :param reduction_axes: Tuple specifying the axes along which the reduction is performed for quantization. + :param config: Configuration parameters for the weight compression, including quantization settings. + :return: A tensor containing the calculated quantization scales and zero points if applicable. + """ + reduction_axis = reduction_axes[0] + + weight = weight.astype(TensorDataType.float32) + + if reduction_axis == 0: + weight = fns.transpose(weight) + reduction_axis = 1 + + group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis] + cur_config = deepcopy(config) + cur_config.group_size = group_size + + max_val = fns.max(fns.abs(weight)) + norm_weight, scale = calculate_normalized_weight_and_scale( + weight, reduction_axis, cur_config.group_size, max_val=max_val + ) + + orig_shape = norm_weight.shape + + norm_weight = fns.unsqueeze(norm_weight.flatten(), 1) + + dist = (norm_weight - fns.unsqueeze(self._initial_codebook, 0)) ** 2 + + indexes = fns.argmin(dist, axis=1)[0] + indexes = fns.reshape(indexes, orig_shape) + + return indexes, scale, self._initial_codebook diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 045dbc418eb..b209e3a93a7 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -104,7 +104,7 @@ def reshape_weight_for_grouped_quantization( return reshaped_weight, reduction_axes -def calculate_nf4_scale(weight: Tensor, reduction_axes: ReductionAxes) -> Tensor: +def calculate_nf4_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val=1.0) -> Tensor: """ Calculates the scale for nf4 quantization. @@ -115,7 +115,7 @@ def calculate_nf4_scale(weight: Tensor, reduction_axes: ReductionAxes) -> Tensor if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) - scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) + scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) / max_val # NOTE: adding machine epsilon to avoid division by zero eps = fns.finfo(weight).eps @@ -134,7 +134,7 @@ def calculate_e2m1_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val= :param to_e8m0: Defines convert scale to e8m0 or not. :return: Scale tensor of float32 type for e2m1 quantization. """ - scale = calculate_nf4_scale(weight, reduction_axes) / max_val + scale = calculate_nf4_scale(weight, reduction_axes, max_val) scale = fns.log2(scale) scale = fns.ceil(scale) @@ -219,12 +219,13 @@ def do_nf4_dequantization(nf4_weight: Tensor, scale: Tensor, reduction_axis: int return decompressed_weight -def calculate_normalized_weight_and_fp4_scale( +def calculate_normalized_weight_and_scale( weight: Tensor, reduction_axes: ReductionAxes, group_size: int = -1, precomputed_scale: Tensor = None, mode: CompressWeightsMode = CompressWeightsMode.NF4, + max_val=1.0, ) -> tuple[Tensor, Tensor]: """ Calculates scale for fp4 (nf4, e2m1) quantization and normalizes weights by the scale. @@ -235,9 +236,10 @@ def calculate_normalized_weight_and_fp4_scale( :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale). The value -1 means no grouping. Defaults to -1. :param precomputed_scale: Precomputed scale. + :parm max_val: Max value of compressed type for normalization. :return: Normalized weight tensor of float32 type and nf4 scale tensor of float32 type. """ - assert mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] + assert mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK] if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) @@ -245,10 +247,14 @@ def calculate_normalized_weight_and_fp4_scale( # weights are reshaped: [a1, r, a2] -> [a1, r//gs, gs, a2] weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) - if mode == CompressWeightsMode.NF4: - scale = calculate_nf4_scale(weight, reduction_axes) if precomputed_scale is None else precomputed_scale - if mode == CompressWeightsMode.E2M1: + if mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK]: + scale = calculate_nf4_scale(weight, reduction_axes, max_val) if precomputed_scale is None else precomputed_scale + elif mode == CompressWeightsMode.E2M1: scale = calculate_e2m1_scale(weight, reduction_axes) if precomputed_scale is None else precomputed_scale + else: + msg = f"Unsupported mode {mode} for weight compression." + raise ValueError(msg) + norm_weight = calculate_normalized_weight(weight, scale) return norm_weight, scale @@ -383,14 +389,14 @@ def compress_weight( :return: The compressed weight and decompression parameters as instance of CompressedWeight """ precomputed_scale, precomputed_zero_point = ( - compressed_weight.scale, - compressed_weight.zero_point if compressed_weight else (None, None), + (compressed_weight.scale, compressed_weight.zero_point) if compressed_weight else (None, None) ) + if not config.is_integer: if weight.backend == TensorBackend.ov: weight = weight.as_numpy_tensor() - compressed_weight, scale = calculate_normalized_weight_and_fp4_scale( + compressed_weight, scale = calculate_normalized_weight_and_scale( weight, reduction_axes, config.group_size, precomputed_scale, config.mode ) return CompressedWeight(compressed_weight, scale) From b870d8d9b242df262afe0859f6b7e0778ed8652b Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 13 May 2025 14:00:13 +0200 Subject: [PATCH 06/68] Compression for default codebook. --- .ci/cspell_dict.txt | 3 +- nncf/openvino/graph/node_utils.py | 23 +++++++++ nncf/quantization/advanced_parameters.py | 37 ++++++++++++++ .../weight_compression/algorithm.py | 15 ++++++ .../algorithms/weight_compression/codebook.py | 24 ++++++--- .../weight_compression/openvino_backend.py | 50 ++++++++++++------- .../weight_compression/scale_estimation.py | 4 +- 7 files changed, 129 insertions(+), 27 deletions(-) diff --git a/.ci/cspell_dict.txt b/.ci/cspell_dict.txt index 8d7bf519804..2dd19aafa41 100644 --- a/.ci/cspell_dict.txt +++ b/.ci/cspell_dict.txt @@ -72,6 +72,7 @@ ckpt clusterization cmap cnode +codebook coeffs concr confs @@ -492,4 +493,4 @@ yolov yscale yujie yury -zfnet \ No newline at end of file +zfnet diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 32ed821b7d1..96f4958f959 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -685,3 +685,26 @@ def create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] = return opset.constant(x.data, name=name, shared_memory=True) const = opset.constant(x.data, dtype=dtype, name=name) return const + + +def create_ov_codebook_subgraph( + codebook: Tensor, indexes: Tensor, dtype: ov.Type, codebook_dtype: ov.Type, name: Optional[str] = None +) -> op.Constant: + """ + Create an OpenVINO subgraph with gather from the given codebook and indexes tensors. + :param codebook: Codebook tensor. + :param indexes: Indexes tensor. + :param dtype: Data type of the indexes. + :param codebook_dtype: Data type of the codebook. + :param name: Optional name of the constant. + :return: OpenVINO subgraph. + """ + cobebook_const = opset.constant(codebook.data, dtype=codebook_dtype) + if codebook_dtype != ov.Type.f16: + cobebook_const = opset.convert(cobebook_const, destination_type=ov.Type.f16) + codebook_indexes = opset.constant(indexes.data, dtype=dtype) + if dtype == ov.Type.u4: + codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8) + + const = opset.gather(cobebook_const, codebook_indexes, 0, name=name) + return const diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index 10f18b34eae..a041c8da25c 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -20,6 +20,8 @@ from enum import Enum from typing import Any, Optional, Union +import openvino.runtime as ov + import nncf from nncf.common.quantization.quantizer_propagation.structs import QuantizerPropagationRule from nncf.common.quantization.structs import QuantizationScheme as QuantizationMode @@ -359,6 +361,40 @@ class AdvancedLoraCorrectionParameters: use_int8_adapters: bool = True +@api() +@dataclass +class AdvancedCodebookParameters: + """ + Contains advanced parameters for codebook compression algorithm. + :param codebook: The codebook (LUT) for the weight compression. + Applicable for vector quantization. + :type codebook: list[Any] + :param dts_type: The type of the codebook. + """ + + codebook: list[Any] = field( + default_factory=lambda: [ + -3.5, + -2.5, + -1.875, + -1.375, + -1.0, + -0.625, + -0.3125, + 0.0, + 0.2812, + 0.5625, + 0.875, + 1.125, + 1.5, + 2.0, + 2.5, + 3.5, + ] + ) + dst_type: Any = ov.Type.f8e4m3 + + @api() @dataclass class AdvancedCompressionParameters: @@ -390,6 +426,7 @@ class AdvancedCompressionParameters: lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters) lora_adapter_rank: int = 256 backend_params: dict[str, Any] = field(default_factory=dict) + codebook_params: AdvancedCodebookParameters = field(default_factory=AdvancedCodebookParameters) @api() diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 05ffe54b725..631be6f17e2 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -38,6 +38,7 @@ from nncf.quantization.advanced_parameters import convert_to_dict_recursively from nncf.quantization.algorithms.algorithm import Algorithm from nncf.quantization.algorithms.weight_compression.awq import AWQ +from nncf.quantization.algorithms.weight_compression.codebook import Codebook from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.gptq import GPTQ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm @@ -288,6 +289,7 @@ def __init__( self._advanced_parameters = ( advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters() ) + self._codebook = mode == CompressWeightsMode.CODEBOOK primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size) criterion_cls = MIXED_PRECISION_CRITERIA.get(self._sensitivity_metric) @@ -320,6 +322,12 @@ def __init__( scale_estimation_params.scale_steps, scale_estimation_params.weight_penalty, ) + if self._codebook: + codebook_params = self._advanced_parameters.codebook_params + self._codebook_algo = Codebook( + initial_codebook=codebook_params.codebook, + dst_type=codebook_params.dst_type, + ) self._data_aware_mixed_precision = ( self._sensitivity_metric != SensitivityMetric.WEIGHT_QUANTIZATION_ERROR and self._ratio != 1.0 @@ -652,6 +660,13 @@ def apply( compressed_weights = None lora_correction_algo = None description = "Applying Weight Compression" + if self._codebook: + compressed_weights = self._codebook_algo.apply( + model=model, + graph=graph, + all_weight_params=all_weight_params, + backend_entity=self._backend_entity, + ) if self._gptq: del statistics model, compressed_weights = self._gptq_algo.apply( diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py index a4ad22498cb..17f30a34cd7 100644 --- a/nncf/quantization/algorithms/weight_compression/codebook.py +++ b/nncf/quantization/algorithms/weight_compression/codebook.py @@ -10,13 +10,14 @@ # limitations under the License. from copy import deepcopy -from typing import Optional, TypeVar +from typing import Any, Optional, TypeVar import nncf from nncf.common.graph.graph import NNCFGraph from nncf.common.logging.track_progress import track from nncf.common.utils.backend import BackendType from nncf.common.utils.backend import get_backend +from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig @@ -38,12 +39,14 @@ class Codebook: def __init__( self, initial_codebook: Tensor, + dst_type: Any, ): """ :param initial_codebook: codebook for compression. """ super().__init__() - self._initial_codebook = initial_codebook.flatten() + self._initial_codebook = initial_codebook + self._dst_type = dst_type @property def available_backends(self) -> list[BackendType]: @@ -95,7 +98,10 @@ def apply( invalid_node_names = [] first_caught_error = None for wp in track(all_weight_params, description="Applying Codebook Compression"): + if wp.compression_config.mode != CompressWeightsMode.CODEBOOK: + continue weight_name = wp.weight_name + print(weight_name) config = wp.compression_config weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) @@ -107,7 +113,7 @@ def apply( try: indexes, scale, codebook = self.calculate_quantization_params(weight, wp.reduction_axes, config) - res[weight_name] = CompressedWeight(indexes, scale, None, codebook) + res[weight_name] = CompressedWeight(indexes, scale, None, (codebook, self._dst_type)) except nncf.InvalidGroupSizeError as error: first_caught_error = error invalid_node_names.append(wp.node_with_weight.node_name) @@ -144,6 +150,10 @@ def calculate_quantization_params( weight = weight.astype(TensorDataType.float32) + codebook = fns.tensor( + self._initial_codebook, backend=weight.backend, dtype=TensorDataType.float32, device=weight.device + ) + if reduction_axis == 0: weight = fns.transpose(weight) reduction_axis = 1 @@ -152,7 +162,7 @@ def calculate_quantization_params( cur_config = deepcopy(config) cur_config.group_size = group_size - max_val = fns.max(fns.abs(weight)) + max_val = fns.max(fns.abs(codebook)) norm_weight, scale = calculate_normalized_weight_and_scale( weight, reduction_axis, cur_config.group_size, max_val=max_val ) @@ -161,9 +171,9 @@ def calculate_quantization_params( norm_weight = fns.unsqueeze(norm_weight.flatten(), 1) - dist = (norm_weight - fns.unsqueeze(self._initial_codebook, 0)) ** 2 + dist = (norm_weight - fns.unsqueeze(codebook, 0)) ** 2 - indexes = fns.argmin(dist, axis=1)[0] + indexes = dist.data.argmin(-1) indexes = fns.reshape(indexes, orig_shape) - return indexes, scale, self._initial_codebook + return indexes, scale, codebook diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index c85c7ea8b1f..f18dc045537 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -32,6 +32,7 @@ from nncf.openvino.graph.metatypes.groups import ATOMIC_ACTIVATIONS_OPERATIONS from nncf.openvino.graph.model_transformer import OVModelTransformer from nncf.openvino.graph.node_utils import convert_op +from nncf.openvino.graph.node_utils import create_ov_codebook_subgraph from nncf.openvino.graph.node_utils import create_ov_const_from_tensor from nncf.openvino.graph.node_utils import get_const_value_as_numpy_tensor from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor @@ -234,31 +235,46 @@ def _create_compression_subgraph( compression_dtype = ov.Type.i8 elif compression_config.mode == CompressWeightsMode.INT8_ASYM: compression_dtype = ov.Type.u8 + elif compression_config.mode == CompressWeightsMode.CODEBOOK: + if compressed_weight is None or not compressed_weight.is_codebook(): + msg = "Codebook compression requires pre-computed codebook." + raise nncf.ValidationError(msg) + compression_dtype = ov.Type.u8 if compressed_weight.tensor.max() > 4 else ov.Type.u4 else: msg = f"{compression_config.mode.value} is not supported." raise nncf.ParameterNotSupportedError(msg) original_shape = weight.shape - with disable_results_caching(OV_MODEL_CACHE): - compressed_weight = compress_weight( - weight, - reduction_axes, - compression_config, - compressed_weight, - ) - compressed_const = create_ov_const_from_tensor( - compressed_weight.tensor, compression_dtype, name=const_node_name - ) - converted_const = opset.convert(compressed_const, ov.Type.f16) - if compressed_weight.zero_point is not None: - zero_point_const = create_ov_const_from_tensor( - compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point" + if compression_config.mode == CompressWeightsMode.CODEBOOK: + converted_const = create_ov_codebook_subgraph( + codebook=compressed_weight.codebook[0], + indexes=compressed_weight.tensor, + dtype=compression_dtype, + codebook_dtype=compressed_weight.codebook[1], + name=const_node_name, ) - zero_point_const = opset.convert(zero_point_const, ov.Type.f16) - converted_const = opset.subtract( - converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract" + else: + with disable_results_caching(OV_MODEL_CACHE): + compressed_weight = compress_weight( + weight, + reduction_axes, + compression_config, + compressed_weight, + ) + compressed_const = create_ov_const_from_tensor( + compressed_weight.tensor, compression_dtype, name=const_node_name ) + converted_const = opset.convert(compressed_const, ov.Type.f16) + + if compressed_weight.zero_point is not None: + zero_point_const = create_ov_const_from_tensor( + compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point" + ) + zero_point_const = opset.convert(zero_point_const, ov.Type.f16) + converted_const = opset.subtract( + converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract" + ) scale_const = create_ov_const_from_tensor(compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale") scale_const = convert_op(scale_const, ov.Type.f16) diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 3ee49a2bd83..5f05db2cfd8 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -25,7 +25,7 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error -from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_scale from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization @@ -215,7 +215,7 @@ def calculate_quantization_params( original_weight = fns.zeros_like(weight) + weight if config.mode == CompressWeightsMode.NF4: - norm_weight, scale = calculate_normalized_weight_and_fp4_scale( + norm_weight, scale = calculate_normalized_weight_and_scale( original_weight, reduction_axis, cur_config.group_size ) compressed_weights = do_nf4_quantization(norm_weight, scale, is_normalized_weight=True) From ac26b8aec3d6f74a65669d36942cb9b1d7d089d0 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 13 May 2025 14:21:50 +0200 Subject: [PATCH 07/68] Reverted change in spell check. --- .ci/cspell_dict.txt | 1 - nncf/openvino/graph/node_utils.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.ci/cspell_dict.txt b/.ci/cspell_dict.txt index 2dd19aafa41..74d2f7ca9ce 100644 --- a/.ci/cspell_dict.txt +++ b/.ci/cspell_dict.txt @@ -72,7 +72,6 @@ ckpt clusterization cmap cnode -codebook coeffs concr confs diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 96f4958f959..db6f344fc23 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -699,12 +699,12 @@ def create_ov_codebook_subgraph( :param name: Optional name of the constant. :return: OpenVINO subgraph. """ - cobebook_const = opset.constant(codebook.data, dtype=codebook_dtype) + codebook_const = opset.constant(codebook.data, dtype=codebook_dtype) if codebook_dtype != ov.Type.f16: - cobebook_const = opset.convert(cobebook_const, destination_type=ov.Type.f16) + codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16) codebook_indexes = opset.constant(indexes.data, dtype=dtype) if dtype == ov.Type.u4: codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8) - const = opset.gather(cobebook_const, codebook_indexes, 0, name=name) + const = opset.gather(codebook_const, codebook_indexes, 0, name=name) return const From 16d7a9e5ea3487fda500d8989c015ab9af3b75f9 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 13 May 2025 16:53:42 +0200 Subject: [PATCH 08/68] Fixed compression to 4bit for codebook indexes. --- nncf/openvino/graph/node_utils.py | 4 ++-- .../quantization/algorithms/weight_compression/algorithm.py | 4 ++-- nncf/quantization/algorithms/weight_compression/codebook.py | 5 +++-- .../algorithms/weight_compression/openvino_backend.py | 6 +++--- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index db6f344fc23..1a4fcb06303 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -700,11 +700,11 @@ def create_ov_codebook_subgraph( :return: OpenVINO subgraph. """ codebook_const = opset.constant(codebook.data, dtype=codebook_dtype) - if codebook_dtype != ov.Type.f16: - codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16) codebook_indexes = opset.constant(indexes.data, dtype=dtype) if dtype == ov.Type.u4: codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8) const = opset.gather(codebook_const, codebook_indexes, 0, name=name) + if codebook_dtype != ov.Type.f16: + const = opset.convert(const, destination_type=ov.Type.f16) return const diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 631be6f17e2..30d0ae719ed 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -38,7 +38,7 @@ from nncf.quantization.advanced_parameters import convert_to_dict_recursively from nncf.quantization.algorithms.algorithm import Algorithm from nncf.quantization.algorithms.weight_compression.awq import AWQ -from nncf.quantization.algorithms.weight_compression.codebook import Codebook +from nncf.quantization.algorithms.weight_compression.codebook import CodebookCompression from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.gptq import GPTQ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm @@ -324,7 +324,7 @@ def __init__( ) if self._codebook: codebook_params = self._advanced_parameters.codebook_params - self._codebook_algo = Codebook( + self._codebook_algo = CodebookCompression( initial_codebook=codebook_params.codebook, dst_type=codebook_params.dst_type, ) diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py index 17f30a34cd7..880062c04e6 100644 --- a/nncf/quantization/algorithms/weight_compression/codebook.py +++ b/nncf/quantization/algorithms/weight_compression/codebook.py @@ -19,6 +19,7 @@ from nncf.common.utils.backend import get_backend from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.common import Codebook from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters @@ -31,7 +32,7 @@ TModel = TypeVar("TModel") -class Codebook: +class CodebookCompression: """ Codebook estimation algorithm implementation. """ @@ -113,7 +114,7 @@ def apply( try: indexes, scale, codebook = self.calculate_quantization_params(weight, wp.reduction_axes, config) - res[weight_name] = CompressedWeight(indexes, scale, None, (codebook, self._dst_type)) + res[weight_name] = CompressedWeight(indexes, scale, None, Codebook(codebook, self._dst_type)) except nncf.InvalidGroupSizeError as error: first_caught_error = error invalid_node_names.append(wp.node_with_weight.node_name) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index f18dc045537..f48bff5519b 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -239,7 +239,7 @@ def _create_compression_subgraph( if compressed_weight is None or not compressed_weight.is_codebook(): msg = "Codebook compression requires pre-computed codebook." raise nncf.ValidationError(msg) - compression_dtype = ov.Type.u8 if compressed_weight.tensor.max() > 4 else ov.Type.u4 + compression_dtype = ov.Type.u8 if compressed_weight.tensor.max() > 15 else ov.Type.u4 else: msg = f"{compression_config.mode.value} is not supported." raise nncf.ParameterNotSupportedError(msg) @@ -248,10 +248,10 @@ def _create_compression_subgraph( if compression_config.mode == CompressWeightsMode.CODEBOOK: converted_const = create_ov_codebook_subgraph( - codebook=compressed_weight.codebook[0], + codebook=compressed_weight.codebook.codebook, indexes=compressed_weight.tensor, dtype=compression_dtype, - codebook_dtype=compressed_weight.codebook[1], + codebook_dtype=compressed_weight.codebook.dst_type, name=const_node_name, ) else: From 87280cc4067c65a81106f3b6c22fa25311281c8a Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Thu, 15 May 2025 14:01:39 +0200 Subject: [PATCH 09/68] Added tests and example. --- .../openvino/smollm2_360m_codebook/main.py | 114 ++++++++++++++++++ nncf/parameters.py | 1 + nncf/quantization/quantize_model.py | 19 +-- .../quantization/test_weights_compression.py | 75 ++++++++++++ 4 files changed, 201 insertions(+), 8 deletions(-) create mode 100644 examples/llm_compression/openvino/smollm2_360m_codebook/main.py diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py new file mode 100644 index 00000000000..7a37c0e3d42 --- /dev/null +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -0,0 +1,114 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import openvino as ov +from datasets import load_dataset +from optimum.intel.openvino import OVModelForCausalLM +from transformers import AutoTokenizer + +import nncf + + +def transform_fn(data, model, tokenizer): + tokenized_text = tokenizer(data["text"], return_tensors="np") + input_ids = tokenized_text["input_ids"] + attention_mask = tokenized_text["attention_mask"] + + inputs = {} + inputs["input_ids"] = input_ids + inputs["attention_mask"] = tokenized_text["attention_mask"] + position_ids = np.cumsum(attention_mask, axis=1) - 1 + position_ids[attention_mask == 0] = 1 + + # The magic forms KV cache as model inputs + batch_size = input_ids.shape[0] + for input_name in model.key_value_input_names: + model_inputs = model.model.input(input_name) + shape = model_inputs.get_partial_shape() + shape[0] = batch_size + if shape[2].is_dynamic: + shape[2] = 0 + else: + shape[1] = 0 + inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape()) + + inputs["position_ids"] = position_ids + return inputs + + +def generate_answers(questions, model, tokenizer, max_new_tokens=50): + messages = [ + {"role": "system", "content": "You are a chatbot who always responds as short as possible."}, + {"role": "user", "content": "What is the capital of Spain?"}, + {"role": "assistant", "content": "Madrid."}, + ] + answers_by_questions = {} + model.request = None + + for question in questions: + messages.append({"role": "user", "content": question}) + input_ids = tokenizer.apply_chat_template( + messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" + ).to(device=model.device) + input_len = len(input_ids[0]) + + output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0] + answer = tokenizer.decode(output[input_len:], skip_special_tokens=True) + answers_by_questions[question] = answer + messages.append({"role": "assistant", "content": answer}) + + model.request = None + return answers_by_questions + + +def main(): + MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct" + OUTPUT_DIR = "smollm2_360m_compressed_codebook" + + dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + # Filtering to remove empty samples from the dataset + dataset = dataset.filter(lambda example: len(example["text"]) > 1) + + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + model = OVModelForCausalLM.from_pretrained( + MODEL_ID, + export=True, + load_in_8bit=False, + compile=False, + stateful=False, + ov_config={"INFERENCE_PRECISION_HINT": "f32"}, + ) + + questions = [ + "What is the capital of France?", + "What is the highest peak in the Alps?", + "What is the largest city in Canada?", + "What is the most visited city in Japan?", + ] + + answers_by_questions = generate_answers(questions, model, tokenizer) + print(f"Non-optimized model outputs:\n{answers_by_questions}\n") + + model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=64) + model.save_pretrained(OUTPUT_DIR) + tokenizer.save_pretrained(OUTPUT_DIR) + + model = OVModelForCausalLM.from_pretrained( + OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "INFERENCE_PRECISION_HINT": "f32"} + ) + answers_by_questions = generate_answers(questions, model, tokenizer) + print(f"Optimized model outputs:\n{answers_by_questions}\n") + return answers_by_questions + + +if __name__ == "__main__": + main() diff --git a/nncf/parameters.py b/nncf/parameters.py index 6a6e6883ab4..f1bf44dcb91 100644 --- a/nncf/parameters.py +++ b/nncf/parameters.py @@ -85,6 +85,7 @@ class CompressWeightsMode(StrEnum): :param NF4: The the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead. :param E2M1: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. + :param CODEBOOK: Codebook (LUT) quantization format. """ INT8_SYM = "int8_sym" diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py index c8921a07063..a22ea74dec8 100644 --- a/nncf/quantization/quantize_model.py +++ b/nncf/quantization/quantize_model.py @@ -516,8 +516,8 @@ def compress_weights( from nncf.torch.nncf_network import NNCFNetwork from nncf.torch.quantization.quantize_model import compress_weights_impl as pt_compression_weights_impl - if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]: - msg = "Torch backend does not support NF4 and E2M1 modes for weight compression." + if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]: + msg = "Torch backend does not support NF4, E2M1 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = {"gptq": gptq, "lora_correction": lora_correction} @@ -560,8 +560,8 @@ def compress_weights( compress_weights_impl as fx_compression_weights_impl, ) - if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]: - msg = "Torch backend does not support NF4 and E2M1 modes for weight compression." + if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]: + msg = "Torch backend does not support NF4, E2M1 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = { @@ -597,8 +597,11 @@ def compress_weights( msg = "Scale estimation, GPTQ or Lora Correction algorithm is defined, but dataset is None." raise nncf.ParameterNotSupportedError(msg) - if any((awq, scale_estimation, gptq, lora_correction)) and mode == CompressWeightsMode.E2M1: - msg = "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is E2M1." + if any((awq, scale_estimation, gptq, lora_correction)) and mode in [ + CompressWeightsMode.E2M1, + CompressWeightsMode.CODEBOOK, + ]: + msg = f"AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is {mode}." raise nncf.ParameterNotSupportedError(msg) if gptq and lora_correction: @@ -614,8 +617,8 @@ def compress_weights( elif backend == BackendType.ONNX: from nncf.onnx.quantization.quantize_model import compress_weights_impl as onnx_compress_weights_impl - if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]: - msg = "ONNX backend does not support NF4 and E2M1 modes for weight compression." + if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]: + msg = "ONNX backend does not support NF4, E2M1 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = { diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 5935b2265b7..fd48852a19e 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -36,6 +36,7 @@ from nncf.parameters import BackupMode from nncf.parameters import CompressionFormat from nncf.quantization import compress_weights +from nncf.quantization.advanced_parameters import AdvancedCodebookParameters from nncf.quantization.advanced_parameters import AdvancedCompressionParameters from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams @@ -696,6 +697,20 @@ def test_raise_error_with_unsupported_params_for_e2m1(algo): compress_weights(ov.Model([], []), dataset="anything", mode=CompressWeightsMode.E2M1, **{algo: True}) +@pytest.mark.parametrize( + "algo", + ( + "lora_correction", + "awq", + "scale_estimation", + "gptq", + ), +) +def test_raise_error_with_unsupported_params_for_codebook(algo): + with pytest.raises(nncf.ParameterNotSupportedError): + compress_weights(ov.Model([], []), dataset="anything", mode=CompressWeightsMode.CODEBOOK, **{algo: True}) + + @pytest.mark.parametrize("mode", INT4_NF4_MODES) @pytest.mark.parametrize( "algo", @@ -1023,6 +1038,66 @@ def test_mixed_precision_e2m1(mode, all_layers, ratio, ref_ids): assert ref_e8m0_nodes == names_e8m0 +@pytest.mark.parametrize( + ("mode", "all_layers", "ratio", "ref_ids"), + ( + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 3, 4]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, []), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 3]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, []), + ), +) +def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids): + model = SequentialMatmulModel().ov_model + compressed_model = compress_weights( + model, + mode=CompressWeightsMode.CODEBOOK, + ratio=ratio, + group_size=1, + all_layers=all_layers, + sensitivity_metric=mode, + ) + names_codebook = { + op.get_friendly_name() + for op in compressed_model.get_ordered_ops() + if op.get_element_type() == ov.Type.f8e4m3 and not op.get_friendly_name().startswith("Const") + } + ref_codebook_nodes = {f"weights_{i}" for i in ref_ids} + + assert ref_codebook_nodes == names_codebook + + +@pytest.mark.parametrize( + ("codebook", "dst_type", "n_layers"), + ( + ([i for i in range(-8, 8)], ov.Type.i4, 2 * 5), + ([i for i in range(-(2**6), 2**6)], ov.Type.i8, 2 * 5), + ([i for i in range(-(2**6), 2**6)], ov.Type.f8e4m3, 2 * 5), + ), +) +def test_codebook(codebook, dst_type, n_layers): + model = SequentialMatmulModel().ov_model + compressed_model = compress_weights( + model, + mode=CompressWeightsMode.CODEBOOK, + ratio=1.0, + group_size=1, + all_layers=True, + advanced_parameters=AdvancedCompressionParameters( + codebook_params=AdvancedCodebookParameters(codebook=codebook, dst_type=dst_type) + ), + ) + names_codebook = [ + op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == dst_type + ] + + assert len(names_codebook) == n_layers + + @pytest.mark.parametrize( ("mode", "data"), ( From 4ab1470c68035ffedd64214fc5c2efdc93816459 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Thu, 15 May 2025 16:13:28 +0200 Subject: [PATCH 10/68] Added file with compression data structures. --- .../algorithms/weight_compression/common.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 nncf/quantization/algorithms/weight_compression/common.py diff --git a/nncf/quantization/algorithms/weight_compression/common.py b/nncf/quantization/algorithms/weight_compression/common.py new file mode 100644 index 00000000000..ff1c737ff19 --- /dev/null +++ b/nncf/quantization/algorithms/weight_compression/common.py @@ -0,0 +1,51 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Any +from nncf.tensor import Tensor + + +@dataclass +class Codebook: + """ + Codebook parameters for weight compression. + :param codebook: The initial codebook for compression. + :param dst_type: The destination type for the codebook. + """ + codebook: Optional[Tensor] = None + dst_type: Optional[Any] = None + + +@dataclass +class CompressedWeight: + """ + Compressed weight and decompression parameters. + + :param tensor: The tensor with compressed weight. + :param scale: The decompression scale, in practice it is dequantization scale for the quantization. + :param zero_point: The zero-point, it is the value of the compression type corresponding to the value 0 + in the non-compression realm. Applicable for INT quantization. + :param codebook: The codebook (LUT) for the weight compression. Applicable for vector quantization + """ + + tensor: Optional[Tensor] = None + scale: Optional[Tensor] = None + zero_point: Optional[Tensor] = None + codebook: Optional[Codebook] = None + + def is_codebook(self): + """ + Check if the compressed weight is a codebook. + + :return: True if the compressed weight is a codebook, False otherwise. + """ + return not (self.codebook is None or self.tensor is None or self.scale is None) From 6ccd252b343523f6d0db7e0efe12bbfdb5604755 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Thu, 15 May 2025 16:37:27 +0200 Subject: [PATCH 11/68] Removed debug information. --- nncf/quantization/advanced_parameters.py | 2 +- nncf/quantization/algorithms/weight_compression/codebook.py | 1 - nncf/quantization/algorithms/weight_compression/common.py | 4 +++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index a041c8da25c..91f95d60303 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -20,7 +20,7 @@ from enum import Enum from typing import Any, Optional, Union -import openvino.runtime as ov +import openvino as ov import nncf from nncf.common.quantization.quantizer_propagation.structs import QuantizerPropagationRule diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py index 903bbbb47ba..b1dec46275a 100644 --- a/nncf/quantization/algorithms/weight_compression/codebook.py +++ b/nncf/quantization/algorithms/weight_compression/codebook.py @@ -102,7 +102,6 @@ def apply( if wp.compression_config.mode != CompressWeightsMode.CODEBOOK: continue weight_name = wp.weight_name - print(weight_name) config = wp.compression_config weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) diff --git a/nncf/quantization/algorithms/weight_compression/common.py b/nncf/quantization/algorithms/weight_compression/common.py index ff1c737ff19..a172899374f 100644 --- a/nncf/quantization/algorithms/weight_compression/common.py +++ b/nncf/quantization/algorithms/weight_compression/common.py @@ -10,7 +10,8 @@ # limitations under the License. from dataclasses import dataclass -from typing import Optional, Any +from typing import Any, Optional + from nncf.tensor import Tensor @@ -21,6 +22,7 @@ class Codebook: :param codebook: The initial codebook for compression. :param dst_type: The destination type for the codebook. """ + codebook: Optional[Tensor] = None dst_type: Optional[Any] = None From 22308e931b5db4a6b2acda4a5c274a913815b75e Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 16 May 2025 10:32:21 +0200 Subject: [PATCH 12/68] Added custom codebook to example. --- .../openvino/smollm2_360m_codebook/README.md | 26 ++++++ .../openvino/smollm2_360m_codebook/main.py | 85 +++++++++++-------- .../smollm2_360m_codebook/requirements.txt | 4 + nncf/__init__.py | 1 + nncf/version.py | 2 +- 5 files changed, 80 insertions(+), 38 deletions(-) create mode 100644 examples/llm_compression/openvino/smollm2_360m_codebook/README.md create mode 100644 examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/README.md b/examples/llm_compression/openvino/smollm2_360m_codebook/README.md new file mode 100644 index 00000000000..c82045d6261 --- /dev/null +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/README.md @@ -0,0 +1,26 @@ +# Large Language Models FP8 Compression Example + +This example demonstrates how to apply codebook compression to [HuggingFaceTB/SmolLM2-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct) model. It can be useful for evaluation and early HW enablement purposes. + +## Prerequisites + +To use this example: + +- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate` +- Install dependencies: + +```bash +pip install -U pip +pip install -r requirements.txt +pip install ../../../../ +``` + +## Run Example + +To run example: + +```bash +python main.py +``` + +It will automatically download the dataset and baseline model and save the resulting model. diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 7a37c0e3d42..67fbfccb26c 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -9,42 +9,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import openvino as ov -from datasets import load_dataset from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer import nncf -def transform_fn(data, model, tokenizer): - tokenized_text = tokenizer(data["text"], return_tensors="np") - input_ids = tokenized_text["input_ids"] - attention_mask = tokenized_text["attention_mask"] - - inputs = {} - inputs["input_ids"] = input_ids - inputs["attention_mask"] = tokenized_text["attention_mask"] - position_ids = np.cumsum(attention_mask, axis=1) - 1 - position_ids[attention_mask == 0] = 1 - - # The magic forms KV cache as model inputs - batch_size = input_ids.shape[0] - for input_name in model.key_value_input_names: - model_inputs = model.model.input(input_name) - shape = model_inputs.get_partial_shape() - shape[0] = batch_size - if shape[2].is_dynamic: - shape[2] = 0 - else: - shape[1] = 0 - inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape()) - - inputs["position_ids"] = position_ids - return inputs - - def generate_answers(questions, model, tokenizer, max_new_tokens=50): messages = [ {"role": "system", "content": "You are a chatbot who always responds as short as possible."}, @@ -70,14 +41,39 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50): return answers_by_questions -def main(): - MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct" - OUTPUT_DIR = "smollm2_360m_compressed_codebook" +def default_codebook_example(MODEL_ID, OUTPUT_DIR): + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + model = OVModelForCausalLM.from_pretrained( + MODEL_ID, + export=True, + load_in_8bit=False, + compile=False, + stateful=False, + ov_config={"INFERENCE_PRECISION_HINT": "f32"}, + ) - dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") - # Filtering to remove empty samples from the dataset - dataset = dataset.filter(lambda example: len(example["text"]) > 1) + questions = [ + "What is the capital of France?", + "What is the highest peak in the Alps?", + "What is the largest city in Canada?", + "What is the most visited city in Japan?", + ] + answers_by_questions = generate_answers(questions, model, tokenizer) + print(f"Non-optimized model outputs:\n{answers_by_questions}\n") + + model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=64) + model.save_pretrained(OUTPUT_DIR) + tokenizer.save_pretrained(OUTPUT_DIR) + + model = OVModelForCausalLM.from_pretrained( + OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "INFERENCE_PRECISION_HINT": "f32"} + ) + answers_by_questions = generate_answers(questions, model, tokenizer) + print(f"Optimized model outputs:\n{answers_by_questions}\n") + + +def custom_codebook_example(MODEL_ID, OUTPUT_DIR): tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = OVModelForCausalLM.from_pretrained( MODEL_ID, @@ -98,7 +94,15 @@ def main(): answers_by_questions = generate_answers(questions, model, tokenizer) print(f"Non-optimized model outputs:\n{answers_by_questions}\n") - model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=64) + codebook_params = nncf.AdvancedCodebookParameters([-8, -4, -2, -1, 0, 1, 2, 4, 8], ov.Type.i8) + + model.model = nncf.compress_weights( + model.model, + mode=nncf.CompressWeightsMode.CODEBOOK, + ratio=1.0, + group_size=64, + advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params), + ) model.save_pretrained(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) @@ -107,7 +111,14 @@ def main(): ) answers_by_questions = generate_answers(questions, model, tokenizer) print(f"Optimized model outputs:\n{answers_by_questions}\n") - return answers_by_questions + + +def main(): + MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct" + OUTPUT_DIR = "smollm2_360m_compressed_codebook" + + default_codebook_example(MODEL_ID, OUTPUT_DIR) + custom_codebook_example(MODEL_ID, OUTPUT_DIR + "_custom") if __name__ == "__main__": diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt new file mode 100644 index 00000000000..feab3bfd695 --- /dev/null +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt @@ -0,0 +1,4 @@ +openvino==2025.1 +optimum-intel[openvino]>=1.22.0 +transformers>=4.48.0 +onnx==1.17.0 diff --git a/nncf/__init__.py b/nncf/__init__.py index 77cd6fbb09a..a0f9a45183f 100644 --- a/nncf/__init__.py +++ b/nncf/__init__.py @@ -52,6 +52,7 @@ ) from nncf.quantization.advanced_parameters import AdvancedAWQParameters as AdvancedAWQParameters from nncf.quantization.advanced_parameters import AdvancedBiasCorrectionParameters as AdvancedBiasCorrectionParameters +from nncf.quantization.advanced_parameters import AdvancedCodebookParameters as AdvancedCodebookParameters from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as AdvancedCompressionParameters from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as AdvancedGPTQParameters from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as AdvancedLoraCorrectionParameters diff --git a/nncf/version.py b/nncf/version.py index 3769834a0b7..cec4ea22fb5 100644 --- a/nncf/version.py +++ b/nncf/version.py @@ -9,7 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.17.0" +__version__ = "2.17.0.dev0+6ccd252b3dirty" BKC_TORCH_SPEC = "==2.7.*" From fb259fc03beabe0e36c2c82d2ed21361c6c5dfee Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 16 May 2025 12:30:38 +0200 Subject: [PATCH 13/68] Fixed bug with group_size=-1. --- .../quantization/algorithms/weight_compression/codebook.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py index b1dec46275a..55dc9f1a583 100644 --- a/nncf/quantization/algorithms/weight_compression/codebook.py +++ b/nncf/quantization/algorithms/weight_compression/codebook.py @@ -9,7 +9,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from copy import deepcopy from typing import Any, Optional, TypeVar import nncf @@ -158,12 +157,8 @@ def calculate_quantization_params( weight = fns.transpose(weight) reduction_axis = 1 - group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis] - cur_config = deepcopy(config) - cur_config.group_size = group_size - max_val = fns.max(fns.abs(codebook)) - norm_weight, scale = do_float_quantization(weight, cur_config, reduction_axis, max_val=max_val) + norm_weight, scale = do_float_quantization(weight, config, reduction_axis, max_val=max_val) orig_shape = norm_weight.shape From 86acc8ee55f1d687d76073ea4340bfcfa3602361 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 16 May 2025 12:57:00 +0200 Subject: [PATCH 14/68] Moved convert before gather. --- .../llm_compression/openvino/smollm2_360m_codebook/main.py | 2 +- nncf/openvino/graph/node_utils.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 67fbfccb26c..b53bc5433d8 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -100,7 +100,7 @@ def custom_codebook_example(MODEL_ID, OUTPUT_DIR): model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, - group_size=64, + group_size=-1, advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params), ) model.save_pretrained(OUTPUT_DIR) diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 1a4fcb06303..75e5208ac43 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -700,11 +700,12 @@ def create_ov_codebook_subgraph( :return: OpenVINO subgraph. """ codebook_const = opset.constant(codebook.data, dtype=codebook_dtype) + if codebook_dtype != ov.Type.f16: + codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16) + codebook_indexes = opset.constant(indexes.data, dtype=dtype) if dtype == ov.Type.u4: codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8) const = opset.gather(codebook_const, codebook_indexes, 0, name=name) - if codebook_dtype != ov.Type.f16: - const = opset.convert(const, destination_type=ov.Type.f16) return const From b54606c856999037415b834e5405f43d3011bb21 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 16 May 2025 13:05:33 +0200 Subject: [PATCH 15/68] Removed backend specific parameter from advanced parameters. --- nncf/quantization/advanced_parameters.py | 4 +--- .../algorithms/weight_compression/openvino_backend.py | 4 +++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index 91f95d60303..ba2bcccfad6 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -20,8 +20,6 @@ from enum import Enum from typing import Any, Optional, Union -import openvino as ov - import nncf from nncf.common.quantization.quantizer_propagation.structs import QuantizerPropagationRule from nncf.common.quantization.structs import QuantizationScheme as QuantizationMode @@ -392,7 +390,7 @@ class AdvancedCodebookParameters: 3.5, ] ) - dst_type: Any = ov.Type.f8e4m3 + dst_type: Any = None @api() diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index f48bff5519b..109fb7fcbc1 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -251,7 +251,9 @@ def _create_compression_subgraph( codebook=compressed_weight.codebook.codebook, indexes=compressed_weight.tensor, dtype=compression_dtype, - codebook_dtype=compressed_weight.codebook.dst_type, + codebook_dtype=compressed_weight.codebook.dst_type + if compressed_weight.codebook.dst_type + else ov.Type.f8e4m3, name=const_node_name, ) else: From 72b803e9a3957e41be2b8657c1021669b34dfc0f Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 16 May 2025 13:26:55 +0200 Subject: [PATCH 16/68] Fixed tests. --- .../quantization/test_weights_compression.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index dab7125e1d6..54f1cb2124f 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1041,14 +1041,14 @@ def test_mixed_precision_e2m1(mode, all_layers, ratio, ref_ids): @pytest.mark.parametrize( ("mode", "all_layers", "ratio", "ref_ids"), ( - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 3, 4]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, []), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 3]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, []), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, 5), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, 3), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, 1), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, 0), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, 4), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, 3), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, 1), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, 0), ), ) def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids): @@ -1064,28 +1064,28 @@ def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids): names_codebook = { op.get_friendly_name() for op in compressed_model.get_ordered_ops() - if op.get_element_type() == ov.Type.f8e4m3 and not op.get_friendly_name().startswith("Const") + if op.get_element_type() == ov.Type.f8e4m3 and op.get_friendly_name().startswith("Const") } - ref_codebook_nodes = {f"weights_{i}" for i in ref_ids} - assert ref_codebook_nodes == names_codebook + assert ref_ids == len(names_codebook) @pytest.mark.parametrize( ("codebook", "dst_type", "n_layers"), ( - ([i for i in range(-8, 8)], ov.Type.i4, 2 * 5), - ([i for i in range(-(2**6), 2**6)], ov.Type.i8, 2 * 5), - ([i for i in range(-(2**6), 2**6)], ov.Type.f8e4m3, 2 * 5), + ([i for i in range(-8, 8)], ov.Type.i4, 5), + ([i for i in range(-(2**6), 2**6)], ov.Type.i8, 5), + ([i for i in range(-(2**6), 2**6)], ov.Type.f8e4m3, 5), ), ) -def test_codebook(codebook, dst_type, n_layers): +@pytest.mark.parametrize("group_size", (1, -1)) +def test_codebook(codebook, dst_type, n_layers, group_size): model = SequentialMatmulModel().ov_model compressed_model = compress_weights( model, mode=CompressWeightsMode.CODEBOOK, ratio=1.0, - group_size=1, + group_size=group_size, all_layers=True, advanced_parameters=AdvancedCompressionParameters( codebook_params=AdvancedCodebookParameters(codebook=codebook, dst_type=dst_type) From 79f34a78b41b0343993296964f8701b3c41703c1 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 20 May 2025 11:58:04 +0200 Subject: [PATCH 17/68] Fix for prevent Gather from low-precision types be recognized as input for graph. --- nncf/openvino/graph/metatypes/openvino_metatypes.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/nncf/openvino/graph/metatypes/openvino_metatypes.py b/nncf/openvino/graph/metatypes/openvino_metatypes.py index c7726276e00..fe433739237 100644 --- a/nncf/openvino/graph/metatypes/openvino_metatypes.py +++ b/nncf/openvino/graph/metatypes/openvino_metatypes.py @@ -817,7 +817,16 @@ def _is_embedding(node: ov.Node) -> bool: allowed_types_list = ["f16", "f32", "f64"] const_port_id = 0 input_tensor = node.input_value(const_port_id) - if input_tensor.get_element_type().get_type_name() in allowed_types_list: + input_type = input_tensor.get_element_type().get_type_name() + + try: + input_node = node.input(const_port_id).get_source_output().node + if input_node.get_type_info().name == "Convert": + input_type = input_node.input_value(0).get_element_type().get_type_name() + except AttributeError: + # Handle the case where input_node is not available + pass + if input_type in allowed_types_list: const_node = get_operation_const_op(node, const_port_id) if const_node is not None: return True From 93233815815c4a5e91908ecca91b0d13b8f81c1b Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 21 May 2025 16:24:19 +0200 Subject: [PATCH 18/68] Extend test for codebook. --- .../quantization/test_weights_compression.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 54f1cb2124f..0a14c3e4cbb 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -41,6 +41,7 @@ from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams +from nncf.quantization.algorithms.weight_compression.codebook import CodebookCompression from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -1119,6 +1120,30 @@ def test_compressed_weighs_range(mode, data): assert np.allclose(np.abs(compressed_weighs.data), np.abs(w.data)) +@pytest.mark.parametrize( + ("data"), + ( + ([-8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0, 0.0]), + ([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), + ([-8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]), + ([-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5]), + ), +) +def test_codebook_weighs_range(data): + data = np.array(data).astype(np.float32) + max_diff = 0.1 + w = Tensor(data + (np.random.rand(*data.shape) - 0.5) * max_diff) + config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK) + codebook_compression = CodebookCompression(initial_codebook=data, dst_type=None) + indexes, scale, codebook = codebook_compression.calculate_quantization_params(w, [-1], config) + uncompressed_data = codebook[indexes] * scale + + indexes = indexes.flatten() + target = np.arange(indexes.shape[0]) + assert np.allclose(indexes.data, target) + assert np.all(np.abs(uncompressed_data.data - data) <= max_diff) + + @pytest.mark.parametrize( ("config", "precompute_scale", "precompute_zero_point", "raises"), [ From 464c0974b4c144e16303d03127ffbd55ab804754 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 23 May 2025 13:47:55 +0200 Subject: [PATCH 19/68] Refactoring. --- nncf/openvino/graph/node_utils.py | 2 +- .../openvino/optimized_functions/functions.py | 2 +- .../weight_compression/algorithm.py | 26 ++++++------ .../algorithms/weight_compression/codebook.py | 37 +++++++++++++--- .../algorithms/weight_compression/config.py | 2 +- .../weight_compression/openvino_backend.py | 23 ++++++---- .../weight_compression/weight_lowering.py | 42 +++++++++++++++---- 7 files changed, 99 insertions(+), 35 deletions(-) diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 75e5208ac43..0ba2ab1b970 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -699,7 +699,7 @@ def create_ov_codebook_subgraph( :param name: Optional name of the constant. :return: OpenVINO subgraph. """ - codebook_const = opset.constant(codebook.data, dtype=codebook_dtype) + codebook_const = opset.constant(codebook.data, dtype=codebook_dtype) #create_ov_const_from_tensor(codebook, codebook_dtype)# if codebook_dtype != ov.Type.f16: codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16) diff --git a/nncf/openvino/optimized_functions/functions.py b/nncf/openvino/optimized_functions/functions.py index 2a11e4c3608..282a43f9d2b 100644 --- a/nncf/openvino/optimized_functions/functions.py +++ b/nncf/openvino/optimized_functions/functions.py @@ -151,7 +151,7 @@ def do_float_quantization( compressed_weight = model([weight, precomputed_scale])[0] scale = precomputed_scale - return compressed_weight, scale + return compressed_weight, scale, None def integer_quantize_dequantize_weight( diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index aa962be7f90..b5826678d3b 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -322,12 +322,12 @@ def __init__( scale_estimation_params.scale_steps, scale_estimation_params.weight_penalty, ) - if self._codebook: - codebook_params = self._advanced_parameters.codebook_params - self._codebook_algo = CodebookCompression( - initial_codebook=codebook_params.codebook, - dst_type=codebook_params.dst_type, - ) + # if self._codebook: + # codebook_params = self._advanced_parameters.codebook_params + # self._codebook_algo = CodebookCompression( + # initial_codebook=codebook_params.codebook, + # dst_type=codebook_params.dst_type, + # ) self._data_aware_mixed_precision = ( self._sensitivity_metric != SensitivityMetric.WEIGHT_QUANTIZATION_ERROR and self._ratio != 1.0 @@ -660,13 +660,13 @@ def apply( compressed_weights = None lora_correction_algo = None description = "Applying Weight Compression" - if self._codebook: - compressed_weights = self._codebook_algo.apply( - model=model, - graph=graph, - all_weight_params=all_weight_params, - backend_entity=self._backend_entity, - ) + # if self._codebook: + # compressed_weights = self._codebook_algo.apply( + # model=model, + # graph=graph, + # all_weight_params=all_weight_params, + # backend_entity=self._backend_entity, + # ) if self._gptq: del statistics model, compressed_weights = self._gptq_algo.apply( diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py index 55dc9f1a583..bcb463e4d67 100644 --- a/nncf/quantization/algorithms/weight_compression/codebook.py +++ b/nncf/quantization/algorithms/weight_compression/codebook.py @@ -158,15 +158,42 @@ def calculate_quantization_params( reduction_axis = 1 max_val = fns.max(fns.abs(codebook)) - norm_weight, scale = do_float_quantization(weight, config, reduction_axis, max_val=max_val) + if True: + norm_weight, scale, indexes = do_float_quantization(weight, config, reduction_axis, max_val=max_val, quantiles=codebook) - orig_shape = norm_weight.shape + orig_shape = norm_weight.shape - norm_weight = fns.unsqueeze(norm_weight.flatten(), 1) + # norm_weight = fns.unsqueeze(norm_weight.flatten(), 1) - dist = (norm_weight - fns.unsqueeze(codebook, 0)) ** 2 + # dist = (norm_weight - fns.unsqueeze(codebook, 0)) ** 2 + + # indexes = dist.data.argmin(-1) + else: + norm_weight, scale = do_float_quantization(weight, config, reduction_axis, max_val=max_val, quantiles=codebook) + + orig_shape = norm_weight.shape + + norm_weight = fns.unsqueeze(norm_weight.flatten(), 1) + + dist = (norm_weight - fns.unsqueeze(codebook, 0)) ** 2 + + indexes = dist.data.argmin(-1) + + + # norm_weight, scale, indexes = do_float_quantization(weight, config, reduction_axis, max_val=max_val, quantiles=codebook) + + # norm_weight_, scale_ = do_float_quantization(weight, config, reduction_axis, max_val=max_val) + + # orig_shape = norm_weight_.shape + + # norm_weight_ = fns.unsqueeze(norm_weight_.flatten(), 1) + + # dist = (norm_weight_ - fns.unsqueeze(codebook, 0)) ** 2 + + # indexes_ = dist.data.argmin(-1) + # import numpy as np + # print(np.count_nonzero(indexes_ != indexes.flatten())) - indexes = dist.data.argmin(-1) indexes = fns.reshape(indexes, orig_shape) return indexes, scale, codebook diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py index 63ed892c472..36879412d45 100644 --- a/nncf/quantization/algorithms/weight_compression/config.py +++ b/nncf/quantization/algorithms/weight_compression/config.py @@ -49,7 +49,7 @@ def is_integer(self): """ :return: True if compression type in integer, else False. """ - return self.mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] + return self.mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK] def __hash__(self): return hash((self.mode.value, self.group_size)) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 109fb7fcbc1..432a7eeee32 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -220,6 +220,7 @@ def _create_compression_subgraph( const_dtype, should_add_convert_node: bool, compressed_weight: Optional[CompressedWeight] = None, + advanced_parameters: Optional[AdvancedCompressionParameters] = None, ): scale_dtype = ov.Type.f16 if compression_config.mode == CompressWeightsMode.NF4: @@ -236,10 +237,7 @@ def _create_compression_subgraph( elif compression_config.mode == CompressWeightsMode.INT8_ASYM: compression_dtype = ov.Type.u8 elif compression_config.mode == CompressWeightsMode.CODEBOOK: - if compressed_weight is None or not compressed_weight.is_codebook(): - msg = "Codebook compression requires pre-computed codebook." - raise nncf.ValidationError(msg) - compression_dtype = ov.Type.u8 if compressed_weight.tensor.max() > 15 else ov.Type.u4 + compression_dtype = None #ov.Type.u8 if compressed_weight.tensor.max() > 15 else ov.Type.u4 else: msg = f"{compression_config.mode.value} is not supported." raise nncf.ParameterNotSupportedError(msg) @@ -247,12 +245,22 @@ def _create_compression_subgraph( original_shape = weight.shape if compression_config.mode == CompressWeightsMode.CODEBOOK: + codebook_params = advanced_parameters.codebook_params + if compressed_weight is None: + compressed_weight = CompressedWeight(codebook=codebook_params.codebook) + compressed_weight = compress_weight( + weight, + reduction_axes, + compression_config, + compressed_weight, + ) + converted_const = create_ov_codebook_subgraph( - codebook=compressed_weight.codebook.codebook, + codebook=compressed_weight.codebook, indexes=compressed_weight.tensor, dtype=compression_dtype, - codebook_dtype=compressed_weight.codebook.dst_type - if compressed_weight.codebook.dst_type + codebook_dtype=codebook_params.dst_type + if codebook_params.dst_type else ov.Type.f8e4m3, name=const_node_name, ) @@ -335,6 +343,7 @@ def transform_model( const_dtype=const_dtype, should_add_convert_node=should_add_convert_node, compressed_weight=compressed_weight, + advanced_parameters=advanced_parameters ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 70affaa9745..a22d1e3f871 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -159,8 +159,9 @@ def do_float_quantization( config: WeightCompressionConfig, reduction_axes: Optional[ReductionAxes] = None, precomputed_scale: Optional[Tensor] = None, + quantiles: Optional[Tensor] = None, max_val: float = 6.0, -) -> tuple[Tensor, Tensor]: +) -> tuple[Tensor, Tensor, Tensor]: """ Computes quantization scale if not provided, and performs corresponding (nf4, e2m1) weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. @@ -172,7 +173,7 @@ def do_float_quantization( :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. :param max_val: Maximal value of destination type. - :return: Returns quantized (for codebook and e2m1 normalized) weight tensor and corresponding scale tensor. + :return: Returns quantized (for codebook and e2m1 normalized) weight tensor and corresponding scale tensor and optional indexes for codebook. """ assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK] @@ -194,6 +195,8 @@ def do_float_quantization( scale = precomputed_scale if scale is None: + if quantiles is not None: + max_val = max(quantiles) scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val) norm_weight = _calculate_normalized_weight(weight, scale) if config.mode == CompressWeightsMode.NF4: @@ -202,10 +205,13 @@ def do_float_quantization( compressed_weight = norm_weight.as_openvino_tensor().astype(TensorDataType.nf4) else: compressed_weight = _calculate_nf4_quantized_weight(norm_weight) + elif config.mode == CompressWeightsMode.CODEBOOK and quantiles is not None: + compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=quantiles) + return compressed_weight, scale, indexes else: # TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved compressed_weight = norm_weight - return compressed_weight, scale + return compressed_weight, scale, None def float_quantize_dequantize_weight( @@ -346,13 +352,16 @@ def compress_weight( :param precomputed_zero_point: Precomputed zero point. :return: The compressed weight and decompression parameters as instance of CompressedWeight """ - precomputed_scale, precomputed_zero_point = ( - (compressed_weight.scale, compressed_weight.zero_point) if compressed_weight else (None, None) + precomputed_scale, precomputed_zero_point, quantiles = ( + (compressed_weight.scale, compressed_weight.zero_point, compressed_weight.codebook) if compressed_weight else (None, None, None) ) if not config.is_integer: - compressed_weight, scale = do_float_quantization(weight, config, reduction_axes, precomputed_scale) - return CompressedWeight(compressed_weight, scale) + compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale, quantiles=quantiles) + if quantiles is not None and indexes is not None: + return CompressedWeight(indexes, scale, None, fns.from_numpy(np.array(quantiles), backend=compressed_weight.backend)) + else: + return CompressedWeight(compressed_weight, scale) compressed_weight, scale, zero_point = do_integer_quantization( weight, config, reduction_axes, precomputed_scale, precomputed_zero_point ) @@ -524,6 +533,25 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor: return quantized_weight +def _calculate_codebook_quantized_weight(norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None) -> Tensor: + """ + Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to "round" or "quantize" to the closest quant. + + :param norm_weight: Weight tensor to quantize already normalized to quantiles range. + :return: Tensor with floating-point values, where each of them corresponds to elements from quantiles. + """ + assert quantiles is not None or center_of_quantiles is not None, "Either quantiles or center_of_quantiles should be provided" + + if center_of_quantiles is None: + quantiles = np.array(quantiles) + center_of_quantiles = 0.5 * (quantiles[1:] + quantiles[:-1]) + center_of_quantiles = fns.from_numpy(center_of_quantiles, backend=norm_weight.backend) + indexes = fns.searchsorted(center_of_quantiles, norm_weight) + quantiles = fns.from_numpy(quantiles, backend=indexes.backend) + quantized_weight = quantiles[indexes] + return quantized_weight, indexes + + def _calculate_normalized_weight(weight: Tensor, scale: Tensor) -> Tensor: """ Normalizes the weight tensor using the provided scale. From b964c0c5f264037db67e1aa01098451f22e8368c Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 23 May 2025 14:30:47 +0200 Subject: [PATCH 20/68] Delete codebook algo. --- .../weight_compression/algorithm.py | 16 +- .../algorithms/weight_compression/codebook.py | 199 ------------------ 2 files changed, 1 insertion(+), 214 deletions(-) delete mode 100644 nncf/quantization/algorithms/weight_compression/codebook.py diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index b5826678d3b..6cec4d1f6d0 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -38,7 +38,6 @@ from nncf.quantization.advanced_parameters import convert_to_dict_recursively from nncf.quantization.algorithms.algorithm import Algorithm from nncf.quantization.algorithms.weight_compression.awq import AWQ -from nncf.quantization.algorithms.weight_compression.codebook import CodebookCompression from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.gptq import GPTQ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm @@ -289,7 +288,6 @@ def __init__( self._advanced_parameters = ( advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters() ) - self._codebook = mode == CompressWeightsMode.CODEBOOK primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size) criterion_cls = MIXED_PRECISION_CRITERIA.get(self._sensitivity_metric) @@ -322,12 +320,6 @@ def __init__( scale_estimation_params.scale_steps, scale_estimation_params.weight_penalty, ) - # if self._codebook: - # codebook_params = self._advanced_parameters.codebook_params - # self._codebook_algo = CodebookCompression( - # initial_codebook=codebook_params.codebook, - # dst_type=codebook_params.dst_type, - # ) self._data_aware_mixed_precision = ( self._sensitivity_metric != SensitivityMetric.WEIGHT_QUANTIZATION_ERROR and self._ratio != 1.0 @@ -660,13 +652,7 @@ def apply( compressed_weights = None lora_correction_algo = None description = "Applying Weight Compression" - # if self._codebook: - # compressed_weights = self._codebook_algo.apply( - # model=model, - # graph=graph, - # all_weight_params=all_weight_params, - # backend_entity=self._backend_entity, - # ) + if self._gptq: del statistics model, compressed_weights = self._gptq_algo.apply( diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py deleted file mode 100644 index bcb463e4d67..00000000000 --- a/nncf/quantization/algorithms/weight_compression/codebook.py +++ /dev/null @@ -1,199 +0,0 @@ -# Copyright (c) 2025 Intel Corporation -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Any, Optional, TypeVar - -import nncf -from nncf.common.graph.graph import NNCFGraph -from nncf.common.logging.track_progress import track -from nncf.common.utils.backend import BackendType -from nncf.common.utils.backend import get_backend -from nncf.parameters import CompressWeightsMode -from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.common import Codebook -from nncf.quantization.algorithms.weight_compression.common import CompressedWeight -from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters -from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization -from nncf.tensor import Tensor -from nncf.tensor import TensorDataType -from nncf.tensor import functions as fns - -TModel = TypeVar("TModel") - - -class CodebookCompression: - """ - Codebook estimation algorithm implementation. - """ - - def __init__( - self, - initial_codebook: Tensor, - dst_type: Any, - ): - """ - :param initial_codebook: codebook for compression. - """ - super().__init__() - self._initial_codebook = initial_codebook - self._dst_type = dst_type - - @property - def available_backends(self) -> list[BackendType]: - return [BackendType.OPENVINO] - - def _set_backend_entity(self, model: TModel) -> None: - """ - Creates a helper class with a backed-specific logic of the algorithm. - - :param model: Backend-specific input model. - """ - model_backend = get_backend(model) - if model_backend == BackendType.OPENVINO: - from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend - - self._backend_entity = OVWeightCompressionAlgoBackend(model) - else: - msg = ( - "Cannot return backend-specific Scale Estimation entity because" - f" {model_backend.value} is not supported!" - ) - raise nncf.UnsupportedBackendError(msg) - - def apply( - self, - model: TModel, - graph: NNCFGraph, - all_weight_params: list[WeightCompressionParameters], - backend_entity: Optional[WeightCompressionAlgoBackend] = None, - ) -> dict[str, CompressedWeight]: - """ - Estimates better scale for the int4 nodes in the model. - Minimizes per-group difference between floating point MatMul and - MatMul with compressed weights. - The algorithm computes weighted scale for the group of weights in MatMul, which - shared the same scale. - - :param model: Model for applying algorithm. - :param graph: Model graph. - :param all_weight_params: List of all weight parameters. - :param backend_entity: Weight compression algorithm backend. - :return: Two dictionaries for estimated scales and zero points for each weight name. - """ - self._backend_entity = backend_entity - if self._backend_entity is None: - self._set_backend_entity(model) - - res = {} - invalid_node_names = [] - first_caught_error = None - for wp in track(all_weight_params, description="Applying Codebook Compression"): - if wp.compression_config.mode != CompressWeightsMode.CODEBOOK: - continue - weight_name = wp.weight_name - config = wp.compression_config - - weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) - if len(weight_data) != 1: # not supported by the algorithm - continue - _, weight_port_id = weight_data[0] - - weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) - - try: - indexes, scale, codebook = self.calculate_quantization_params(weight, wp.reduction_axes, config) - res[weight_name] = CompressedWeight(indexes, scale, None, Codebook(codebook, self._dst_type)) - except nncf.InvalidGroupSizeError as error: - first_caught_error = error - invalid_node_names.append(wp.node_with_weight.node_name) - - if first_caught_error: - handle_invalid_group_size_error(first_caught_error, invalid_node_names) - - return res - - def calculate_quantization_params( - self, - weight: Tensor, - reduction_axes: tuple[int, ...], - config: WeightCompressionConfig, - ) -> Tensor: - """ - Calculates the quantization parameters for a given set of weights and activations. - This function estimates the optimal quantization scale for weight compression by - minimizing the difference between floating-point operations and operations with - quantized weights. - - The function uses an iterative process: - 1. Initial scale rectification based on activation statistics. - 2. A grid search to further refine the scale parameters. - - :param statistics: The input activations of the layer reduced over batch and sequence length dimensions, - together with original activation tensor shapes. - :param weight: The weight tensor that is being quantized. - :param reduction_axes: Tuple specifying the axes along which the reduction is performed for quantization. - :param config: Configuration parameters for the weight compression, including quantization settings. - :return: A tensor containing the calculated quantization scales and zero points if applicable. - """ - reduction_axis = reduction_axes[0] - - weight = weight.astype(TensorDataType.float32) - - codebook = fns.tensor( - self._initial_codebook, backend=weight.backend, dtype=TensorDataType.float32, device=weight.device - ) - - if reduction_axis == 0: - weight = fns.transpose(weight) - reduction_axis = 1 - - max_val = fns.max(fns.abs(codebook)) - if True: - norm_weight, scale, indexes = do_float_quantization(weight, config, reduction_axis, max_val=max_val, quantiles=codebook) - - orig_shape = norm_weight.shape - - # norm_weight = fns.unsqueeze(norm_weight.flatten(), 1) - - # dist = (norm_weight - fns.unsqueeze(codebook, 0)) ** 2 - - # indexes = dist.data.argmin(-1) - else: - norm_weight, scale = do_float_quantization(weight, config, reduction_axis, max_val=max_val, quantiles=codebook) - - orig_shape = norm_weight.shape - - norm_weight = fns.unsqueeze(norm_weight.flatten(), 1) - - dist = (norm_weight - fns.unsqueeze(codebook, 0)) ** 2 - - indexes = dist.data.argmin(-1) - - - # norm_weight, scale, indexes = do_float_quantization(weight, config, reduction_axis, max_val=max_val, quantiles=codebook) - - # norm_weight_, scale_ = do_float_quantization(weight, config, reduction_axis, max_val=max_val) - - # orig_shape = norm_weight_.shape - - # norm_weight_ = fns.unsqueeze(norm_weight_.flatten(), 1) - - # dist = (norm_weight_ - fns.unsqueeze(codebook, 0)) ** 2 - - # indexes_ = dist.data.argmin(-1) - # import numpy as np - # print(np.count_nonzero(indexes_ != indexes.flatten())) - - indexes = fns.reshape(indexes, orig_shape) - - return indexes, scale, codebook From 145fbf3a984f581e6d73533afef593fdbc3373c5 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 26 May 2025 09:44:33 +0200 Subject: [PATCH 21/68] Refactoring. --- .../weight_compression/algorithm.py | 2 +- .../algorithms/weight_compression/config.py | 3 +- .../weight_compression/openvino_backend.py | 28 ++++++++----------- .../weight_compression/scale_estimation.py | 4 +-- .../weight_compression/weight_lowering.py | 21 +++++++------- 5 files changed, 26 insertions(+), 32 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 4a932b9774a..1c0a6d93336 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -445,7 +445,7 @@ def _set_weight_compression_config( :param graph: The model graph associated with the model. :param statistics_points: Statistics points. """ - primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size) + primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size, user_data=self._advanced_parameters.codebook_params.codebook) if self._ratio == 1: for weight_param in ratio_defining_params: weight_param.compression_config = primary_config diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py index 36879412d45..80709f53d95 100644 --- a/nncf/quantization/algorithms/weight_compression/config.py +++ b/nncf/quantization/algorithms/weight_compression/config.py @@ -10,7 +10,7 @@ # limitations under the License. from dataclasses import dataclass from dataclasses import field -from typing import Optional, TypeVar +from typing import Optional, TypeVar, Any import numpy as np @@ -32,6 +32,7 @@ class WeightCompressionConfig: mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM group_size: Optional[int] = -1 + user_data: Optional[Any] = None @property def num_bits(self): diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 432a7eeee32..8db1a463cb7 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -237,24 +237,25 @@ def _create_compression_subgraph( elif compression_config.mode == CompressWeightsMode.INT8_ASYM: compression_dtype = ov.Type.u8 elif compression_config.mode == CompressWeightsMode.CODEBOOK: - compression_dtype = None #ov.Type.u8 if compressed_weight.tensor.max() > 15 else ov.Type.u4 + compression_dtype = None else: msg = f"{compression_config.mode.value} is not supported." raise nncf.ParameterNotSupportedError(msg) original_shape = weight.shape - if compression_config.mode == CompressWeightsMode.CODEBOOK: - codebook_params = advanced_parameters.codebook_params - if compressed_weight is None: - compressed_weight = CompressedWeight(codebook=codebook_params.codebook) + with disable_results_caching(OV_MODEL_CACHE): compressed_weight = compress_weight( - weight, - reduction_axes, - compression_config, - compressed_weight, - ) + weight, + reduction_axes, + compression_config, + compressed_weight, + ) + if compression_config.mode == CompressWeightsMode.CODEBOOK: + n_quants = compressed_weight.tensor.max() + compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4) + codebook_params = advanced_parameters.codebook_params converted_const = create_ov_codebook_subgraph( codebook=compressed_weight.codebook, indexes=compressed_weight.tensor, @@ -265,13 +266,6 @@ def _create_compression_subgraph( name=const_node_name, ) else: - with disable_results_caching(OV_MODEL_CACHE): - compressed_weight = compress_weight( - weight, - reduction_axes, - compression_config, - compressed_weight, - ) compressed_const = create_ov_const_from_tensor( compressed_weight.tensor, compression_dtype, name=const_node_name ) diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index a772dd107b2..8701250bace 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -301,7 +301,7 @@ def calculate_quantization_params( if i < initial_steps - 1: if config.mode == CompressWeightsMode.NF4: - out, _ = do_float_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale) + out, _, _ = do_float_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale) else: out, _, _ = do_integer_quantization( original_weight, @@ -319,7 +319,7 @@ def calculate_quantization_params( scaled_scale = factor * scale if config.mode == CompressWeightsMode.NF4: - out, _ = do_float_quantization(original_weight, config, precomputed_scale=scaled_scale) + out, _, _ = do_float_quantization(original_weight, config, precomputed_scale=scaled_scale) else: out, _, _ = do_integer_quantization( original_weight, diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index a22d1e3f871..97d21bbdd05 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -159,7 +159,6 @@ def do_float_quantization( config: WeightCompressionConfig, reduction_axes: Optional[ReductionAxes] = None, precomputed_scale: Optional[Tensor] = None, - quantiles: Optional[Tensor] = None, max_val: float = 6.0, ) -> tuple[Tensor, Tensor, Tensor]: """ @@ -195,8 +194,8 @@ def do_float_quantization( scale = precomputed_scale if scale is None: - if quantiles is not None: - max_val = max(quantiles) + if config.mode == CompressWeightsMode.CODEBOOK: + max_val = max(config.user_data) scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val) norm_weight = _calculate_normalized_weight(weight, scale) if config.mode == CompressWeightsMode.NF4: @@ -205,8 +204,8 @@ def do_float_quantization( compressed_weight = norm_weight.as_openvino_tensor().astype(TensorDataType.nf4) else: compressed_weight = _calculate_nf4_quantized_weight(norm_weight) - elif config.mode == CompressWeightsMode.CODEBOOK and quantiles is not None: - compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=quantiles) + elif config.mode == CompressWeightsMode.CODEBOOK: + compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=config.user_data) return compressed_weight, scale, indexes else: # TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved @@ -250,7 +249,7 @@ def float_quantize_dequantize_weight( ) # Reference implementation - compressed_weight, scale = do_float_quantization(weight, config, reduction_axes, precomputed_scale) + compressed_weight, scale, _ = do_float_quantization(weight, config, reduction_axes, precomputed_scale) decompressed_weight = do_float_dequantization(compressed_weight, scale) if return_compressed_weight: return decompressed_weight, compressed_weight, scale @@ -352,14 +351,14 @@ def compress_weight( :param precomputed_zero_point: Precomputed zero point. :return: The compressed weight and decompression parameters as instance of CompressedWeight """ - precomputed_scale, precomputed_zero_point, quantiles = ( - (compressed_weight.scale, compressed_weight.zero_point, compressed_weight.codebook) if compressed_weight else (None, None, None) + precomputed_scale, precomputed_zero_point = ( + (compressed_weight.scale, compressed_weight.zero_point) if compressed_weight else (None, None) ) if not config.is_integer: - compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale, quantiles=quantiles) - if quantiles is not None and indexes is not None: - return CompressedWeight(indexes, scale, None, fns.from_numpy(np.array(quantiles), backend=compressed_weight.backend)) + compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale) + if indexes is not None: + return CompressedWeight(indexes, scale, None, fns.from_numpy(np.array(config.user_data), backend=compressed_weight.backend)) else: return CompressedWeight(compressed_weight, scale) compressed_weight, scale, zero_point = do_integer_quantization( From d4e8578c8db4c66c32a015a04c66d60cf40fcdb4 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 26 May 2025 13:44:15 +0200 Subject: [PATCH 22/68] Data aware codebook. --- nncf/openvino/graph/node_utils.py | 2 +- .../weight_compression/algorithm.py | 4 +++- .../algorithms/weight_compression/awq.py | 3 +-- .../algorithms/weight_compression/config.py | 2 +- .../algorithms/weight_compression/gptq.py | 8 ++++--- .../weight_compression/openvino_backend.py | 6 ++--- .../weight_compression/scale_estimation.py | 11 +++++----- .../weight_compression/weight_lowering.py | 22 +++++++++++++------ nncf/quantization/quantize_model.py | 1 - 9 files changed, 33 insertions(+), 26 deletions(-) diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 0ba2ab1b970..75e5208ac43 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -699,7 +699,7 @@ def create_ov_codebook_subgraph( :param name: Optional name of the constant. :return: OpenVINO subgraph. """ - codebook_const = opset.constant(codebook.data, dtype=codebook_dtype) #create_ov_const_from_tensor(codebook, codebook_dtype)# + codebook_const = opset.constant(codebook.data, dtype=codebook_dtype) if codebook_dtype != ov.Type.f16: codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 1c0a6d93336..89eb99d8fa8 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -445,7 +445,9 @@ def _set_weight_compression_config( :param graph: The model graph associated with the model. :param statistics_points: Statistics points. """ - primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size, user_data=self._advanced_parameters.codebook_params.codebook) + primary_config = WeightCompressionConfig( + mode=self._mode, group_size=self._group_size, user_data=self._advanced_parameters.codebook_params.codebook + ) if self._ratio == 1: for weight_param in ratio_defining_params: weight_param.compression_config = primary_config diff --git a/nncf/quantization/algorithms/weight_compression/awq.py b/nncf/quantization/algorithms/weight_compression/awq.py index fbab09a1fdf..fa423828fc1 100644 --- a/nncf/quantization/algorithms/weight_compression/awq.py +++ b/nncf/quantization/algorithms/weight_compression/awq.py @@ -25,7 +25,6 @@ from nncf.common.utils.backend import BackendType from nncf.common.utils.backend import get_backend from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic -from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.algorithm import Algorithm from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend @@ -250,7 +249,7 @@ def _data_aware_step(self, wp, weight, statistics): for _ in range(self._steps): cur_scale = gscale**alpha weights_to_fake_quantize = gweight * cur_scale - if config.mode == CompressWeightsMode.NF4: + if not config.is_integer: g_decompressed_weighs = float_quantize_dequantize_weight( weights_to_fake_quantize, awq_config, reduction_axis ) diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py index 80709f53d95..2939e2af609 100644 --- a/nncf/quantization/algorithms/weight_compression/config.py +++ b/nncf/quantization/algorithms/weight_compression/config.py @@ -10,7 +10,7 @@ # limitations under the License. from dataclasses import dataclass from dataclasses import field -from typing import Optional, TypeVar, Any +from typing import Any, Optional, TypeVar import numpy as np diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index 70a340b36b2..015b2628cee 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -235,7 +235,9 @@ def _quantize_weights( else weight_tensor.shape[1] ) reduction_axes = wc_params.reduction_axes - block_compression_config = WeightCompressionConfig(mode=wc_params.compression_config.mode) + block_compression_config = WeightCompressionConfig( + mode=wc_params.compression_config.mode, user_data=wc_params.compression_config.user_data + ) damp = self._damp_percent * fns.mean(fns.diag(hessian)) diag_indices = fns.arange(columns, backend=hessian.backend, device=hessian.device) @@ -260,7 +262,7 @@ def _quantize_weights( hessian_diag_val = hessian_inv_block[i, i] if (i1 + i) % group_size == 0: - if block_compression_config.mode == CompressWeightsMode.NF4: + if not block_compression_config.is_integer: scale = calculate_float_quantization_params( weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes, block_compression_config ) @@ -289,7 +291,7 @@ def _quantize_weights( # optimized OV compression performs worse than numpy compression. # TODO(nikita-savelyevv): Remove this workaround by introducing logic that will control whether to # execute optimized compression based on input size. - if block_compression_config.mode == CompressWeightsMode.NF4: + if not block_compression_config.is_integer: quantized_col = float_quantize_dequantize_weight( fns.unsqueeze(weight_col, 1), block_compression_config, diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 8db1a463cb7..447db197855 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -260,9 +260,7 @@ def _create_compression_subgraph( codebook=compressed_weight.codebook, indexes=compressed_weight.tensor, dtype=compression_dtype, - codebook_dtype=codebook_params.dst_type - if codebook_params.dst_type - else ov.Type.f8e4m3, + codebook_dtype=codebook_params.dst_type if codebook_params.dst_type else ov.Type.f8e4m3, name=const_node_name, ) else: @@ -337,7 +335,7 @@ def transform_model( const_dtype=const_dtype, should_add_convert_node=should_add_convert_node, compressed_weight=compressed_weight, - advanced_parameters=advanced_parameters + advanced_parameters=advanced_parameters, ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 8701250bace..020a42c8f16 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -18,7 +18,6 @@ from nncf.common.utils.backend import BackendType from nncf.common.utils.backend import get_backend from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic -from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.common import CompressedWeight @@ -213,7 +212,7 @@ def calculate_quantization_params( cur_config.group_size = group_size original_weight = fns.zeros_like(weight) + weight - if config.mode == CompressWeightsMode.NF4: + if not config.is_integer: q_weights, compressed_weights, scale = float_quantize_dequantize_weight( original_weight, cur_config, reduction_axis, return_compressed_weight=True ) @@ -262,7 +261,7 @@ def calculate_quantization_params( near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) near_to_ideal_scale = near_to_ideal_scale * scale_sign - if config.mode == CompressWeightsMode.NF4: + if not config.is_integer: out = float_quantize_dequantize_weight( original_weight, config, @@ -300,7 +299,7 @@ def calculate_quantization_params( result_scale = near_to_ideal_scale if i < initial_steps - 1: - if config.mode == CompressWeightsMode.NF4: + if not config.is_integer: out, _, _ = do_float_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale) else: out, _, _ = do_integer_quantization( @@ -318,7 +317,7 @@ def calculate_quantization_params( factor = 1.0 - 0.05 * scale_step scaled_scale = factor * scale - if config.mode == CompressWeightsMode.NF4: + if not config.is_integer: out, _, _ = do_float_quantization(original_weight, config, precomputed_scale=scaled_scale) else: out, _, _ = do_integer_quantization( @@ -334,7 +333,7 @@ def calculate_quantization_params( near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) near_to_ideal_scale = near_to_ideal_scale * scale_sign - if config.mode == CompressWeightsMode.NF4: + if not config.is_integer: out = float_quantize_dequantize_weight(original_weight, config, precomputed_scale=near_to_ideal_scale) else: out = integer_quantize_dequantize_weight( diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 97d21bbdd05..774d720b65b 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -172,7 +172,8 @@ def do_float_quantization( :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. :param max_val: Maximal value of destination type. - :return: Returns quantized (for codebook and e2m1 normalized) weight tensor and corresponding scale tensor and optional indexes for codebook. + :return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor and + optional indexes for codebook. """ assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK] @@ -231,11 +232,11 @@ def float_quantize_dequantize_weight( :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale. :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale. """ - assert config.mode == CompressWeightsMode.NF4 + assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK] # TODO(nikita-savelyevv): add support for f4e2m1 once ticket 164851 is resolved # Optimized implementation - if _can_run_optimized(weight.backend): + if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight.backend): from nncf.openvino.optimized_functions import ( float_quantize_dequantize_weight as float_quantize_dequantize_weight_ov, ) @@ -358,7 +359,9 @@ def compress_weight( if not config.is_integer: compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale) if indexes is not None: - return CompressedWeight(indexes, scale, None, fns.from_numpy(np.array(config.user_data), backend=compressed_weight.backend)) + return CompressedWeight( + indexes, scale, None, fns.from_numpy(np.array(config.user_data), backend=compressed_weight.backend) + ) else: return CompressedWeight(compressed_weight, scale) compressed_weight, scale, zero_point = do_integer_quantization( @@ -532,14 +535,19 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor: return quantized_weight -def _calculate_codebook_quantized_weight(norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None) -> Tensor: +def _calculate_codebook_quantized_weight( + norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None +) -> Tensor: """ - Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to "round" or "quantize" to the closest quant. + Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to + "round" or "quantize" to the closest quant. :param norm_weight: Weight tensor to quantize already normalized to quantiles range. :return: Tensor with floating-point values, where each of them corresponds to elements from quantiles. """ - assert quantiles is not None or center_of_quantiles is not None, "Either quantiles or center_of_quantiles should be provided" + assert quantiles is not None or center_of_quantiles is not None, ( + "Either quantiles or center_of_quantiles should be provided" + ) if center_of_quantiles is None: quantiles = np.array(quantiles) diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py index 6dc78e45f4a..0595a46d5f0 100644 --- a/nncf/quantization/quantize_model.py +++ b/nncf/quantization/quantize_model.py @@ -599,7 +599,6 @@ def compress_weights( if any((awq, scale_estimation, gptq, lora_correction)) and mode in [ CompressWeightsMode.E2M1, - CompressWeightsMode.CODEBOOK, ]: msg = f"AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is {mode}." raise nncf.ParameterNotSupportedError(msg) From ac0346dea67b4854e6ccdf2e022eabbb9fc5c034 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 26 May 2025 14:10:24 +0200 Subject: [PATCH 23/68] Fixed test. --- .../openvino/smollm2_360m_codebook/main.py | 32 +++++++++---------- .../test_compression_functions.py | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index b53bc5433d8..923829f08b2 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -41,10 +41,10 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50): return answers_by_questions -def default_codebook_example(MODEL_ID, OUTPUT_DIR): - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +def default_codebook_example(model_id, output_dir): + tokenizer = AutoTokenizer.from_pretrained(model_id) model = OVModelForCausalLM.from_pretrained( - MODEL_ID, + model_id, export=True, load_in_8bit=False, compile=False, @@ -63,20 +63,20 @@ def default_codebook_example(MODEL_ID, OUTPUT_DIR): print(f"Non-optimized model outputs:\n{answers_by_questions}\n") model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=64) - model.save_pretrained(OUTPUT_DIR) - tokenizer.save_pretrained(OUTPUT_DIR) + model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) model = OVModelForCausalLM.from_pretrained( - OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "INFERENCE_PRECISION_HINT": "f32"} + output_dir, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "64", "INFERENCE_PRECISION_HINT": "f32"} ) answers_by_questions = generate_answers(questions, model, tokenizer) print(f"Optimized model outputs:\n{answers_by_questions}\n") -def custom_codebook_example(MODEL_ID, OUTPUT_DIR): - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +def custom_codebook_example(model_id, output_dir): + tokenizer = AutoTokenizer.from_pretrained(model_id) model = OVModelForCausalLM.from_pretrained( - MODEL_ID, + model_id, export=True, load_in_8bit=False, compile=False, @@ -103,22 +103,22 @@ def custom_codebook_example(MODEL_ID, OUTPUT_DIR): group_size=-1, advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params), ) - model.save_pretrained(OUTPUT_DIR) - tokenizer.save_pretrained(OUTPUT_DIR) + model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) model = OVModelForCausalLM.from_pretrained( - OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "INFERENCE_PRECISION_HINT": "f32"} + output_dir, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "64", "INFERENCE_PRECISION_HINT": "f32"} ) answers_by_questions = generate_answers(questions, model, tokenizer) print(f"Optimized model outputs:\n{answers_by_questions}\n") def main(): - MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct" - OUTPUT_DIR = "smollm2_360m_compressed_codebook" + model_id = "HuggingFaceTB/SmolLM2-360M-Instruct" + output_dir = "smollm2_360m_compressed_codebook" - default_codebook_example(MODEL_ID, OUTPUT_DIR) - custom_codebook_example(MODEL_ID, OUTPUT_DIR + "_custom") + default_codebook_example(model_id, output_dir) + custom_codebook_example(model_id, output_dir + "_custom") if __name__ == "__main__": diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index 77eaaf9364c..b57c3bb9281 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -215,7 +215,7 @@ def test_quantization_alignment(weight_shape, config, quantization_task, tensor_ if config.is_integer: compressed_weight, scale, zero_point = outputs else: - compressed_weight, scale = outputs + compressed_weight, scale, _ = outputs elif quantization_task == QuantizationTask.Q_DQ: decompressed_weight = outputs else: From 5fb55e42fc6a0dd846ce4203c0f607facff002cd Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 26 May 2025 15:21:23 +0200 Subject: [PATCH 24/68] Fixed tests. --- .../openvino/smollm2_360m_codebook/main.py | 30 ++++++++----------- .../weight_compression/algorithm.py | 11 ++++--- .../weight_compression/weight_lowering.py | 2 +- .../quantization/test_weights_compression.py | 24 ++++----------- 4 files changed, 25 insertions(+), 42 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 923829f08b2..5c797f8c691 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -41,6 +41,14 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50): return answers_by_questions +QUESTIONS = [ + "What is the capital of France?", + "What is the highest peak in the Alps?", + "What is the largest city in Canada?", + "What is the most visited city in Japan?", +] + + def default_codebook_example(model_id, output_dir): tokenizer = AutoTokenizer.from_pretrained(model_id) model = OVModelForCausalLM.from_pretrained( @@ -52,14 +60,7 @@ def default_codebook_example(model_id, output_dir): ov_config={"INFERENCE_PRECISION_HINT": "f32"}, ) - questions = [ - "What is the capital of France?", - "What is the highest peak in the Alps?", - "What is the largest city in Canada?", - "What is the most visited city in Japan?", - ] - - answers_by_questions = generate_answers(questions, model, tokenizer) + answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Non-optimized model outputs:\n{answers_by_questions}\n") model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=64) @@ -69,7 +70,7 @@ def default_codebook_example(model_id, output_dir): model = OVModelForCausalLM.from_pretrained( output_dir, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "64", "INFERENCE_PRECISION_HINT": "f32"} ) - answers_by_questions = generate_answers(questions, model, tokenizer) + answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Optimized model outputs:\n{answers_by_questions}\n") @@ -84,14 +85,7 @@ def custom_codebook_example(model_id, output_dir): ov_config={"INFERENCE_PRECISION_HINT": "f32"}, ) - questions = [ - "What is the capital of France?", - "What is the highest peak in the Alps?", - "What is the largest city in Canada?", - "What is the most visited city in Japan?", - ] - - answers_by_questions = generate_answers(questions, model, tokenizer) + answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Non-optimized model outputs:\n{answers_by_questions}\n") codebook_params = nncf.AdvancedCodebookParameters([-8, -4, -2, -1, 0, 1, 2, 4, 8], ov.Type.i8) @@ -109,7 +103,7 @@ def custom_codebook_example(model_id, output_dir): model = OVModelForCausalLM.from_pretrained( output_dir, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "64", "INFERENCE_PRECISION_HINT": "f32"} ) - answers_by_questions = generate_answers(questions, model, tokenizer) + answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Optimized model outputs:\n{answers_by_questions}\n") diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 89eb99d8fa8..a24222f9581 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -293,7 +293,7 @@ def __init__( advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters() ) - primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size) + primary_config = self._get_primary_config() criterion_cls = MIXED_PRECISION_CRITERIA.get(self._sensitivity_metric) self._mixed_precision_algo = criterion_cls(primary_config, self._ratio, self._subset_size) self._statistics_path = self._advanced_parameters.statistics_path @@ -429,6 +429,11 @@ def _get_ratio_defining_params( return ratio_defining_params + def _get_primary_config(self): + return WeightCompressionConfig( + mode=self._mode, group_size=self._group_size, user_data=self._advanced_parameters.codebook_params.codebook + ) + def _set_weight_compression_config( self, ratio_defining_params: list[WeightCompressionParameters], @@ -445,9 +450,7 @@ def _set_weight_compression_config( :param graph: The model graph associated with the model. :param statistics_points: Statistics points. """ - primary_config = WeightCompressionConfig( - mode=self._mode, group_size=self._group_size, user_data=self._advanced_parameters.codebook_params.codebook - ) + primary_config = self._get_primary_config() if self._ratio == 1: for weight_param in ratio_defining_params: weight_param.compression_config = primary_config diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index a7edf80cab0..c18abe82f8a 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -199,7 +199,7 @@ def do_float_quantization( scale = precomputed_scale if scale is None: if config.mode == CompressWeightsMode.CODEBOOK: - max_val = max(config.user_data) + max_val = max(np.abs(np.array(config.user_data))) scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val) norm_weight = _calculate_normalized_weight(weight, scale) if config.mode == CompressWeightsMode.NF4: diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index dece9e25e8f..44cc8ca62c1 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -41,7 +41,6 @@ from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams -from nncf.quantization.algorithms.weight_compression.codebook import CodebookCompression from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -49,6 +48,7 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_nf4_quantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_normalized_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization @@ -699,20 +699,6 @@ def test_raise_error_with_unsupported_params_for_e2m1(algo): compress_weights(ov.Model([], []), dataset="anything", mode=CompressWeightsMode.E2M1, **{algo: True}) -@pytest.mark.parametrize( - "algo", - ( - "lora_correction", - "awq", - "scale_estimation", - "gptq", - ), -) -def test_raise_error_with_unsupported_params_for_codebook(algo): - with pytest.raises(nncf.ParameterNotSupportedError): - compress_weights(ov.Model([], []), dataset="anything", mode=CompressWeightsMode.CODEBOOK, **{algo: True}) - - @pytest.mark.parametrize("mode", INT4_NF4_MODES) @pytest.mark.parametrize( "algo", @@ -1132,12 +1118,12 @@ def test_compressed_weighs_range(mode, data): ) def test_codebook_weighs_range(data): data = np.array(data).astype(np.float32) + codebook = data max_diff = 0.1 w = Tensor(data + (np.random.rand(*data.shape) - 0.5) * max_diff) - config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK) - codebook_compression = CodebookCompression(initial_codebook=data, dst_type=None) - indexes, scale, codebook = codebook_compression.calculate_quantization_params(w, [-1], config) - uncompressed_data = codebook[indexes] * scale + config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK, user_data=data) + _, scale, indexes = do_float_quantization(w, config, -1) + uncompressed_data = codebook[indexes.data] * scale.data indexes = indexes.flatten() target = np.arange(indexes.shape[0]) From bf94228a9f945f77200ddd04f32246c0155294c3 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 28 May 2025 10:21:39 +0200 Subject: [PATCH 25/68] Added CB4_F8E4M3 type. --- .../openvino/smollm2_360m_codebook/main.py | 6 ++-- nncf/parameters.py | 2 ++ .../weight_compression/algorithm.py | 7 +++- .../algorithms/weight_compression/config.py | 14 +++++++- .../weight_compression/openvino_backend.py | 4 +-- .../weight_compression/weight_lowering.py | 32 ++++++++++++++++--- 6 files changed, 54 insertions(+), 11 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 5c797f8c691..feaa3fe8fec 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -63,7 +63,7 @@ def default_codebook_example(model_id, output_dir): answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Non-optimized model outputs:\n{answers_by_questions}\n") - model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=64) + model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CB4_F8E4M3, ratio=1.0, group_size=64) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) @@ -88,7 +88,9 @@ def custom_codebook_example(model_id, output_dir): answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Non-optimized model outputs:\n{answers_by_questions}\n") - codebook_params = nncf.AdvancedCodebookParameters([-8, -4, -2, -1, 0, 1, 2, 4, 8], ov.Type.i8) + codebook_params = nncf.AdvancedCodebookParameters( + [-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], ov.Type.i8 + ) model.model = nncf.compress_weights( model.model, diff --git a/nncf/parameters.py b/nncf/parameters.py index 50567733098..b8966210d75 100644 --- a/nncf/parameters.py +++ b/nncf/parameters.py @@ -86,6 +86,7 @@ class CompressWeightsMode(StrEnum): :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead. :param E2M1: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. :param CODEBOOK: Codebook (LUT) quantization format. + :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values. """ INT8_SYM = "int8_sym" @@ -96,6 +97,7 @@ class CompressWeightsMode(StrEnum): INT8 = "int8" # Deprecated mode E2M1 = "e2m1" CODEBOOK = "codebook" + CB4_F8E4M3 = "cb4_f8e4m3" @api(canonical_alias="nncf.CompressionFormat") diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index a24222f9581..bff4e9abc38 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -43,6 +43,7 @@ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation +from nncf.quantization.algorithms.weight_compression.weight_lowering import CB4_QUANTILES from nncf.quantization.algorithms.weight_compression.weight_lowering import WeightCompressionConfig from nncf.scopes import IgnoredScope from nncf.scopes import get_ignored_node_names_from_ignored_scope @@ -431,7 +432,11 @@ def _get_ratio_defining_params( def _get_primary_config(self): return WeightCompressionConfig( - mode=self._mode, group_size=self._group_size, user_data=self._advanced_parameters.codebook_params.codebook + mode=self._mode, + group_size=self._group_size, + user_data=CB4_QUANTILES + if self._mode == CompressWeightsMode.CB4_F8E4M3 + else self._advanced_parameters.codebook_params.codebook, ) def _set_weight_compression_config( diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py index 2939e2af609..ff9b3eb10e9 100644 --- a/nncf/quantization/algorithms/weight_compression/config.py +++ b/nncf/quantization/algorithms/weight_compression/config.py @@ -50,7 +50,19 @@ def is_integer(self): """ :return: True if compression type in integer, else False. """ - return self.mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK] + return self.mode not in [ + CompressWeightsMode.NF4, + CompressWeightsMode.E2M1, + CompressWeightsMode.CODEBOOK, + CompressWeightsMode.CB4_F8E4M3, + ] + + @property + def is_codebook(self): + """ + :return: True if compression type is codebook, else False. + """ + return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] def __hash__(self): return hash((self.mode.value, self.group_size)) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 447db197855..cd848c4120a 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -236,7 +236,7 @@ def _create_compression_subgraph( compression_dtype = ov.Type.i8 elif compression_config.mode == CompressWeightsMode.INT8_ASYM: compression_dtype = ov.Type.u8 - elif compression_config.mode == CompressWeightsMode.CODEBOOK: + elif compression_config.is_codebook: compression_dtype = None else: msg = f"{compression_config.mode.value} is not supported." @@ -252,7 +252,7 @@ def _create_compression_subgraph( compressed_weight, ) - if compression_config.mode == CompressWeightsMode.CODEBOOK: + if compression_config.is_codebook: n_quants = compressed_weight.tensor.max() compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4) codebook_params = advanced_parameters.codebook_params diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index c18abe82f8a..bb43205d276 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -50,6 +50,28 @@ dtype=np.float32, ) +CB4_QUANTILES = np.array( + [ + -3.5, + -2.5, + -1.875, + -1.375, + -1.0, + -0.625, + -0.3125, + 0.0, + 0.2812, + 0.5625, + 0.875, + 1.125, + 1.5, + 2.0, + 2.5, + 3.5, + ], + dtype=np.float32, +) + CENTER_OF_NF4_QUANTILES = np.array( [ -0.84809643, @@ -119,13 +141,13 @@ def calculate_float_quantization_params( :param max_val: Maximal value of e2m1 type. :return: Scale tensor of float32 type for float quantization. """ - assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK] + assert not config.is_integer if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) - if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]: + if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]: scale = scale / max_val # NOTE: adding machine epsilon to avoid division by zero @@ -178,7 +200,7 @@ def do_float_quantization( :return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor and optional indexes for codebook. """ - assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK] + assert not config.is_integer if config.group_size != -1 and reduction_axes is not None: # weights are reshaped: [a1, r, a2] -> [a1, r//gs, gs, a2] @@ -198,7 +220,7 @@ def do_float_quantization( scale = precomputed_scale if scale is None: - if config.mode == CompressWeightsMode.CODEBOOK: + if config.is_codebook: max_val = max(np.abs(np.array(config.user_data))) scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val) norm_weight = _calculate_normalized_weight(weight, scale) @@ -208,7 +230,7 @@ def do_float_quantization( compressed_weight = norm_weight.as_openvino_tensor().astype(TensorDataType.nf4) else: compressed_weight = _calculate_nf4_quantized_weight(norm_weight) - elif config.mode == CompressWeightsMode.CODEBOOK: + elif config.is_codebook: compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=config.user_data) return compressed_weight, scale, indexes else: From 37a7c590fd2cf11aca030a402deb627794ac5d8c Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 28 May 2025 10:37:51 +0200 Subject: [PATCH 26/68] Fixed pre-commit. --- nncf/quantization/advanced_parameters.py | 1 + nncf/quantization/algorithms/weight_compression/onnx_backend.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index ba2bcccfad6..9d67d90a56f 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -364,6 +364,7 @@ class AdvancedLoraCorrectionParameters: class AdvancedCodebookParameters: """ Contains advanced parameters for codebook compression algorithm. + :param codebook: The codebook (LUT) for the weight compression. Applicable for vector quantization. :type codebook: list[Any] diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py index 478ddd076c1..cdcef81d77c 100644 --- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -50,7 +50,6 @@ from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm -from nncf.quantization.algorithms.weight_compression.weight_lowering import CompressedWeight from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType From 6006be654e065e0ce5058aabaa5fed6fd53c43c1 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 2 Jun 2025 12:04:35 +0200 Subject: [PATCH 27/68] Applied suggestions. --- nncf/quantization/advanced_parameters.py | 21 +------------------ .../algorithms/weight_compression/backend.py | 2 +- .../algorithms/weight_compression/common.py | 2 +- .../weight_compression/onnx_backend.py | 2 +- .../weight_compression/openvino_backend.py | 4 ++-- .../weight_compression/torch_backend.py | 4 ++-- .../weight_compression/torch_fx_backend.py | 4 ++-- 7 files changed, 10 insertions(+), 29 deletions(-) diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index 9d67d90a56f..fa9746900d4 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -371,26 +371,7 @@ class AdvancedCodebookParameters: :param dts_type: The type of the codebook. """ - codebook: list[Any] = field( - default_factory=lambda: [ - -3.5, - -2.5, - -1.875, - -1.375, - -1.0, - -0.625, - -0.3125, - 0.0, - 0.2812, - 0.5625, - 0.875, - 1.125, - 1.5, - 2.0, - 2.5, - 3.5, - ] - ) + codebook: list[Any] = None dst_type: Any = None diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py index 2d928ff2908..cee8763995b 100644 --- a/nncf/quantization/algorithms/weight_compression/backend.py +++ b/nncf/quantization/algorithms/weight_compression/backend.py @@ -149,7 +149,7 @@ def transform_model( model: TModel, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - compressed_weights: dict[str, CompressedWeight] = None, + compressed_weights: Optional[dict[str, CompressedWeight]] = None, lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), diff --git a/nncf/quantization/algorithms/weight_compression/common.py b/nncf/quantization/algorithms/weight_compression/common.py index a172899374f..8c1d60fd400 100644 --- a/nncf/quantization/algorithms/weight_compression/common.py +++ b/nncf/quantization/algorithms/weight_compression/common.py @@ -50,4 +50,4 @@ def is_codebook(self): :return: True if the compressed weight is a codebook, False otherwise. """ - return not (self.codebook is None or self.tensor is None or self.scale is None) + return self.codebook is not None and self.tensor is not None and self.scale is not None diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py index cdcef81d77c..c0a2ab73849 100644 --- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -201,7 +201,7 @@ def transform_model( model: onnx.ModelProto, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - compressed_weights: dict[str, CompressedWeight] = None, + compressed_weights: Optional[dict[str, CompressedWeight]] = None, lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index cd848c4120a..6a3a3bf1c56 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -299,8 +299,8 @@ def transform_model( model: ov.Model, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - compressed_weights: dict[str, CompressedWeight] = None, - lora_correction_algo: LoraCorrectionAlgorithm = None, + compressed_weights: Optional[dict[str, CompressedWeight]] = None, + lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), ) -> ov.Model: diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index 92306f0a24b..f4254bfb0c5 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -456,8 +456,8 @@ def transform_model( model: Union[GraphModelWrapper, torch.nn.Module], graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - compressed_weights: dict[str, CompressedWeight] = None, - lora_correction_algo: LoraCorrectionAlgorithm = None, + compressed_weights: Optional[dict[str, CompressedWeight]] = None, + lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), ) -> NNCFNetwork: diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index 80597096346..2172a6a5e37 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -190,8 +190,8 @@ def transform_model( model: torch.fx.GraphModule, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - compressed_weights: dict[str, CompressedWeight] = None, - lora_correction_algo: LoraCorrectionAlgorithm = None, + compressed_weights: Optional[dict[str, CompressedWeight]] = None, + lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), ) -> torch.fx.GraphModule: From caed8a8569451f1ea2648cea544ee6d6f32693ca Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 2 Jun 2025 12:25:44 +0200 Subject: [PATCH 28/68] Fixed tests. --- nncf/quantization/advanced_parameters.py | 2 +- tests/openvino/native/quantization/test_weights_compression.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index fa9746900d4..945ae9bd68c 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -371,7 +371,7 @@ class AdvancedCodebookParameters: :param dts_type: The type of the codebook. """ - codebook: list[Any] = None + codebook: Optional[list[Any]] = None dst_type: Any = None diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 44cc8ca62c1..cf07a96d486 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1043,7 +1043,7 @@ def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids): model = SequentialMatmulModel().ov_model compressed_model = compress_weights( model, - mode=CompressWeightsMode.CODEBOOK, + mode=CompressWeightsMode.CB4_F8E4M3, ratio=ratio, group_size=1, all_layers=all_layers, From 0a36b5169e6a1e92f1da034d389397cf75442270 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 2 Jun 2025 18:38:57 +0200 Subject: [PATCH 29/68] Added codebook parametars validation. --- .../weight_compression/algorithm.py | 6 ++++++ nncf/quantization/quantize_model.py | 21 ++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index bff4e9abc38..6c90bda6f44 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -208,6 +208,12 @@ def check_user_compression_configuration( msg = "LoRA Correction algorithm is not compatible with FQ, FQ_LORA and FQ_LORA_NLS compression formats." raise nncf.ValidationError(msg) + if mode == CompressWeightsMode.CODEBOOK and ( + advanced_parameters is None or advanced_parameters.codebook_params.codebook is not None + ): + msg = "Codebook compression mode requires codebook parameters to be specified in advanced_parameters." + raise nncf.ValidationError(msg) + class WeightCompression(Algorithm): """ diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py index 0595a46d5f0..5b69e52359d 100644 --- a/nncf/quantization/quantize_model.py +++ b/nncf/quantization/quantize_model.py @@ -516,7 +516,12 @@ def compress_weights( from nncf.torch.nncf_network import NNCFNetwork from nncf.torch.quantization.quantize_model import compress_weights_impl as pt_compression_weights_impl - if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]: + if mode in [ + CompressWeightsMode.NF4, + CompressWeightsMode.E2M1, + CompressWeightsMode.CODEBOOK, + CompressWeightsMode.CB4_F8E4M3, + ]: msg = "Torch backend does not support NF4, E2M1 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) @@ -560,7 +565,12 @@ def compress_weights( compress_weights_impl as fx_compression_weights_impl, ) - if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]: + if mode in [ + CompressWeightsMode.NF4, + CompressWeightsMode.E2M1, + CompressWeightsMode.CODEBOOK, + CompressWeightsMode.CB4_F8E4M3, + ]: msg = "Torch backend does not support NF4, E2M1 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) @@ -616,7 +626,12 @@ def compress_weights( elif backend == BackendType.ONNX: from nncf.onnx.quantization.quantize_model import compress_weights_impl as onnx_compress_weights_impl - if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]: + if mode in [ + CompressWeightsMode.NF4, + CompressWeightsMode.E2M1, + CompressWeightsMode.CODEBOOK, + CompressWeightsMode.CB4_F8E4M3, + ]: msg = "ONNX backend does not support NF4, E2M1 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) From 68d633bbf65a0414f54250f5fb128cd27f45564c Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 2 Jun 2025 19:05:25 +0200 Subject: [PATCH 30/68] Fixed bug. --- nncf/quantization/algorithms/weight_compression/algorithm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 6c90bda6f44..d21a851e0eb 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -209,7 +209,7 @@ def check_user_compression_configuration( raise nncf.ValidationError(msg) if mode == CompressWeightsMode.CODEBOOK and ( - advanced_parameters is None or advanced_parameters.codebook_params.codebook is not None + advanced_parameters is None or advanced_parameters.codebook_params.codebook is None ): msg = "Codebook compression mode requires codebook parameters to be specified in advanced_parameters." raise nncf.ValidationError(msg) From 508aec426b795ce1f8911015c0fa8b75607032c6 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 6 Jun 2025 10:26:00 +0200 Subject: [PATCH 31/68] Applied suggestions. --- .../openvino/smollm2_360m_codebook/main.py | 17 +++++++++-------- .../algorithms/weight_compression/algorithm.py | 2 +- .../algorithms/weight_compression/config.py | 2 +- .../algorithms/weight_compression/gptq.py | 2 +- .../weight_compression/weight_lowering.py | 14 ++++++++++---- nncf/version.py | 2 +- tests/cross_fw/examples/example_scope.json | 17 +++++++++++++++++ tests/cross_fw/examples/run_example.py | 6 ++++++ .../quantization/test_weights_compression.py | 2 +- 9 files changed, 47 insertions(+), 17 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index feaa3fe8fec..4eb3ef98612 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -67,12 +67,12 @@ def default_codebook_example(model_id, output_dir): model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) - model = OVModelForCausalLM.from_pretrained( - output_dir, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "64", "INFERENCE_PRECISION_HINT": "f32"} - ) + model = OVModelForCausalLM.from_pretrained(output_dir, ov_config={"INFERENCE_PRECISION_HINT": "f32"}) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Optimized model outputs:\n{answers_by_questions}\n") + return list(answers_by_questions.values()) + def custom_codebook_example(model_id, output_dir): tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -102,19 +102,20 @@ def custom_codebook_example(model_id, output_dir): model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) - model = OVModelForCausalLM.from_pretrained( - output_dir, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "64", "INFERENCE_PRECISION_HINT": "f32"} - ) + model = OVModelForCausalLM.from_pretrained(output_dir, ov_config={"INFERENCE_PRECISION_HINT": "f32"}) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Optimized model outputs:\n{answers_by_questions}\n") + return list(answers_by_questions.values()) + def main(): model_id = "HuggingFaceTB/SmolLM2-360M-Instruct" output_dir = "smollm2_360m_compressed_codebook" - default_codebook_example(model_id, output_dir) - custom_codebook_example(model_id, output_dir + "_custom") + res = default_codebook_example(model_id, output_dir) + res += custom_codebook_example(model_id, output_dir + "_custom") + return res if __name__ == "__main__": diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index d21a851e0eb..7070311f8e4 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -440,7 +440,7 @@ def _get_primary_config(self): return WeightCompressionConfig( mode=self._mode, group_size=self._group_size, - user_data=CB4_QUANTILES + codebook_values=CB4_QUANTILES if self._mode == CompressWeightsMode.CB4_F8E4M3 else self._advanced_parameters.codebook_params.codebook, ) diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py index ff9b3eb10e9..519be93dee4 100644 --- a/nncf/quantization/algorithms/weight_compression/config.py +++ b/nncf/quantization/algorithms/weight_compression/config.py @@ -32,7 +32,7 @@ class WeightCompressionConfig: mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM group_size: Optional[int] = -1 - user_data: Optional[Any] = None + codebook_values: Optional[Any] = None @property def num_bits(self): diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index 963c9feb252..767fe2f5127 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -235,7 +235,7 @@ def _quantize_weights( ) reduction_axes = wc_params.reduction_axes block_compression_config = WeightCompressionConfig( - mode=wc_params.compression_config.mode, user_data=wc_params.compression_config.user_data + mode=wc_params.compression_config.mode, codebook_values=wc_params.compression_config.codebook_values ) damp = self._damp_percent * fns.mean(fns.diag(hessian)) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index bb43205d276..bd1031227ba 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -221,7 +221,7 @@ def do_float_quantization( scale = precomputed_scale if scale is None: if config.is_codebook: - max_val = max(np.abs(np.array(config.user_data))) + max_val = max(np.abs(np.array(config.codebook_values))) scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val) norm_weight = _calculate_normalized_weight(weight, scale) if config.mode == CompressWeightsMode.NF4: @@ -231,7 +231,7 @@ def do_float_quantization( else: compressed_weight = _calculate_nf4_quantized_weight(norm_weight) elif config.is_codebook: - compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=config.user_data) + compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=config.codebook_values) return compressed_weight, scale, indexes else: # TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved @@ -257,7 +257,7 @@ def float_quantize_dequantize_weight( :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale. :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale. """ - assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK] + assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] # TODO(nikita-savelyevv): add support for f4e2m1 once ticket 164851 is resolved # Optimized implementation @@ -385,7 +385,10 @@ def compress_weight( compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale) if indexes is not None: return CompressedWeight( - indexes, scale, None, fns.from_numpy(np.array(config.user_data), backend=compressed_weight.backend) + indexes, + scale, + None, + fns.from_numpy(np.array(config.codebook_values), backend=compressed_weight.backend), ) else: return CompressedWeight(compressed_weight, scale) @@ -568,6 +571,9 @@ def _calculate_codebook_quantized_weight( "round" or "quantize" to the closest quant. :param norm_weight: Weight tensor to quantize already normalized to quantiles range. + :param quantiles: Quantiles to use for quantization. If None, the center_of_quantiles must be provided. + :param center_of_quantiles: Center of quantiles to use for quantization. If None, it is calculated as the average + of adjacent quantiles. :return: Tensor with floating-point values, where each of them corresponds to elements from quantiles. """ assert quantiles is not None or center_of_quantiles is not None, ( diff --git a/nncf/version.py b/nncf/version.py index cec4ea22fb5..3769834a0b7 100644 --- a/nncf/version.py +++ b/nncf/version.py @@ -9,7 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.17.0.dev0+6ccd252b3dirty" +__version__ = "2.17.0" BKC_TORCH_SPEC = "==2.7.*" diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json index 36e627b1f00..565a77d468c 100644 --- a/tests/cross_fw/examples/example_scope.json +++ b/tests/cross_fw/examples/example_scope.json @@ -275,6 +275,23 @@ ] } }, + "codebook_llm_compression": { + "backend": "openvino", + "requirements": "examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt", + "cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz", + "accuracy_metrics": { + "answers": [ + "Paris.", + "Mont Blanc.", + "Toronto.", + "Tokyo.", + "Paris.", + "Mont Blanc.", + "Toronto.", + "Tokyo." + ] + } + }, "llm_compression_qat_with_lora": { "backend": "torch", "device": "cuda", diff --git a/tests/cross_fw/examples/run_example.py b/tests/cross_fw/examples/run_example.py index b44e130b283..09dd29fb719 100644 --- a/tests/cross_fw/examples/run_example.py +++ b/tests/cross_fw/examples/run_example.py @@ -192,6 +192,12 @@ def fp8_llm_quantization() -> dict[str, float]: return {"answers": list(result.values())} +def codebook_llm_compression() -> list[str]: + from examples.llm_compression.openvino.smollm2_360m_codebook.main import main as codebook_llm_compression_main + + return codebook_llm_compression_main() + + def llm_compression_qat_with_lora() -> float: from examples.llm_compression.torch.qat_with_lora.main import main as qat_with_lora_main diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index cf07a96d486..4a364751e83 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1121,7 +1121,7 @@ def test_codebook_weighs_range(data): codebook = data max_diff = 0.1 w = Tensor(data + (np.random.rand(*data.shape) - 0.5) * max_diff) - config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK, user_data=data) + config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK, codebook_values=data) _, scale, indexes = do_float_quantization(w, config, -1) uncompressed_data = codebook[indexes.data] * scale.data From 79f93680d146b1fc74898f48b226c8e60d77462a Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 10 Jun 2025 10:51:21 +0200 Subject: [PATCH 32/68] Added description for codebook parameter. --- nncf/quantization/advanced_parameters.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index 945ae9bd68c..3b521a6144a 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -395,6 +395,8 @@ class AdvancedCompressionParameters: :type lora_adapter_rank: int :param backend_params: Backend-specific parameters. :type backend_params: dict[str, Any] + :param codebook_params: Advanced parameters for codebook compression. + :type codebook_params: AdvancedCodebookParameters """ statistics_path: Optional[str] = None From 8c9b7b527ae19aecddfdcc08349b46201fc4cfa3 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 10 Jun 2025 12:05:37 +0200 Subject: [PATCH 33/68] Renamed global parameter for codebook. --- .../openvino/smollm2_360m_codebook/main.py | 4 +--- nncf/__init__.py | 2 +- nncf/openvino/graph/node_utils.py | 1 + nncf/quantization/advanced_parameters.py | 8 ++++---- .../native/quantization/test_weights_compression.py | 4 ++-- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 4eb3ef98612..68a0ea50980 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -88,9 +88,7 @@ def custom_codebook_example(model_id, output_dir): answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Non-optimized model outputs:\n{answers_by_questions}\n") - codebook_params = nncf.AdvancedCodebookParameters( - [-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], ov.Type.i8 - ) + codebook_params = nncf.CodebookParameters([-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], ov.Type.i8) model.model = nncf.compress_weights( model.model, diff --git a/nncf/__init__.py b/nncf/__init__.py index a0f9a45183f..14e1c38740f 100644 --- a/nncf/__init__.py +++ b/nncf/__init__.py @@ -52,13 +52,13 @@ ) from nncf.quantization.advanced_parameters import AdvancedAWQParameters as AdvancedAWQParameters from nncf.quantization.advanced_parameters import AdvancedBiasCorrectionParameters as AdvancedBiasCorrectionParameters -from nncf.quantization.advanced_parameters import AdvancedCodebookParameters as AdvancedCodebookParameters from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as AdvancedCompressionParameters from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as AdvancedGPTQParameters from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as AdvancedLoraCorrectionParameters from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters as AdvancedQuantizationParameters from nncf.quantization.advanced_parameters import AdvancedScaleEstimationParameters as AdvancedScaleEstimationParameters from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters as AdvancedSmoothQuantParameters +from nncf.quantization.advanced_parameters import CodebookParameters as CodebookParameters from nncf.quantization.advanced_parameters import OverflowFix as OverflowFix from nncf.scopes import IgnoredScope as IgnoredScope from nncf.scopes import Subgraph as Subgraph diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 75e5208ac43..f1c05fdb86e 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -692,6 +692,7 @@ def create_ov_codebook_subgraph( ) -> op.Constant: """ Create an OpenVINO subgraph with gather from the given codebook and indexes tensors. + :param codebook: Codebook tensor. :param indexes: Indexes tensor. :param dtype: Data type of the indexes. diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index 3b521a6144a..0ae6762a475 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -361,7 +361,7 @@ class AdvancedLoraCorrectionParameters: @api() @dataclass -class AdvancedCodebookParameters: +class CodebookParameters: """ Contains advanced parameters for codebook compression algorithm. @@ -395,8 +395,8 @@ class AdvancedCompressionParameters: :type lora_adapter_rank: int :param backend_params: Backend-specific parameters. :type backend_params: dict[str, Any] - :param codebook_params: Advanced parameters for codebook compression. - :type codebook_params: AdvancedCodebookParameters + :param codebook_params: Parameters for codebook compression. + :type codebook_params: CodebookParameters """ statistics_path: Optional[str] = None @@ -408,7 +408,7 @@ class AdvancedCompressionParameters: lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters) lora_adapter_rank: int = 256 backend_params: dict[str, Any] = field(default_factory=dict) - codebook_params: AdvancedCodebookParameters = field(default_factory=AdvancedCodebookParameters) + codebook_params: CodebookParameters = field(default_factory=CodebookParameters) @api() diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 4a364751e83..ee6f1bab7e4 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -36,11 +36,11 @@ from nncf.parameters import BackupMode from nncf.parameters import CompressionFormat from nncf.quantization import compress_weights -from nncf.quantization.advanced_parameters import AdvancedCodebookParameters from nncf.quantization.advanced_parameters import AdvancedCompressionParameters from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams +from nncf.quantization.advanced_parameters import CodebookParameters from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -1076,7 +1076,7 @@ def test_codebook(codebook, dst_type, n_layers, group_size): group_size=group_size, all_layers=True, advanced_parameters=AdvancedCompressionParameters( - codebook_params=AdvancedCodebookParameters(codebook=codebook, dst_type=dst_type) + codebook_params=CodebookParameters(codebook=codebook, dst_type=dst_type) ), ) names_codebook = [ From 9bd8c4b54f830c160a2bb34c2d2d26ebf9f1464d Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 11 Jun 2025 13:23:23 +0200 Subject: [PATCH 34/68] Removed tensor type. --- .../openvino/smollm2_360m_codebook/main.py | 8 +++++--- nncf/openvino/graph/node_utils.py | 7 +++---- nncf/quantization/advanced_parameters.py | 1 - .../algorithms/weight_compression/openvino_backend.py | 2 -- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 68a0ea50980..ab6268a8b7f 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -9,7 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import openvino as ov +import numpy as np from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer @@ -88,7 +88,9 @@ def custom_codebook_example(model_id, output_dir): answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Non-optimized model outputs:\n{answers_by_questions}\n") - codebook_params = nncf.CodebookParameters([-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], ov.Type.i8) + codebook_params = nncf.CodebookParameters( + np.array([-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], dtype=np.int8) + ) model.model = nncf.compress_weights( model.model, @@ -109,7 +111,7 @@ def custom_codebook_example(model_id, output_dir): def main(): model_id = "HuggingFaceTB/SmolLM2-360M-Instruct" - output_dir = "smollm2_360m_compressed_codebook" + output_dir = "smollm2_360m_compressed_codebook_" res = default_codebook_example(model_id, output_dir) res += custom_codebook_example(model_id, output_dir + "_custom") diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index f1c05fdb86e..edccbcfff0e 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -688,7 +688,7 @@ def create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] = def create_ov_codebook_subgraph( - codebook: Tensor, indexes: Tensor, dtype: ov.Type, codebook_dtype: ov.Type, name: Optional[str] = None + codebook: Tensor, indexes: Tensor, dtype: ov.Type, name: Optional[str] = None ) -> op.Constant: """ Create an OpenVINO subgraph with gather from the given codebook and indexes tensors. @@ -696,12 +696,11 @@ def create_ov_codebook_subgraph( :param codebook: Codebook tensor. :param indexes: Indexes tensor. :param dtype: Data type of the indexes. - :param codebook_dtype: Data type of the codebook. :param name: Optional name of the constant. :return: OpenVINO subgraph. """ - codebook_const = opset.constant(codebook.data, dtype=codebook_dtype) - if codebook_dtype != ov.Type.f16: + codebook_const = opset.constant(codebook.data) + if codebook.dtype != ov.Type.f16: codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16) codebook_indexes = opset.constant(indexes.data, dtype=dtype) diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index 0ae6762a475..7279d0d4ab3 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -372,7 +372,6 @@ class CodebookParameters: """ codebook: Optional[list[Any]] = None - dst_type: Any = None @api() diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 6a3a3bf1c56..96e8ffe4f62 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -255,12 +255,10 @@ def _create_compression_subgraph( if compression_config.is_codebook: n_quants = compressed_weight.tensor.max() compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4) - codebook_params = advanced_parameters.codebook_params converted_const = create_ov_codebook_subgraph( codebook=compressed_weight.codebook, indexes=compressed_weight.tensor, dtype=compression_dtype, - codebook_dtype=codebook_params.dst_type if codebook_params.dst_type else ov.Type.f8e4m3, name=const_node_name, ) else: From 8f6eb33dda4bda0f5a24230d4ebce626e0c600ff Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Thu, 12 Jun 2025 14:43:22 +0200 Subject: [PATCH 35/68] 1) Applied suggestions. 2) Removed codebook gather from input nodes by name given in compression. --- nncf/openvino/graph/metatypes/openvino_metatypes.py | 10 +++------- nncf/parameters.py | 2 +- nncf/quantization/advanced_parameters.py | 2 +- .../algorithms/weight_compression/openvino_backend.py | 2 +- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/nncf/openvino/graph/metatypes/openvino_metatypes.py b/nncf/openvino/graph/metatypes/openvino_metatypes.py index fe433739237..e2b95afc241 100644 --- a/nncf/openvino/graph/metatypes/openvino_metatypes.py +++ b/nncf/openvino/graph/metatypes/openvino_metatypes.py @@ -819,13 +819,9 @@ def _is_embedding(node: ov.Node) -> bool: input_tensor = node.input_value(const_port_id) input_type = input_tensor.get_element_type().get_type_name() - try: - input_node = node.input(const_port_id).get_source_output().node - if input_node.get_type_info().name == "Convert": - input_type = input_node.input_value(0).get_element_type().get_type_name() - except AttributeError: - # Handle the case where input_node is not available - pass + if node.friendly_name.endswith("nncf_codebook"): + return False + if input_type in allowed_types_list: const_node = get_operation_const_op(node, const_port_id) if const_node is not None: diff --git a/nncf/parameters.py b/nncf/parameters.py index b8966210d75..55ef80046de 100644 --- a/nncf/parameters.py +++ b/nncf/parameters.py @@ -94,10 +94,10 @@ class CompressWeightsMode(StrEnum): INT4_SYM = "int4_sym" INT4_ASYM = "int4_asym" NF4 = "nf4" + CB4_F8E4M3 = "cb4_f8e4m3" INT8 = "int8" # Deprecated mode E2M1 = "e2m1" CODEBOOK = "codebook" - CB4_F8E4M3 = "cb4_f8e4m3" @api(canonical_alias="nncf.CompressionFormat") diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index 0ae6762a475..93b02f2adec 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -363,7 +363,7 @@ class AdvancedLoraCorrectionParameters: @dataclass class CodebookParameters: """ - Contains advanced parameters for codebook compression algorithm. + Contains parameters for codebook compression algorithm. :param codebook: The codebook (LUT) for the weight compression. Applicable for vector quantization. diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 6a3a3bf1c56..6175cf9857e 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -261,7 +261,7 @@ def _create_compression_subgraph( indexes=compressed_weight.tensor, dtype=compression_dtype, codebook_dtype=codebook_params.dst_type if codebook_params.dst_type else ov.Type.f8e4m3, - name=const_node_name, + name=const_node_name + "_nncf_codebook", ) else: compressed_const = create_ov_const_from_tensor( From 8c7f42866b768d49a7e5e693a49ba30265b5dab1 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 24 Jun 2025 15:42:51 +0200 Subject: [PATCH 36/68] Removed data type from codebook parameters. --- .../weight_compression/algorithm.py | 5 +- .../algorithms/weight_compression/config.py | 3 + .../weight_compression/openvino_backend.py | 1 - .../weight_compression/weight_lowering.py | 75 ++----------------- 4 files changed, 13 insertions(+), 71 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 7070311f8e4..339c2612851 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -39,14 +39,15 @@ from nncf.quantization.algorithms.algorithm import Algorithm from nncf.quantization.algorithms.weight_compression.awq import AWQ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES from nncf.quantization.algorithms.weight_compression.gptq import GPTQ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation -from nncf.quantization.algorithms.weight_compression.weight_lowering import CB4_QUANTILES from nncf.quantization.algorithms.weight_compression.weight_lowering import WeightCompressionConfig from nncf.scopes import IgnoredScope from nncf.scopes import get_ignored_node_names_from_ignored_scope +from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType TModel = TypeVar("TModel") @@ -442,7 +443,7 @@ def _get_primary_config(self): group_size=self._group_size, codebook_values=CB4_QUANTILES if self._mode == CompressWeightsMode.CB4_F8E4M3 - else self._advanced_parameters.codebook_params.codebook, + else Tensor(self._advanced_parameters.codebook_params.codebook), ) def _set_weight_compression_config( diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py index 519be93dee4..d8d8c5e879e 100644 --- a/nncf/quantization/algorithms/weight_compression/config.py +++ b/nncf/quantization/algorithms/weight_compression/config.py @@ -64,6 +64,9 @@ def is_codebook(self): """ return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] + def get_numpy_codebook(self): + return self.codebook_values.as_numpy_tensor().data + def __hash__(self): return hash((self.mode.value, self.group_size)) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index fd2cb101e49..d464f894209 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -259,7 +259,6 @@ def _create_compression_subgraph( codebook=compressed_weight.codebook, indexes=compressed_weight.tensor, dtype=compression_dtype, - name=const_node_name, name=const_node_name + "_nncf_codebook", ) else: diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index bd1031227ba..d328efb7c57 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -20,6 +20,8 @@ from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.common import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES +from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES from nncf.quantization.fake_quantize import calculate_scale_zero_point from nncf.tensor import Tensor from nncf.tensor import functions as fns @@ -28,71 +30,6 @@ ReductionAxes = Union[int, tuple[int, ...]] -NF4_QUANTILES = np.array( - [ - -1.0, - -0.6961928009986877, - -0.5250730514526367, - -0.39491748809814453, - -0.28444138169288635, - -0.18477343022823334, - -0.09105003625154495, - 0.0, - 0.07958029955625534, - 0.16093020141124725, - 0.24611230194568634, - 0.33791524171829224, - 0.44070982933044434, - 0.5626170039176941, - 0.7229568362236023, - 1.0, - ], - dtype=np.float32, -) - -CB4_QUANTILES = np.array( - [ - -3.5, - -2.5, - -1.875, - -1.375, - -1.0, - -0.625, - -0.3125, - 0.0, - 0.2812, - 0.5625, - 0.875, - 1.125, - 1.5, - 2.0, - 2.5, - 3.5, - ], - dtype=np.float32, -) - -CENTER_OF_NF4_QUANTILES = np.array( - [ - -0.84809643, - -0.6106329, - -0.45999527, - -0.33967942, - -0.2346074, - -0.13791174, - -0.045525018, - 0.03979015, - 0.120255254, - 0.20352125, - 0.29201376, - 0.38931254, - 0.5016634, - 0.6427869, - 0.8614784, - ], - dtype=np.float32, -) - MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000 @@ -221,7 +158,7 @@ def do_float_quantization( scale = precomputed_scale if scale is None: if config.is_codebook: - max_val = max(np.abs(np.array(config.codebook_values))) + max_val = max(np.abs(config.get_numpy_codebook())) scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val) norm_weight = _calculate_normalized_weight(weight, scale) if config.mode == CompressWeightsMode.NF4: @@ -231,7 +168,9 @@ def do_float_quantization( else: compressed_weight = _calculate_nf4_quantized_weight(norm_weight) elif config.is_codebook: - compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=config.codebook_values) + compressed_weight, indexes = _calculate_codebook_quantized_weight( + norm_weight, quantiles=config.get_numpy_codebook() + ) return compressed_weight, scale, indexes else: # TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved @@ -388,7 +327,7 @@ def compress_weight( indexes, scale, None, - fns.from_numpy(np.array(config.codebook_values), backend=compressed_weight.backend), + config.codebook_values, ) else: return CompressedWeight(compressed_weight, scale) From db4399114ea8c696b5824dc9e6fc02990c134dd6 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 24 Jun 2025 17:13:50 +0200 Subject: [PATCH 37/68] Removed circular imports. --- .../openvino/smollm2_360m_codebook/main.py | 2 +- .../openvino/optimized_functions/functions.py | 2 +- .../algorithms/weight_compression/common.py | 38 ++++++++++++++++++- .../weight_compression/scale_estimation.py | 2 +- .../weight_compression/weight_lowering.py | 33 +--------------- .../quantization/test_weights_compression.py | 28 +++++++++----- .../test_compression_functions.py | 2 +- 7 files changed, 61 insertions(+), 46 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index ab6268a8b7f..eb2eaff0a25 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -111,7 +111,7 @@ def custom_codebook_example(model_id, output_dir): def main(): model_id = "HuggingFaceTB/SmolLM2-360M-Instruct" - output_dir = "smollm2_360m_compressed_codebook_" + output_dir = "smollm2_360m_compressed_codebook" res = default_codebook_example(model_id, output_dir) res += custom_codebook_example(model_id, output_dir + "_custom") diff --git a/nncf/openvino/optimized_functions/functions.py b/nncf/openvino/optimized_functions/functions.py index 282a43f9d2b..bc34e6a023c 100644 --- a/nncf/openvino/optimized_functions/functions.py +++ b/nncf/openvino/optimized_functions/functions.py @@ -21,8 +21,8 @@ from nncf.openvino.optimized_functions.models import get_integer_quantization_error_model from nncf.openvino.optimized_functions.models import get_integer_quantization_model from nncf.openvino.optimized_functions.models import get_integer_quantize_dequantize_weight_model +from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization from nncf.tensor import Tensor from nncf.tensor import TensorBackend from nncf.tensor import TensorDataType diff --git a/nncf/quantization/algorithms/weight_compression/common.py b/nncf/quantization/algorithms/weight_compression/common.py index 8c1d60fd400..94128b615de 100644 --- a/nncf/quantization/algorithms/weight_compression/common.py +++ b/nncf/quantization/algorithms/weight_compression/common.py @@ -10,10 +10,14 @@ # limitations under the License. from dataclasses import dataclass -from typing import Any, Optional +from typing import Any, Optional, Union +from nncf.errors import InvalidGroupSizeError +from nncf.errors import UnsupportedModelError from nncf.tensor import Tensor +ReductionAxes = Union[int, tuple[int, ...]] + @dataclass class Codebook: @@ -51,3 +55,35 @@ def is_codebook(self): :return: True if the compressed weight is a codebook, False otherwise. """ return self.codebook is not None and self.tensor is not None and self.scale is not None + + +def reshape_weight_for_grouped_quantization( + weight: Tensor, reduction_axes: ReductionAxes, group_size: int +) -> tuple[Tensor, int]: + """ + Reshapes weight for group-wise quantization and return a reduction axis for collecting statistics per group + dimension. Having a transposed weight with shapes [c_out, c_in] and group size = 128, shape of reshaped weight is + [c_out, c_in // 128, 128], reduction axis = 1 and the returned reduction axis = 2. + + :param weight: Weight array to compress. + :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). + :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale). + :return: reshaped weight and new reduction axis. + """ + assert group_size != -1 + if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1: + reduction_axes = reduction_axes[0] + if not isinstance(reduction_axes, int): + msg = f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}." + raise UnsupportedModelError(msg) + channel_size = weight.shape[reduction_axes] + if channel_size % group_size != 0: + msg = f"Channel size {channel_size} should be divisible by size of group {group_size}." + raise InvalidGroupSizeError(msg) + + num_groups_per_channel = channel_size // group_size + shape = list(weight.shape) # [a1, r, a2] - "r" refers to number of channels along reduction axis + shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size) + reshaped_weight = weight.reshape(shape) + reduction_axes += 1 + return reshaped_weight, reduction_axes diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 020a42c8f16..8b9b460df38 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -21,6 +21,7 @@ from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.common import CompressedWeight +from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error @@ -28,7 +29,6 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import integer_quantize_dequantize_weight -from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor import functions as fns diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index d328efb7c57..9421adc8ff2 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -19,6 +19,7 @@ from nncf.common.utils.backend import is_openvino_available from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.common import CompressedWeight +from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES @@ -34,38 +35,6 @@ MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000 -def reshape_weight_for_grouped_quantization( - weight: Tensor, reduction_axes: ReductionAxes, group_size: int -) -> tuple[Tensor, int]: - """ - Reshapes weight for group-wise quantization and return a reduction axis for collecting statistics per group - dimension. Having a transposed weight with shapes [c_out, c_in] and group size = 128, shape of reshaped weight is - [c_out, c_in // 128, 128], reduction axis = 1 and the returned reduction axis = 2. - - :param weight: Weight array to compress. - :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). - :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale). - :return: reshaped weight and new reduction axis. - """ - assert group_size != -1 - if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1: - reduction_axes = reduction_axes[0] - if not isinstance(reduction_axes, int): - msg = f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}." - raise nncf.UnsupportedModelError(msg) - channel_size = weight.shape[reduction_axes] - if channel_size % group_size != 0: - msg = f"Channel size {channel_size} should be divisible by size of group {group_size}." - raise nncf.InvalidGroupSizeError(msg) - - num_groups_per_channel = channel_size // group_size - shape = list(weight.shape) # [a1, r, a2] - "r" refers to number of channels along reduction axis - shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size) - reshaped_weight = weight.reshape(shape) - reduction_axes += 1 - return reshaped_weight, reduction_axes - - def calculate_float_quantization_params( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, max_val=6.0 ) -> Tensor: diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index ee6f1bab7e4..ecc9e92343f 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -41,6 +41,7 @@ from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams from nncf.quantization.advanced_parameters import CodebookParameters +from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -51,7 +52,6 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error -from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization from nncf.scopes import IgnoredScope from nncf.tensor import Tensor from nncf.tensor import TensorDataType @@ -1061,13 +1061,17 @@ def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids): @pytest.mark.parametrize( ("codebook", "dst_type", "n_layers"), ( - ([i for i in range(-8, 8)], ov.Type.i4, 5), - ([i for i in range(-(2**6), 2**6)], ov.Type.i8, 5), - ([i for i in range(-(2**6), 2**6)], ov.Type.f8e4m3, 5), + (np.array([i for i in range(-8, 8)], np.int8), ov.Type.i8, 5), + (np.array([i for i in range(-(2**6), 2**6)], np.int8), ov.Type.i8, 5), + ( + Tensor(np.array([i for i in range(-(2**6), 2**6)])).as_openvino_tensor().astype(TensorDataType.f8e4m3), + ov.Type.f8e4m3, + 5, + ), ), ) @pytest.mark.parametrize("group_size", (1, -1)) -def test_codebook(codebook, dst_type, n_layers, group_size): +def test_codebook(codebook, n_layers, dst_type, group_size): model = SequentialMatmulModel().ov_model compressed_model = compress_weights( model, @@ -1075,10 +1079,16 @@ def test_codebook(codebook, dst_type, n_layers, group_size): ratio=1.0, group_size=group_size, all_layers=True, - advanced_parameters=AdvancedCompressionParameters( - codebook_params=CodebookParameters(codebook=codebook, dst_type=dst_type) - ), + advanced_parameters=AdvancedCompressionParameters(codebook_params=CodebookParameters(codebook=codebook)), ) + names_codebook = [ + op.get_friendly_name() + for op in compressed_model.get_ordered_ops() + if op.get_friendly_name().endswith("nncf_codebook") + ] + + assert len(names_codebook) == n_layers + names_codebook = [ op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == dst_type ] @@ -1121,7 +1131,7 @@ def test_codebook_weighs_range(data): codebook = data max_diff = 0.1 w = Tensor(data + (np.random.rand(*data.shape) - 0.5) * max_diff) - config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK, codebook_values=data) + config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK, codebook_values=Tensor(data)) _, scale, indexes = do_float_quantization(w, config, -1) uncompressed_data = codebook[indexes.data] * scale.data diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index 67a9fcef14d..41148cbc2c2 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -28,6 +28,7 @@ from nncf.common.utils.caching import cache_results from nncf.openvino.cpu_info import is_arm_cpu from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor +from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization @@ -35,7 +36,6 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error from nncf.quantization.algorithms.weight_compression.weight_lowering import integer_quantize_dequantize_weight -from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor.definitions import TensorBackend From 7c9429ea9cb18f6ae913e416f3e9e964c834298f Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 24 Jun 2025 17:28:19 +0200 Subject: [PATCH 38/68] Added file with constants. --- nncf/quantization/advanced_parameters.py | 8 +- .../weight_compression/constants.py | 86 +++++++++++++++++++ 2 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 nncf/quantization/algorithms/weight_compression/constants.py diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index d3a9446ea28..fcf04bf01a5 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -29,6 +29,8 @@ from nncf.quantization.range_estimator import RangeEstimatorParameters from nncf.quantization.range_estimator import StatisticsType +TTensor = Any + @api(canonical_alias="nncf.OverflowFix") class OverflowFix(StrEnum): @@ -366,12 +368,12 @@ class CodebookParameters: Contains parameters for codebook compression algorithm. :param codebook: The codebook (LUT) for the weight compression. - Applicable for vector quantization. - :type codebook: list[Any] + Applicable for vector quantization. Must be a numpy array, ov Tensor, or torch Tensor. + :type codebook: TTensor :param dts_type: The type of the codebook. """ - codebook: Optional[list[Any]] = None + codebook: Optional[TTensor] = None @api() diff --git a/nncf/quantization/algorithms/weight_compression/constants.py b/nncf/quantization/algorithms/weight_compression/constants.py new file mode 100644 index 00000000000..fcbe91bfb53 --- /dev/null +++ b/nncf/quantization/algorithms/weight_compression/constants.py @@ -0,0 +1,86 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from nncf.tensor import Tensor +from nncf.tensor import TensorDataType + +NF4_QUANTILES = np.array( + [ + -1.0, + -0.6961928009986877, + -0.5250730514526367, + -0.39491748809814453, + -0.28444138169288635, + -0.18477343022823334, + -0.09105003625154495, + 0.0, + 0.07958029955625534, + 0.16093020141124725, + 0.24611230194568634, + 0.33791524171829224, + 0.44070982933044434, + 0.5626170039176941, + 0.7229568362236023, + 1.0, + ], + dtype=np.float32, +) + +CB4_QUANTILES = ( + Tensor( + np.array( + [ + -3.5, + -2.5, + -1.875, + -1.375, + -1.0, + -0.625, + -0.3125, + 0.0, + 0.2812, + 0.5625, + 0.875, + 1.125, + 1.5, + 2.0, + 2.5, + 3.5, + ], + dtype=np.float32, + ) + ) + .as_openvino_tensor() + .astype(TensorDataType.f8e4m3) +) + +CENTER_OF_NF4_QUANTILES = np.array( + [ + -0.84809643, + -0.6106329, + -0.45999527, + -0.33967942, + -0.2346074, + -0.13791174, + -0.045525018, + 0.03979015, + 0.120255254, + 0.20352125, + 0.29201376, + 0.38931254, + 0.5016634, + 0.6427869, + 0.8614784, + ], + dtype=np.float32, +) From b90ccf378119dca2d88127ee161667f70ad2c4ee Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 24 Jun 2025 17:45:25 +0200 Subject: [PATCH 39/68] Moved default codebook initialization to function. --- .../weight_compression/algorithm.py | 4 +- .../weight_compression/constants.py | 53 ++++++++++--------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 339c2612851..376894fde2e 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -39,7 +39,7 @@ from nncf.quantization.algorithms.algorithm import Algorithm from nncf.quantization.algorithms.weight_compression.awq import AWQ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters -from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES +from nncf.quantization.algorithms.weight_compression.constants import get_cb4_quantiles from nncf.quantization.algorithms.weight_compression.gptq import GPTQ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -441,7 +441,7 @@ def _get_primary_config(self): return WeightCompressionConfig( mode=self._mode, group_size=self._group_size, - codebook_values=CB4_QUANTILES + codebook_values=get_cb4_quantiles() if self._mode == CompressWeightsMode.CB4_F8E4M3 else Tensor(self._advanced_parameters.codebook_params.codebook), ) diff --git a/nncf/quantization/algorithms/weight_compression/constants.py b/nncf/quantization/algorithms/weight_compression/constants.py index fcbe91bfb53..4465549a706 100644 --- a/nncf/quantization/algorithms/weight_compression/constants.py +++ b/nncf/quantization/algorithms/weight_compression/constants.py @@ -36,33 +36,34 @@ dtype=np.float32, ) -CB4_QUANTILES = ( - Tensor( - np.array( - [ - -3.5, - -2.5, - -1.875, - -1.375, - -1.0, - -0.625, - -0.3125, - 0.0, - 0.2812, - 0.5625, - 0.875, - 1.125, - 1.5, - 2.0, - 2.5, - 3.5, - ], - dtype=np.float32, - ) + +def get_cb4_quantiles() -> Tensor: + """ + Returns the quantiles for the CB4 codebook. + """ + CB4_QUANTILES = np.array( + [ + -3.5, + -2.5, + -1.875, + -1.375, + -1.0, + -0.625, + -0.3125, + 0.0, + 0.2812, + 0.5625, + 0.875, + 1.125, + 1.5, + 2.0, + 2.5, + 3.5, + ], + dtype=np.float32, ) - .as_openvino_tensor() - .astype(TensorDataType.f8e4m3) -) + return Tensor(CB4_QUANTILES).as_openvino_tensor().astype(TensorDataType.f8e4m3) + CENTER_OF_NF4_QUANTILES = np.array( [ From 8a06f88f002dd811c72cd9dcc3f6efe1f5fccd4e Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 25 Jun 2025 10:24:54 +0200 Subject: [PATCH 40/68] Added test for comparison of compressed weight values for CB4_F8E4M3 type. --- nncf/openvino/graph/node_utils.py | 4 +- .../weight_compression/openvino_backend.py | 2 +- ...erModel_compressed_weights_cb4_f8e4m3.json | 178 ++++++++++++++++++ .../quantization/test_weights_compression.py | 32 ++++ 4 files changed, 213 insertions(+), 3 deletions(-) create mode 100644 tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index edccbcfff0e..7622eaf2c52 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -699,7 +699,7 @@ def create_ov_codebook_subgraph( :param name: Optional name of the constant. :return: OpenVINO subgraph. """ - codebook_const = opset.constant(codebook.data) + codebook_const = opset.constant(codebook.data, name=name) if codebook.dtype != ov.Type.f16: codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16) @@ -707,5 +707,5 @@ def create_ov_codebook_subgraph( if dtype == ov.Type.u4: codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8) - const = opset.gather(codebook_const, codebook_indexes, 0, name=name) + const = opset.gather(codebook_const, codebook_indexes, 0, name=name + "_nncf_codebook") return const diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index d464f894209..96e8ffe4f62 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -259,7 +259,7 @@ def _create_compression_subgraph( codebook=compressed_weight.codebook, indexes=compressed_weight.tensor, dtype=compression_dtype, - name=const_node_name + "_nncf_codebook", + name=const_node_name, ) else: compressed_const = create_ov_const_from_tensor( diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json new file mode 100644 index 00000000000..b8712bf3839 --- /dev/null +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json @@ -0,0 +1,178 @@ +{ + "matmul_2_data": { + "scale": [ + [ + [ + 0.2275390625 + ] + ], + [ + [ + 0.269287109375 + ] + ], + [ + [ + 0.272705078125 + ] + ], + [ + [ + 0.284423828125 + ] + ], + [ + [ + 0.266357421875 + ] + ], + [ + [ + 0.2802734375 + ] + ] + ] + }, + "matmul_1_data": { + "compressed_weight": [ + [ + 119, + 168, + 11, + 49, + 255, + 255 + ], + [ + 255, + 159, + 255, + 255, + 255, + 255 + ], + [ + 255, + 169, + 59, + 255, + 228, + 135 + ], + [ + 202, + 255, + 255, + 149, + 238, + 134 + ], + [ + 229, + 130, + 151, + 255, + 87, + 240 + ], + [ + 26, + 255, + 245, + 75, + 255, + 18 + ] + ], + "zero_point": [ + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ] + ], + "scale": [ + [ + 0.0025196075439453125 + ], + [ + 0.0024051666259765625 + ], + [ + 0.002300262451171875 + ], + [ + 0.0024013519287109375 + ], + [ + 0.0025997161865234375 + ], + [ + 0.003208160400390625 + ] + ] + }, + "gather_2_data": { + "compressed_weight": [ + [ + 181, + 77, + 12, + 5, + 231, + 255 + ], + [ + 166, + 200, + 149, + 255, + 223, + 1 + ], + [ + 255, + 10, + 224, + 54, + 255, + 166 + ] + ], + "zero_point": [ + [ + 0 + ], + [ + 0 + ], + [ + 0 + ] + ], + "scale": [ + [ + 0.0035152435302734375 + ], + [ + 0.0036563873291015625 + ], + [ + 0.003253936767578125 + ] + ] + } +} \ No newline at end of file diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index ecc9e92343f..5b9aaf6a525 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -231,6 +231,37 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7): } +def check_cb4_f8e4m3_grouped(op: ov.Node, group_size: int = 7): + assert op.get_element_type() == ov.Type.f8e4m3 + + convert_node = get_next_node(op) + assert convert_node.get_type_name() == "Convert" + + gather_node = get_next_node(convert_node) + assert gather_node.get_type_name() == "Gather" + + weight_shape = gather_node.shape + # NOTE: get_const_value_as_numpy_tensor doesn't work for 4-bit types + assert list(weight_shape)[-1] == group_size + reduced_weight_shape = list(weight_shape) + reduced_weight_shape[-1] = 1 + + mul_node = get_next_node(gather_node) + assert mul_node.get_type_name() == "Multiply" + scale_node = mul_node.input_value(1).get_node() + assert list(scale_node.shape) == reduced_weight_shape + + reshape_node = get_next_node(mul_node) + assert reshape_node.get_type_name() == "Reshape" + + convert_node = get_next_node(reshape_node) + assert convert_node.get_type_name() == "Convert" + + return { + "scale": get_const_value_as_numpy_tensor(scale_node), + } + + def check_int4_sym_grouped(op: ov.Node): return check_int4_grouped(op, mode=CompressWeightsMode.INT4_SYM) @@ -258,6 +289,7 @@ def get_mixed_mapping(primary_fn: Callable, list_layers: list[str]): (CompressWeightsMode.INT4_SYM, 7, get_mixed_mapping(check_int4_sym_grouped, TEST_MODELS[IntegerModel])), (CompressWeightsMode.INT4_ASYM, 7, get_mixed_mapping(check_int4_asym_grouped, TEST_MODELS[IntegerModel])), (CompressWeightsMode.NF4, 7, get_mixed_mapping(check_nf4_grouped, TEST_MODELS[IntegerModel])), + (CompressWeightsMode.CB4_F8E4M3, 7, get_mixed_mapping(check_cb4_f8e4m3_grouped, TEST_MODELS[IntegerModel])), ), ) def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map): From d6e4a76640940562ef8b23b9e71f04a358d78df9 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 25 Jun 2025 11:24:11 +0200 Subject: [PATCH 41/68] Fixed test. --- .../openvino/native/quantization/test_weights_compression.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 5b9aaf6a525..bc80ac2d3c6 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1082,9 +1082,7 @@ def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids): sensitivity_metric=mode, ) names_codebook = { - op.get_friendly_name() - for op in compressed_model.get_ordered_ops() - if op.get_element_type() == ov.Type.f8e4m3 and op.get_friendly_name().startswith("Const") + op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == ov.Type.f8e4m3 } assert ref_ids == len(names_codebook) From b231848cc36902e49aa73ce1a1fa3789eb7ec003 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 25 Jun 2025 11:27:39 +0200 Subject: [PATCH 42/68] Fixed fp8 value. --- nncf/quantization/algorithms/weight_compression/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/constants.py b/nncf/quantization/algorithms/weight_compression/constants.py index 4465549a706..726ba841e00 100644 --- a/nncf/quantization/algorithms/weight_compression/constants.py +++ b/nncf/quantization/algorithms/weight_compression/constants.py @@ -51,7 +51,7 @@ def get_cb4_quantiles() -> Tensor: -0.625, -0.3125, 0.0, - 0.2812, + 0.28125, 0.5625, 0.875, 1.125, From de7b709d32173a8eb6ccbf5784dd150ba157a330 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 27 Jun 2025 10:46:09 +0200 Subject: [PATCH 43/68] Test for codebook graph. --- .../IntegerModel_codebook_f16_u4.json | 61 ++++++++++ .../IntegerModel_codebook_f8e4m3_u8.json | 106 ++++++++++++++++++ .../IntegerModel_codebook_i8_u8.json | 106 ++++++++++++++++++ .../IntegerModel_codebook_u8_u4.json | 61 ++++++++++ 4 files changed, 334 insertions(+) create mode 100644 tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f16_u4.json create mode 100644 tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f8e4m3_u8.json create mode 100644 tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_i8_u8.json create mode 100644 tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_u8_u4.json diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f16_u4.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f16_u4.json new file mode 100644 index 00000000000..578b2cc53d3 --- /dev/null +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f16_u4.json @@ -0,0 +1,61 @@ +{ + "matmul_2_data_nncf_codebook_idxs": { + "indexes": [ + 171, + 253, + 154, + 172, + 217, + 235, + 250, + 155, + 253, + 252, + 188, + 253, + 207, + 206, + 253, + 236, + 254, + 233, + 255, + 248, + 255 + ] + }, + "matmul_2_data": { + "scale": [ + [ + [ + 0.99560546875 + ] + ], + [ + [ + 1.177734375 + ] + ], + [ + [ + 1.193359375 + ] + ], + [ + [ + 1.244140625 + ] + ], + [ + [ + 1.1650390625 + ] + ], + [ + [ + 1.2265625 + ] + ] + ] + } +} \ No newline at end of file diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f8e4m3_u8.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f8e4m3_u8.json new file mode 100644 index 00000000000..abf99c05ca4 --- /dev/null +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f8e4m3_u8.json @@ -0,0 +1,106 @@ +{ + "matmul_2_data_nncf_codebook_idxs": { + "indexes": [ + [ + [ + 14, + 12, + 16, + 20, + 13, + 11, + 15 + ] + ], + [ + [ + 12, + 11, + 16, + 13, + 17, + 12, + 20 + ] + ], + [ + [ + 14, + 11, + 17, + 20, + 15, + 20, + 15 + ] + ], + [ + [ + 14, + 16, + 20, + 19, + 15, + 18, + 15 + ] + ], + [ + [ + 16, + 18, + 14, + 18, + 18, + 20, + 11 + ] + ], + [ + [ + 17, + 19, + 20, + 10, + 19, + 20, + 20 + ] + ] + ] + }, + "matmul_2_data": { + "scale": [ + [ + [ + 0.2275390625 + ] + ], + [ + [ + 0.269287109375 + ] + ], + [ + [ + 0.272705078125 + ] + ], + [ + [ + 0.284423828125 + ] + ], + [ + [ + 0.266357421875 + ] + ], + [ + [ + 0.2802734375 + ] + ] + ] + } +} \ No newline at end of file diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_i8_u8.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_i8_u8.json new file mode 100644 index 00000000000..acf5ad93048 --- /dev/null +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_i8_u8.json @@ -0,0 +1,106 @@ +{ + "matmul_2_data_nncf_codebook_idxs": { + "indexes": [ + [ + [ + 14, + 12, + 16, + 20, + 13, + 11, + 15 + ] + ], + [ + [ + 12, + 11, + 16, + 13, + 17, + 12, + 20 + ] + ], + [ + [ + 14, + 11, + 17, + 20, + 15, + 20, + 15 + ] + ], + [ + [ + 14, + 16, + 20, + 20, + 15, + 18, + 15 + ] + ], + [ + [ + 16, + 18, + 14, + 18, + 18, + 20, + 11 + ] + ], + [ + [ + 17, + 19, + 20, + 10, + 19, + 20, + 20 + ] + ] + ] + }, + "matmul_2_data": { + "scale": [ + [ + [ + 0.07965087890625 + ] + ], + [ + [ + 0.09423828125 + ] + ], + [ + [ + 0.095458984375 + ] + ], + [ + [ + 0.0994873046875 + ] + ], + [ + [ + 0.09320068359375 + ] + ], + [ + [ + 0.09814453125 + ] + ] + ] + } +} \ No newline at end of file diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_u8_u4.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_u8_u4.json new file mode 100644 index 00000000000..8642e52a868 --- /dev/null +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_u8_u4.json @@ -0,0 +1,61 @@ +{ + "matmul_2_data_nncf_codebook_idxs": { + "indexes": [ + 54, + 248, + 20, + 56, + 145, + 181, + 243, + 38, + 250, + 247, + 104, + 249, + 126, + 123, + 217, + 199, + 251, + 178, + 254, + 208, + 255 + ] + }, + "matmul_2_data": { + "scale": [ + [ + [ + 0.0531005859375 + ] + ], + [ + [ + 0.06280517578125 + ] + ], + [ + [ + 0.06365966796875 + ] + ], + [ + [ + 0.06634521484375 + ] + ], + [ + [ + 0.0621337890625 + ] + ], + [ + [ + 0.0654296875 + ] + ] + ] + } +} \ No newline at end of file From 4737adea5556823425fc9a008463532a21413e95 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 27 Jun 2025 11:47:28 +0200 Subject: [PATCH 44/68] Changed name of file for more appropriate. --- nncf/openvino/graph/node_utils.py | 2 +- .../{common.py => group_quantization.py} | 41 +--------- .../weight_compression/parameters.py | 53 +++++++++++++ .../weight_compression/weight_lowering.py | 4 +- .../quantization/test_weights_compression.py | 74 ++++++++++++++++++- .../test_compression_functions.py | 2 +- 6 files changed, 129 insertions(+), 47 deletions(-) rename nncf/quantization/algorithms/weight_compression/{common.py => group_quantization.py} (64%) create mode 100644 nncf/quantization/algorithms/weight_compression/parameters.py diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 7622eaf2c52..3892876e51d 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -703,7 +703,7 @@ def create_ov_codebook_subgraph( if codebook.dtype != ov.Type.f16: codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16) - codebook_indexes = opset.constant(indexes.data, dtype=dtype) + codebook_indexes = opset.constant(indexes.data, dtype=dtype, name=name + "_nncf_codebook_idxs") if dtype == ov.Type.u4: codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8) diff --git a/nncf/quantization/algorithms/weight_compression/common.py b/nncf/quantization/algorithms/weight_compression/group_quantization.py similarity index 64% rename from nncf/quantization/algorithms/weight_compression/common.py rename to nncf/quantization/algorithms/weight_compression/group_quantization.py index 94128b615de..215dd217ea3 100644 --- a/nncf/quantization/algorithms/weight_compression/common.py +++ b/nncf/quantization/algorithms/weight_compression/group_quantization.py @@ -9,8 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass -from typing import Any, Optional, Union +from typing import Union from nncf.errors import InvalidGroupSizeError from nncf.errors import UnsupportedModelError @@ -19,44 +18,6 @@ ReductionAxes = Union[int, tuple[int, ...]] -@dataclass -class Codebook: - """ - Codebook parameters for weight compression. - :param codebook: The initial codebook for compression. - :param dst_type: The destination type for the codebook. - """ - - codebook: Optional[Tensor] = None - dst_type: Optional[Any] = None - - -@dataclass -class CompressedWeight: - """ - Compressed weight and decompression parameters. - - :param tensor: The tensor with compressed weight. - :param scale: The decompression scale, in practice it is dequantization scale for the quantization. - :param zero_point: The zero-point, it is the value of the compression type corresponding to the value 0 - in the non-compression realm. Applicable for INT quantization. - :param codebook: The codebook (LUT) for the weight compression. Applicable for vector quantization - """ - - tensor: Optional[Tensor] = None - scale: Optional[Tensor] = None - zero_point: Optional[Tensor] = None - codebook: Optional[Codebook] = None - - def is_codebook(self): - """ - Check if the compressed weight is a codebook. - - :return: True if the compressed weight is a codebook, False otherwise. - """ - return self.codebook is not None and self.tensor is not None and self.scale is not None - - def reshape_weight_for_grouped_quantization( weight: Tensor, reduction_axes: ReductionAxes, group_size: int ) -> tuple[Tensor, int]: diff --git a/nncf/quantization/algorithms/weight_compression/parameters.py b/nncf/quantization/algorithms/weight_compression/parameters.py new file mode 100644 index 00000000000..8c1d60fd400 --- /dev/null +++ b/nncf/quantization/algorithms/weight_compression/parameters.py @@ -0,0 +1,53 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Any, Optional + +from nncf.tensor import Tensor + + +@dataclass +class Codebook: + """ + Codebook parameters for weight compression. + :param codebook: The initial codebook for compression. + :param dst_type: The destination type for the codebook. + """ + + codebook: Optional[Tensor] = None + dst_type: Optional[Any] = None + + +@dataclass +class CompressedWeight: + """ + Compressed weight and decompression parameters. + + :param tensor: The tensor with compressed weight. + :param scale: The decompression scale, in practice it is dequantization scale for the quantization. + :param zero_point: The zero-point, it is the value of the compression type corresponding to the value 0 + in the non-compression realm. Applicable for INT quantization. + :param codebook: The codebook (LUT) for the weight compression. Applicable for vector quantization + """ + + tensor: Optional[Tensor] = None + scale: Optional[Tensor] = None + zero_point: Optional[Tensor] = None + codebook: Optional[Codebook] = None + + def is_codebook(self): + """ + Check if the compressed weight is a codebook. + + :return: True if the compressed weight is a codebook, False otherwise. + """ + return self.codebook is not None and self.tensor is not None and self.scale is not None diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 9421adc8ff2..8a5370b7716 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -18,11 +18,11 @@ from nncf.common.utils.backend import is_openvino_at_least from nncf.common.utils.backend import is_openvino_available from nncf.parameters import CompressWeightsMode -from nncf.quantization.algorithms.weight_compression.common import CompressedWeight -from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES +from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.fake_quantize import calculate_scale_zero_point from nncf.tensor import Tensor from nncf.tensor import functions as fns diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index bc80ac2d3c6..7b92e71964f 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -231,8 +231,8 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7): } -def check_cb4_f8e4m3_grouped(op: ov.Node, group_size: int = 7): - assert op.get_element_type() == ov.Type.f8e4m3 +def check_codebook_grouped(op: ov.Node, group_size: int = 7, dtype=ov.Type.f8e4m3): + assert op.get_element_type() == dtype convert_node = get_next_node(op) assert convert_node.get_type_name() == "Convert" @@ -262,6 +262,23 @@ def check_cb4_f8e4m3_grouped(op: ov.Node, group_size: int = 7): } +def check_codebook_indexes(op: ov.Node, dtype=ov.Type.u4): + assert op.get_element_type() == dtype + + if dtype == ov.Type.u4: + convert_node = get_next_node(op) + assert convert_node.get_type_name() == "Convert" + else: + convert_node = op + + gather_node = get_next_node(convert_node) + assert gather_node.get_type_name() == "Gather" + + return { + "indexes": get_const_value_as_numpy_tensor(op), + } + + def check_int4_sym_grouped(op: ov.Node): return check_int4_grouped(op, mode=CompressWeightsMode.INT4_SYM) @@ -289,7 +306,7 @@ def get_mixed_mapping(primary_fn: Callable, list_layers: list[str]): (CompressWeightsMode.INT4_SYM, 7, get_mixed_mapping(check_int4_sym_grouped, TEST_MODELS[IntegerModel])), (CompressWeightsMode.INT4_ASYM, 7, get_mixed_mapping(check_int4_asym_grouped, TEST_MODELS[IntegerModel])), (CompressWeightsMode.NF4, 7, get_mixed_mapping(check_nf4_grouped, TEST_MODELS[IntegerModel])), - (CompressWeightsMode.CB4_F8E4M3, 7, get_mixed_mapping(check_cb4_f8e4m3_grouped, TEST_MODELS[IntegerModel])), + (CompressWeightsMode.CB4_F8E4M3, 7, get_mixed_mapping(check_codebook_grouped, TEST_MODELS[IntegerModel])), ), ) def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map): @@ -313,6 +330,57 @@ def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map): compare_stats(ref_stats, actual_stats) +@pytest.mark.parametrize( + "codebook, codebook_dtype, index_dtype, name", + [ + (np.array([i for i in range(16)], np.uint8), ov.Type.u8, ov.Type.u4, "u8_u4"), + (np.array([0.1 * i for i in range(-8, 8)], np.float16), ov.Type.f16, ov.Type.u4, "f16_u4"), + ( + Tensor(np.array([0.35 * i for i in range(-10, 11)], np.float16)) + .as_openvino_tensor() + .astype(TensorDataType.f8e4m3), + ov.Type.f8e4m3, + ov.Type.u8, + "f8e4m3_u8", + ), + ( + Tensor(np.array([i for i in range(-10, 11)], np.int8)).as_openvino_tensor().astype(TensorDataType.int8), + ov.Type.i8, + ov.Type.u8, + "i8_u8", + ), + ], +) +def test_compression_with_сodebook_for_different_dtypes(codebook, codebook_dtype, index_dtype, name): + model = IntegerModel().ov_model + codebook_params = nncf.CodebookParameters(codebook) + + compressed_model = compress_weights( + model, + mode=CompressWeightsMode.CODEBOOK, + group_size=7, + advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params), + ) + actual_stats = {} + for op in compressed_model.get_ops(): + op_name = op.get_friendly_name() + if op.get_type_name() == "Constant": + if op_name == "matmul_2_data": + actual_stats[op_name] = check_codebook_grouped(op, group_size=7, dtype=codebook_dtype) + elif op_name == "matmul_2_data_nncf_codebook_idxs": + actual_stats[op_name] = check_codebook_indexes(op, dtype=index_dtype) + + ref_stats_path = get_actual_reference_for_current_openvino( + REFERENCE_SCALES_DIR / f"IntegerModel_codebook_{name}.json" + ) + + if os.getenv("NNCF_TEST_REGEN_DOT") is not None: + dump_to_json(ref_stats_path, actual_stats) + + ref_stats = load_json(ref_stats_path) + compare_stats(ref_stats, actual_stats) + + @pytest.mark.parametrize("metric", DATA_BASED_SENSITIVITY_METRICS) def test_gather_in_4_bit_if_all_layers_with_data(metric): dim1 = 2 # sequence length dimension diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index 41148cbc2c2..4df1befd37b 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -28,8 +28,8 @@ from nncf.common.utils.caching import cache_results from nncf.openvino.cpu_info import is_arm_cpu from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor -from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization From 072a62a3f03cd845594db59fe6effff342e63802 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 27 Jun 2025 11:48:14 +0200 Subject: [PATCH 45/68] Changed name of file for more appropriate. --- nncf/openvino/optimized_functions/functions.py | 2 +- nncf/quantization/algorithms/weight_compression/backend.py | 2 +- nncf/quantization/algorithms/weight_compression/gptq.py | 2 +- .../algorithms/weight_compression/onnx_backend.py | 2 +- .../algorithms/weight_compression/openvino_backend.py | 2 +- .../algorithms/weight_compression/scale_estimation.py | 4 ++-- .../algorithms/weight_compression/torch_backend.py | 2 +- .../algorithms/weight_compression/torch_fx_backend.py | 2 +- .../openvino/native/quantization/test_weights_compression.py | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/nncf/openvino/optimized_functions/functions.py b/nncf/openvino/optimized_functions/functions.py index bc34e6a023c..217b4444c8c 100644 --- a/nncf/openvino/optimized_functions/functions.py +++ b/nncf/openvino/optimized_functions/functions.py @@ -21,7 +21,7 @@ from nncf.openvino.optimized_functions.models import get_integer_quantization_error_model from nncf.openvino.optimized_functions.models import get_integer_quantization_model from nncf.openvino.optimized_functions.models import get_integer_quantize_dequantize_weight_model -from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization +from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.tensor import Tensor from nncf.tensor import TensorBackend diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py index cee8763995b..49b4ce487e9 100644 --- a/nncf/quantization/algorithms/weight_compression/backend.py +++ b/nncf/quantization/algorithms/weight_compression/backend.py @@ -26,7 +26,7 @@ from nncf.experimental.common.tensor_statistics.statistics import HessianTensorStatistic from nncf.parameters import CompressionFormat from nncf.quantization.advanced_parameters import AdvancedCompressionParameters -from nncf.quantization.algorithms.weight_compression.common import CompressedWeight +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.tensor import Tensor diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index 767fe2f5127..125e0c84a6f 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -23,7 +23,7 @@ from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.layerwise.engine import LayerwiseEngine from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.common import CompressedWeight +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py index c0a2ab73849..c3eeb7e49d6 100644 --- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -47,7 +47,7 @@ from nncf.parameters import CompressWeightsMode from nncf.quantization.advanced_parameters import AdvancedCompressionParameters from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.common import CompressedWeight +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 96e8ffe4f62..e5d5490b8f5 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -54,7 +54,7 @@ from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.common import CompressedWeight +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 8b9b460df38..85d358d23b2 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -20,8 +20,8 @@ from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.common import CompressedWeight -from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight +from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index f4254bfb0c5..dd5c4aec80b 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -45,7 +45,7 @@ from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.common import CompressedWeight +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index 2172a6a5e37..b76179d1dd6 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -40,7 +40,7 @@ from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.common import CompressedWeight +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 7b92e71964f..b48b0ce15cf 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -41,7 +41,7 @@ from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams from nncf.quantization.advanced_parameters import CodebookParameters -from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization +from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA From ede93422aa2d0aee22e3489eb30fba8b1036c257 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 27 Jun 2025 12:11:09 +0200 Subject: [PATCH 46/68] Return reshape_weight_for_grouped_quantization to weight_lowering. --- .../openvino/optimized_functions/functions.py | 2 +- .../algorithms/weight_compression/backend.py | 2 +- .../algorithms/weight_compression/gptq.py | 2 +- .../weight_compression/group_quantization.py | 50 ------------------- .../weight_compression/onnx_backend.py | 2 +- .../weight_compression/openvino_backend.py | 2 +- .../weight_compression/scale_estimation.py | 4 +- .../weight_compression/torch_backend.py | 2 +- .../weight_compression/torch_fx_backend.py | 2 +- .../weight_compression/weight_lowering.py | 36 ++++++++++++- .../quantization/test_weights_compression.py | 2 +- .../test_compression_functions.py | 2 +- 12 files changed, 45 insertions(+), 63 deletions(-) delete mode 100644 nncf/quantization/algorithms/weight_compression/group_quantization.py diff --git a/nncf/openvino/optimized_functions/functions.py b/nncf/openvino/optimized_functions/functions.py index 217b4444c8c..282a43f9d2b 100644 --- a/nncf/openvino/optimized_functions/functions.py +++ b/nncf/openvino/optimized_functions/functions.py @@ -21,8 +21,8 @@ from nncf.openvino.optimized_functions.models import get_integer_quantization_error_model from nncf.openvino.optimized_functions.models import get_integer_quantization_model from nncf.openvino.optimized_functions.models import get_integer_quantize_dequantize_weight_model -from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization from nncf.tensor import Tensor from nncf.tensor import TensorBackend from nncf.tensor import TensorDataType diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py index 49b4ce487e9..92c6cb80a5d 100644 --- a/nncf/quantization/algorithms/weight_compression/backend.py +++ b/nncf/quantization/algorithms/weight_compression/backend.py @@ -26,9 +26,9 @@ from nncf.experimental.common.tensor_statistics.statistics import HessianTensorStatistic from nncf.parameters import CompressionFormat from nncf.quantization.advanced_parameters import AdvancedCompressionParameters -from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.tensor import Tensor from nncf.tensor import TensorDataType diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index 125e0c84a6f..814ec4a2a6b 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -23,9 +23,9 @@ from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.layerwise.engine import LayerwiseEngine from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_float_quantization_params from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_integer_quantization_params diff --git a/nncf/quantization/algorithms/weight_compression/group_quantization.py b/nncf/quantization/algorithms/weight_compression/group_quantization.py deleted file mode 100644 index 215dd217ea3..00000000000 --- a/nncf/quantization/algorithms/weight_compression/group_quantization.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2025 Intel Corporation -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Union - -from nncf.errors import InvalidGroupSizeError -from nncf.errors import UnsupportedModelError -from nncf.tensor import Tensor - -ReductionAxes = Union[int, tuple[int, ...]] - - -def reshape_weight_for_grouped_quantization( - weight: Tensor, reduction_axes: ReductionAxes, group_size: int -) -> tuple[Tensor, int]: - """ - Reshapes weight for group-wise quantization and return a reduction axis for collecting statistics per group - dimension. Having a transposed weight with shapes [c_out, c_in] and group size = 128, shape of reshaped weight is - [c_out, c_in // 128, 128], reduction axis = 1 and the returned reduction axis = 2. - - :param weight: Weight array to compress. - :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). - :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale). - :return: reshaped weight and new reduction axis. - """ - assert group_size != -1 - if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1: - reduction_axes = reduction_axes[0] - if not isinstance(reduction_axes, int): - msg = f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}." - raise UnsupportedModelError(msg) - channel_size = weight.shape[reduction_axes] - if channel_size % group_size != 0: - msg = f"Channel size {channel_size} should be divisible by size of group {group_size}." - raise InvalidGroupSizeError(msg) - - num_groups_per_channel = channel_size // group_size - shape = list(weight.shape) # [a1, r, a2] - "r" refers to number of channels along reduction axis - shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size) - reshaped_weight = weight.reshape(shape) - reduction_axes += 1 - return reshaped_weight, reduction_axes diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py index c3eeb7e49d6..07347cd3abe 100644 --- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -47,9 +47,9 @@ from nncf.parameters import CompressWeightsMode from nncf.quantization.advanced_parameters import AdvancedCompressionParameters from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index e5d5490b8f5..2d158daf5e3 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -54,11 +54,11 @@ from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 85d358d23b2..d7c63c3d1e8 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -20,15 +20,15 @@ from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight -from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import integer_quantize_dequantize_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor import functions as fns diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index dd5c4aec80b..e8efeb302e3 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -45,10 +45,10 @@ from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index b76179d1dd6..d00b0ae5b4c 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -40,10 +40,10 @@ from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.torch_backend import PTAWQAlgoAlgoBackend from nncf.quantization.algorithms.weight_compression.torch_backend import PTMixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 8a5370b7716..8b00c98a755 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -17,11 +17,12 @@ from nncf.common.logging.logger import nncf_logger from nncf.common.utils.backend import is_openvino_at_least from nncf.common.utils.backend import is_openvino_available +from nncf.errors import InvalidGroupSizeError +from nncf.errors import UnsupportedModelError from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES -from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.fake_quantize import calculate_scale_zero_point from nncf.tensor import Tensor @@ -31,10 +32,41 @@ ReductionAxes = Union[int, tuple[int, ...]] - MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000 +def reshape_weight_for_grouped_quantization( + weight: Tensor, reduction_axes: ReductionAxes, group_size: int +) -> tuple[Tensor, int]: + """ + Reshapes weight for group-wise quantization and return a reduction axis for collecting statistics per group + dimension. Having a transposed weight with shapes [c_out, c_in] and group size = 128, shape of reshaped weight is + [c_out, c_in // 128, 128], reduction axis = 1 and the returned reduction axis = 2. + + :param weight: Weight array to compress. + :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). + :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale). + :return: reshaped weight and new reduction axis. + """ + assert group_size != -1 + if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1: + reduction_axes = reduction_axes[0] + if not isinstance(reduction_axes, int): + msg = f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}." + raise UnsupportedModelError(msg) + channel_size = weight.shape[reduction_axes] + if channel_size % group_size != 0: + msg = f"Channel size {channel_size} should be divisible by size of group {group_size}." + raise InvalidGroupSizeError(msg) + + num_groups_per_channel = channel_size // group_size + shape = list(weight.shape) # [a1, r, a2] - "r" refers to number of channels along reduction axis + shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size) + reshaped_weight = weight.reshape(shape) + reduction_axes += 1 + return reshaped_weight, reduction_axes + + def calculate_float_quantization_params( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, max_val=6.0 ) -> Tensor: diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index b48b0ce15cf..182a267876b 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -41,7 +41,6 @@ from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams from nncf.quantization.advanced_parameters import CodebookParameters -from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -52,6 +51,7 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error +from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization from nncf.scopes import IgnoredScope from nncf.tensor import Tensor from nncf.tensor import TensorDataType diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index 4df1befd37b..67a9fcef14d 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -29,13 +29,13 @@ from nncf.openvino.cpu_info import is_arm_cpu from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error from nncf.quantization.algorithms.weight_compression.weight_lowering import integer_quantize_dequantize_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor.definitions import TensorBackend From 44712901f3a2a7cf475a6046c1b9598c7025515b Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 27 Jun 2025 15:05:56 +0200 Subject: [PATCH 47/68] Changed no ascii chracter. --- tests/openvino/native/quantization/test_weights_compression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 182a267876b..85d14d70a89 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -351,7 +351,7 @@ def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map): ), ], ) -def test_compression_with_сodebook_for_different_dtypes(codebook, codebook_dtype, index_dtype, name): +def test_compression_with_codebook_for_different_dtypes(codebook, codebook_dtype, index_dtype, name): model = IntegerModel().ov_model codebook_params = nncf.CodebookParameters(codebook) From 3f9f833d7be464e6d52e61b074808ba9b4a800ec Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 27 Jun 2025 15:56:15 +0200 Subject: [PATCH 48/68] Removed extra convert from fp16 to fp16. --- nncf/openvino/graph/node_utils.py | 3 ++- .../native/quantization/test_weights_compression.py | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 3892876e51d..5faec5e904e 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -44,6 +44,7 @@ from nncf.openvino.graph.metatypes.openvino_metatypes import get_node_metatype from nncf.tensor import Tensor from nncf.tensor import TensorBackend +from nncf.tensor import TensorDataType InplaceInsertionFnType = Callable[[ov.Node, int, str], ov.Node] @@ -700,7 +701,7 @@ def create_ov_codebook_subgraph( :return: OpenVINO subgraph. """ codebook_const = opset.constant(codebook.data, name=name) - if codebook.dtype != ov.Type.f16: + if codebook.dtype != TensorDataType.float16: codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16) codebook_indexes = opset.constant(indexes.data, dtype=dtype, name=name + "_nncf_codebook_idxs") diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 85d14d70a89..b3bb42f056a 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -234,8 +234,11 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7): def check_codebook_grouped(op: ov.Node, group_size: int = 7, dtype=ov.Type.f8e4m3): assert op.get_element_type() == dtype - convert_node = get_next_node(op) - assert convert_node.get_type_name() == "Convert" + if dtype == ov.Type.f16: + convert_node = op + else: + convert_node = get_next_node(op) + assert convert_node.get_type_name() == "Convert" gather_node = get_next_node(convert_node) assert gather_node.get_type_name() == "Gather" From 67faaa7c77b878143b4abcac029321162f89eb74 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 30 Jun 2025 10:06:05 +0200 Subject: [PATCH 49/68] Added test and exception which checks what codebook is sorted, not empty 1d array. --- .../weight_compression/algorithm.py | 16 +++++++++++++ .../quantization/test_weights_compression.py | 23 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 376894fde2e..2c85a3eba34 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -181,6 +181,22 @@ def check_user_compression_configuration( ] ) ranks = [advanced_parameters.lora_adapter_rank, advanced_parameters.lora_correction_params.adapter_rank] + + if advanced_parameters.codebook_params.codebook is not None: + codebook = Tensor(advanced_parameters.codebook_params.codebook).as_numpy_tensor().data + msg = None + if codebook.ndim != 1: + msg = "The codebook must be a 1D array, but a multi-dimensional array is given." + if codebook.size < 2: + msg = ( + "The codebook must contain at least two unique elements," + "but a single-element or empty array is given." + ) + if (codebook[:-1] >= codebook[1:]).any(): + msg = "The codebook must be a sorted 1D array with unique elements, but an unsorted array is given." + if msg: + raise nncf.ValidationError(msg) + for size in values_to_check: if size <= 0: msg = f"The subset_size value should be positive, but subset_size={size} is given." diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index b3bb42f056a..5da9944a03e 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1725,6 +1725,29 @@ def test_nf4_quantization_mid_quant(weight, scale): np.testing.assert_allclose(nf4_quant.data, ref_nf4_quant.data, atol=0, rtol=0) +@pytest.mark.parametrize( + "codebook_values", + [ + np.array([0.2, 0.2, 0.3, 0.4], dtype=np.float32), + np.array([0.5, 0.2, 0.3, 0.4], dtype=np.float32), + np.array([[-1, 0, 1, 2, 3], [-1, 0, 1, 2, 3]], dtype=np.float32), + np.array([5], dtype=np.float32), + ], +) +def test_codebook_is_correct_array(codebook_values): + codebook_params = nncf.CodebookParameters(codebook_values) + model = SequentialMatmulModel().ov_model + + # The codebook should be a non empty 1D numpy array and sorted + with pytest.raises(nncf.ValidationError): + compress_weights( + model, + mode=CompressWeightsMode.CODEBOOK, + group_size=-1, + advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params), + ) + + class TestOVTemplateWeightCompression(TemplateWeightCompression): @staticmethod def get_matmul_model() -> ov.Model: From e5322e378057b3a80b03bcbeb3a148b5f3d99453 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 30 Jun 2025 10:33:10 +0200 Subject: [PATCH 50/68] Fixed fp8 values in test. --- .../native/quantization/test_weights_compression.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 5da9944a03e..33ba2681b07 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -354,7 +354,7 @@ def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map): ), ], ) -def test_compression_with_codebook_for_different_dtypes(codebook, codebook_dtype, index_dtype, name): +def test_codebook_compression_for_different_dtypes(codebook, codebook_dtype, index_dtype, name): model = IntegerModel().ov_model codebook_params = nncf.CodebookParameters(codebook) @@ -1165,7 +1165,9 @@ def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids): (np.array([i for i in range(-8, 8)], np.int8), ov.Type.i8, 5), (np.array([i for i in range(-(2**6), 2**6)], np.int8), ov.Type.i8, 5), ( - Tensor(np.array([i for i in range(-(2**6), 2**6)])).as_openvino_tensor().astype(TensorDataType.f8e4m3), + Tensor(np.array([np.sign(i) * 2 ** np.abs(i) for i in range(-6, 6)])) + .as_openvino_tensor() + .astype(TensorDataType.f8e4m3), ov.Type.f8e4m3, 5, ), From 8f18fb8b4aa0d4927e817fc9378b5a7148c0ec7f Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 30 Jun 2025 20:42:10 +0200 Subject: [PATCH 51/68] Applied suggestions. --- nncf/openvino/optimized_functions/functions.py | 2 +- nncf/quantization/advanced_parameters.py | 1 - .../algorithms/weight_compression/config.py | 5 +++-- .../weight_compression/openvino_backend.py | 2 -- .../algorithms/weight_compression/weight_lowering.py | 12 ++++-------- nncf/quantization/quantize_model.py | 4 +--- 6 files changed, 9 insertions(+), 17 deletions(-) diff --git a/nncf/openvino/optimized_functions/functions.py b/nncf/openvino/optimized_functions/functions.py index 282a43f9d2b..e22ea481abd 100644 --- a/nncf/openvino/optimized_functions/functions.py +++ b/nncf/openvino/optimized_functions/functions.py @@ -105,7 +105,7 @@ def do_float_quantization( config: WeightCompressionConfig, reduction_axes: Optional[ReductionAxes] = None, precomputed_scale: Optional[Tensor] = None, -) -> tuple[Tensor, Tensor]: +) -> tuple[Tensor, Tensor, Tensor]: """ Computes quantization scale if not provided, and performs corresponding nf4 weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index fcf04bf01a5..78a4cfae2d6 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -370,7 +370,6 @@ class CodebookParameters: :param codebook: The codebook (LUT) for the weight compression. Applicable for vector quantization. Must be a numpy array, ov Tensor, or torch Tensor. :type codebook: TTensor - :param dts_type: The type of the codebook. """ codebook: Optional[TTensor] = None diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py index d8d8c5e879e..b686c84c669 100644 --- a/nncf/quantization/algorithms/weight_compression/config.py +++ b/nncf/quantization/algorithms/weight_compression/config.py @@ -10,7 +10,7 @@ # limitations under the License. from dataclasses import dataclass from dataclasses import field -from typing import Any, Optional, TypeVar +from typing import Optional, TypeVar import numpy as np @@ -18,6 +18,7 @@ from nncf.parameters import CompressWeightsMode TWeightType = TypeVar("TWeightType") +TTensor = TypeVar("TTensor") @dataclass @@ -32,7 +33,7 @@ class WeightCompressionConfig: mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM group_size: Optional[int] = -1 - codebook_values: Optional[Any] = None + codebook_values: Optional[TTensor] = None @property def num_bits(self): diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 2d158daf5e3..5d5656db350 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -220,7 +220,6 @@ def _create_compression_subgraph( const_dtype, should_add_convert_node: bool, compressed_weight: Optional[CompressedWeight] = None, - advanced_parameters: Optional[AdvancedCompressionParameters] = None, ): scale_dtype = ov.Type.f16 if compression_config.mode == CompressWeightsMode.NF4: @@ -333,7 +332,6 @@ def transform_model( const_dtype=const_dtype, should_add_convert_node=should_add_convert_node, compressed_weight=compressed_weight, - advanced_parameters=advanced_parameters, ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 8b00c98a755..2a89c9acc9f 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -68,7 +68,7 @@ def reshape_weight_for_grouped_quantization( def calculate_float_quantization_params( - weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, max_val=6.0 + weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig ) -> Tensor: """ Calculates the scale for nf4 or e2m1 quantization. @@ -76,7 +76,6 @@ def calculate_float_quantization_params( :param weight: Weight array to compress. :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max). :param config: Weight compression configuration. - :param max_val: Maximal value of e2m1 type. :return: Scale tensor of float32 type for float quantization. """ assert not config.is_integer @@ -86,6 +85,7 @@ def calculate_float_quantization_params( scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]: + max_val = 6.0 if config.mode == CompressWeightsMode.E2M1 else max(np.abs(config.get_numpy_codebook())) scale = scale / max_val # NOTE: adding machine epsilon to avoid division by zero @@ -122,7 +122,6 @@ def do_float_quantization( config: WeightCompressionConfig, reduction_axes: Optional[ReductionAxes] = None, precomputed_scale: Optional[Tensor] = None, - max_val: float = 6.0, ) -> tuple[Tensor, Tensor, Tensor]: """ Computes quantization scale if not provided, and performs corresponding (nf4, e2m1) weight quantization. @@ -134,7 +133,6 @@ def do_float_quantization( :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :param max_val: Maximal value of destination type. :return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor and optional indexes for codebook. """ @@ -158,9 +156,7 @@ def do_float_quantization( scale = precomputed_scale if scale is None: - if config.is_codebook: - max_val = max(np.abs(config.get_numpy_codebook())) - scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val) + scale = calculate_float_quantization_params(weight, reduction_axes, config) norm_weight = _calculate_normalized_weight(weight, scale) if config.mode == CompressWeightsMode.NF4: if original_weight_backend == TensorBackend.ov: @@ -505,7 +501,7 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor: def _calculate_codebook_quantized_weight( norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None -) -> Tensor: +) -> tuple[Tensor, Tensor]: """ Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to "round" or "quantize" to the closest quant. diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py index 5b69e52359d..c63d698c430 100644 --- a/nncf/quantization/quantize_model.py +++ b/nncf/quantization/quantize_model.py @@ -607,9 +607,7 @@ def compress_weights( msg = "Scale estimation, GPTQ or Lora Correction algorithm is defined, but dataset is None." raise nncf.ParameterNotSupportedError(msg) - if any((awq, scale_estimation, gptq, lora_correction)) and mode in [ - CompressWeightsMode.E2M1, - ]: + if any((awq, scale_estimation, gptq, lora_correction)) and mode == CompressWeightsMode.E2M1: msg = f"AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is {mode}." raise nncf.ParameterNotSupportedError(msg) From c838708c7a369472d8022384e7da19e022ed5300 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 1 Jul 2025 14:38:38 +0200 Subject: [PATCH 52/68] Applied suggestions. --- nncf/quantization/algorithms/weight_compression/config.py | 2 ++ .../algorithms/weight_compression/weight_lowering.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py index b686c84c669..78201f96ce5 100644 --- a/nncf/quantization/algorithms/weight_compression/config.py +++ b/nncf/quantization/algorithms/weight_compression/config.py @@ -29,6 +29,8 @@ class WeightCompressionConfig: :param mode: Defines a mode for weight compression. Defaults to INT8_ASYM mode. :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale). The value -1 means no grouping. Defaults to -1. + :param codebook_values: Optional codebook values for CODEBOOK compression mode. + Must be fns.Tensor which wraps numpy array, ov or torch tensor. """ mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 2a89c9acc9f..d0abf039f90 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -500,7 +500,7 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor: def _calculate_codebook_quantized_weight( - norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None + norm_weight: Tensor, quantiles: np.array = None, center_of_quantiles: np.array = None ) -> tuple[Tensor, Tensor]: """ Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to From 0949a92a29b857bb9a7ef397944751b0ac63a1ec Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 1 Jul 2025 18:52:56 +0200 Subject: [PATCH 53/68] Applied suggestions. --- .../algorithms/weight_compression/algorithm.py | 8 ++++---- .../algorithms/weight_compression/openvino_backend.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 2c85a3eba34..dd3fac4d288 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -686,13 +686,13 @@ def apply( # del is used to prematurely mark non-necessary data as free for garbage collection del self.awq_algo - compressed_weights = None + precomputed_compressed_weights = None lora_correction_algo = None description = "Applying Weight Compression" if self._gptq: del statistics - model, compressed_weights = self._gptq_algo.apply( + model, precomputed_compressed_weights = self._gptq_algo.apply( model=model, graph=graph, dataset=dataset, @@ -701,7 +701,7 @@ def apply( ) else: if self._scale_estimation: - compressed_weights = self._scale_estimation_algo.apply( + precomputed_compressed_weights = self._scale_estimation_algo.apply( model=model, graph=graph, all_weight_params=all_weight_params, @@ -724,7 +724,7 @@ def apply( model, graph, track(all_weight_params, description=description, weights=all_weight_sizes), - compressed_weights, + precomputed_compressed_weights, lora_correction_algo, self._compression_format, self._advanced_parameters, diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 5d5656db350..20ff7248c7c 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -252,7 +252,7 @@ def _create_compression_subgraph( ) if compression_config.is_codebook: - n_quants = compressed_weight.tensor.max() + n_quants = compressed_weight.codebook.size - 1 compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4) converted_const = create_ov_codebook_subgraph( codebook=compressed_weight.codebook, From b491012310612d21d7b38d164bccaa55ed8c3e2d Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 1 Jul 2025 19:08:11 +0200 Subject: [PATCH 54/68] Fixed data type. --- .../algorithms/weight_compression/weight_lowering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index d0abf039f90..c1a6406678d 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -500,7 +500,7 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor: def _calculate_codebook_quantized_weight( - norm_weight: Tensor, quantiles: np.array = None, center_of_quantiles: np.array = None + norm_weight: Tensor, quantiles: np.ndarray = None, center_of_quantiles: np.ndarray = None ) -> tuple[Tensor, Tensor]: """ Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to From 6bf05fcc65a16cd3bac12b7c292bc6e553e00988 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 1 Jul 2025 19:47:37 +0200 Subject: [PATCH 55/68] Removed torch tensor from codebook docstring. --- nncf/quantization/advanced_parameters.py | 2 +- nncf/quantization/algorithms/weight_compression/config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index 78a4cfae2d6..ad62fef11fc 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -368,7 +368,7 @@ class CodebookParameters: Contains parameters for codebook compression algorithm. :param codebook: The codebook (LUT) for the weight compression. - Applicable for vector quantization. Must be a numpy array, ov Tensor, or torch Tensor. + Applicable for vector quantization. Must be a numpy array or ov Tensor. :type codebook: TTensor """ diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py index 78201f96ce5..1d4e7a57917 100644 --- a/nncf/quantization/algorithms/weight_compression/config.py +++ b/nncf/quantization/algorithms/weight_compression/config.py @@ -30,7 +30,7 @@ class WeightCompressionConfig: :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale). The value -1 means no grouping. Defaults to -1. :param codebook_values: Optional codebook values for CODEBOOK compression mode. - Must be fns.Tensor which wraps numpy array, ov or torch tensor. + Must be fns.Tensor which wraps numpy array or ov tensor. """ mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM From e44b3d866673a44894e03ab552fe65d5b394bfcb Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 2 Jul 2025 09:58:56 +0200 Subject: [PATCH 56/68] Applied suggestion. --- .../algorithms/weight_compression/onnx_backend.py | 6 ++++-- .../weight_compression/openvino_backend.py | 14 +++++++++----- .../algorithms/weight_compression/torch_backend.py | 6 ++++-- .../weight_compression/torch_fx_backend.py | 6 ++++-- .../weight_compression/weight_lowering.py | 6 ++++-- 5 files changed, 25 insertions(+), 13 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py index 07347cd3abe..761647184b4 100644 --- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -201,7 +201,7 @@ def transform_model( model: onnx.ModelProto, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - compressed_weights: Optional[dict[str, CompressedWeight]] = None, + precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None, lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), @@ -217,7 +217,9 @@ def transform_model( Tensor(weight), wc_params.reduction_axes, compression_config, - None if compressed_weights is None else compressed_weights.get(wc_params.weight_name), + None + if precomputed_compressed_weights is None + else precomputed_compressed_weights.get(wc_params.weight_name), ) dequantize_block_size = max(compression_config.group_size, 0) # 0 - is no block wise quantization dequantize_axis = ( diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 20ff7248c7c..d763ad278de 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -219,7 +219,7 @@ def _create_compression_subgraph( weight_port_id: int, const_dtype, should_add_convert_node: bool, - compressed_weight: Optional[CompressedWeight] = None, + precomputed_compressed_weights: Optional[CompressedWeight] = None, ): scale_dtype = ov.Type.f16 if compression_config.mode == CompressWeightsMode.NF4: @@ -248,7 +248,7 @@ def _create_compression_subgraph( weight, reduction_axes, compression_config, - compressed_weight, + precomputed_compressed_weights, ) if compression_config.is_codebook: @@ -296,7 +296,7 @@ def transform_model( model: ov.Model, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - compressed_weights: Optional[dict[str, CompressedWeight]] = None, + precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None, lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), @@ -321,7 +321,11 @@ def transform_model( should_add_convert_node = True break - compressed_weight = None if compressed_weights is None else compressed_weights.get(wc_params.weight_name) + precomputed_compressed_weights = ( + None + if precomputed_compressed_weights is None + else precomputed_compressed_weights.get(wc_params.weight_name) + ) try: mul, compressed_weight = self._create_compression_subgraph( weight=weight, @@ -331,7 +335,7 @@ def transform_model( weight_port_id=wc_params.weight_port_id, const_dtype=const_dtype, should_add_convert_node=should_add_convert_node, - compressed_weight=compressed_weight, + precomputed_compressed_weights=precomputed_compressed_weights, ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index e8efeb302e3..79f6b315a09 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -456,7 +456,7 @@ def transform_model( model: Union[GraphModelWrapper, torch.nn.Module], graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - compressed_weights: Optional[dict[str, CompressedWeight]] = None, + precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None, lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), @@ -493,7 +493,9 @@ def transform_model( Tensor(weight), wc_params.reduction_axes, compression_config, - None if compressed_weights is None else compressed_weights.get(wc_params.weight_name), + None + if precomputed_compressed_weights is None + else precomputed_compressed_weights.get(wc_params.weight_name), ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index d00b0ae5b4c..396f125ca7b 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -190,7 +190,7 @@ def transform_model( model: torch.fx.GraphModule, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - compressed_weights: Optional[dict[str, CompressedWeight]] = None, + precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None, lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), @@ -218,7 +218,9 @@ def transform_model( weight, wc_params.reduction_axes, compression_config, - None if compressed_weights is None else compressed_weights.get(wc_params.weight_name), + None + if precomputed_compressed_weights is None + else precomputed_compressed_weights.get(wc_params.weight_name), ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index c1a6406678d..c4f697a01d9 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -301,7 +301,7 @@ def compress_weight( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, - compressed_weight: CompressedWeight = None, + precomputed_compressed_weights: CompressedWeight = None, ) -> CompressedWeight: """ Compress weight using compression configuration. @@ -314,7 +314,9 @@ def compress_weight( :return: The compressed weight and decompression parameters as instance of CompressedWeight """ precomputed_scale, precomputed_zero_point = ( - (compressed_weight.scale, compressed_weight.zero_point) if compressed_weight else (None, None) + (precomputed_compressed_weights.scale, precomputed_compressed_weights.zero_point) + if precomputed_compressed_weights + else (None, None) ) if not config.is_integer: From f1c68d6f787ab5a1689b84ed0fa4643e37adee52 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 2 Jul 2025 10:01:08 +0200 Subject: [PATCH 57/68] Applied suggestion. --- nncf/quantization/algorithms/weight_compression/backend.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py index 92c6cb80a5d..e2257168ad3 100644 --- a/nncf/quantization/algorithms/weight_compression/backend.py +++ b/nncf/quantization/algorithms/weight_compression/backend.py @@ -149,7 +149,7 @@ def transform_model( model: TModel, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - compressed_weights: Optional[dict[str, CompressedWeight]] = None, + precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None, lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), @@ -160,8 +160,7 @@ def transform_model( :param model: Model in which the weights will be compressed according to the weight compression description. :param graph: The graph associated with the model. :param weight_compression_parameters: An iterable of weight compression parameters. - :param precomputed_scales: Precomputed scales for weight compression. - :param precomputed_zero_points: Precomputed zero points for weight compression. + :param precomputed_compressed_weights: Precomputed scales, zero points, or codebook for weight compression. :param lora_correction_algo: An optional algorithm to reduce quantization noise after weight compression by using low-rank adapters. This algorithm not only overrides weights with their quantized counterparts but also expands the model's execution graph following the Low-Rank Adaptation (LoRA) concept. From 8159e56aa8548ee249a3c20bdbda14324f6ea951 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 2 Jul 2025 12:40:23 +0200 Subject: [PATCH 58/68] Fixed bug. --- .../algorithms/weight_compression/onnx_backend.py | 4 ++-- .../algorithms/weight_compression/openvino_backend.py | 8 ++++---- .../algorithms/weight_compression/weight_lowering.py | 9 ++++----- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py index 761647184b4..faf78319b74 100644 --- a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -213,7 +213,7 @@ def transform_model( compression_config = wc_params.compression_config node = wc_params.node_with_weight weight = self.get_weight(node, wc_params.weight_port_id, model, graph) - compressed_weight = compress_weight( + precomputed_compressed_weight = compress_weight( Tensor(weight), wc_params.reduction_axes, compression_config, @@ -231,7 +231,7 @@ def transform_model( # See https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md if opset_version < 21 and dequantize_block_size > 0: compressed_weight, scale, zero_point = self._preprocess_compressed_weight( - compressed_weight, weight.shape, dequantize_block_size=None, apply_transpose=True + precomputed_compressed_weight, weight.shape, dequantize_block_size=None, apply_transpose=True ) self._replace_matmul_with_matmulnbits( model, diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index d763ad278de..80b6a2d64c4 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -219,7 +219,7 @@ def _create_compression_subgraph( weight_port_id: int, const_dtype, should_add_convert_node: bool, - precomputed_compressed_weights: Optional[CompressedWeight] = None, + precomputed_compressed_weight: Optional[CompressedWeight] = None, ): scale_dtype = ov.Type.f16 if compression_config.mode == CompressWeightsMode.NF4: @@ -248,7 +248,7 @@ def _create_compression_subgraph( weight, reduction_axes, compression_config, - precomputed_compressed_weights, + precomputed_compressed_weight, ) if compression_config.is_codebook: @@ -321,7 +321,7 @@ def transform_model( should_add_convert_node = True break - precomputed_compressed_weights = ( + precomputed_compressed_weight = ( None if precomputed_compressed_weights is None else precomputed_compressed_weights.get(wc_params.weight_name) @@ -335,7 +335,7 @@ def transform_model( weight_port_id=wc_params.weight_port_id, const_dtype=const_dtype, should_add_convert_node=should_add_convert_node, - precomputed_compressed_weights=precomputed_compressed_weights, + precomputed_compressed_weight=precomputed_compressed_weight, ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index c4f697a01d9..c572be2fff7 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -301,7 +301,7 @@ def compress_weight( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, - precomputed_compressed_weights: CompressedWeight = None, + precomputed_compressed_weight: CompressedWeight = None, ) -> CompressedWeight: """ Compress weight using compression configuration. @@ -309,13 +309,12 @@ def compress_weight( :param weight: The weight to compress. :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). :param config: Compression configuration. - :param precomputed_scale: Precomputed scale. - :param precomputed_zero_point: Precomputed zero point. + :param precomputed_compressed_weight: precomputed scale and zero point. :return: The compressed weight and decompression parameters as instance of CompressedWeight """ precomputed_scale, precomputed_zero_point = ( - (precomputed_compressed_weights.scale, precomputed_compressed_weights.zero_point) - if precomputed_compressed_weights + (precomputed_compressed_weight.scale, precomputed_compressed_weight.zero_point) + if precomputed_compressed_weight else (None, None) ) From b24936b31d41f5ae33e883ebf6aec906b4a0cf89 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 2 Jul 2025 13:48:55 +0200 Subject: [PATCH 59/68] Fixed bug for onnx. --- .../openvino/smollm2_360m_codebook/main.py | 25 +++++++------------ .../weight_compression/onnx_backend.py | 4 +-- .../weight_compression/openvino_backend.py | 9 +++---- .../weight_compression/weight_lowering.py | 2 +- 4 files changed, 15 insertions(+), 25 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index eb2eaff0a25..640a9fd1313 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -49,17 +49,18 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50): ] -def default_codebook_example(model_id, output_dir): +def load_model_and_tokenizer(model_id, export=True): tokenizer = AutoTokenizer.from_pretrained(model_id) model = OVModelForCausalLM.from_pretrained( model_id, - export=True, + export=export, load_in_8bit=False, - compile=False, - stateful=False, - ov_config={"INFERENCE_PRECISION_HINT": "f32"}, ) + return model, tokenizer + +def default_codebook_example(model_id, output_dir): + model, tokenizer = load_model_and_tokenizer(model_id) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Non-optimized model outputs:\n{answers_by_questions}\n") @@ -67,7 +68,7 @@ def default_codebook_example(model_id, output_dir): model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) - model = OVModelForCausalLM.from_pretrained(output_dir, ov_config={"INFERENCE_PRECISION_HINT": "f32"}) + model, tokenizer = load_model_and_tokenizer(output_dir, False) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Optimized model outputs:\n{answers_by_questions}\n") @@ -75,15 +76,7 @@ def default_codebook_example(model_id, output_dir): def custom_codebook_example(model_id, output_dir): - tokenizer = AutoTokenizer.from_pretrained(model_id) - model = OVModelForCausalLM.from_pretrained( - model_id, - export=True, - load_in_8bit=False, - compile=False, - stateful=False, - ov_config={"INFERENCE_PRECISION_HINT": "f32"}, - ) + model, tokenizer = load_model_and_tokenizer(model_id) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Non-optimized model outputs:\n{answers_by_questions}\n") @@ -102,7 +95,7 @@ def custom_codebook_example(model_id, output_dir): model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) - model = OVModelForCausalLM.from_pretrained(output_dir, ov_config={"INFERENCE_PRECISION_HINT": "f32"}) + model, tokenizer = load_model_and_tokenizer(output_dir, False) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print(f"Optimized model outputs:\n{answers_by_questions}\n") diff --git a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py index faf78319b74..761647184b4 100644 --- a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -213,7 +213,7 @@ def transform_model( compression_config = wc_params.compression_config node = wc_params.node_with_weight weight = self.get_weight(node, wc_params.weight_port_id, model, graph) - precomputed_compressed_weight = compress_weight( + compressed_weight = compress_weight( Tensor(weight), wc_params.reduction_axes, compression_config, @@ -231,7 +231,7 @@ def transform_model( # See https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md if opset_version < 21 and dequantize_block_size > 0: compressed_weight, scale, zero_point = self._preprocess_compressed_weight( - precomputed_compressed_weight, weight.shape, dequantize_block_size=None, apply_transpose=True + compressed_weight, weight.shape, dequantize_block_size=None, apply_transpose=True ) self._replace_matmul_with_matmulnbits( model, diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 80b6a2d64c4..37564cc654b 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -321,11 +321,6 @@ def transform_model( should_add_convert_node = True break - precomputed_compressed_weight = ( - None - if precomputed_compressed_weights is None - else precomputed_compressed_weights.get(wc_params.weight_name) - ) try: mul, compressed_weight = self._create_compression_subgraph( weight=weight, @@ -335,7 +330,9 @@ def transform_model( weight_port_id=wc_params.weight_port_id, const_dtype=const_dtype, should_add_convert_node=should_add_convert_node, - precomputed_compressed_weight=precomputed_compressed_weight, + precomputed_compressed_weight=None + if precomputed_compressed_weights is None + else precomputed_compressed_weights.get(wc_params.weight_name), ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index c572be2fff7..c961b45af33 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -309,7 +309,7 @@ def compress_weight( :param weight: The weight to compress. :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). :param config: Compression configuration. - :param precomputed_compressed_weight: precomputed scale and zero point. + :param precomputed_compressed_weight: Contains precomputed scale and zero point. :return: The compressed weight and decompression parameters as instance of CompressedWeight """ precomputed_scale, precomputed_zero_point = ( From 6fdfd3304629e5ae926729e837d5b79023dd7e0d Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 2 Jul 2025 16:24:01 +0200 Subject: [PATCH 60/68] Applied suggestion. --- src/nncf/quantization/algorithms/weight_compression/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/config.py b/src/nncf/quantization/algorithms/weight_compression/config.py index 1d4e7a57917..fd72a5cdd8d 100644 --- a/src/nncf/quantization/algorithms/weight_compression/config.py +++ b/src/nncf/quantization/algorithms/weight_compression/config.py @@ -30,7 +30,8 @@ class WeightCompressionConfig: :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale). The value -1 means no grouping. Defaults to -1. :param codebook_values: Optional codebook values for CODEBOOK compression mode. - Must be fns.Tensor which wraps numpy array or ov tensor. + Must be fns.Tensor which wraps numpy array or ov tensor. Storing ov tensor is useful for having + destination data type information available. """ mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM From 17d6d2d61b44560fa3bed9868c24722e3d5a61be Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 8 Jul 2025 10:15:08 +0200 Subject: [PATCH 61/68] Applied suggestions. --- .../weight_compression/algorithm.py | 4 +- .../weight_compression/constants.py | 50 ++++++++----------- .../weight_compression/openvino_backend.py | 4 +- 3 files changed, 26 insertions(+), 32 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index dd3fac4d288..9077cf7a81c 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -39,7 +39,7 @@ from nncf.quantization.algorithms.algorithm import Algorithm from nncf.quantization.algorithms.weight_compression.awq import AWQ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters -from nncf.quantization.algorithms.weight_compression.constants import get_cb4_quantiles +from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES from nncf.quantization.algorithms.weight_compression.gptq import GPTQ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -457,7 +457,7 @@ def _get_primary_config(self): return WeightCompressionConfig( mode=self._mode, group_size=self._group_size, - codebook_values=get_cb4_quantiles() + codebook_values=Tensor(CB4_QUANTILES) if self._mode == CompressWeightsMode.CB4_F8E4M3 else Tensor(self._advanced_parameters.codebook_params.codebook), ) diff --git a/src/nncf/quantization/algorithms/weight_compression/constants.py b/src/nncf/quantization/algorithms/weight_compression/constants.py index 726ba841e00..6119fd8f83c 100644 --- a/src/nncf/quantization/algorithms/weight_compression/constants.py +++ b/src/nncf/quantization/algorithms/weight_compression/constants.py @@ -11,9 +11,6 @@ import numpy as np -from nncf.tensor import Tensor -from nncf.tensor import TensorDataType - NF4_QUANTILES = np.array( [ -1.0, @@ -37,32 +34,27 @@ ) -def get_cb4_quantiles() -> Tensor: - """ - Returns the quantiles for the CB4 codebook. - """ - CB4_QUANTILES = np.array( - [ - -3.5, - -2.5, - -1.875, - -1.375, - -1.0, - -0.625, - -0.3125, - 0.0, - 0.28125, - 0.5625, - 0.875, - 1.125, - 1.5, - 2.0, - 2.5, - 3.5, - ], - dtype=np.float32, - ) - return Tensor(CB4_QUANTILES).as_openvino_tensor().astype(TensorDataType.f8e4m3) +CB4_QUANTILES = np.array( + [ + -3.5, + -2.5, + -1.875, + -1.375, + -1.0, + -0.625, + -0.3125, + 0.0, + 0.28125, + 0.5625, + 0.875, + 1.125, + 1.5, + 2.0, + 2.5, + 3.5, + ], + dtype=np.float32, +) CENTER_OF_NF4_QUANTILES = np.array( diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 37564cc654b..6215fb4b1ee 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -255,7 +255,9 @@ def _create_compression_subgraph( n_quants = compressed_weight.codebook.size - 1 compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4) converted_const = create_ov_codebook_subgraph( - codebook=compressed_weight.codebook, + codebook=compressed_weight.codebook + if compression_config.mode == CompressWeightsMode.CODEBOOK + else compressed_weight.codebook.as_openvino_tensor().astype(TensorDataType.f8e4m3), indexes=compressed_weight.tensor, dtype=compression_dtype, name=const_node_name, From 61abc6a7a3944414844cd889f4d701b19b5bd5a2 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 8 Jul 2025 12:26:03 +0200 Subject: [PATCH 62/68] Applied suggestions. --- .../openvino/smollm2_360m_codebook/main.py | 32 +++++++++++++++---- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 640a9fd1313..7b091a25157 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -12,18 +12,31 @@ import numpy as np from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer +from transformers import logging import nncf +logging.set_verbosity_error() + def generate_answers(questions, model, tokenizer, max_new_tokens=50): + """Generate answers for a list of questions using the provided model and tokenizer. + + Args: + questions : List of questions to be answered. + model : The model to use for generating answers. + tokenizer : The tokenizer to use for processing the input and output. + max_new_tokens (int, optional): Maximum number of new tokens to generate for each answer. Defaults to 50. + + Returns: + dict: A dictionary mapping each question to its corresponding answer. + """ messages = [ {"role": "system", "content": "You are a chatbot who always responds as short as possible."}, {"role": "user", "content": "What is the capital of Spain?"}, {"role": "assistant", "content": "Madrid."}, ] answers_by_questions = {} - model.request = None for question in questions: messages.append({"role": "user", "content": question}) @@ -37,10 +50,15 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50): answers_by_questions[question] = answer messages.append({"role": "assistant", "content": answer}) - model.request = None return answers_by_questions +def print_answers(header, answers_by_questions): + print(header) + for question, answer in answers_by_questions.items(): + print(f"Q: {question}\nA: {answer}\n") + + QUESTIONS = [ "What is the capital of France?", "What is the highest peak in the Alps?", @@ -50,7 +68,7 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50): def load_model_and_tokenizer(model_id, export=True): - tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) model = OVModelForCausalLM.from_pretrained( model_id, export=export, @@ -62,7 +80,7 @@ def load_model_and_tokenizer(model_id, export=True): def default_codebook_example(model_id, output_dir): model, tokenizer = load_model_and_tokenizer(model_id) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) - print(f"Non-optimized model outputs:\n{answers_by_questions}\n") + print_answers("Non-optimized model outputs:\n", answers_by_questions) model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CB4_F8E4M3, ratio=1.0, group_size=64) model.save_pretrained(output_dir) @@ -70,7 +88,7 @@ def default_codebook_example(model_id, output_dir): model, tokenizer = load_model_and_tokenizer(output_dir, False) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) - print(f"Optimized model outputs:\n{answers_by_questions}\n") + print_answers("Optimized model outputs:\n", answers_by_questions) return list(answers_by_questions.values()) @@ -79,7 +97,7 @@ def custom_codebook_example(model_id, output_dir): model, tokenizer = load_model_and_tokenizer(model_id) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) - print(f"Non-optimized model outputs:\n{answers_by_questions}\n") + print_answers("Non-optimized model outputs:\n", answers_by_questions) codebook_params = nncf.CodebookParameters( np.array([-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], dtype=np.int8) @@ -97,7 +115,7 @@ def custom_codebook_example(model_id, output_dir): model, tokenizer = load_model_and_tokenizer(output_dir, False) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) - print(f"Optimized model outputs:\n{answers_by_questions}\n") + print_answers("Optimized model outputs:\n", answers_by_questions) return list(answers_by_questions.values()) From d1d82329bafc6ed8762bf426aed88f8e0e4f2f98 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 8 Jul 2025 14:01:17 +0200 Subject: [PATCH 63/68] 1) Added docstrings for codebook example. 2) Changed custom codebook to smaller in codebook example. --- .../openvino/smollm2_360m_codebook/main.py | 60 +++++++++++++------ tests/cross_fw/examples/example_scope.json | 2 +- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 7b091a25157..37feef9d207 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -19,7 +19,9 @@ logging.set_verbosity_error() -def generate_answers(questions, model, tokenizer, max_new_tokens=50): +def generate_answers( + questions: list[str], model: OVModelForCausalLM, tokenizer: AutoTokenizer, max_new_tokens: int = 50 +) -> dict[str, str]: """Generate answers for a list of questions using the provided model and tokenizer. Args: @@ -53,7 +55,12 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50): return answers_by_questions -def print_answers(header, answers_by_questions): +def print_answers(header: str, answers_by_questions: list[str]) -> None: + """Print the answers to the console. + Args: + header (str): Header to print before the answers. + answers_by_questions (dict): Dictionary mapping questions to their answers. + """ print(header) for question, answer in answers_by_questions.items(): print(f"Q: {question}\nA: {answer}\n") @@ -67,7 +74,14 @@ def print_answers(header, answers_by_questions): ] -def load_model_and_tokenizer(model_id, export=True): +def load_model_and_tokenizer(model_id: str, export=True) -> tuple[OVModelForCausalLM, AutoTokenizer]: + """Load the model and tokenizer from the specified model ID. + Args: + model_id (str): The identifier of the model to load. + export (bool): Whether to export the model for OpenVINO. Defaults to True. + Returns: + tuple: A tuple containing the loaded model and tokenizer. + """ tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) model = OVModelForCausalLM.from_pretrained( model_id, @@ -77,31 +91,43 @@ def load_model_and_tokenizer(model_id, export=True): return model, tokenizer -def default_codebook_example(model_id, output_dir): +def default_codebook_example(model_id: str, compressed_model_id: str) -> None: + """Example of using the default codebook compression. + Args: + model_id (str): The identifier of the model to load. + compressed_model_id (str): The identifier for the compressed model to save. + Returns: + list: A list of answers generated by the model after compression. + """ model, tokenizer = load_model_and_tokenizer(model_id) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print_answers("Non-optimized model outputs:\n", answers_by_questions) model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CB4_F8E4M3, ratio=1.0, group_size=64) - model.save_pretrained(output_dir) - tokenizer.save_pretrained(output_dir) + model.save_pretrained(compressed_model_id) + tokenizer.save_pretrained(compressed_model_id) - model, tokenizer = load_model_and_tokenizer(output_dir, False) + model, tokenizer = load_model_and_tokenizer(compressed_model_id, False) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print_answers("Optimized model outputs:\n", answers_by_questions) return list(answers_by_questions.values()) -def custom_codebook_example(model_id, output_dir): +def custom_codebook_example(model_id: str, compressed_model_id: str) -> None: + """Example of using the custom codebook compression. + Args: + model_id (str): The identifier of the model to load. + compressed_model_id (str): The identifier for the compressed model to save. + Returns: + list: A list of answers generated by the model after compression. + """ model, tokenizer = load_model_and_tokenizer(model_id) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print_answers("Non-optimized model outputs:\n", answers_by_questions) - codebook_params = nncf.CodebookParameters( - np.array([-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], dtype=np.int8) - ) + codebook_params = nncf.CodebookParameters(np.array([-8, -4, -2, -1, 0, 1, 2, 4, 8], dtype=np.int8)) model.model = nncf.compress_weights( model.model, @@ -110,10 +136,10 @@ def custom_codebook_example(model_id, output_dir): group_size=-1, advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params), ) - model.save_pretrained(output_dir) - tokenizer.save_pretrained(output_dir) + model.save_pretrained(compressed_model_id) + tokenizer.save_pretrained(compressed_model_id) - model, tokenizer = load_model_and_tokenizer(output_dir, False) + model, tokenizer = load_model_and_tokenizer(compressed_model_id, False) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print_answers("Optimized model outputs:\n", answers_by_questions) @@ -122,10 +148,10 @@ def custom_codebook_example(model_id, output_dir): def main(): model_id = "HuggingFaceTB/SmolLM2-360M-Instruct" - output_dir = "smollm2_360m_compressed_codebook" + compressed_model_id = "smollm2_360m_compressed_codebook" - res = default_codebook_example(model_id, output_dir) - res += custom_codebook_example(model_id, output_dir + "_custom") + res = default_codebook_example(model_id, compressed_model_id) + res += custom_codebook_example(model_id, compressed_model_id + "_custom") return res diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json index c45dfc898b9..e027db051c2 100644 --- a/tests/cross_fw/examples/example_scope.json +++ b/tests/cross_fw/examples/example_scope.json @@ -296,7 +296,7 @@ "Paris.", "Mont Blanc.", "Toronto.", - "Tokyo." + "Fukuoka." ] } }, From b8f25269ec560b1f41c351d09db9c6d296641355 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 8 Jul 2025 14:53:48 +0200 Subject: [PATCH 64/68] Applied suggestions. --- .../openvino/smollm2_360m_codebook/main.py | 4 ++++ .../algorithms/weight_compression/algorithm.py | 9 ++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 37feef9d207..a8d3b539f23 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -9,14 +9,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings + import numpy as np from optimum.intel.openvino import OVModelForCausalLM +from torch.jit import TracerWarning from transformers import AutoTokenizer from transformers import logging import nncf logging.set_verbosity_error() +warnings.filterwarnings("ignore", category=TracerWarning) def generate_answers( diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index 9077cf7a81c..a276f281345 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -454,12 +454,15 @@ def _get_ratio_defining_params( return ratio_defining_params def _get_primary_config(self): + codebook_values = ( + Tensor(CB4_QUANTILES) + if self._mode == CompressWeightsMode.CB4_F8E4M3 + else Tensor(self._advanced_parameters.codebook_params.codebook) + ) return WeightCompressionConfig( mode=self._mode, group_size=self._group_size, - codebook_values=Tensor(CB4_QUANTILES) - if self._mode == CompressWeightsMode.CB4_F8E4M3 - else Tensor(self._advanced_parameters.codebook_params.codebook), + codebook_values=codebook_values, ) def _set_weight_compression_config( From ca342ab9e40496d9f4702dacffbca72806dc3dd9 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 8 Jul 2025 16:12:22 +0200 Subject: [PATCH 65/68] Applied suggestion. --- .../openvino/smollm2_360m_codebook/main.py | 11 ++++++----- .../algorithms/weight_compression/config.py | 2 +- .../algorithms/weight_compression/weight_lowering.py | 7 ++----- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index a8d3b539f23..6262639bd1e 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -23,6 +23,10 @@ warnings.filterwarnings("ignore", category=TracerWarning) +MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct" +COMPRESSED_MODEL_ID = "smollm2_360m_compressed_codebook" + + def generate_answers( questions: list[str], model: OVModelForCausalLM, tokenizer: AutoTokenizer, max_new_tokens: int = 50 ) -> dict[str, str]: @@ -151,11 +155,8 @@ def custom_codebook_example(model_id: str, compressed_model_id: str) -> None: def main(): - model_id = "HuggingFaceTB/SmolLM2-360M-Instruct" - compressed_model_id = "smollm2_360m_compressed_codebook" - - res = default_codebook_example(model_id, compressed_model_id) - res += custom_codebook_example(model_id, compressed_model_id + "_custom") + res = default_codebook_example(MODEL_ID, COMPRESSED_MODEL_ID) + res += custom_codebook_example(MODEL_ID, COMPRESSED_MODEL_ID + "_custom") return res diff --git a/src/nncf/quantization/algorithms/weight_compression/config.py b/src/nncf/quantization/algorithms/weight_compression/config.py index fd72a5cdd8d..1d5376b3454 100644 --- a/src/nncf/quantization/algorithms/weight_compression/config.py +++ b/src/nncf/quantization/algorithms/weight_compression/config.py @@ -69,7 +69,7 @@ def is_codebook(self): return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] def get_numpy_codebook(self): - return self.codebook_values.as_numpy_tensor().data + return self.codebook_values.as_numpy_tensor() def __hash__(self): return hash((self.mode.value, self.group_size)) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index c961b45af33..8a1ee8f9b40 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -11,8 +11,6 @@ import os from typing import Optional, Union -import numpy as np - import nncf from nncf.common.logging.logger import nncf_logger from nncf.common.utils.backend import is_openvino_at_least @@ -85,7 +83,7 @@ def calculate_float_quantization_params( scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]: - max_val = 6.0 if config.mode == CompressWeightsMode.E2M1 else max(np.abs(config.get_numpy_codebook())) + max_val = 6.0 if config.mode == CompressWeightsMode.E2M1 else fns.max(fns.abs(config.get_numpy_codebook())) scale = scale / max_val # NOTE: adding machine epsilon to avoid division by zero @@ -501,7 +499,7 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor: def _calculate_codebook_quantized_weight( - norm_weight: Tensor, quantiles: np.ndarray = None, center_of_quantiles: np.ndarray = None + norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None ) -> tuple[Tensor, Tensor]: """ Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to @@ -518,7 +516,6 @@ def _calculate_codebook_quantized_weight( ) if center_of_quantiles is None: - quantiles = np.array(quantiles) center_of_quantiles = 0.5 * (quantiles[1:] + quantiles[:-1]) center_of_quantiles = fns.from_numpy(center_of_quantiles, backend=norm_weight.backend) indexes = fns.searchsorted(center_of_quantiles, norm_weight) From 635ef2327d563c349f17be743795902a21617ab6 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 8 Jul 2025 16:19:43 +0200 Subject: [PATCH 66/68] Changed docstring formatting. --- .../openvino/smollm2_360m_codebook/main.py | 65 +++++++++---------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 6262639bd1e..c80fceca0e6 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -30,16 +30,14 @@ def generate_answers( questions: list[str], model: OVModelForCausalLM, tokenizer: AutoTokenizer, max_new_tokens: int = 50 ) -> dict[str, str]: - """Generate answers for a list of questions using the provided model and tokenizer. - - Args: - questions : List of questions to be answered. - model : The model to use for generating answers. - tokenizer : The tokenizer to use for processing the input and output. - max_new_tokens (int, optional): Maximum number of new tokens to generate for each answer. Defaults to 50. + """ + Generate answers for a list of questions using the provided model and tokenizer. - Returns: - dict: A dictionary mapping each question to its corresponding answer. + :param questions : List of questions to be answered. + :param model : The model to use for generating answers. + :param tokenizer : The tokenizer to use for processing the input and output. + :param max_new_tokens (int, optional): Maximum number of new tokens to generate for each answer. Defaults to 50. + :return: A dictionary mapping each question to its corresponding answer. """ messages = [ {"role": "system", "content": "You are a chatbot who always responds as short as possible."}, @@ -64,10 +62,11 @@ def generate_answers( def print_answers(header: str, answers_by_questions: list[str]) -> None: - """Print the answers to the console. - Args: - header (str): Header to print before the answers. - answers_by_questions (dict): Dictionary mapping questions to their answers. + """ + Print the answers to the console. + + :param header (str): Header to print before the answers. + :param answers_by_questions (dict): Dictionary mapping questions to their answers. """ print(header) for question, answer in answers_by_questions.items(): @@ -83,12 +82,12 @@ def print_answers(header: str, answers_by_questions: list[str]) -> None: def load_model_and_tokenizer(model_id: str, export=True) -> tuple[OVModelForCausalLM, AutoTokenizer]: - """Load the model and tokenizer from the specified model ID. - Args: - model_id (str): The identifier of the model to load. - export (bool): Whether to export the model for OpenVINO. Defaults to True. - Returns: - tuple: A tuple containing the loaded model and tokenizer. + """ + Load the model and tokenizer from the specified model ID. + + :param model_id (str): The identifier of the model to load. + :param export (bool): Whether to export the model for OpenVINO. Defaults to True. + :return: A tuple containing the loaded model and tokenizer. """ tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) model = OVModelForCausalLM.from_pretrained( @@ -99,13 +98,13 @@ def load_model_and_tokenizer(model_id: str, export=True) -> tuple[OVModelForCaus return model, tokenizer -def default_codebook_example(model_id: str, compressed_model_id: str) -> None: - """Example of using the default codebook compression. - Args: - model_id (str): The identifier of the model to load. - compressed_model_id (str): The identifier for the compressed model to save. - Returns: - list: A list of answers generated by the model after compression. +def default_codebook_example(model_id: str, compressed_model_id: str) -> list[str]: + """ + Example of using the default codebook compression. + + :param model_id (str): The identifier of the model to load. + :param compressed_model_id (str): The identifier for the compressed model to save. + :return: A list of answers generated by the model after compression. """ model, tokenizer = load_model_and_tokenizer(model_id) answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) @@ -122,13 +121,13 @@ def default_codebook_example(model_id: str, compressed_model_id: str) -> None: return list(answers_by_questions.values()) -def custom_codebook_example(model_id: str, compressed_model_id: str) -> None: - """Example of using the custom codebook compression. - Args: - model_id (str): The identifier of the model to load. - compressed_model_id (str): The identifier for the compressed model to save. - Returns: - list: A list of answers generated by the model after compression. +def custom_codebook_example(model_id: str, compressed_model_id: str) -> list[str]: + """ + Example of using the custom codebook compression. + + :param model_id (str): The identifier of the model to load. + :param compressed_model_id (str): The identifier for the compressed model to save. + :return: A list of answers generated by the model after compression. """ model, tokenizer = load_model_and_tokenizer(model_id) From 50a94aa1fc5cb220047009a80171c6b323423080 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 9 Jul 2025 14:04:14 +0200 Subject: [PATCH 67/68] Applied suggestions. --- .../openvino/smollm2_360m_codebook/main.py | 24 +++++++++---------- .../graph/metatypes/openvino_metatypes.py | 1 + .../weight_compression/algorithm.py | 13 ++++++---- .../weight_compression/onnx_backend.py | 5 ++-- .../weight_compression/torch_backend.py | 5 ++-- 5 files changed, 25 insertions(+), 23 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index c80fceca0e6..7df2572c148 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -33,10 +33,10 @@ def generate_answers( """ Generate answers for a list of questions using the provided model and tokenizer. - :param questions : List of questions to be answered. - :param model : The model to use for generating answers. - :param tokenizer : The tokenizer to use for processing the input and output. - :param max_new_tokens (int, optional): Maximum number of new tokens to generate for each answer. Defaults to 50. + :param questions: List of questions to be answered. + :param model: The model to use for generating answers. + :param tokenizer: The tokenizer to use for processing the input and output. + :param max_new_tokens: Maximum number of new tokens to generate for each answer. Defaults to 50. :return: A dictionary mapping each question to its corresponding answer. """ messages = [ @@ -65,8 +65,8 @@ def print_answers(header: str, answers_by_questions: list[str]) -> None: """ Print the answers to the console. - :param header (str): Header to print before the answers. - :param answers_by_questions (dict): Dictionary mapping questions to their answers. + :param header: Header to print before the answers. + :param answers_by_questions: Dictionary mapping questions to their answers. """ print(header) for question, answer in answers_by_questions.items(): @@ -85,8 +85,8 @@ def load_model_and_tokenizer(model_id: str, export=True) -> tuple[OVModelForCaus """ Load the model and tokenizer from the specified model ID. - :param model_id (str): The identifier of the model to load. - :param export (bool): Whether to export the model for OpenVINO. Defaults to True. + :param model_id: The identifier of the model to load. + :param export: Whether to export the model for OpenVINO. Defaults to True. :return: A tuple containing the loaded model and tokenizer. """ tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) @@ -102,8 +102,8 @@ def default_codebook_example(model_id: str, compressed_model_id: str) -> list[st """ Example of using the default codebook compression. - :param model_id (str): The identifier of the model to load. - :param compressed_model_id (str): The identifier for the compressed model to save. + :param model_id: The identifier of the model to load. + :param compressed_model_id: The identifier for the compressed model to save. :return: A list of answers generated by the model after compression. """ model, tokenizer = load_model_and_tokenizer(model_id) @@ -125,8 +125,8 @@ def custom_codebook_example(model_id: str, compressed_model_id: str) -> list[str """ Example of using the custom codebook compression. - :param model_id (str): The identifier of the model to load. - :param compressed_model_id (str): The identifier for the compressed model to save. + :param model_id: The identifier of the model to load. + :param compressed_model_id: The identifier for the compressed model to save. :return: A list of answers generated by the model after compression. """ model, tokenizer = load_model_and_tokenizer(model_id) diff --git a/src/nncf/openvino/graph/metatypes/openvino_metatypes.py b/src/nncf/openvino/graph/metatypes/openvino_metatypes.py index e2b95afc241..214bce563f1 100644 --- a/src/nncf/openvino/graph/metatypes/openvino_metatypes.py +++ b/src/nncf/openvino/graph/metatypes/openvino_metatypes.py @@ -819,6 +819,7 @@ def _is_embedding(node: ov.Node) -> bool: input_tensor = node.input_value(const_port_id) input_type = input_tensor.get_element_type().get_type_name() + # TODO(aanuf): Implement a pattern based check for embedding. if node.friendly_name.endswith("nncf_codebook"): return False diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index a276f281345..9d7ca909ea3 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -48,6 +48,7 @@ from nncf.scopes import IgnoredScope from nncf.scopes import get_ignored_node_names_from_ignored_scope from nncf.tensor import Tensor +from nncf.tensor import functions as fns from nncf.tensor.definitions import TensorDataType TModel = TypeVar("TModel") @@ -182,17 +183,19 @@ def check_user_compression_configuration( ) ranks = [advanced_parameters.lora_adapter_rank, advanced_parameters.lora_correction_params.adapter_rank] - if advanced_parameters.codebook_params.codebook is not None: - codebook = Tensor(advanced_parameters.codebook_params.codebook).as_numpy_tensor().data + codebook = advanced_parameters.codebook_params.codebook + if codebook is not None: + # OpenVINO Tensor is not support functions to validate codebook + np_codebook = Tensor(codebook).as_numpy_tensor() msg = None - if codebook.ndim != 1: + if np_codebook.ndim != 1: msg = "The codebook must be a 1D array, but a multi-dimensional array is given." - if codebook.size < 2: + elif np_codebook.size < 2: msg = ( "The codebook must contain at least two unique elements," "but a single-element or empty array is given." ) - if (codebook[:-1] >= codebook[1:]).any(): + elif fns.any(np_codebook[:-1] >= np_codebook[1:]): msg = "The codebook must be a sorted 1D array with unique elements, but an unsorted array is given." if msg: raise nncf.ValidationError(msg) diff --git a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py index 761647184b4..0e7e1897813 100644 --- a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -213,13 +213,12 @@ def transform_model( compression_config = wc_params.compression_config node = wc_params.node_with_weight weight = self.get_weight(node, wc_params.weight_port_id, model, graph) + precomputed_compressed_weights = precomputed_compressed_weights or {} compressed_weight = compress_weight( Tensor(weight), wc_params.reduction_axes, compression_config, - None - if precomputed_compressed_weights is None - else precomputed_compressed_weights.get(wc_params.weight_name), + precomputed_compressed_weights.get(wc_params.weight_name), ) dequantize_block_size = max(compression_config.group_size, 0) # 0 - is no block wise quantization dequantize_axis = ( diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py index 79f6b315a09..7e5c348f3a9 100644 --- a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -488,14 +488,13 @@ def transform_model( raise nncf.InternalError(msg) try: + precomputed_compressed_weights = precomputed_compressed_weights or {} # calculates compressed weights and decompression parameters compressed_weight = compress_weight( Tensor(weight), wc_params.reduction_axes, compression_config, - None - if precomputed_compressed_weights is None - else precomputed_compressed_weights.get(wc_params.weight_name), + precomputed_compressed_weights.get(wc_params.weight_name), ) except nncf.InvalidGroupSizeError as error: first_caught_error = error From 82d9e5cacd92c7c18557d096416fc0546e73e205 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 9 Jul 2025 17:27:24 +0200 Subject: [PATCH 68/68] Applied suggestions. --- .../openvino/smollm2_360m_codebook/main.py | 4 ++-- src/nncf/__init__.py | 1 - src/nncf/parameters.py | 2 +- src/nncf/quantization/advanced_parameters.py | 21 ++++--------------- .../weight_compression/algorithm.py | 18 ++++++++-------- .../weight_compression/parameters.py | 16 ++------------ src/nncf/quantization/quantize_model.py | 2 +- .../quantization/test_weights_compression.py | 13 +++++------- 8 files changed, 24 insertions(+), 53 deletions(-) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 7df2572c148..a5b27104218 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -134,14 +134,14 @@ def custom_codebook_example(model_id: str, compressed_model_id: str) -> list[str answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) print_answers("Non-optimized model outputs:\n", answers_by_questions) - codebook_params = nncf.CodebookParameters(np.array([-8, -4, -2, -1, 0, 1, 2, 4, 8], dtype=np.int8)) + codebook = np.array([-8, -4, -2, -1, 0, 1, 2, 4, 8], dtype=np.int8) model.model = nncf.compress_weights( model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=-1, - advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params), + advanced_parameters=nncf.AdvancedCompressionParameters(codebook=codebook), ) model.save_pretrained(compressed_model_id) tokenizer.save_pretrained(compressed_model_id) diff --git a/src/nncf/__init__.py b/src/nncf/__init__.py index 14e1c38740f..77cd6fbb09a 100644 --- a/src/nncf/__init__.py +++ b/src/nncf/__init__.py @@ -58,7 +58,6 @@ from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters as AdvancedQuantizationParameters from nncf.quantization.advanced_parameters import AdvancedScaleEstimationParameters as AdvancedScaleEstimationParameters from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters as AdvancedSmoothQuantParameters -from nncf.quantization.advanced_parameters import CodebookParameters as CodebookParameters from nncf.quantization.advanced_parameters import OverflowFix as OverflowFix from nncf.scopes import IgnoredScope as IgnoredScope from nncf.scopes import Subgraph as Subgraph diff --git a/src/nncf/parameters.py b/src/nncf/parameters.py index 55ef80046de..e1269ea78e1 100644 --- a/src/nncf/parameters.py +++ b/src/nncf/parameters.py @@ -86,7 +86,7 @@ class CompressWeightsMode(StrEnum): :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead. :param E2M1: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. :param CODEBOOK: Codebook (LUT) quantization format. - :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values. + :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format. """ INT8_SYM = "int8_sym" diff --git a/src/nncf/quantization/advanced_parameters.py b/src/nncf/quantization/advanced_parameters.py index ad62fef11fc..4de0152188f 100644 --- a/src/nncf/quantization/advanced_parameters.py +++ b/src/nncf/quantization/advanced_parameters.py @@ -361,20 +361,6 @@ class AdvancedLoraCorrectionParameters: use_int8_adapters: bool = True -@api() -@dataclass -class CodebookParameters: - """ - Contains parameters for codebook compression algorithm. - - :param codebook: The codebook (LUT) for the weight compression. - Applicable for vector quantization. Must be a numpy array or ov Tensor. - :type codebook: TTensor - """ - - codebook: Optional[TTensor] = None - - @api() @dataclass class AdvancedCompressionParameters: @@ -395,8 +381,9 @@ class AdvancedCompressionParameters: :type lora_adapter_rank: int :param backend_params: Backend-specific parameters. :type backend_params: dict[str, Any] - :param codebook_params: Parameters for codebook compression. - :type codebook_params: CodebookParameters + :param codebook: The codebook (LUT) for the weight compression. + Applicable for vector quantization. Must be a numpy array or ov Tensor. + :type codebook: TTensor """ statistics_path: Optional[str] = None @@ -408,7 +395,7 @@ class AdvancedCompressionParameters: lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters) lora_adapter_rank: int = 256 backend_params: dict[str, Any] = field(default_factory=dict) - codebook_params: CodebookParameters = field(default_factory=CodebookParameters) + codebook: Optional[TTensor] = None @api() diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index 9d7ca909ea3..7ab4d2d1813 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -183,7 +183,7 @@ def check_user_compression_configuration( ) ranks = [advanced_parameters.lora_adapter_rank, advanced_parameters.lora_correction_params.adapter_rank] - codebook = advanced_parameters.codebook_params.codebook + codebook = advanced_parameters.codebook if codebook is not None: # OpenVINO Tensor is not support functions to validate codebook np_codebook = Tensor(codebook).as_numpy_tensor() @@ -228,9 +228,7 @@ def check_user_compression_configuration( msg = "LoRA Correction algorithm is not compatible with FQ, FQ_LORA and FQ_LORA_NLS compression formats." raise nncf.ValidationError(msg) - if mode == CompressWeightsMode.CODEBOOK and ( - advanced_parameters is None or advanced_parameters.codebook_params.codebook is None - ): + if mode == CompressWeightsMode.CODEBOOK and (advanced_parameters is None or advanced_parameters.codebook is None): msg = "Codebook compression mode requires codebook parameters to be specified in advanced_parameters." raise nncf.ValidationError(msg) @@ -457,11 +455,13 @@ def _get_ratio_defining_params( return ratio_defining_params def _get_primary_config(self): - codebook_values = ( - Tensor(CB4_QUANTILES) - if self._mode == CompressWeightsMode.CB4_F8E4M3 - else Tensor(self._advanced_parameters.codebook_params.codebook) - ) + codebook_values = None + + if self._mode == CompressWeightsMode.CB4_F8E4M3: + codebook_values = Tensor(CB4_QUANTILES) + elif self._mode == CompressWeightsMode.CODEBOOK: + codebook_values = Tensor(self._advanced_parameters.codebook) + return WeightCompressionConfig( mode=self._mode, group_size=self._group_size, diff --git a/src/nncf/quantization/algorithms/weight_compression/parameters.py b/src/nncf/quantization/algorithms/weight_compression/parameters.py index 8c1d60fd400..fb27775997d 100644 --- a/src/nncf/quantization/algorithms/weight_compression/parameters.py +++ b/src/nncf/quantization/algorithms/weight_compression/parameters.py @@ -10,23 +10,11 @@ # limitations under the License. from dataclasses import dataclass -from typing import Any, Optional +from typing import Optional from nncf.tensor import Tensor -@dataclass -class Codebook: - """ - Codebook parameters for weight compression. - :param codebook: The initial codebook for compression. - :param dst_type: The destination type for the codebook. - """ - - codebook: Optional[Tensor] = None - dst_type: Optional[Any] = None - - @dataclass class CompressedWeight: """ @@ -42,7 +30,7 @@ class CompressedWeight: tensor: Optional[Tensor] = None scale: Optional[Tensor] = None zero_point: Optional[Tensor] = None - codebook: Optional[Codebook] = None + codebook: Optional[Tensor] = None def is_codebook(self): """ diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py index c63d698c430..340f5983f2b 100644 --- a/src/nncf/quantization/quantize_model.py +++ b/src/nncf/quantization/quantize_model.py @@ -608,7 +608,7 @@ def compress_weights( raise nncf.ParameterNotSupportedError(msg) if any((awq, scale_estimation, gptq, lora_correction)) and mode == CompressWeightsMode.E2M1: - msg = f"AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is {mode}." + msg = "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is E2M1." raise nncf.ParameterNotSupportedError(msg) if gptq and lora_correction: diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 33ba2681b07..936d5d53329 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -40,7 +40,6 @@ from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams -from nncf.quantization.advanced_parameters import CodebookParameters from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -356,13 +355,12 @@ def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map): ) def test_codebook_compression_for_different_dtypes(codebook, codebook_dtype, index_dtype, name): model = IntegerModel().ov_model - codebook_params = nncf.CodebookParameters(codebook) compressed_model = compress_weights( model, mode=CompressWeightsMode.CODEBOOK, group_size=7, - advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params), + advanced_parameters=nncf.AdvancedCompressionParameters(codebook=codebook), ) actual_stats = {} for op in compressed_model.get_ops(): @@ -1182,7 +1180,7 @@ def test_codebook(codebook, n_layers, dst_type, group_size): ratio=1.0, group_size=group_size, all_layers=True, - advanced_parameters=AdvancedCompressionParameters(codebook_params=CodebookParameters(codebook=codebook)), + advanced_parameters=AdvancedCompressionParameters(codebook=codebook), ) names_codebook = [ op.get_friendly_name() @@ -1728,7 +1726,7 @@ def test_nf4_quantization_mid_quant(weight, scale): @pytest.mark.parametrize( - "codebook_values", + "codebook", [ np.array([0.2, 0.2, 0.3, 0.4], dtype=np.float32), np.array([0.5, 0.2, 0.3, 0.4], dtype=np.float32), @@ -1736,8 +1734,7 @@ def test_nf4_quantization_mid_quant(weight, scale): np.array([5], dtype=np.float32), ], ) -def test_codebook_is_correct_array(codebook_values): - codebook_params = nncf.CodebookParameters(codebook_values) +def test_codebook_is_correct_array(codebook): model = SequentialMatmulModel().ov_model # The codebook should be a non empty 1D numpy array and sorted @@ -1746,7 +1743,7 @@ def test_codebook_is_correct_array(codebook_values): model, mode=CompressWeightsMode.CODEBOOK, group_size=-1, - advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params), + advanced_parameters=nncf.AdvancedCompressionParameters(codebook=codebook), )