From 488cacc2be70b7ae7e417c555d2aeea29163f5b6 Mon Sep 17 00:00:00 2001 From: Aleksandr Suslov Date: Mon, 10 Jun 2024 19:17:08 +0400 Subject: [PATCH 01/25] Support scale estimation inside GPTQ --- .../algorithms/layerwise/scheduler.py | 34 +- .../weight_compression/activation_stats.py | 7 +- .../weight_compression/algorithm.py | 59 ++-- .../algorithms/weight_compression/gptq.py | 41 ++- .../weight_compression/scale_estimation.py | 316 ++++++++++-------- nncf/quantization/quantize_model.py | 5 - .../openvino/native/quantization/test_gptq.py | 5 +- .../quantization/test_weights_compression.py | 5 +- 8 files changed, 271 insertions(+), 201 deletions(-) diff --git a/nncf/quantization/algorithms/layerwise/scheduler.py b/nncf/quantization/algorithms/layerwise/scheduler.py index 8eee99fad28..8abc03400c0 100644 --- a/nncf/quantization/algorithms/layerwise/scheduler.py +++ b/nncf/quantization/algorithms/layerwise/scheduler.py @@ -9,6 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import OrderedDict from copy import deepcopy from dataclasses import dataclass from dataclasses import field @@ -177,26 +178,31 @@ def schedule( old_input_nodes = set() new_input_nodes = set() for p in paths: - target_output_nodes = set() + target_outputs = [] additional_output_nodes = set() for output_node in p.output_nodes: - if output_node in target_nodes: - target_output_nodes.add(output_node) - elif output_node in p.input_nodes: - reuse_input_nodes.add(output_node) - else: - # filter additional output nodes - for prev_node in inference_graph.get_previous_nodes(output_node): - if prev_node not in p.output_nodes: - additional_output_nodes.add(output_node) - break - if not target_output_nodes: + try: + target_node_index = target_nodes.index(output_node) + target_outputs.append((target_node_index, output_node)) + except ValueError: + if output_node in p.input_nodes: + reuse_input_nodes.add(output_node) + else: + # filter additional output nodes + for prev_node in inference_graph.get_previous_nodes(output_node): + if prev_node not in p.output_nodes: + additional_output_nodes.add(output_node) + break + if not target_outputs: continue + target_outputs.sort(key=lambda target_output: target_output[0]) + target_output_nodes = [output[1] for output in target_outputs] + old_input_nodes |= p.input_nodes - new_input_nodes |= target_output_nodes | additional_output_nodes + new_input_nodes |= set(target_output_nodes) | additional_output_nodes subgraph_inputs = list(p.inputs) - step_target_nodes = {} + step_target_nodes = OrderedDict() subgraph_outputs = [] for node in target_output_nodes: target_edge = {} diff --git a/nncf/quantization/algorithms/weight_compression/activation_stats.py b/nncf/quantization/algorithms/weight_compression/activation_stats.py index eb8286e6383..359887e7769 100644 --- a/nncf/quantization/algorithms/weight_compression/activation_stats.py +++ b/nncf/quantization/algorithms/weight_compression/activation_stats.py @@ -9,14 +9,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple, TypeVar +from typing import List, Tuple +from nncf.tensor import Tensor from nncf.tensor import functions as fns -TTensor = TypeVar("TTensor") - -def process_stats(stats: List[TTensor], subset_size: int) -> Tuple[TTensor, TTensor]: +def process_stats(stats: List[Tensor], subset_size: int) -> Tuple[Tensor, Tensor]: """ It's a processing of activations shared between AWQ, Scale Estimation and LoRA Correction algorithms. diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 3499521bce3..1b2af0fd9a3 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -124,7 +124,12 @@ def __init__( if self._gptq: gptq_params = self._advanced_parameters.gptq_params - self._gptq_algo = GPTQ(gptq_params.damp_percent, gptq_params.block_size, gptq_params.subset_size) + self._gptq_algo = GPTQ( + damp_percent=gptq_params.damp_percent, + block_size=gptq_params.block_size, + subset_size=gptq_params.subset_size, + scale_estimation=self._scale_estimation, + ) self._gptq_statistics = None @property @@ -379,25 +384,8 @@ def apply( scales = {} zero_points = {} - if ( - self._scale_estimation - and activations is not None - and self._mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] - ): - scale_estimation_params = self._advanced_parameters.scale_estimation_params - scale_algo = ScaleEstimation( - model, - self._backend_entity.name_to_node_mapping, - all_weight_params, - nodes_to_compress, - activations, - scale_estimation_params.subset_size, - scale_estimation_params.initial_steps, - scale_estimation_params.scale_steps, - scale_estimation_params.weight_penalty, - ) - scales = scale_algo.apply(model, graph) - + lora_correction_algo = None + description = "Applying Weight Compression" if self._gptq: model, scales, zero_points = self._gptq_algo.apply( model=model, @@ -407,13 +395,30 @@ def apply( statistic_points=self._gptq_statistics, backend_entity=self._backend_entity, ) + else: + if ( + self._scale_estimation + and activations is not None + and self._mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] + ): + scale_estimation_params = self._advanced_parameters.scale_estimation_params + scale_algo = ScaleEstimation( + model, + self._backend_entity.name_to_node_mapping, + all_weight_params, + nodes_to_compress, + activations, + scale_estimation_params.subset_size, + scale_estimation_params.initial_steps, + scale_estimation_params.scale_steps, + scale_estimation_params.weight_penalty, + ) + scales = scale_algo.apply(model, graph) - lora_correction_algo = None - description = "Applying Weight Compression" - if self._lora_correction: - lora_correction_params = self._advanced_parameters.lora_correction_params - lora_correction_algo = LoraCorrectionAlgorithm(activations, lora_correction_params) - description += " with correction of low-rank adapters" + if self._lora_correction: + lora_correction_params = self._advanced_parameters.lora_correction_params + lora_correction_algo = LoraCorrectionAlgorithm(activations, lora_correction_params) + description += " with correction of low-rank adapters" # Sort weight params to start compression with the bigger constants. This lowers peak memory footprint. all_weight_params = sorted(all_weight_params, key=lambda wp: wp.num_weights, reverse=True) @@ -542,7 +547,7 @@ def _get_activations( statistics_aggregator = StatisticsAggregatorFactory.create(model, dataset) statistics_aggregator.register_statistic_points(statistic_container) - if self._gptq: + if self._gptq and not self._awq: self._gptq_statistics = self._gptq_algo.get_statistic_points( model, graph, nodes_to_compress, self._backend_entity ) diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index b595e080533..b1101916da3 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -25,6 +25,7 @@ from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_integer_quantization_params from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_scale from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight @@ -44,10 +45,7 @@ class GPTQ: """ def __init__( - self, - damp_percent: float = 0.1, - block_size: int = 128, - subset_size: int = 128, + self, damp_percent: float = 0.1, block_size: int = 128, subset_size: int = 128, scale_estimation: bool = False ): """ :param damp_percent: The percent of the average Hessian diagonal to use for dampening, @@ -58,6 +56,7 @@ def __init__( self._damp_percent = damp_percent self._block_size = block_size self._subset_size = subset_size + self._scale_estimation = scale_estimation self._backend = None self._backend_entity = None @@ -124,10 +123,9 @@ def apply( CompressWeightsMode.INT8_SYM, ]: continue - assert len(inputs) == 1 _, input_tensors = next(iter(inputs.items())) hessian = self._calculate_hessian(node, input_tensors) - scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian) + scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian, input_tensors) scales[wc_params.weight_name] = scale zero_points[wc_params.weight_name] = zero_point @@ -193,7 +191,12 @@ def _calculate_hessian(self, node: NNCFNode, inputs: List[Tensor]) -> Tensor: return hessian def _quantize_weights( - self, model: TModel, graph: NNCFGraph, wc_params: WeightCompressionParameters, hessian: Tensor + self, + model: TModel, + graph: NNCFGraph, + wc_params: WeightCompressionParameters, + hessian: Tensor, + inputs: List[Tensor], ): """ Quantizes the weights of the model based on the calculated Hessian matrix. @@ -260,11 +263,25 @@ def _quantize_weights( scale = calculate_nf4_scale(weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes) scales.append(scale) else: - scale, zero_point = calculate_integer_quantization_params( - weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes, block_compression_config - ) - scales.append(scale) - zero_points.append(zero_point) + if self._scale_estimation and block_compression_config.num_bits == 4: + activations = [inp.squeeze()[:, (i1 + i) : (i1 + i + group_size)] for inp in inputs] + scale, zero_point = ScaleEstimation.calculate_quantization_params( + self._backend_entity, + activations, + weight_tensor[:, (i1 + i) : (i1 + i + group_size)], + reduction_axes, + wc_params.compression_config, + ) + scales.append(scale.squeeze(axis=1)) + zero_points.append(zero_point) + else: + scale, zero_point = calculate_integer_quantization_params( + weight_tensor[:, (i1 + i) : (i1 + i + group_size)], + reduction_axes, + block_compression_config, + ) + scales.append(scale) + zero_points.append(zero_point) if block_compression_config.mode == CompressWeightsMode.NF4: compressed_weights = do_nf4_quantization( fns.unsqueeze(weight_col, 1), scales[-1], is_normalized_weight=False diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 6d1110c108f..712c5fd955d 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -20,16 +20,17 @@ from nncf.common.utils.backend import BackendType from nncf.common.utils.backend import get_backend from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats +from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization +from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor import functions as fns TModel = TypeVar("TModel") -TTensor = TypeVar("TTensor") -TWeightType = TypeVar("TWeightType") class ScaleEstimation: @@ -37,13 +38,15 @@ class ScaleEstimation: Scale estimation algorithm implementation. """ + compress_decompress_cache = {} + def __init__( self, model: TModel, name_to_node_mapping: Dict[str, Any], all_weight_params: List[WeightCompressionParameters], nodes_to_compress: List[NNCFNode], - activations: Optional[Dict[str, TTensor]] = None, + activations: Optional[Dict[str, List[Tensor]]] = None, subset_size: int = 32, initial_steps: int = 5, scale_steps: int = 10, @@ -103,7 +106,7 @@ def apply( graph: NNCFGraph, statistic_points: Optional[StatisticPointsContainer] = None, dataset: Optional[Dataset] = None, - ) -> Dict[str, TTensor]: + ) -> Dict[str, Tensor]: """ Estimates better scale for the int4 nodes in the model. Minimizes per-group difference between floating point MatMul and @@ -118,8 +121,7 @@ def apply( :return: Dict with pairs (weight name, estimated scale). """ - compress_decompress_cache = {} - res = dict() + scales = dict() for wp in track(self._all_weight_params, description="Applying Scale Estimation"): weight_name = wp.weight_name @@ -127,11 +129,10 @@ def apply( config = wp.compression_config if config.num_bits != 4 or node_name not in self._activations: - res[weight_name] = None + scales[weight_name] = None continue - s, X = process_stats(self._activations[node_name], self._subset_size) - reduction_axis = wp.reduction_axes[0] + stats = self._activations[node_name] weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) if len(weight_data) != 1: # not supported by the algorithm @@ -139,162 +140,211 @@ def apply( _, weight_port_id = weight_data[0] weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) - weight = weight.astype(TensorDataType.float32) - eps = fns.finfo(weight).eps - if reduction_axis == 0: - weight = fns.transpose(weight) - reduction_axis = 1 + scales[weight_name], _ = self.calculate_quantization_params( + self._backend_entity, + stats, + weight, + wp.reduction_axes, + config, + self._subset_size, + self._initial_steps, + self._scale_steps, + self._weight_penalty, + ) - group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis] - cur_config = deepcopy(config) - cur_config.group_size = group_size + return scales - original_weight = fns.zeros_like(weight) + weight + @staticmethod + def calculate_quantization_params( + backend_entity: WeightCompressionAlgoBackend, + activations: List[Tensor], + weight: Tensor, + reduction_axes: Tuple[int, ...], + config: WeightCompressionConfig, + subset_size: int = 32, + initial_steps: int = 5, + scale_steps: int = 10, + weight_penalty: float = -1.0, + ) -> Tensor: + """ + Calculates the quantization parameters for a given set of weights and activations. + This function estimates the optimal quantization scale for weight compression by + minimizing the difference between floating-point operations and operations with + quantized weights. + + The function uses an iterative process: + 1. Initial scale rectification based on activation statistics. + 2. A grid search to further refine the scale parameters. + + :param backend_entity: The backend-specific implementation of the weight compression algorithm. + :param activations: List of activation tensors corresponding to the layers being quantized. + :param weight: The weight tensor that is being quantized. + :param reduction_axes: Tuple specifying the axes along which the reduction is performed for quantization. + :param config: Configuration parameters for the weight compression, including quantization settings. + :param subset_size: The number of samples to use for scale estimation. Defaults to 32. + :param initial_steps: The number of steps for initial scale rectification using activation statistics. + Defaults to 5. + :param scale_steps: The number of steps for refining the scale using a grid search. Defaults to 10. + :param weight_penalty: Penalty coefficient applied to the difference between floating-point + and quantized weights. A value of -1 disables the penalty. Defaults to -1.0. + :return: A tensor containing the calculated quantization scales and zero points if applicable. + """ + reduction_axis = reduction_axes[0] - compressed_weights, scale, zp = do_int_quantization(original_weight, reduction_axis, cur_config) - if zp is not None: - zp = zp.astype(scale.dtype) - q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis) + s, X = process_stats(activations, subset_size) - s = fns.unsqueeze(s, 0) - s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size) + weight = weight.astype(TensorDataType.float32) + eps = fns.finfo(weight).eps - original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size) + if reduction_axis == 0: + weight = fns.transpose(weight) + reduction_axis = 1 - # all weight in group has importance based on corresponding input activations - importance = fns.ones_like(original_weight) - importance = importance * s + group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis] + cur_config = deepcopy(config) + cur_config.group_size = group_size - target, zero_mask = get_target_zero_mask(compressed_weights, zp) - importance = fns.where(zero_mask, 0.0, importance) - - # normalize importances for every group of weights to make sum of them equal to 1.0 - denum = fns.sum(importance, axis=2, keepdims=True) - importance = importance / (denum + eps) - - X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size) - q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size) - best_diffs = None - result_scale = None - - fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X) - q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X) - - # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE - min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) - min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0)) - if self._weight_penalty > 0.0: - min_max_scale_diffs += self._weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1) - - zp_shape = zp.shape if zp is not None else None - key = [(wp.compression_config.mode, wp.compression_config.num_bits) + q_weights.shape + scale.shape] - if zp is not None: - key += zp_shape - key = tuple(key) - if key in compress_decompress_cache: - compress_decompress_model = compress_decompress_cache[key]["compress_decompress_model"] - compress_model = compress_decompress_cache[key]["compress_model"] - else: - compress_decompress_model = self._backend_entity.get_compress_decompress_pipeline( - wp.compression_config, q_weights.shape, scale.shape, zp_shape - ) - compress_model = self._backend_entity.get_compress_pipeline( - wp.compression_config, q_weights.shape, scale.shape, zp_shape - ) - compress_decompress_cache[key] = { - "compress_decompress_model": compress_decompress_model, - "compress_model": compress_model, - } - - scale_sign = scale / fns.abs(scale) - zero_scale = 0.001 - zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + original_weight = fns.zeros_like(weight) + weight - input_tensors = [original_weight.data, None] - if zp is not None: - input_tensors.append(zp.data) - # iterative rectification of initial scale - for i in range(self._initial_steps): - near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) - near_to_ideal_scale = near_to_ideal_scale * scale_sign - input_tensors[1] = near_to_ideal_scale.data + compressed_weights, scale, zp = do_int_quantization(original_weight, reduction_axis, cur_config) + if zp is not None: + zp = zp.astype(scale.dtype) + q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis) - out = compress_decompress_model(input_tensors) - q_weights_ = fns.zeros_like(original_weight) + out - q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) + s = fns.unsqueeze(s, 0) + s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size) - ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) - ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) - if self._weight_penalty > 0.0: - ideal_scale_diffs += self._weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) + original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size) - if best_diffs is None: - best_diffs = min_max_scale_diffs + # all weight in group has importance based on corresponding input activations + importance = fns.ones_like(original_weight) + importance = importance * s - mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) + target, zero_mask = get_target_zero_mask(compressed_weights, zp) + importance = fns.where(zero_mask, 0.0, importance) - best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs + # normalize importances for every group of weights to make sum of them equal to 1.0 + denum = fns.sum(importance, axis=2, keepdims=True) + importance = importance / (denum + eps) - mask = fns.unsqueeze(mask, axis=2) + X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size) + q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size) + best_diffs = None + result_scale = None - if result_scale is None: - near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale - else: - near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale - result_scale = near_to_ideal_scale - input_tensors[1] = near_to_ideal_scale.data + fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X) + q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X) - if i < self._initial_steps - 1: - out = compress_model(input_tensors) - compressed_weights = fns.zeros_like(original_weight) + out - target, zero_mask = get_target_zero_mask(compressed_weights, zp) - zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE + min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) + min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0)) + if weight_penalty > 0.0: + min_max_scale_diffs += weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1) - # iterative rectification of scale based on grid search - for scale_steps in range(self._scale_steps): - factor = 1.0 - 0.05 * scale_steps - scaled_scale = factor * scale + zp_shape = zp.shape if zp is not None else None + key = (config.mode, config.num_bits) + q_weights.shape + scale.shape + if zp is not None: + key += zp_shape + if key in ScaleEstimation.compress_decompress_cache: + compress_decompress_model = ScaleEstimation.compress_decompress_cache[key]["compress_decompress_model"] + compress_model = ScaleEstimation.compress_decompress_cache[key]["compress_model"] + else: + compress_decompress_model = backend_entity.get_compress_decompress_pipeline( + config, q_weights.shape, scale.shape, zp_shape + ) + compress_model = backend_entity.get_compress_pipeline(config, q_weights.shape, scale.shape, zp_shape) + ScaleEstimation.compress_decompress_cache[key] = { + "compress_decompress_model": compress_decompress_model, + "compress_model": compress_model, + } + scale_sign = scale / fns.abs(scale) + zero_scale = 0.001 + zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + + input_tensors = [original_weight.data, None] + if zp is not None: + input_tensors.append(zp.data) + # iterative rectification of initial scale + for i in range(initial_steps): + near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) + near_to_ideal_scale = near_to_ideal_scale * scale_sign + input_tensors[1] = near_to_ideal_scale.data + + out = compress_decompress_model(input_tensors) + q_weights_ = fns.zeros_like(original_weight) + out + q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) + + ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) + ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) + if weight_penalty > 0.0: + ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) + + if best_diffs is None: + best_diffs = min_max_scale_diffs + + mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) + + best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs + + mask = fns.unsqueeze(mask, axis=2) + + if result_scale is None: + near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale + else: + near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale + result_scale = near_to_ideal_scale + input_tensors[1] = near_to_ideal_scale.data - input_tensors[1] = scaled_scale.data + if i < initial_steps - 1: out = compress_model(input_tensors) compressed_weights = fns.zeros_like(original_weight) + out - target, zero_mask = get_target_zero_mask(compressed_weights, zp) zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) - near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) - near_to_ideal_scale = near_to_ideal_scale * scale_sign - input_tensors[1] = near_to_ideal_scale.data - out = compress_decompress_model(input_tensors) - q_weights_ = fns.zeros_like(original_weight) + out + # iterative rectification of scale based on grid search + for scale_steps in range(scale_steps): + factor = 1.0 - 0.05 * scale_steps + scaled_scale = factor * scale + + input_tensors[1] = scaled_scale.data + out = compress_model(input_tensors) + compressed_weights = fns.zeros_like(original_weight) + out - q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) - ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) - ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) - if self._weight_penalty > 0.0: - ideal_scale_diffs += self._weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) + target, zero_mask = get_target_zero_mask(compressed_weights, zp) + zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) + near_to_ideal_scale = near_to_ideal_scale * scale_sign + + input_tensors[1] = near_to_ideal_scale.data + out = compress_decompress_model(input_tensors) + q_weights_ = fns.zeros_like(original_weight) + out - mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) + q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) + ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) + ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) + if weight_penalty > 0.0: + ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) - best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs + mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) - mask = fns.unsqueeze(mask, axis=2) + best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs - if result_scale is None: - near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale - else: - near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale - result_scale = near_to_ideal_scale + mask = fns.unsqueeze(mask, axis=2) + + if result_scale is None: + near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale + else: + near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale + result_scale = near_to_ideal_scale - if config.group_size == -1: - result_scale = fns.squeeze(result_scale, axis=1) - res[weight_name] = result_scale + if config.group_size == -1: + result_scale = fns.squeeze(result_scale, axis=1) - return res + return result_scale, zp -def get_target_zero_mask(compressed_weights: TTensor, zp: Optional[TTensor] = None) -> Tuple[TTensor, TTensor]: +def get_target_zero_mask(compressed_weights: Tensor, zp: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: """ Computes the target values and a mask indicating zero values in the target. @@ -310,7 +360,7 @@ def get_target_zero_mask(compressed_weights: TTensor, zp: Optional[TTensor] = No return target, zero_mask -def estimate_scales(weight: TTensor, target: TTensor, zero_mask: TTensor, importance: TTensor) -> TTensor: +def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importance: Tensor) -> Tensor: """ Estimates scales for the given weight, target, zero mask, and importance. diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py index e96c4526c51..60baeacc48e 100644 --- a/nncf/quantization/quantize_model.py +++ b/nncf/quantization/quantize_model.py @@ -482,11 +482,6 @@ def compress_weights( if any((gptq, lora_correction)) and (dataset is None or mode == CompressWeightsMode.E2M1): raise AttributeError("GPTQ or Lora Correction algorithm is defined, but dataset is None or mode is E2M1.") - if gptq and scale_estimation: - raise AttributeError( - "Simultaneous use of Scale estimation and GPTQ algorithms is not supported. Select one of them." - ) - if gptq and lora_correction: raise AttributeError( "Simultaneous use of Lora correction and GPTQ algorithms is not supported. Select one of them." diff --git a/tests/openvino/native/quantization/test_gptq.py b/tests/openvino/native/quantization/test_gptq.py index 1202b216ec7..ad19990eac0 100644 --- a/tests/openvino/native/quantization/test_gptq.py +++ b/tests/openvino/native/quantization/test_gptq.py @@ -341,7 +341,8 @@ def test_calculate_scale_linear(): gptq._set_backend_entity(ov_model) nodes = graph.get_all_nodes() - H = gptq._calculate_hessian(nodes[1], [Tensor(inp) for inp in inputs]) + wrapped_inputs = [Tensor(inp) for inp in inputs] + H = gptq._calculate_hessian(nodes[1], wrapped_inputs) ref_H = ref_gptq.H.numpy() assert np.all(np.isclose(ref_H, H.data)) @@ -351,7 +352,7 @@ def test_calculate_scale_linear(): ) wc_params.compression_config = WeightCompressionConfig(mode=CompressWeightsMode.INT4_SYM, group_size=16) - scale, _ = gptq._quantize_weights(ov_model, graph, wc_params, H) + scale, _ = gptq._quantize_weights(ov_model, graph, wc_params, H, wrapped_inputs) ref_scale = ref_scale.numpy() scale = scale.reshape(ref_scale.shape) assert np.all(np.isclose(ref_scale, scale.data)) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index bb9b5c373c7..c51cf667ca2 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -713,10 +713,7 @@ def test_raise_error_with_unsupported_params_for_int8(mode, params): @pytest.mark.parametrize("mode", INT4_MODES) @pytest.mark.parametrize( "params", - ( - {"dataset": "anything", "scale_estimation": True, "gptq": True}, - {"dataset": "anything", "lora_correction": True, "gptq": True}, - ), + ({"dataset": "anything", "lora_correction": True, "gptq": True},), ) def test_raise_error_with_unsupported_params_for_int4(mode, params): with pytest.raises(AttributeError): From ee648777dcb951f4c7bdadd3997680a5083645a7 Mon Sep 17 00:00:00 2001 From: Aleksandr Suslov Date: Wed, 4 Sep 2024 13:25:22 +0400 Subject: [PATCH 02/25] fix for INT4_ASYM --- nncf/quantization/algorithms/weight_compression/gptq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index b1101916da3..bd6518c86ad 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -273,7 +273,7 @@ def _quantize_weights( wc_params.compression_config, ) scales.append(scale.squeeze(axis=1)) - zero_points.append(zero_point) + zero_points.append(zero_point if zero_point is None else zero_point.squeeze(axis=1)) else: scale, zero_point = calculate_integer_quantization_params( weight_tensor[:, (i1 + i) : (i1 + i + group_size)], From 3bcd47bf5322af7311a47b03ddd850fa773d5fb7 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 11 Jul 2025 10:59:02 +0200 Subject: [PATCH 03/25] Initial codebook estimation algorithm. --- .../weight_compression/codebook_estimation.py | 377 ++++++++++++++++++ 1 file changed, 377 insertions(+) create mode 100644 src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py new file mode 100644 index 00000000000..afe721c7ebe --- /dev/null +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -0,0 +1,377 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import deepcopy +from typing import Optional, TypeVar +import numpy as np + +import nncf +from nncf.common.graph.graph import NNCFGraph +from nncf.common.logging.track_progress import track +from nncf.common.utils.backend import BackendType +from nncf.common.utils.backend import get_backend +from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic +from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats +from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization +from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization +from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_float_quantization_params +from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_normalized_weight +from nncf.tensor import Tensor +from nncf.tensor import TensorDataType +from nncf.tensor import functions as fns + +TModel = TypeVar("TModel") + + +f8e4m3_data = np.array( + [-4.4800000e+02, -4.1600000e+02, -3.8400000e+02, -3.5200000e+02 + , -3.2000000e+02, -2.8800000e+02, -2.5600000e+02, -2.4000000e+02 + , -2.2400000e+02, -2.0800000e+02, -1.9200000e+02, -1.7600000e+02 + , -1.6000000e+02, -1.4400000e+02, -1.2800000e+02, -1.2000000e+02 + , -1.1200000e+02, -1.0400000e+02, -9.6000000e+01, -8.8000000e+01 + , -8.0000000e+01, -7.2000000e+01, -6.4000000e+01, -6.0000000e+01 + , -5.6000000e+01, -5.2000000e+01, -4.8000000e+01, -4.4000000e+01 + , -4.0000000e+01, -3.6000000e+01, -3.2000000e+01, -3.0000000e+01 + , -2.8000000e+01, -2.6000000e+01, -2.4000000e+01, -2.2000000e+01 + , -2.0000000e+01, -1.8000000e+01, -1.6000000e+01, -1.5000000e+01 + , -1.4000000e+01, -1.3000000e+01, -1.2000000e+01, -1.1000000e+01 + , -1.0000000e+01, -9.0000000e+00, -8.0000000e+00, -7.5000000e+00 + , -7.0000000e+00, -6.5000000e+00, -6.0000000e+00, -5.5000000e+00 + , -5.0000000e+00, -4.5000000e+00, -4.0000000e+00, -3.7500000e+00 + , -3.5000000e+00, -3.2500000e+00, -3.0000000e+00, -2.7500000e+00 + , -2.5000000e+00, -2.2500000e+00, -2.0000000e+00, -1.8750000e+00 + , -1.7500000e+00, -1.6250000e+00, -1.5000000e+00, -1.3750000e+00 + , -1.2500000e+00, -1.1250000e+00, -1.0000000e+00, -9.3750000e-01 + , -8.7500000e-01, -8.1250000e-01, -7.5000000e-01, -6.8750000e-01 + , -6.2500000e-01, -5.6250000e-01, -5.0000000e-01, -4.6875000e-01 + , -4.3750000e-01, -4.0625000e-01, -3.7500000e-01, -3.4375000e-01 + , -3.1250000e-01, -2.8125000e-01, -2.5000000e-01, -2.3437500e-01 + , -2.1875000e-01, -2.0312500e-01, -1.8750000e-01, -1.7187500e-01 + , -1.5625000e-01, -1.4062500e-01, -1.2500000e-01, -1.1718750e-01 + , -1.0937500e-01, -1.0156250e-01, -9.3750000e-02, -7.8125000e-02 + , -7.0312500e-02, -5.8593750e-02, -5.0781250e-02, -3.9062500e-02 + , -2.9296875e-02, -1.9531250e-02, -9.7656250e-03, 0.0000000e+00 + , 9.7656250e-03, 1.9531250e-02, 2.9296875e-02, 3.9062500e-02 + , 5.0781250e-02, 5.8593750e-02, 7.0312500e-02, 7.8125000e-02 + , 9.3750000e-02, 1.0156250e-01, 1.0937500e-01, 1.1718750e-01 + , 1.2500000e-01, 1.4062500e-01, 1.5625000e-01, 1.7187500e-01 + , 1.8750000e-01, 2.0312500e-01, 2.1875000e-01, 2.3437500e-01 + , 2.5000000e-01, 2.8125000e-01, 3.1250000e-01, 3.4375000e-01 + , 3.7500000e-01, 4.0625000e-01, 4.3750000e-01, 4.6875000e-01 + , 5.0000000e-01, 5.6250000e-01, 6.2500000e-01, 6.8750000e-01 + , 7.5000000e-01, 8.1250000e-01, 8.7500000e-01, 9.3750000e-01 + , 1.0000000e+00, 1.1250000e+00, 1.2500000e+00, 1.3750000e+00 + , 1.5000000e+00, 1.6250000e+00, 1.7500000e+00, 1.8750000e+00 + , 2.0000000e+00, 2.2500000e+00, 2.5000000e+00, 2.7500000e+00 + , 3.0000000e+00, 3.2500000e+00, 3.5000000e+00, 3.7500000e+00 + , 4.0000000e+00, 4.5000000e+00, 5.0000000e+00, 5.5000000e+00 + , 6.0000000e+00, 6.5000000e+00, 7.0000000e+00, 7.5000000e+00 + , 8.0000000e+00, 9.0000000e+00, 1.0000000e+01, 1.1000000e+01 + , 1.2000000e+01, 1.3000000e+01, 1.4000000e+01, 1.5000000e+01 + , 1.6000000e+01, 1.8000000e+01, 2.0000000e+01, 2.2000000e+01 + , 2.4000000e+01, 2.6000000e+01, 2.8000000e+01, 3.0000000e+01 + , 3.2000000e+01, 3.6000000e+01, 4.0000000e+01, 4.4000000e+01 + , 4.8000000e+01, 5.2000000e+01, 5.6000000e+01, 6.0000000e+01 + , 6.4000000e+01, 7.2000000e+01, 8.0000000e+01, 8.8000000e+01 + , 9.6000000e+01, 1.0400000e+02, 1.1200000e+02, 1.2000000e+02 + , 1.2800000e+02, 1.4400000e+02, 1.6000000e+02, 1.7600000e+02 + , 1.9200000e+02, 2.0800000e+02, 2.2400000e+02, 2.4000000e+02 + , 2.5600000e+02, 2.8800000e+02, 3.2000000e+02, 3.5200000e+02 + , 3.8400000e+02, 4.1600000e+02, 4.4800000e+02] +) + + +class CodebookEstimation: + """ + Scale estimation algorithm implementation. + """ + + def __init__( + self, + ): + """ + :param subset_size: The number of samples for scale estimation. + :param initial_steps: The number of the steps for absmax scale rectification. + :param scale_steps: The number of the steps for grid search scale rectification + from 1.0 to 1.0 - 0.05 * scale_step. + :param weight_penalty: coefficient for penalty between fp and compressed weights. If -1 then doesn't apply. + """ + super().__init__() + + @property + def available_backends(self) -> list[BackendType]: + return [BackendType.OPENVINO] + + def _set_backend_entity(self, model: TModel) -> None: + """ + Creates a helper class with a backed-specific logic of the algorithm. + + :param model: Backend-specific input model. + """ + model_backend = get_backend(model) + if model_backend == BackendType.OPENVINO: + from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend + + self._backend_entity = OVWeightCompressionAlgoBackend(model) + else: + msg = ( + "Cannot return backend-specific Scale Estimation entity because" + f" {model_backend.value} is not supported!" + ) + raise nncf.UnsupportedBackendError(msg) + + def apply( + self, + model: TModel, + graph: NNCFGraph, + all_weight_params: list[WeightCompressionParameters], + statistics: dict[str, WCTensorStatistic], + backend_entity: Optional[WeightCompressionAlgoBackend] = None, + ) -> dict[str, CompressedWeight]: + """ + Estimates better scale for the int4 nodes in the model. + Minimizes per-group difference between floating point MatMul and + MatMul with compressed weights. + The algorithm computes weighted scale for the group of weights in MatMul, which + shared the same scale. + + :param model: Model for applying algorithm. + :param graph: Model graph. + :param all_weight_params: List of all weight parameters. + :param statistics: Input activation statistics for each node. + :param statistic_points: Statistic points with collected statistics values. + :param dataset: A representative dataset for the calibration process. + :param backend_entity: Weight compression algorithm backend. + :return: Two dictionaries for estimated scales and zero points for each weight name. + """ + self._backend_entity = backend_entity + if self._backend_entity is None: + self._set_backend_entity(model) + res = dict() + + for wp in track(all_weight_params, description="Applying Codebook Estimation"): + weight_name = wp.weight_name + node_name = wp.node_with_weight.node_name + config = wp.compression_config + + if config.num_bits != 4 or node_name not in statistics: + res[weight_name] = CompressedWeight() + continue + + stats = statistics[node_name] + + weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) + if len(weight_data) != 1: # not supported by the algorithm + continue + _, weight_port_id = weight_data[0] + + weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) + + scale, zero_point = self.calculate_quantization_params( + stats, + weight, + wp.reduction_axes, + config, + self._subset_size, + self._initial_steps, + self._scale_steps, + self._weight_penalty, + ) + res[weight_name] = CompressedWeight(None, scale, zero_point, None) + + return res + + @staticmethod + def calculate_codebook( + statistics: WCTensorStatistic, + weight: Tensor, + reduction_axes: tuple[int, ...], + config: WeightCompressionConfig, + subset_size: int = 32, + initial_steps: int = 5, + scale_steps: int = 10, + weight_penalty: float = -1.0, + ) -> Tensor: + + reduction_axis = reduction_axes[0] + weight = deepcopy(weight.astype(TensorDataType.float32)) + + if reduction_axis == 0: + weight = fns.transpose(weight) + reduction_axis = 1 + + weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) + + scale = calculate_float_quantization_params(weight, reduction_axes, config) + norm_weight = _calculate_normalized_weight(weight, scale) + + codebook, indexes = weights_clusterization_k_means(norm_weight) + + + + + fp8_scales = np.unique(np.abs(f8e4m3_data)) + fp8_scales = fp8_scales[scale >= 1.0] + + + + + return None + + +def most_common(lst): + """ + Return the most frequently occuring element in a list. + """ + return max(set(lst), key=lst.count) + + +def euclidean_(point, data): + """ + Return euclidean distances between a point & a dataset + """ + return np.sqrt(np.sum((point - data) ** 2, axis=1)) + + +def round(quantiles, values): + center_of_quantiles = 0.5 * (quantiles[1:] + quantiles[:-1]) + + return np.searchsorted(center_of_quantiles, values, side='left', sorter=None) + + +class KMeansHist: + def __init__(self, n_clusters=8, max_iter=300): + self.n_clusters = n_clusters + self.max_iter = max_iter + + @staticmethod + def get_init(values, frequencies, n_clusters): + step = 1.0 / n_clusters + denum = np.sum(frequencies) + quants = [i * step for i in range(n_clusters)] + n_frequencies = frequencies / denum + n_frequencies = np.cumsum(n_frequencies) + + res = [] + for i in range(len(quants)): + if i == 0: + res.append(values[0]) + elif i == len(quants) - 1: + res.append(values[-1]) + else: + prev = values[np.where(n_frequencies <= quants[i])[0][-1]] + next_ = values[np.where(n_frequencies <= quants[i + 1])[0][-1]] + res.append((prev + next_) / 2) + + res = np.array(res).reshape(1, -1) + return res + + @staticmethod + def create_histogramm(data, data_range=(-1.0, 1.0), granularity=0.01): + centers = [] + step = granularity + prev = data_range[0] + + while prev < data_range[1]: + centers.append(prev + step / 2) + prev += step + + centers = np.array(centers) + centroid_idxs = round(centers, data) + + res = [[], [], []] + for i in range(centers.shape[1]): + idxs = np.where(centroid_idxs == i) + if len(idxs[0]) == 0: + continue + res[0].append(centers[i]) + res[1].append(np.sum(data[idxs, :])) + res[2].append(len(idxs[0])) + + res[0] = np.array(res[0]).reshape(-1, 1) + res[1] = np.array(res[1]) + res[2] = np.array(res[2]) + + return res + + def fit(self, X_train, init, fixed=[]): + if self.max_iter == 1: + self.centroids = deepcopy(init) + return + + self.hist = self.create_histogramm(X_train) + + init_by_hist = self.get_init(self.hist[0], self.hist[2], self.n_clusters) + init_by_hist[0, 0] = init[0] + init_by_hist[0, -1] = init[-1] + zero_idx = np.argmin(np.abs(init_by_hist[0, :])) + init_by_hist[0, zero_idx] = init[0, zero_idx] + fixed[1] = zero_idx + init = init_by_hist + + self.centroids = deepcopy(init) + + iteration = 0 + prev_centroids = self.centroids + while iteration < self.max_iter: + prev_centroids = deepcopy(self.centroids) + + centroid_idxs = round(self.centroids, self.hist[0]) + for i in range(self.n_clusters): + idxs = np.where(centroid_idxs == i) + self.centroids[:, i] = np.sum(self.hist[1][idxs]) / np.sum(self.hist[2][idxs]) + + for i, centroid in enumerate(self.centroids): + if np.isnan(centroid).any(): # Catch any np.nans, resulting from a centroid having no points + self.centroids[i] = prev_centroids[i] + for idx in fixed: + self.centroids[:, idx] = init[:, idx] + iteration += 1 + if np.all(np.abs(self.centroids - prev_centroids) < 0.00001).any(): + break + print(self.centroids) + + def evaluate(self, X): + centroid_idxs = round(self.centroids, X) + return deepcopy(self.centroids).flatten(), centroid_idxs + + +def weights_clusterization_k_means(weight, n_centroids=2**4): + weight = weight.as_numpy_tensor().data + + ow = deepcopy(weight) + orig_shape = weight.shape + weight = weight.flatten() + + n_init = [0, 0] + n_init[0] = weight.min() + n_init[-1] = weight.max() + + kmeans = KMeansHist(n_centroids, max_iter=1) + + n_init = kmeans.get_init(weight, n_init, n_centroids) + + #kmeans.fit(weight.reshape(-1, 1), n_init.reshape(1, -1), fixed=[0, 7, 15]) + kmeans.fit(weight, n_init, fixed=[0, 7, 15]) + codebook, indexes = kmeans.evaluate(weight.reshape(-1, 1)) + # codebook = kmeans.cluster_centers_.flatten() + # indexes = kmeans.labels_ + + indexes = np.reshape(indexes, orig_shape) + + print(orig_shape, np.mean(np.abs(ow - codebook[indexes]))) + + return codebook, indexes From eb93fdbdf1bb7897b57afb553ee2f01ba7d1df2d Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 14 Jul 2025 14:54:05 +0200 Subject: [PATCH 04/25] First working example for layer wise codebook. --- .../weight_compression/algorithm.py | 12 ++ .../weight_compression/codebook_estimation.py | 115 ++++++++++++------ 2 files changed, 93 insertions(+), 34 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index 7211064a25c..c4479680c4c 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -40,6 +40,7 @@ from nncf.quantization.algorithms.weight_compression.awq import AWQ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES +from nncf.quantization.algorithms.weight_compression.codebook_estimation import CodebookEstimation from nncf.quantization.algorithms.weight_compression.gptq import GPTQ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -352,6 +353,8 @@ def __init__( scale_estimation_params.scale_steps, scale_estimation_params.weight_penalty, ) + + self._codebook_estimation_algo = CodebookEstimation() self._data_aware_mixed_precision = ( self._sensitivity_metric != SensitivityMetric.WEIGHT_QUANTIZATION_ERROR and self._ratio != 1.0 @@ -785,6 +788,15 @@ def apply( precomputed_compressed_weights = None lora_correction_algo = None description = "Applying Weight Compression" + + if self._mode == CompressWeightsMode.CODEBOOK: + precomputed_compressed_weights = self._codebook_estimation_algo.apply( + model=model, + graph=graph, + all_weight_params=all_weight_params, + statistics=statistics, + backend_entity=self._backend_entity, + ) if self._gptq: del statistics diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index afe721c7ebe..67908c33db9 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -13,6 +13,9 @@ from typing import Optional, TypeVar import numpy as np +import openvino as ov +from openvino.runtime import opset13 as opset + import nncf from nncf.common.graph.graph import NNCFGraph from nncf.common.logging.track_progress import track @@ -93,6 +96,20 @@ , 3.8400000e+02, 4.1600000e+02, 4.4800000e+02] ) +def fp8_convert(in_shape): + input = opset.parameter( + in_shape, dtype=ov.Type.f32 + ) + scale_convert = opset.convert(input, ov.Type.f8e4m3) + scale_convert = opset.convert(scale_convert, ov.Type.f32) + result = opset.result(scale_convert, name="Result") + result.get_output_tensor(0).set_names(set(["Result"])) + model = ov.Model([result], [input]) + + compiled_model = ov.compile_model(model) + + return compiled_model + class CodebookEstimation: """ @@ -167,11 +184,11 @@ def apply( node_name = wp.node_with_weight.node_name config = wp.compression_config - if config.num_bits != 4 or node_name not in statistics: + if config.num_bits != 4:# or node_name not in statistics: res[weight_name] = CompressedWeight() continue - stats = statistics[node_name] + stats = None #statistics[node_name] weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) if len(weight_data) != 1: # not supported by the algorithm @@ -180,17 +197,17 @@ def apply( weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) - scale, zero_point = self.calculate_quantization_params( + codebook, scale, indexes = self.calculate_codebook( stats, weight, wp.reduction_axes, config, - self._subset_size, - self._initial_steps, - self._scale_steps, - self._weight_penalty, + # self._subset_size, + # self._initial_steps, + # self._scale_steps, + # self._weight_penalty, ) - res[weight_name] = CompressedWeight(None, scale, zero_point, None) + res[weight_name] = CompressedWeight(indexes, scale, None, codebook) return res @@ -200,10 +217,10 @@ def calculate_codebook( weight: Tensor, reduction_axes: tuple[int, ...], config: WeightCompressionConfig, - subset_size: int = 32, - initial_steps: int = 5, - scale_steps: int = 10, - weight_penalty: float = -1.0, + # subset_size: int = 32, + # initial_steps: int = 5, + # scale_steps: int = 10, + # weight_penalty: float = -1.0, ) -> Tensor: reduction_axis = reduction_axes[0] @@ -213,23 +230,49 @@ def calculate_codebook( weight = fns.transpose(weight) reduction_axis = 1 - weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) + if config.group_size != -1: + weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) + + orig_shape = weight.shape scale = calculate_float_quantization_params(weight, reduction_axes, config) norm_weight = _calculate_normalized_weight(weight, scale) - codebook, indexes = weights_clusterization_k_means(norm_weight) - + codebook, indexes = weights_clusterization_k_means(norm_weight)#.as_numpy_tensor().data) - + converter = fp8_convert(codebook.shape) + indexes = indexes.reshape(orig_shape) fp8_scales = np.unique(np.abs(f8e4m3_data)) - fp8_scales = fp8_scales[scale >= 1.0] - + fp8_scales = fp8_scales[fp8_scales >= 1.0] + best_codebook = None + min_diff = float("inf") + best_scale = None + weight = weight.as_numpy_tensor().data + scale = scale.as_numpy_tensor().data + + min_diffs = [] + for fp8_scale in fp8_scales: + scaled_codebook = codebook * fp8_scale + scaled_codebook = converter(scaled_codebook)[0] + + + dequantized_weight = scaled_codebook[indexes] + dequantized_weight = dequantized_weight * scale + dequantized_weight = dequantized_weight / fp8_scale + + diff = np.mean(np.abs(weight - dequantized_weight)) + + if diff < min_diff: + min_diff = diff + best_codebook = deepcopy(scaled_codebook) + best_scale = fp8_scale + min_diffs.append(min_diff) - return None + print("\t", min_diffs) + return Tensor(best_codebook), Tensor(scale / best_scale), indexes def most_common(lst): @@ -272,17 +315,21 @@ def get_init(values, frequencies, n_clusters): elif i == len(quants) - 1: res.append(values[-1]) else: - prev = values[np.where(n_frequencies <= quants[i])[0][-1]] - next_ = values[np.where(n_frequencies <= quants[i + 1])[0][-1]] + prev = values[np.where(n_frequencies <= quants[i])[0][-1]].item() + next_ = values[np.where(n_frequencies <= quants[i + 1])[0][-1]].item() res.append((prev + next_) / 2) - res = np.array(res).reshape(1, -1) + res = np.array(res)#.reshape(1, -1) return res @staticmethod - def create_histogramm(data, data_range=(-1.0, 1.0), granularity=0.01): + def create_histogramm(data, granularity=0.01): centers = [] step = granularity + + granularity = granularity * (data.max() - data.min()) + + data_range=(data.min().item(), data.max().item()) prev = data_range[0] while prev < data_range[1]: @@ -293,15 +340,15 @@ def create_histogramm(data, data_range=(-1.0, 1.0), granularity=0.01): centroid_idxs = round(centers, data) res = [[], [], []] - for i in range(centers.shape[1]): + for i in range(centers.size): idxs = np.where(centroid_idxs == i) if len(idxs[0]) == 0: continue res[0].append(centers[i]) - res[1].append(np.sum(data[idxs, :])) + res[1].append(np.sum(data[idxs])) res[2].append(len(idxs[0])) - res[0] = np.array(res[0]).reshape(-1, 1) + res[0] = np.array(res[0])#.reshape(-1, 1) res[1] = np.array(res[1]) res[2] = np.array(res[2]) @@ -315,10 +362,10 @@ def fit(self, X_train, init, fixed=[]): self.hist = self.create_histogramm(X_train) init_by_hist = self.get_init(self.hist[0], self.hist[2], self.n_clusters) - init_by_hist[0, 0] = init[0] - init_by_hist[0, -1] = init[-1] - zero_idx = np.argmin(np.abs(init_by_hist[0, :])) - init_by_hist[0, zero_idx] = init[0, zero_idx] + init_by_hist[0] = init[0] + init_by_hist[0] = init[-1] + zero_idx = np.argmin(np.abs(init_by_hist[:])) + init_by_hist[zero_idx] = 0.0 #init[0, zero_idx] fixed[1] = zero_idx init = init_by_hist @@ -332,13 +379,13 @@ def fit(self, X_train, init, fixed=[]): centroid_idxs = round(self.centroids, self.hist[0]) for i in range(self.n_clusters): idxs = np.where(centroid_idxs == i) - self.centroids[:, i] = np.sum(self.hist[1][idxs]) / np.sum(self.hist[2][idxs]) + self.centroids[i] = np.sum(self.hist[1][idxs]) / np.sum(self.hist[2][idxs]) for i, centroid in enumerate(self.centroids): if np.isnan(centroid).any(): # Catch any np.nans, resulting from a centroid having no points self.centroids[i] = prev_centroids[i] for idx in fixed: - self.centroids[:, idx] = init[:, idx] + self.centroids[idx] = init[idx] iteration += 1 if np.all(np.abs(self.centroids - prev_centroids) < 0.00001).any(): break @@ -360,9 +407,9 @@ def weights_clusterization_k_means(weight, n_centroids=2**4): n_init[0] = weight.min() n_init[-1] = weight.max() - kmeans = KMeansHist(n_centroids, max_iter=1) + kmeans = KMeansHist(n_centroids, max_iter=15) - n_init = kmeans.get_init(weight, n_init, n_centroids) + #n_init = kmeans.get_init(weight, n_init, n_centroids) #kmeans.fit(weight.reshape(-1, 1), n_init.reshape(1, -1), fixed=[0, 7, 15]) kmeans.fit(weight, n_init, fixed=[0, 7, 15]) From 5bfccee6803e603c67fb80c31aa59443dd433ce5 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 16 Jul 2025 17:45:32 +0200 Subject: [PATCH 05/25] Experiment. --- .../weight_compression/codebook_estimation.py | 47 +++++++++++-------- .../weight_compression/weight_lowering.py | 3 ++ 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index 67908c33db9..869d5d16124 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -36,6 +36,8 @@ from nncf.tensor import TensorDataType from nncf.tensor import functions as fns +from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES + TModel = TypeVar("TModel") @@ -183,6 +185,9 @@ def apply( weight_name = wp.weight_name node_name = wp.node_with_weight.node_name config = wp.compression_config + + if not 'return_proj' in node_name: + continue if config.num_bits != 4:# or node_name not in statistics: res[weight_name] = CompressedWeight() @@ -246,33 +251,35 @@ def calculate_codebook( fp8_scales = np.unique(np.abs(f8e4m3_data)) fp8_scales = fp8_scales[fp8_scales >= 1.0] - best_codebook = None + best_codebook = codebook #converter(codebook)[0] + print("Best codebook:", best_codebook) + min_diff = float("inf") - best_scale = None + best_scale = 1.0 weight = weight.as_numpy_tensor().data scale = scale.as_numpy_tensor().data - min_diffs = [] - for fp8_scale in fp8_scales: - scaled_codebook = codebook * fp8_scale - scaled_codebook = converter(scaled_codebook)[0] + # min_diffs = [] + # for fp8_scale in fp8_scales: + # scaled_codebook = codebook * fp8_scale + # scaled_codebook = converter(scaled_codebook)[0] - dequantized_weight = scaled_codebook[indexes] - dequantized_weight = dequantized_weight * scale - dequantized_weight = dequantized_weight / fp8_scale + # dequantized_weight = scaled_codebook[indexes] + # dequantized_weight = dequantized_weight * scale + # dequantized_weight = dequantized_weight / fp8_scale - diff = np.mean(np.abs(weight - dequantized_weight)) + # diff = np.mean(np.abs(weight - dequantized_weight)) - if diff < min_diff: - min_diff = diff - best_codebook = deepcopy(scaled_codebook) - best_scale = fp8_scale - min_diffs.append(min_diff) + # if diff < min_diff: + # min_diff = diff + # best_codebook = deepcopy(scaled_codebook) + # best_scale = fp8_scale + # min_diffs.append(min_diff) - print("\t", min_diffs) - return Tensor(best_codebook), Tensor(scale / best_scale), indexes + #print("\t", min_diffs) + return Tensor(best_codebook), Tensor(scale / best_scale), Tensor(indexes) def most_common(lst): @@ -361,9 +368,9 @@ def fit(self, X_train, init, fixed=[]): self.hist = self.create_histogramm(X_train) - init_by_hist = self.get_init(self.hist[0], self.hist[2], self.n_clusters) + init_by_hist = CB4_QUANTILES #self.get_init(self.hist[0], self.hist[2], self.n_clusters) init_by_hist[0] = init[0] - init_by_hist[0] = init[-1] + init_by_hist[-1] = init[-1] zero_idx = np.argmin(np.abs(init_by_hist[:])) init_by_hist[zero_idx] = 0.0 #init[0, zero_idx] fixed[1] = zero_idx @@ -407,7 +414,7 @@ def weights_clusterization_k_means(weight, n_centroids=2**4): n_init[0] = weight.min() n_init[-1] = weight.max() - kmeans = KMeansHist(n_centroids, max_iter=15) + kmeans = KMeansHist(n_centroids, max_iter=25) #n_init = kmeans.get_init(weight, n_init, n_centroids) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index a08dd1340a8..4ace7cfe6ec 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -329,6 +329,9 @@ def compress_weight( ) if not config.is_integer: + if precomputed_compressed_weight is not None and precomputed_compressed_weight.tensor is not None and precomputed_compressed_weight.codebook is not None: + return precomputed_compressed_weight + compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale) if indexes is not None: return CompressedWeight( From 509b6ef9c2f7b74d55701a307e81ee2942bfe7c0 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Thu, 17 Jul 2025 11:06:02 +0200 Subject: [PATCH 06/25] Experiment with accuracy improvement. --- .../weight_compression/codebook_estimation.py | 13 +++++++------ .../weight_compression/weight_lowering.py | 9 +++++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index 869d5d16124..5ef85a66e6c 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -186,8 +186,8 @@ def apply( node_name = wp.node_with_weight.node_name config = wp.compression_config - if not 'return_proj' in node_name: - continue + # if not 'return_proj' in node_name: + # continue if config.num_bits != 4:# or node_name not in statistics: res[weight_name] = CompressedWeight() @@ -240,7 +240,7 @@ def calculate_codebook( orig_shape = weight.shape - scale = calculate_float_quantization_params(weight, reduction_axes, config) + scale = calculate_float_quantization_params(weight, reduction_axes, config, signed=True) norm_weight = _calculate_normalized_weight(weight, scale) codebook, indexes = weights_clusterization_k_means(norm_weight)#.as_numpy_tensor().data) @@ -251,7 +251,7 @@ def calculate_codebook( fp8_scales = np.unique(np.abs(f8e4m3_data)) fp8_scales = fp8_scales[fp8_scales >= 1.0] - best_codebook = codebook #converter(codebook)[0] + best_codebook = converter(codebook)[0] print("Best codebook:", best_codebook) min_diff = float("inf") @@ -334,7 +334,7 @@ def create_histogramm(data, granularity=0.01): centers = [] step = granularity - granularity = granularity * (data.max() - data.min()) + #granularity = granularity * (data.max() - data.min()) data_range=(data.min().item(), data.max().item()) prev = data_range[0] @@ -413,8 +413,9 @@ def weights_clusterization_k_means(weight, n_centroids=2**4): n_init = [0, 0] n_init[0] = weight.min() n_init[-1] = weight.max() + print("n_init:", n_init) - kmeans = KMeansHist(n_centroids, max_iter=25) + kmeans = KMeansHist(n_centroids, max_iter=10) #n_init = kmeans.get_init(weight, n_init, n_centroids) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 4ace7cfe6ec..8219eaf0697 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -78,7 +78,7 @@ def reshape_weight_for_grouped_quantization( def calculate_float_quantization_params( - weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig + weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, signed: bool = False ) -> Tensor: """ Calculates the scale for nf4 or e2m1 quantization. @@ -93,7 +93,12 @@ def calculate_float_quantization_params( if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) - scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) + if signed: + scale_neg = fns.min(weight, axis=reduction_axes, keepdims=True) + scale_pos = fns.max(weight, axis=reduction_axes, keepdims=True) + scale = fns.where(fns.abs(scale_neg) >= fns.abs(scale_pos), scale_neg, scale_pos) + else: + scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]: max_val = 6.0 if config.mode == CompressWeightsMode.E2M1 else fns.max(fns.abs(config.get_numpy_codebook())) scale = scale / max_val From 872b025d3cf6d63d4aa151982ee6fd5d87792323 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 18 Jul 2025 10:25:34 +0200 Subject: [PATCH 07/25] Fix in histogram computation. --- .../algorithms/weight_compression/codebook_estimation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index 5ef85a66e6c..1c8b0776c96 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -309,7 +309,7 @@ def __init__(self, n_clusters=8, max_iter=300): @staticmethod def get_init(values, frequencies, n_clusters): - step = 1.0 / n_clusters + step = 1.0 / (n_clusters - 1) denum = np.sum(frequencies) quants = [i * step for i in range(n_clusters)] n_frequencies = frequencies / denum From 9a8d08b3001c5b296388fcdc451b0a5be056c288 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 28 Jul 2025 20:11:58 +0200 Subject: [PATCH 08/25] Experimrnt. --- .../weight_compression/algorithm.py | 1 + .../weight_compression/codebook_estimation.py | 310 +++++++++++++++++- .../weight_compression/openvino_backend.py | 4 +- .../weight_compression/scale_estimation.py | 1 + 4 files changed, 296 insertions(+), 20 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index c4479680c4c..790676909d6 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -364,6 +364,7 @@ def __init__( or self._scale_estimation or self._lora_correction or self._gptq + or self._codebook_estimation_algo ) @property diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index 1c8b0776c96..67bb89de73e 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -12,6 +12,7 @@ from copy import deepcopy from typing import Optional, TypeVar import numpy as np +import time import openvino as ov from openvino.runtime import opset13 as opset @@ -193,7 +194,7 @@ def apply( res[weight_name] = CompressedWeight() continue - stats = None #statistics[node_name] + stats = statistics[node_name] weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) if len(weight_data) != 1: # not supported by the algorithm @@ -201,18 +202,25 @@ def apply( _, weight_port_id = weight_data[0] weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) + + qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) + print("Initial diff:", np.mean(np.abs(weight.data - qw.data))) codebook, scale, indexes = self.calculate_codebook( stats, weight, wp.reduction_axes, config, + wp # self._subset_size, # self._initial_steps, # self._scale_steps, # self._weight_penalty, ) res[weight_name] = CompressedWeight(indexes, scale, None, codebook) + config.codebook_values = codebook + qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) + print("kmeans diff:", np.mean(np.abs(weight.data - qw.data))) return res @@ -222,6 +230,7 @@ def calculate_codebook( weight: Tensor, reduction_axes: tuple[int, ...], config: WeightCompressionConfig, + wp: WeightCompressionParameters # subset_size: int = 32, # initial_steps: int = 5, # scale_steps: int = 10, @@ -230,6 +239,8 @@ def calculate_codebook( reduction_axis = reduction_axes[0] weight = deepcopy(weight.astype(TensorDataType.float32)) + + s, X = process_stats(statistics, 128) if reduction_axis == 0: weight = fns.transpose(weight) @@ -240,11 +251,14 @@ def calculate_codebook( orig_shape = weight.shape + importance = fns.ones_like(weight) + importance = importance * s + scale = calculate_float_quantization_params(weight, reduction_axes, config, signed=True) norm_weight = _calculate_normalized_weight(weight, scale) - - codebook, indexes = weights_clusterization_k_means(norm_weight)#.as_numpy_tensor().data) - + + codebook, indexes, variants = weights_clusterization_k_means(norm_weight, importance) + converter = fp8_convert(codebook.shape) indexes = indexes.reshape(orig_shape) @@ -252,13 +266,34 @@ def calculate_codebook( fp8_scales = fp8_scales[fp8_scales >= 1.0] best_codebook = converter(codebook)[0] - print("Best codebook:", best_codebook) + #print("Best codebook:", best_codebook) min_diff = float("inf") best_scale = 1.0 - weight = weight.as_numpy_tensor().data - scale = scale.as_numpy_tensor().data + #weight = weight.as_numpy_tensor().data + #scale = scale.as_numpy_tensor().data + + fp_outs = fns.matmul(weight, X) + diff = float('inf') + + variants[0] = CB4_QUANTILES + + for var in variants: + var = converter(var)[0] + config.codebook_values = Tensor(var) + qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) + q_outs = fns.matmul(qw, X) + + cur_diff = np.mean(np.abs(fp_outs.data - q_outs.data)) + if cur_diff < diff: + diff = cur_diff + best_codebook = var + else: + print("Was skip: ", diff, cur_diff) + #print("Best codebook:", best_codebook) + + # min_diffs = [] # for fp8_scale in fp8_scales: @@ -279,7 +314,7 @@ def calculate_codebook( # min_diffs.append(min_diff) #print("\t", min_diffs) - return Tensor(best_codebook), Tensor(scale / best_scale), Tensor(indexes) + return Tensor(best_codebook), None, None #Tensor(scale / best_scale), Tensor(indexes) def most_common(lst): @@ -361,14 +396,72 @@ def create_histogramm(data, granularity=0.01): return res + + @staticmethod + def create_histogramm_sorted(data_, granularity=0.01): + centers = [] + ranges = [] + step = granularity + + #granularity = granularity * (data.max() - data.min()) + + data = np.sort(data_) + data_range=(data.min().item(), data.max().item()) + prev = data_range[0] + + + while prev < data_range[1]: + centers.append(prev + step / 2) + prev += step + + if len(centers) > 1: + ranges.append(0.5 * (centers[-2] + centers[-1])) + ranges.append(centers[-1]) + + + centers = np.array(centers) + ranges = np.array(ranges) + + ranges_idxs = round(data, ranges) + + res = [[], [], []] + for i in range(centers.size): + res[0].append(centers[i]) + if i == 0: + res[1].append(np.sum(data[:ranges_idxs[1]])) + res[2].append(ranges_idxs[1]) + elif i == centers.size - 1: + res[1].append(np.sum(data[ranges_idxs[-2]:])) + res[2].append(len(data) - ranges_idxs[-2]) + else: + idx = 2 * i + res[1].append(np.sum(data[ranges_idxs[idx - 1]:ranges_idxs[idx + 1]])) + res[2].append(ranges_idxs[idx + 1] - ranges_idxs[idx - 1] - 1) + + res[0] = np.array(res[0])#.reshape(-1, 1) + res[1] = np.array(res[1]) + res[2] = np.array(res[2]) + + return res + def fit(self, X_train, init, fixed=[]): if self.max_iter == 1: self.centroids = deepcopy(init) return - self.hist = self.create_histogramm(X_train) + # start = time.time() + # self.hist = self.create_histogramm(X_train) + # end = time.time() + # print("create_histogramm", end - start) + + start = time.time() + self.hist = self.create_histogramm_sorted(X_train) + end = time.time() + #print("create_histogramm_sorted", end - start) + + start = time.time() - init_by_hist = CB4_QUANTILES #self.get_init(self.hist[0], self.hist[2], self.n_clusters) + init_by_hist = self.get_init(self.hist[0], self.hist[2], self.n_clusters) init_by_hist[0] = init[0] init_by_hist[-1] = init[-1] zero_idx = np.argmin(np.abs(init_by_hist[:])) @@ -396,31 +489,214 @@ def fit(self, X_train, init, fixed=[]): iteration += 1 if np.all(np.abs(self.centroids - prev_centroids) < 0.00001).any(): break - print(self.centroids) + end = time.time() + #print("rest", end - start) + #print(self.centroids) def evaluate(self, X): centroid_idxs = round(self.centroids, X) return deepcopy(self.centroids).flatten(), centroid_idxs -def weights_clusterization_k_means(weight, n_centroids=2**4): +class KMeansWeighted: + def __init__(self, n_clusters=8, max_iter=300): + self.n_clusters = n_clusters + self.max_iter = max_iter + self.variants = [] + + @staticmethod + def get_init(values, frequencies, n_clusters): + step = 1.0 / (n_clusters - 1) + denum = np.sum(frequencies) + quants = [i * step for i in range(n_clusters)] + n_frequencies = frequencies / denum + n_frequencies = np.cumsum(n_frequencies) + + res = [] + for i in range(len(quants)): + if i == 0: + res.append(values[0]) + elif i == len(quants) - 1: + res.append(values[-1]) + else: + prev = values[np.where(n_frequencies <= quants[i])[0][-1]].item() + next_ = values[np.where(n_frequencies <= quants[i + 1])[0][-1]].item() + res.append((prev + next_) / 2) + + res = np.array(res)#.reshape(1, -1) + return res + + @staticmethod + def create_histogramm(data, granularity=0.01): + centers = [] + step = granularity + + #granularity = granularity * (data.max() - data.min()) + + data_range=(data.min().item(), data.max().item()) + prev = data_range[0] + + while prev < data_range[1]: + centers.append(prev + step / 2) + prev += step + + centers = np.array(centers) + centroid_idxs = round(centers, data) + + res = [[], [], []] + for i in range(centers.size): + idxs = np.where(centroid_idxs == i) + if len(idxs[0]) == 0: + continue + res[0].append(centers[i]) + res[1].append(np.sum(data[idxs])) + res[2].append(len(idxs[0])) + + res[0] = np.array(res[0])#.reshape(-1, 1) + res[1] = np.array(res[1]) + res[2] = np.array(res[2]) + + return res + + @staticmethod + def add_weighted_data_and_weights(res, data): + res[1].append(np.multiply(data[0, :], data[1, :]).sum()) + res[2].append(np.sum(data[1, :])) + + @staticmethod + def create_histogramm_sorted(data_, importance, granularity=0.01): + centers = [] + ranges = [] + step = data_.max().item() * granularity / 3.5 + + #granularity = granularity * (data.max() - data.min()) + + data = np.array([data_, importance]) + + #data = np.sort(data, axis=1) + + data = data[:, data[0, :].argsort()] + + data_range=(data.min().item(), data.max().item()) + prev = data_range[0] + + + while prev < data_range[1]: + centers.append(prev + step / 2) + prev += step + + if len(centers) > 1: + ranges.append(0.5 * (centers[-2] + centers[-1])) + ranges.append(centers[-1]) + + + centers = np.array(centers) + ranges = np.array(ranges) + + ranges_idxs = round(data[0], ranges) + + res = [[], [], []] + for i in range(centers.size): + res[0].append(centers[i]) + if i == 0: + # res[1].append(np.sum(data[0, :ranges_idxs[1]])) + # res[2].append(ranges_idxs[1]) + KMeansWeighted.add_weighted_data_and_weights(res, data[:, :ranges_idxs[1]]) + elif i == centers.size - 1: + # res[1].append(np.sum(data[ranges_idxs[-2]:])) + # res[2].append(len(data) - ranges_idxs[-2]) + KMeansWeighted.add_weighted_data_and_weights(res, data[:, ranges_idxs[-2]:]) + else: + idx = 2 * i + # res[1].append(np.sum(data[ranges_idxs[idx - 1]:ranges_idxs[idx + 1]])) + # res[2].append(ranges_idxs[idx + 1] - ranges_idxs[idx - 1] - 1) + KMeansWeighted.add_weighted_data_and_weights(res, data[:, ranges_idxs[idx - 1]:ranges_idxs[idx + 1]]) + + res[0] = np.array(res[0])#.reshape(-1, 1) + res[1] = np.array(res[1]) + res[2] = np.array(res[2]) + + return res + + def fit(self, X_train, importance, init, fixed=[]): + if self.max_iter == 1: + self.centroids = deepcopy(init) + return + + # start = time.time() + # self.hist = self.create_histogramm(X_train) + # end = time.time() + # print("create_histogramm", end - start) + + start = time.time() + self.hist = KMeansWeighted.create_histogramm_sorted(X_train, importance) + end = time.time() + #print("create_histogramm_sorted", end - start) + + start = time.time() + + init_by_hist = self.get_init(self.hist[0], self.hist[2], self.n_clusters) + init_by_hist[0] = init[0] + init_by_hist[-1] = init[-1] + zero_idx = np.argmin(np.abs(init_by_hist[:])) + init_by_hist[zero_idx] = 0.0 #init[0, zero_idx] + fixed[1] = zero_idx + init = init_by_hist + + self.centroids = deepcopy(init) + + iteration = 0 + prev_centroids = self.centroids + while iteration < self.max_iter: + prev_centroids = deepcopy(self.centroids) + + if iteration % 5 == 0: + self.variants.append(deepcopy(self.centroids)) + + centroid_idxs = round(self.centroids, self.hist[0]) + for i in range(self.n_clusters): + idxs = np.where(centroid_idxs == i) + self.centroids[i] = np.sum(self.hist[1][idxs]) / np.sum(self.hist[2][idxs]) + + for i, centroid in enumerate(self.centroids): + if np.isnan(centroid).any(): # Catch any np.nans, resulting from a centroid having no points + self.centroids[i] = prev_centroids[i] + for idx in fixed: + self.centroids[idx] = init[idx] + iteration += 1 + if np.all(np.abs(self.centroids - prev_centroids) < 0.00001).any(): + break + + self.variants.append(deepcopy(self.centroids)) + end = time.time() + #print("rest", end - start) + #print(self.centroids) + + def evaluate(self, X): + centroid_idxs = round(self.centroids, X) + return deepcopy(self.centroids).flatten(), centroid_idxs + + +def weights_clusterization_k_means(weight, importance, n_centroids=2**4): weight = weight.as_numpy_tensor().data + importance = importance.as_numpy_tensor().data ow = deepcopy(weight) orig_shape = weight.shape weight = weight.flatten() - + importance = importance.flatten() + n_init = [0, 0] n_init[0] = weight.min() n_init[-1] = weight.max() - print("n_init:", n_init) + #print("n_init:", n_init) - kmeans = KMeansHist(n_centroids, max_iter=10) + kmeans = KMeansWeighted(n_centroids, max_iter=70) #n_init = kmeans.get_init(weight, n_init, n_centroids) #kmeans.fit(weight.reshape(-1, 1), n_init.reshape(1, -1), fixed=[0, 7, 15]) - kmeans.fit(weight, n_init, fixed=[0, 7, 15]) + kmeans.fit(weight, importance, n_init, fixed=[0, 7, 15]) codebook, indexes = kmeans.evaluate(weight.reshape(-1, 1)) # codebook = kmeans.cluster_centers_.flatten() # indexes = kmeans.labels_ @@ -429,4 +705,4 @@ def weights_clusterization_k_means(weight, n_centroids=2**4): print(orig_shape, np.mean(np.abs(ow - codebook[indexes]))) - return codebook, indexes + return codebook, indexes, kmeans.variants diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 4e8766620f9..d3f542e340e 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -254,9 +254,7 @@ def _create_compression_subgraph( n_quants = compressed_weight.codebook.size - 1 compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4) converted_const = create_ov_codebook_subgraph( - codebook=compressed_weight.codebook - if compression_config.mode == CompressWeightsMode.CODEBOOK - else compressed_weight.codebook.as_openvino_tensor().astype(TensorDataType.f8e4m3), + compressed_weight.codebook.as_openvino_tensor().astype(TensorDataType.f8e4m3), indexes=compressed_weight.tensor, dtype=compression_dtype, name=const_node_name, diff --git a/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 83cc1449755..a9b8d8dd9c6 100644 --- a/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -221,6 +221,7 @@ def calculate_quantization_params( # all weight in group has importance based on corresponding input activations importance = fns.ones_like(original_weight) + #s = s**2 importance = importance * s target, zero_mask = get_target_zero_mask(compressed_weights, zp) From ad518c2c6955a19673045d531acb980955c32e56 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 30 Jul 2025 09:29:11 +0200 Subject: [PATCH 09/25] Search best codebook by minimizing MatMul diff. --- .../weight_compression/codebook_estimation.py | 11 +++++++---- .../algorithms/weight_compression/weight_lowering.py | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index 67bb89de73e..14b49a9769d 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -278,8 +278,10 @@ def calculate_codebook( diff = float('inf') variants[0] = CB4_QUANTILES + variants[1] = np.array([i for i in range(-8, 8)]) + best_i = -1 - for var in variants: + for i_var, var in enumerate(variants): var = converter(var)[0] config.codebook_values = Tensor(var) qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) @@ -289,11 +291,12 @@ def calculate_codebook( if cur_diff < diff: diff = cur_diff best_codebook = var - else: - print("Was skip: ", diff, cur_diff) + best_i = i_var + # else: + # print("Was skip: ", diff, cur_diff) #print("Best codebook:", best_codebook) - + print("Best codebook:", best_codebook, "diff:", diff, "best_i:", best_i) # min_diffs = [] # for fp8_scale in fp8_scales: diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 8219eaf0697..d64d03769fd 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -78,7 +78,7 @@ def reshape_weight_for_grouped_quantization( def calculate_float_quantization_params( - weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, signed: bool = False + weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, signed: bool = True ) -> Tensor: """ Calculates the scale for nf4 or e2m1 quantization. From 2fb21b2df3ef7cb40e442dcee31a4cf0f7621380 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Thu, 25 Sep 2025 14:11:30 +0200 Subject: [PATCH 10/25] Removed unused code. --- .../weight_compression/codebook_estimation.py | 205 +----------------- 1 file changed, 7 insertions(+), 198 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index 14b49a9769d..521b19a857f 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -116,18 +116,14 @@ def fp8_convert(in_shape): class CodebookEstimation: """ - Scale estimation algorithm implementation. + Codebook estimation algorithm implementation. """ def __init__( self, ): """ - :param subset_size: The number of samples for scale estimation. - :param initial_steps: The number of the steps for absmax scale rectification. - :param scale_steps: The number of the steps for grid search scale rectification - from 1.0 to 1.0 - 0.05 * scale_step. - :param weight_penalty: coefficient for penalty between fp and compressed weights. If -1 then doesn't apply. + Initializes the CodebookEstimation algorithm. """ super().__init__() @@ -148,7 +144,7 @@ def _set_backend_entity(self, model: TModel) -> None: self._backend_entity = OVWeightCompressionAlgoBackend(model) else: msg = ( - "Cannot return backend-specific Scale Estimation entity because" + "Cannot return backend-specific Codebook Estimation entity because" f" {model_backend.value} is not supported!" ) raise nncf.UnsupportedBackendError(msg) @@ -162,11 +158,10 @@ def apply( backend_entity: Optional[WeightCompressionAlgoBackend] = None, ) -> dict[str, CompressedWeight]: """ - Estimates better scale for the int4 nodes in the model. - Minimizes per-group difference between floating point MatMul and + Estimates better codebook. + Minimizes difference between floating point MatMul and MatMul with compressed weights. - The algorithm computes weighted scale for the group of weights in MatMul, which - shared the same scale. + The algorithm computes codebook and indexes for MatMul compression. :param model: Model for applying algorithm. :param graph: Model graph. @@ -175,7 +170,7 @@ def apply( :param statistic_points: Statistic points with collected statistics values. :param dataset: A representative dataset for the calibration process. :param backend_entity: Weight compression algorithm backend. - :return: Two dictionaries for estimated scales and zero points for each weight name. + :return: A dictionary that maps weight names to CompressedWeight with codebook, codebook indexes and scale. """ self._backend_entity = backend_entity if self._backend_entity is None: @@ -186,9 +181,6 @@ def apply( weight_name = wp.weight_name node_name = wp.node_with_weight.node_name config = wp.compression_config - - # if not 'return_proj' in node_name: - # continue if config.num_bits != 4:# or node_name not in statistics: res[weight_name] = CompressedWeight() @@ -212,10 +204,6 @@ def apply( wp.reduction_axes, config, wp - # self._subset_size, - # self._initial_steps, - # self._scale_steps, - # self._weight_penalty, ) res[weight_name] = CompressedWeight(indexes, scale, None, codebook) config.codebook_values = codebook @@ -231,10 +219,6 @@ def calculate_codebook( reduction_axes: tuple[int, ...], config: WeightCompressionConfig, wp: WeightCompressionParameters - # subset_size: int = 32, - # initial_steps: int = 5, - # scale_steps: int = 10, - # weight_penalty: float = -1.0, ) -> Tensor: reduction_axis = reduction_axes[0] @@ -320,187 +304,12 @@ def calculate_codebook( return Tensor(best_codebook), None, None #Tensor(scale / best_scale), Tensor(indexes) -def most_common(lst): - """ - Return the most frequently occuring element in a list. - """ - return max(set(lst), key=lst.count) - - -def euclidean_(point, data): - """ - Return euclidean distances between a point & a dataset - """ - return np.sqrt(np.sum((point - data) ** 2, axis=1)) - - def round(quantiles, values): center_of_quantiles = 0.5 * (quantiles[1:] + quantiles[:-1]) return np.searchsorted(center_of_quantiles, values, side='left', sorter=None) -class KMeansHist: - def __init__(self, n_clusters=8, max_iter=300): - self.n_clusters = n_clusters - self.max_iter = max_iter - - @staticmethod - def get_init(values, frequencies, n_clusters): - step = 1.0 / (n_clusters - 1) - denum = np.sum(frequencies) - quants = [i * step for i in range(n_clusters)] - n_frequencies = frequencies / denum - n_frequencies = np.cumsum(n_frequencies) - - res = [] - for i in range(len(quants)): - if i == 0: - res.append(values[0]) - elif i == len(quants) - 1: - res.append(values[-1]) - else: - prev = values[np.where(n_frequencies <= quants[i])[0][-1]].item() - next_ = values[np.where(n_frequencies <= quants[i + 1])[0][-1]].item() - res.append((prev + next_) / 2) - - res = np.array(res)#.reshape(1, -1) - return res - - @staticmethod - def create_histogramm(data, granularity=0.01): - centers = [] - step = granularity - - #granularity = granularity * (data.max() - data.min()) - - data_range=(data.min().item(), data.max().item()) - prev = data_range[0] - - while prev < data_range[1]: - centers.append(prev + step / 2) - prev += step - - centers = np.array(centers) - centroid_idxs = round(centers, data) - - res = [[], [], []] - for i in range(centers.size): - idxs = np.where(centroid_idxs == i) - if len(idxs[0]) == 0: - continue - res[0].append(centers[i]) - res[1].append(np.sum(data[idxs])) - res[2].append(len(idxs[0])) - - res[0] = np.array(res[0])#.reshape(-1, 1) - res[1] = np.array(res[1]) - res[2] = np.array(res[2]) - - return res - - - @staticmethod - def create_histogramm_sorted(data_, granularity=0.01): - centers = [] - ranges = [] - step = granularity - - #granularity = granularity * (data.max() - data.min()) - - data = np.sort(data_) - data_range=(data.min().item(), data.max().item()) - prev = data_range[0] - - - while prev < data_range[1]: - centers.append(prev + step / 2) - prev += step - - if len(centers) > 1: - ranges.append(0.5 * (centers[-2] + centers[-1])) - ranges.append(centers[-1]) - - - centers = np.array(centers) - ranges = np.array(ranges) - - ranges_idxs = round(data, ranges) - - res = [[], [], []] - for i in range(centers.size): - res[0].append(centers[i]) - if i == 0: - res[1].append(np.sum(data[:ranges_idxs[1]])) - res[2].append(ranges_idxs[1]) - elif i == centers.size - 1: - res[1].append(np.sum(data[ranges_idxs[-2]:])) - res[2].append(len(data) - ranges_idxs[-2]) - else: - idx = 2 * i - res[1].append(np.sum(data[ranges_idxs[idx - 1]:ranges_idxs[idx + 1]])) - res[2].append(ranges_idxs[idx + 1] - ranges_idxs[idx - 1] - 1) - - res[0] = np.array(res[0])#.reshape(-1, 1) - res[1] = np.array(res[1]) - res[2] = np.array(res[2]) - - return res - - def fit(self, X_train, init, fixed=[]): - if self.max_iter == 1: - self.centroids = deepcopy(init) - return - - # start = time.time() - # self.hist = self.create_histogramm(X_train) - # end = time.time() - # print("create_histogramm", end - start) - - start = time.time() - self.hist = self.create_histogramm_sorted(X_train) - end = time.time() - #print("create_histogramm_sorted", end - start) - - start = time.time() - - init_by_hist = self.get_init(self.hist[0], self.hist[2], self.n_clusters) - init_by_hist[0] = init[0] - init_by_hist[-1] = init[-1] - zero_idx = np.argmin(np.abs(init_by_hist[:])) - init_by_hist[zero_idx] = 0.0 #init[0, zero_idx] - fixed[1] = zero_idx - init = init_by_hist - - self.centroids = deepcopy(init) - - iteration = 0 - prev_centroids = self.centroids - while iteration < self.max_iter: - prev_centroids = deepcopy(self.centroids) - - centroid_idxs = round(self.centroids, self.hist[0]) - for i in range(self.n_clusters): - idxs = np.where(centroid_idxs == i) - self.centroids[i] = np.sum(self.hist[1][idxs]) / np.sum(self.hist[2][idxs]) - - for i, centroid in enumerate(self.centroids): - if np.isnan(centroid).any(): # Catch any np.nans, resulting from a centroid having no points - self.centroids[i] = prev_centroids[i] - for idx in fixed: - self.centroids[idx] = init[idx] - iteration += 1 - if np.all(np.abs(self.centroids - prev_centroids) < 0.00001).any(): - break - end = time.time() - #print("rest", end - start) - #print(self.centroids) - - def evaluate(self, X): - centroid_idxs = round(self.centroids, X) - return deepcopy(self.centroids).flatten(), centroid_idxs - - class KMeansWeighted: def __init__(self, n_clusters=8, max_iter=300): self.n_clusters = n_clusters From 812cbed354039e27ee322dd03020ff06253225eb Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Thu, 25 Sep 2025 18:21:01 +0200 Subject: [PATCH 11/25] Remove unused code. --- .../weight_compression/codebook_estimation.py | 146 ++---------------- 1 file changed, 16 insertions(+), 130 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index 521b19a857f..b3cabc79ad8 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -42,62 +42,6 @@ TModel = TypeVar("TModel") -f8e4m3_data = np.array( - [-4.4800000e+02, -4.1600000e+02, -3.8400000e+02, -3.5200000e+02 - , -3.2000000e+02, -2.8800000e+02, -2.5600000e+02, -2.4000000e+02 - , -2.2400000e+02, -2.0800000e+02, -1.9200000e+02, -1.7600000e+02 - , -1.6000000e+02, -1.4400000e+02, -1.2800000e+02, -1.2000000e+02 - , -1.1200000e+02, -1.0400000e+02, -9.6000000e+01, -8.8000000e+01 - , -8.0000000e+01, -7.2000000e+01, -6.4000000e+01, -6.0000000e+01 - , -5.6000000e+01, -5.2000000e+01, -4.8000000e+01, -4.4000000e+01 - , -4.0000000e+01, -3.6000000e+01, -3.2000000e+01, -3.0000000e+01 - , -2.8000000e+01, -2.6000000e+01, -2.4000000e+01, -2.2000000e+01 - , -2.0000000e+01, -1.8000000e+01, -1.6000000e+01, -1.5000000e+01 - , -1.4000000e+01, -1.3000000e+01, -1.2000000e+01, -1.1000000e+01 - , -1.0000000e+01, -9.0000000e+00, -8.0000000e+00, -7.5000000e+00 - , -7.0000000e+00, -6.5000000e+00, -6.0000000e+00, -5.5000000e+00 - , -5.0000000e+00, -4.5000000e+00, -4.0000000e+00, -3.7500000e+00 - , -3.5000000e+00, -3.2500000e+00, -3.0000000e+00, -2.7500000e+00 - , -2.5000000e+00, -2.2500000e+00, -2.0000000e+00, -1.8750000e+00 - , -1.7500000e+00, -1.6250000e+00, -1.5000000e+00, -1.3750000e+00 - , -1.2500000e+00, -1.1250000e+00, -1.0000000e+00, -9.3750000e-01 - , -8.7500000e-01, -8.1250000e-01, -7.5000000e-01, -6.8750000e-01 - , -6.2500000e-01, -5.6250000e-01, -5.0000000e-01, -4.6875000e-01 - , -4.3750000e-01, -4.0625000e-01, -3.7500000e-01, -3.4375000e-01 - , -3.1250000e-01, -2.8125000e-01, -2.5000000e-01, -2.3437500e-01 - , -2.1875000e-01, -2.0312500e-01, -1.8750000e-01, -1.7187500e-01 - , -1.5625000e-01, -1.4062500e-01, -1.2500000e-01, -1.1718750e-01 - , -1.0937500e-01, -1.0156250e-01, -9.3750000e-02, -7.8125000e-02 - , -7.0312500e-02, -5.8593750e-02, -5.0781250e-02, -3.9062500e-02 - , -2.9296875e-02, -1.9531250e-02, -9.7656250e-03, 0.0000000e+00 - , 9.7656250e-03, 1.9531250e-02, 2.9296875e-02, 3.9062500e-02 - , 5.0781250e-02, 5.8593750e-02, 7.0312500e-02, 7.8125000e-02 - , 9.3750000e-02, 1.0156250e-01, 1.0937500e-01, 1.1718750e-01 - , 1.2500000e-01, 1.4062500e-01, 1.5625000e-01, 1.7187500e-01 - , 1.8750000e-01, 2.0312500e-01, 2.1875000e-01, 2.3437500e-01 - , 2.5000000e-01, 2.8125000e-01, 3.1250000e-01, 3.4375000e-01 - , 3.7500000e-01, 4.0625000e-01, 4.3750000e-01, 4.6875000e-01 - , 5.0000000e-01, 5.6250000e-01, 6.2500000e-01, 6.8750000e-01 - , 7.5000000e-01, 8.1250000e-01, 8.7500000e-01, 9.3750000e-01 - , 1.0000000e+00, 1.1250000e+00, 1.2500000e+00, 1.3750000e+00 - , 1.5000000e+00, 1.6250000e+00, 1.7500000e+00, 1.8750000e+00 - , 2.0000000e+00, 2.2500000e+00, 2.5000000e+00, 2.7500000e+00 - , 3.0000000e+00, 3.2500000e+00, 3.5000000e+00, 3.7500000e+00 - , 4.0000000e+00, 4.5000000e+00, 5.0000000e+00, 5.5000000e+00 - , 6.0000000e+00, 6.5000000e+00, 7.0000000e+00, 7.5000000e+00 - , 8.0000000e+00, 9.0000000e+00, 1.0000000e+01, 1.1000000e+01 - , 1.2000000e+01, 1.3000000e+01, 1.4000000e+01, 1.5000000e+01 - , 1.6000000e+01, 1.8000000e+01, 2.0000000e+01, 2.2000000e+01 - , 2.4000000e+01, 2.6000000e+01, 2.8000000e+01, 3.0000000e+01 - , 3.2000000e+01, 3.6000000e+01, 4.0000000e+01, 4.4000000e+01 - , 4.8000000e+01, 5.2000000e+01, 5.6000000e+01, 6.0000000e+01 - , 6.4000000e+01, 7.2000000e+01, 8.0000000e+01, 8.8000000e+01 - , 9.6000000e+01, 1.0400000e+02, 1.1200000e+02, 1.2000000e+02 - , 1.2800000e+02, 1.4400000e+02, 1.6000000e+02, 1.7600000e+02 - , 1.9200000e+02, 2.0800000e+02, 2.2400000e+02, 2.4000000e+02 - , 2.5600000e+02, 2.8800000e+02, 3.2000000e+02, 3.5200000e+02 - , 3.8400000e+02, 4.1600000e+02, 4.4800000e+02] -) def fp8_convert(in_shape): input = opset.parameter( @@ -156,6 +100,7 @@ def apply( all_weight_params: list[WeightCompressionParameters], statistics: dict[str, WCTensorStatistic], backend_entity: Optional[WeightCompressionAlgoBackend] = None, + debug=False ) -> dict[str, CompressedWeight]: """ Estimates better codebook. @@ -194,9 +139,10 @@ def apply( _, weight_port_id = weight_data[0] weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) - - qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) - print("Initial diff:", np.mean(np.abs(weight.data - qw.data))) + + if debug: + qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) + print("Initial diff:", np.mean(np.abs(weight.data - qw.data))) codebook, scale, indexes = self.calculate_codebook( stats, @@ -207,8 +153,10 @@ def apply( ) res[weight_name] = CompressedWeight(indexes, scale, None, codebook) config.codebook_values = codebook - qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) - print("kmeans diff:", np.mean(np.abs(weight.data - qw.data))) + + if debug: + qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) + print("kmeans diff:", np.mean(np.abs(weight.data - qw.data))) return res @@ -245,18 +193,9 @@ def calculate_codebook( converter = fp8_convert(codebook.shape) indexes = indexes.reshape(orig_shape) - - fp8_scales = np.unique(np.abs(f8e4m3_data)) - fp8_scales = fp8_scales[fp8_scales >= 1.0] - - best_codebook = converter(codebook)[0] - #print("Best codebook:", best_codebook) - min_diff = float("inf") - best_scale = 1.0 - #weight = weight.as_numpy_tensor().data - #scale = scale.as_numpy_tensor().data + best_codebook = converter(codebook)[0] fp_outs = fns.matmul(weight, X) diff = float('inf') @@ -271,42 +210,19 @@ def calculate_codebook( qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) q_outs = fns.matmul(qw, X) - cur_diff = np.mean(np.abs(fp_outs.data - q_outs.data)) + cur_diff = fns.mean(fns.abs(fp_outs - q_outs)).item() if cur_diff < diff: diff = cur_diff best_codebook = var best_i = i_var - # else: - # print("Was skip: ", diff, cur_diff) - #print("Best codebook:", best_codebook) print("Best codebook:", best_codebook, "diff:", diff, "best_i:", best_i) - # min_diffs = [] - # for fp8_scale in fp8_scales: - # scaled_codebook = codebook * fp8_scale - # scaled_codebook = converter(scaled_codebook)[0] - - - # dequantized_weight = scaled_codebook[indexes] - # dequantized_weight = dequantized_weight * scale - # dequantized_weight = dequantized_weight / fp8_scale - - # diff = np.mean(np.abs(weight - dequantized_weight)) - - # if diff < min_diff: - # min_diff = diff - # best_codebook = deepcopy(scaled_codebook) - # best_scale = fp8_scale - # min_diffs.append(min_diff) - - #print("\t", min_diffs) - return Tensor(best_codebook), None, None #Tensor(scale / best_scale), Tensor(indexes) + return Tensor(best_codebook), None, None def round(quantiles, values): center_of_quantiles = 0.5 * (quantiles[1:] + quantiles[:-1]) - return np.searchsorted(center_of_quantiles, values, side='left', sorter=None) @@ -335,15 +251,13 @@ def get_init(values, frequencies, n_clusters): next_ = values[np.where(n_frequencies <= quants[i + 1])[0][-1]].item() res.append((prev + next_) / 2) - res = np.array(res)#.reshape(1, -1) + res = np.array(res) return res @staticmethod def create_histogramm(data, granularity=0.01): centers = [] step = granularity - - #granularity = granularity * (data.max() - data.min()) data_range=(data.min().item(), data.max().item()) prev = data_range[0] @@ -381,11 +295,7 @@ def create_histogramm_sorted(data_, importance, granularity=0.01): ranges = [] step = data_.max().item() * granularity / 3.5 - #granularity = granularity * (data.max() - data.min()) - data = np.array([data_, importance]) - - #data = np.sort(data, axis=1) data = data[:, data[0, :].argsort()] @@ -411,20 +321,14 @@ def create_histogramm_sorted(data_, importance, granularity=0.01): for i in range(centers.size): res[0].append(centers[i]) if i == 0: - # res[1].append(np.sum(data[0, :ranges_idxs[1]])) - # res[2].append(ranges_idxs[1]) KMeansWeighted.add_weighted_data_and_weights(res, data[:, :ranges_idxs[1]]) elif i == centers.size - 1: - # res[1].append(np.sum(data[ranges_idxs[-2]:])) - # res[2].append(len(data) - ranges_idxs[-2]) KMeansWeighted.add_weighted_data_and_weights(res, data[:, ranges_idxs[-2]:]) else: idx = 2 * i - # res[1].append(np.sum(data[ranges_idxs[idx - 1]:ranges_idxs[idx + 1]])) - # res[2].append(ranges_idxs[idx + 1] - ranges_idxs[idx - 1] - 1) KMeansWeighted.add_weighted_data_and_weights(res, data[:, ranges_idxs[idx - 1]:ranges_idxs[idx + 1]]) - res[0] = np.array(res[0])#.reshape(-1, 1) + res[0] = np.array(res[0]) res[1] = np.array(res[1]) res[2] = np.array(res[2]) @@ -434,18 +338,8 @@ def fit(self, X_train, importance, init, fixed=[]): if self.max_iter == 1: self.centroids = deepcopy(init) return - - # start = time.time() - # self.hist = self.create_histogramm(X_train) - # end = time.time() - # print("create_histogramm", end - start) - start = time.time() self.hist = KMeansWeighted.create_histogramm_sorted(X_train, importance) - end = time.time() - #print("create_histogramm_sorted", end - start) - - start = time.time() init_by_hist = self.get_init(self.hist[0], self.hist[2], self.n_clusters) init_by_hist[0] = init[0] @@ -480,9 +374,7 @@ def fit(self, X_train, importance, init, fixed=[]): break self.variants.append(deepcopy(self.centroids)) - end = time.time() - #print("rest", end - start) - #print(self.centroids) + def evaluate(self, X): centroid_idxs = round(self.centroids, X) @@ -501,17 +393,11 @@ def weights_clusterization_k_means(weight, importance, n_centroids=2**4): n_init = [0, 0] n_init[0] = weight.min() n_init[-1] = weight.max() - #print("n_init:", n_init) kmeans = KMeansWeighted(n_centroids, max_iter=70) - - #n_init = kmeans.get_init(weight, n_init, n_centroids) - - #kmeans.fit(weight.reshape(-1, 1), n_init.reshape(1, -1), fixed=[0, 7, 15]) + kmeans.fit(weight, importance, n_init, fixed=[0, 7, 15]) codebook, indexes = kmeans.evaluate(weight.reshape(-1, 1)) - # codebook = kmeans.cluster_centers_.flatten() - # indexes = kmeans.labels_ indexes = np.reshape(indexes, orig_shape) From 8c896c8d369935efb379e76c8a8774a6873a9c91 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 26 Sep 2025 11:56:51 +0200 Subject: [PATCH 12/25] Replace np by fns. --- .../weight_compression/codebook_estimation.py | 44 ++++++++++++------- src/nncf/tensor/functions/__init__.py | 2 + src/nncf/tensor/functions/numeric.py | 21 +++++++++ src/nncf/tensor/functions/numpy_numeric.py | 12 +++++ src/nncf/tensor/functions/torch_numeric.py | 12 +++++ 5 files changed, 74 insertions(+), 17 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index b3cabc79ad8..24c8e02d515 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -10,6 +10,7 @@ # limitations under the License. from copy import deepcopy +from dataclasses import dataclass from typing import Optional, TypeVar import numpy as np import time @@ -142,7 +143,7 @@ def apply( if debug: qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) - print("Initial diff:", np.mean(np.abs(weight.data - qw.data))) + print("Initial diff:", fns.mean(fns.abs(weight.data - qw.data))) codebook, scale, indexes = self.calculate_codebook( stats, @@ -156,7 +157,7 @@ def apply( if debug: qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) - print("kmeans diff:", np.mean(np.abs(weight.data - qw.data))) + print("kmeans diff:", fns.mean(fns.abs(weight.data - qw.data))) return res @@ -201,7 +202,7 @@ def calculate_codebook( diff = float('inf') variants[0] = CB4_QUANTILES - variants[1] = np.array([i for i in range(-8, 8)]) + variants[1] = fns.tensor([i for i in range(-8, 8)]) best_i = -1 for i_var, var in enumerate(variants): @@ -223,8 +224,16 @@ def calculate_codebook( def round(quantiles, values): center_of_quantiles = 0.5 * (quantiles[1:] + quantiles[:-1]) - return np.searchsorted(center_of_quantiles, values, side='left', sorter=None) + return fns.searchsorted(center_of_quantiles, values, side='left', sorter=None) +@dataclass +class KMeansAlgoData: + centroids: Tensor + hist: Tensor + weighted_hist: Tensor | None = None + + frequencies: Tensor | None = None + weights: Tensor | None = None class KMeansWeighted: def __init__(self, n_clusters=8, max_iter=300): @@ -235,10 +244,11 @@ def __init__(self, n_clusters=8, max_iter=300): @staticmethod def get_init(values, frequencies, n_clusters): step = 1.0 / (n_clusters - 1) - denum = np.sum(frequencies) + denum = fns.sum(frequencies) quants = [i * step for i in range(n_clusters)] n_frequencies = frequencies / denum - n_frequencies = np.cumsum(n_frequencies) + n_frequencies = fns.cumsum(n_frequencies) + res = [] for i in range(len(quants)): @@ -247,11 +257,11 @@ def get_init(values, frequencies, n_clusters): elif i == len(quants) - 1: res.append(values[-1]) else: - prev = values[np.where(n_frequencies <= quants[i])[0][-1]].item() - next_ = values[np.where(n_frequencies <= quants[i + 1])[0][-1]].item() + prev = values[fns.nonzero(n_frequencies <= quants[i])[0][-1]].item() + next_ = values[fns.nonzero(n_frequencies <= quants[i + 1])[0][-1]].item() res.append((prev + next_) / 2) - res = np.array(res) + res = fns.tensor(res) return res @staticmethod @@ -266,28 +276,28 @@ def create_histogramm(data, granularity=0.01): centers.append(prev + step / 2) prev += step - centers = np.array(centers) + centers = fns.tensor(centers) centroid_idxs = round(centers, data) res = [[], [], []] for i in range(centers.size): - idxs = np.where(centroid_idxs == i) + idxs = fns.nonzero(centroid_idxs == i) if len(idxs[0]) == 0: continue res[0].append(centers[i]) - res[1].append(np.sum(data[idxs])) + res[1].append(fns.sum(data[idxs])) res[2].append(len(idxs[0])) - res[0] = np.array(res[0])#.reshape(-1, 1) - res[1] = np.array(res[1]) - res[2] = np.array(res[2]) + res[0] = fns.tensor(res[0]) # centers of histogram bins + res[1] = fns.tensor(res[1]) # sum of values in each bin + res[2] = fns.tensor(res[2]) # count of values in each bin return res @staticmethod def add_weighted_data_and_weights(res, data): - res[1].append(np.multiply(data[0, :], data[1, :]).sum()) - res[2].append(np.sum(data[1, :])) + res[1].append(fns.multiply(data[0, :], data[1, :]).sum()) + res[2].append(fns.sum(data[1, :])) @staticmethod def create_histogramm_sorted(data_, importance, granularity=0.01): diff --git a/src/nncf/tensor/functions/__init__.py b/src/nncf/tensor/functions/__init__.py index ffb05b430d2..5982cc583b0 100644 --- a/src/nncf/tensor/functions/__init__.py +++ b/src/nncf/tensor/functions/__init__.py @@ -58,12 +58,14 @@ from nncf.tensor.functions.numeric import squeeze as squeeze from nncf.tensor.functions.numeric import stack as stack from nncf.tensor.functions.numeric import sum as sum +from nncf.tensor.functions.numeric import cumsum as cumsum from nncf.tensor.functions.numeric import tensor as tensor from nncf.tensor.functions.numeric import transpose as transpose from nncf.tensor.functions.numeric import unsqueeze as unsqueeze from nncf.tensor.functions.numeric import unstack as unstack from nncf.tensor.functions.numeric import var as var from nncf.tensor.functions.numeric import where as where +from nncf.tensor.functions.numeric import nonzero as nonzero from nncf.tensor.functions.numeric import zeros as zeros from nncf.tensor.functions.numeric import zeros_like as zeros_like diff --git a/src/nncf/tensor/functions/numeric.py b/src/nncf/tensor/functions/numeric.py index 886ea3e8ab2..35531262d3d 100644 --- a/src/nncf/tensor/functions/numeric.py +++ b/src/nncf/tensor/functions/numeric.py @@ -273,6 +273,16 @@ def where(condition: Tensor, x: Union[Tensor, float], y: Union[Tensor, float]) - """ +@tensor_dispatcher +def nonzero(condition: Tensor) -> Tensor: + """ + Return the indices of the elements that are non-zero. + + :param condition: The input tensor. + :return: A tensor containing the indices of the non-zero elements. + """ + + @tensor_dispatcher def zeros_like(a: Tensor) -> Tensor: """ @@ -503,6 +513,17 @@ def sum(a: Tensor, axis: Optional[Union[int, tuple[int, ...]]] = None, keepdims: :return: Returns the sum of all elements in the input tensor in the given axis. """ +@tensor_dispatcher +def cumsum(a: Tensor, axis: Optional[Union[int, tuple[int, ...]]] = None) -> Tensor: + """ + Cumulative sum of tensor elements over a given axis. + + :param a: The input tensor. + :param axis: Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements + of the input tensor. + :return: Returns the cumulative sum of all elements in the input tensor in the given axis. + """ + @tensor_dispatcher def multiply(x1: Tensor, x2: Union[Tensor, float]) -> Tensor: diff --git a/src/nncf/tensor/functions/numpy_numeric.py b/src/nncf/tensor/functions/numpy_numeric.py index b6accc34c82..ddb7ab9f15c 100644 --- a/src/nncf/tensor/functions/numpy_numeric.py +++ b/src/nncf/tensor/functions/numpy_numeric.py @@ -173,6 +173,13 @@ def _( return np.where(condition, x, y) +@numeric.nonzero.register +def _( + condition: T_NUMPY, +) -> T_NUMPY_ARRAY: + return np.nonzero(condition) + + @numeric.zeros_like.register def _(a: T_NUMPY) -> T_NUMPY_ARRAY: return np.zeros_like(a) @@ -294,6 +301,11 @@ def _(a: T_NUMPY, axis: T_AXIS = None, keepdims: bool = False) -> T_NUMPY_ARRAY: return np.array(np.sum(a, axis=axis, keepdims=keepdims)) +@numeric.cumsum.register +def _(a: T_NUMPY, axis: T_AXIS = None) -> T_NUMPY_ARRAY: + return np.array(np.cumsum(a, axis=axis)) + + @numeric.multiply.register def _(x1: T_NUMPY, x2: Union[T_NUMPY, float]) -> T_NUMPY_ARRAY: return np.multiply(x1, x2) diff --git a/src/nncf/tensor/functions/torch_numeric.py b/src/nncf/tensor/functions/torch_numeric.py index 41ba6e89135..5188f85c54d 100644 --- a/src/nncf/tensor/functions/torch_numeric.py +++ b/src/nncf/tensor/functions/torch_numeric.py @@ -186,6 +186,13 @@ def _( return torch.where(condition, x, y) +@numeric.nonzero.register +def _( + condition: torch.Tensor +) -> torch.Tensor: + return torch.nonzero(condition, as_tuple=True) + + @numeric.zeros_like.register def _(a: torch.Tensor) -> torch.Tensor: return torch.zeros_like(a) @@ -319,6 +326,11 @@ def _(a: torch.Tensor, axis: T_AXIS = None, keepdims: bool = False) -> torch.Ten return torch.sum(a, dim=axis, keepdim=keepdims) +@numeric.cumsum.register +def _(a: torch.Tensor, axis: T_AXIS = None) -> torch.Tensor: + return torch.cumsum(a, dim=axis) + + @numeric.multiply.register def _(x1: torch.Tensor, x2: Union[torch.Tensor, float]) -> torch.Tensor: return torch.multiply(x1, x2) From a792c0b3b411c1b3d0564a3613bb0fe7ef8c23b3 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 26 Sep 2025 14:39:27 +0200 Subject: [PATCH 13/25] Replace np by fns. --- .../weight_compression/codebook_estimation.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index 24c8e02d515..ac012ef1e74 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -295,9 +295,9 @@ def create_histogramm(data, granularity=0.01): return res @staticmethod - def add_weighted_data_and_weights(res, data): - res[1].append(fns.multiply(data[0, :], data[1, :]).sum()) - res[2].append(fns.sum(data[1, :])) + def add_weighted_data_and_weights(res, data, importance): + res[1].append(fns.sum(fns.multiply(data, importance))) + res[2].append(fns.sum(importance)) @staticmethod def create_histogramm_sorted(data_, importance, granularity=0.01): @@ -305,9 +305,12 @@ def create_histogramm_sorted(data_, importance, granularity=0.01): ranges = [] step = data_.max().item() * granularity / 3.5 - data = np.array([data_, importance]) + sorted_idx = fns.argsort(data_) + data = data_[sorted_idx] + importance = importance[sorted_idx] - data = data[:, data[0, :].argsort()] + #data = np.array([data_, importance]) + #data = data[:, data[0, :].argsort()] data_range=(data.min().item(), data.max().item()) prev = data_range[0] @@ -325,18 +328,19 @@ def create_histogramm_sorted(data_, importance, granularity=0.01): centers = np.array(centers) ranges = np.array(ranges) - ranges_idxs = round(data[0], ranges) + ranges_idxs = round(data, ranges) res = [[], [], []] for i in range(centers.size): res[0].append(centers[i]) if i == 0: - KMeansWeighted.add_weighted_data_and_weights(res, data[:, :ranges_idxs[1]]) + KMeansWeighted.add_weighted_data_and_weights(res, data[:ranges_idxs[1].item()], importance[:ranges_idxs[1].item()]) elif i == centers.size - 1: - KMeansWeighted.add_weighted_data_and_weights(res, data[:, ranges_idxs[-2]:]) + KMeansWeighted.add_weighted_data_and_weights(res, data[ranges_idxs[-2].item():], importance[ranges_idxs[-2].item():]) else: idx = 2 * i - KMeansWeighted.add_weighted_data_and_weights(res, data[:, ranges_idxs[idx - 1]:ranges_idxs[idx + 1]]) + KMeansWeighted.add_weighted_data_and_weights(res, data[:, ranges_idxs[idx - 1].item():ranges_idxs[idx + 1].item()], + importance[:, ranges_idxs[idx - 1].item():ranges_idxs[idx + 1].item()]) res[0] = np.array(res[0]) res[1] = np.array(res[1]) @@ -392,8 +396,8 @@ def evaluate(self, X): def weights_clusterization_k_means(weight, importance, n_centroids=2**4): - weight = weight.as_numpy_tensor().data - importance = importance.as_numpy_tensor().data + #weight = weight.as_numpy_tensor().data + #importance = importance.as_numpy_tensor().data ow = deepcopy(weight) orig_shape = weight.shape From d3c2ab83a2a538f69170d6bb6e4f1dc947f81622 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 26 Sep 2025 17:03:10 +0200 Subject: [PATCH 14/25] Replace np by fns. --- .../weight_compression/codebook_estimation.py | 18 +++++++++--------- src/nncf/tensor/functions/__init__.py | 1 + src/nncf/tensor/functions/numeric.py | 11 +++++++++++ src/nncf/tensor/functions/numpy_numeric.py | 5 +++++ 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index ac012ef1e74..77f16a7833d 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -257,11 +257,11 @@ def get_init(values, frequencies, n_clusters): elif i == len(quants) - 1: res.append(values[-1]) else: - prev = values[fns.nonzero(n_frequencies <= quants[i])[0][-1]].item() - next_ = values[fns.nonzero(n_frequencies <= quants[i + 1])[0][-1]].item() + prev = values[fns.nonzero(n_frequencies <= quants[i])[0][-1].item()].item() + next_ = values[fns.nonzero(n_frequencies <= quants[i + 1])[0][-1].item()].item() res.append((prev + next_) / 2) - res = fns.tensor(res) + res = fns.tensor(res, backend=values.backend) return res @staticmethod @@ -339,12 +339,12 @@ def create_histogramm_sorted(data_, importance, granularity=0.01): KMeansWeighted.add_weighted_data_and_weights(res, data[ranges_idxs[-2].item():], importance[ranges_idxs[-2].item():]) else: idx = 2 * i - KMeansWeighted.add_weighted_data_and_weights(res, data[:, ranges_idxs[idx - 1].item():ranges_idxs[idx + 1].item()], - importance[:, ranges_idxs[idx - 1].item():ranges_idxs[idx + 1].item()]) + KMeansWeighted.add_weighted_data_and_weights(res, data[ranges_idxs[idx - 1].item():ranges_idxs[idx + 1].item()], + importance[ranges_idxs[idx - 1].item():ranges_idxs[idx + 1].item()]) - res[0] = np.array(res[0]) - res[1] = np.array(res[1]) - res[2] = np.array(res[2]) + res[0] = fns.tensor(res[0], backend=data_.backend) # centers of histogram bins + res[1] = fns.tensor(res[1], backend=data_.backend) + res[2] = fns.tensor(res[2], backend=data_.backend) return res @@ -358,7 +358,7 @@ def fit(self, X_train, importance, init, fixed=[]): init_by_hist = self.get_init(self.hist[0], self.hist[2], self.n_clusters) init_by_hist[0] = init[0] init_by_hist[-1] = init[-1] - zero_idx = np.argmin(np.abs(init_by_hist[:])) + zero_idx = fns.argmin(fns.abs(init_by_hist[:])) init_by_hist[zero_idx] = 0.0 #init[0, zero_idx] fixed[1] = zero_idx init = init_by_hist diff --git a/src/nncf/tensor/functions/__init__.py b/src/nncf/tensor/functions/__init__.py index 5982cc583b0..29372664573 100644 --- a/src/nncf/tensor/functions/__init__.py +++ b/src/nncf/tensor/functions/__init__.py @@ -17,6 +17,7 @@ from nncf.tensor.functions.numeric import any as any from nncf.tensor.functions.numeric import arange as arange from nncf.tensor.functions.numeric import argsort as argsort +from nncf.tensor.functions.numeric import argmin as argmin from nncf.tensor.functions.numeric import as_tensor_like as as_tensor_like from nncf.tensor.functions.numeric import astype as astype from nncf.tensor.functions.numeric import atleast_1d as atleast_1d diff --git a/src/nncf/tensor/functions/numeric.py b/src/nncf/tensor/functions/numeric.py index 35531262d3d..519e3f88d55 100644 --- a/src/nncf/tensor/functions/numeric.py +++ b/src/nncf/tensor/functions/numeric.py @@ -609,6 +609,17 @@ def argsort(a: Tensor, axis: int = -1, descending: bool = False, stable: bool = """ +@tensor_dispatcher +def argmin(a: Tensor, axis: T_AXIS = None) -> Tensor: + """ + Returns the indices of the minimum values along an axis. + + :param a: The tensor for which to find the minimum values. + :param axis: Axis or tuple of axes along which to find the minimum values. + :return: Indices of the minimum values along an axis. + """ + + @tensor_dispatcher def diag(a: Tensor, k: int = 0) -> Tensor: """ diff --git a/src/nncf/tensor/functions/numpy_numeric.py b/src/nncf/tensor/functions/numpy_numeric.py index ddb7ab9f15c..4cd7b27f2a0 100644 --- a/src/nncf/tensor/functions/numpy_numeric.py +++ b/src/nncf/tensor/functions/numpy_numeric.py @@ -350,6 +350,11 @@ def _(a: T_NUMPY, axis: int = -1, descending: bool = False, stable: bool = False return np.argsort(a, axis=axis, kind="stable" if stable else None) +@numeric.argmin.register +def _(a: T_NUMPY, axis: T_AXIS = None) -> T_NUMPY: + return np.argmin(a, axis=axis) + + @numeric.diag.register def _(a: T_NUMPY, k: int = 0) -> T_NUMPY_ARRAY: return np.diag(a, k=k) From 9eec3e33a3c283985c60bed32a2c7308993607c5 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 29 Sep 2025 12:38:19 +0200 Subject: [PATCH 15/25] Replace np by fns. --- .../weight_compression/codebook_estimation.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index 77f16a7833d..0a7e43ece94 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -196,7 +196,7 @@ def calculate_codebook( indexes = indexes.reshape(orig_shape) - best_codebook = converter(codebook)[0] + best_codebook = converter(codebook.as_openvino_tensor())[0] fp_outs = fns.matmul(weight, X) diff = float('inf') @@ -206,7 +206,7 @@ def calculate_codebook( best_i = -1 for i_var, var in enumerate(variants): - var = converter(var)[0] + var = converter(var.as_openvino_tensor())[0] config.codebook_values = Tensor(var) qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) q_outs = fns.matmul(qw, X) @@ -378,14 +378,16 @@ def fit(self, X_train, importance, init, fixed=[]): idxs = np.where(centroid_idxs == i) self.centroids[i] = np.sum(self.hist[1][idxs]) / np.sum(self.hist[2][idxs]) - for i, centroid in enumerate(self.centroids): - if np.isnan(centroid).any(): # Catch any np.nans, resulting from a centroid having no points - self.centroids[i] = prev_centroids[i] + # for i, centroid in enumerate(self.centroids): + # if np.isnan(centroid).any(): # Catch any np.nans, resulting from a centroid having no points + # self.centroids[i] = prev_centroids[i] for idx in fixed: self.centroids[idx] = init[idx] iteration += 1 - if np.all(np.abs(self.centroids - prev_centroids) < 0.00001).any(): + if fns.any(fns.all(fns.abs(self.centroids - prev_centroids) < 0.00001)): break + # if np.all(np.abs(self.centroids - prev_centroids) < 0.00001).any(): + # break self.variants.append(deepcopy(self.centroids)) @@ -411,10 +413,12 @@ def weights_clusterization_k_means(weight, importance, n_centroids=2**4): kmeans = KMeansWeighted(n_centroids, max_iter=70) kmeans.fit(weight, importance, n_init, fixed=[0, 7, 15]) - codebook, indexes = kmeans.evaluate(weight.reshape(-1, 1)) + codebook, indexes = kmeans.evaluate(weight)#.reshape(-1, 1)) - indexes = np.reshape(indexes, orig_shape) + indexes = fns.reshape(indexes, orig_shape) - print(orig_shape, np.mean(np.abs(ow - codebook[indexes]))) + #print(orig_shape, np.mean(np.abs(ow - codebook[indexes]))) + + print(orig_shape, fns.mean(fns.abs(ow - codebook[indexes]))) return codebook, indexes, kmeans.variants From 5a66fda6a660fde2866e79f982791dbb0933323c Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Tue, 30 Sep 2025 15:25:01 +0200 Subject: [PATCH 16/25] Fixed problems with fp64 data types. --- .../weight_compression/codebook_estimation.py | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index 0a7e43ece94..f102ba9810c 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -196,17 +196,17 @@ def calculate_codebook( indexes = indexes.reshape(orig_shape) - best_codebook = converter(codebook.as_openvino_tensor())[0] + best_codebook = converter(codebook.as_openvino_tensor().data)[0] fp_outs = fns.matmul(weight, X) diff = float('inf') - variants[0] = CB4_QUANTILES - variants[1] = fns.tensor([i for i in range(-8, 8)]) + variants[0] = fns.tensor(CB4_QUANTILES, backend=weight.backend, dtype=weight.dtype) + variants[1] = fns.tensor([i for i in range(-8, 8)], backend=weight.backend, dtype=weight.dtype) best_i = -1 for i_var, var in enumerate(variants): - var = converter(var.as_openvino_tensor())[0] + var = converter(var.as_openvino_tensor().data)[0] config.codebook_values = Tensor(var) qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) q_outs = fns.matmul(qw, X) @@ -250,18 +250,17 @@ def get_init(values, frequencies, n_clusters): n_frequencies = fns.cumsum(n_frequencies) - res = [] + res = fns.zeros((n_clusters,), backend=values.backend, dtype=values.dtype) for i in range(len(quants)): if i == 0: - res.append(values[0]) + res[i] = values[0] elif i == len(quants) - 1: - res.append(values[-1]) + res[i] = values[-1] else: prev = values[fns.nonzero(n_frequencies <= quants[i])[0][-1].item()].item() next_ = values[fns.nonzero(n_frequencies <= quants[i + 1])[0][-1].item()].item() - res.append((prev + next_) / 2) + res[i] = (prev + next_) / 2 - res = fns.tensor(res, backend=values.backend) return res @staticmethod @@ -296,8 +295,8 @@ def create_histogramm(data, granularity=0.01): @staticmethod def add_weighted_data_and_weights(res, data, importance): - res[1].append(fns.sum(fns.multiply(data, importance))) - res[2].append(fns.sum(importance)) + res[1].append(fns.sum(fns.multiply(data, importance)).item()) + res[2].append(fns.sum(importance).item()) @staticmethod def create_histogramm_sorted(data_, importance, granularity=0.01): @@ -325,8 +324,8 @@ def create_histogramm_sorted(data_, importance, granularity=0.01): ranges.append(centers[-1]) - centers = np.array(centers) - ranges = np.array(ranges) + centers = fns.tensor(centers, backend=data_.backend, dtype=data_.dtype) + ranges = fns.tensor(ranges, backend=data_.backend, dtype=data_.dtype) ranges_idxs = round(data, ranges) @@ -342,9 +341,9 @@ def create_histogramm_sorted(data_, importance, granularity=0.01): KMeansWeighted.add_weighted_data_and_weights(res, data[ranges_idxs[idx - 1].item():ranges_idxs[idx + 1].item()], importance[ranges_idxs[idx - 1].item():ranges_idxs[idx + 1].item()]) - res[0] = fns.tensor(res[0], backend=data_.backend) # centers of histogram bins - res[1] = fns.tensor(res[1], backend=data_.backend) - res[2] = fns.tensor(res[2], backend=data_.backend) + res[0] = centers #fns.tensor(res[0], backend=data_.backend, dtype=data_.dtype) # centers of histogram bins + res[1] = fns.tensor(res[1], backend=data_.backend, dtype=data_.dtype) + res[2] = fns.tensor(res[2], backend=data_.backend, dtype=data_.dtype) return res @@ -375,8 +374,8 @@ def fit(self, X_train, importance, init, fixed=[]): centroid_idxs = round(self.centroids, self.hist[0]) for i in range(self.n_clusters): - idxs = np.where(centroid_idxs == i) - self.centroids[i] = np.sum(self.hist[1][idxs]) / np.sum(self.hist[2][idxs]) + idxs = fns.nonzero(centroid_idxs == i) + self.centroids[i] = fns.sum(self.hist[1][idxs]).item() / fns.sum(self.hist[2][idxs]).item() # for i, centroid in enumerate(self.centroids): # if np.isnan(centroid).any(): # Catch any np.nans, resulting from a centroid having no points From 735c80911ace39760db5edcdbf627c013f7185f4 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Wed, 8 Oct 2025 19:31:36 +0200 Subject: [PATCH 17/25] Fixed. --- .../weight_compression/algorithm.py | 6 +- .../weight_compression/codebook_estimation.py | 120 ++++++++---------- .../weight_compression/scale_estimation.py | 2 +- .../weight_compression/weight_lowering.py | 6 +- src/nncf/tensor/functions/__init__.py | 5 +- src/nncf/tensor/functions/numeric.py | 22 +--- src/nncf/tensor/functions/numpy_numeric.py | 5 - src/nncf/tensor/functions/torch_numeric.py | 9 +- 8 files changed, 73 insertions(+), 102 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index f7489d0290c..8a84e88402a 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -41,9 +41,9 @@ from nncf.quantization.advanced_parameters import convert_to_dict_recursively from nncf.quantization.algorithms.algorithm import Algorithm from nncf.quantization.algorithms.weight_compression.awq import AWQ +from nncf.quantization.algorithms.weight_compression.codebook_estimation import CodebookEstimation from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES -from nncf.quantization.algorithms.weight_compression.codebook_estimation import CodebookEstimation from nncf.quantization.algorithms.weight_compression.gptq import GPTQ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -379,7 +379,7 @@ def __init__( scale_estimation_params.scale_steps, scale_estimation_params.weight_penalty, ) - + self._codebook_estimation_algo = CodebookEstimation() self._data_aware_mixed_precision = ( @@ -941,7 +941,7 @@ def apply( precomputed_compressed_weights = None lora_correction_algo = None description = "Applying Weight Compression" - + if self._mode == CompressWeightsMode.CODEBOOK: precomputed_compressed_weights = self._codebook_estimation_algo.apply( model=model, diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index f102ba9810c..e9c3e035ea5 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -12,8 +12,6 @@ from copy import deepcopy from dataclasses import dataclass from typing import Optional, TypeVar -import numpy as np -import time import openvino as ov from openvino.runtime import opset13 as opset @@ -28,26 +26,21 @@ from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization -from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization -from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight -from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_float_quantization_params from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_normalized_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_float_quantization_params +from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor import functions as fns -from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES - TModel = TypeVar("TModel") - def fp8_convert(in_shape): - input = opset.parameter( - in_shape, dtype=ov.Type.f32 - ) + input = opset.parameter(in_shape, dtype=ov.Type.f32) scale_convert = opset.convert(input, ov.Type.f8e4m3) scale_convert = opset.convert(scale_convert, ov.Type.f32) result = opset.result(scale_convert, name="Result") @@ -101,7 +94,7 @@ def apply( all_weight_params: list[WeightCompressionParameters], statistics: dict[str, WCTensorStatistic], backend_entity: Optional[WeightCompressionAlgoBackend] = None, - debug=False + debug=False, ) -> dict[str, CompressedWeight]: """ Estimates better codebook. @@ -128,7 +121,7 @@ def apply( node_name = wp.node_with_weight.node_name config = wp.compression_config - if config.num_bits != 4:# or node_name not in statistics: + if config.num_bits != 4: # or node_name not in statistics: res[weight_name] = CompressedWeight() continue @@ -145,16 +138,10 @@ def apply( qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) print("Initial diff:", fns.mean(fns.abs(weight.data - qw.data))) - codebook, scale, indexes = self.calculate_codebook( - stats, - weight, - wp.reduction_axes, - config, - wp - ) + codebook, scale, indexes = self.calculate_codebook(stats, weight, wp.reduction_axes, config, wp) res[weight_name] = CompressedWeight(indexes, scale, None, codebook) config.codebook_values = codebook - + if debug: qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) print("kmeans diff:", fns.mean(fns.abs(weight.data - qw.data))) @@ -167,12 +154,11 @@ def calculate_codebook( weight: Tensor, reduction_axes: tuple[int, ...], config: WeightCompressionConfig, - wp: WeightCompressionParameters + wp: WeightCompressionParameters, ) -> Tensor: - reduction_axis = reduction_axes[0] weight = deepcopy(weight.astype(TensorDataType.float32)) - + s, X = process_stats(statistics, 128) if reduction_axis == 0: @@ -181,12 +167,12 @@ def calculate_codebook( if config.group_size != -1: weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) - + orig_shape = weight.shape - + importance = fns.ones_like(weight) importance = importance * s - + scale = calculate_float_quantization_params(weight, reduction_axes, config, signed=True) norm_weight = _calculate_normalized_weight(weight, scale) @@ -195,28 +181,27 @@ def calculate_codebook( converter = fp8_convert(codebook.shape) indexes = indexes.reshape(orig_shape) - best_codebook = converter(codebook.as_openvino_tensor().data)[0] - + fp_outs = fns.matmul(weight, X) - diff = float('inf') - + diff = float("inf") + variants[0] = fns.tensor(CB4_QUANTILES, backend=weight.backend, dtype=weight.dtype) variants[1] = fns.tensor([i for i in range(-8, 8)], backend=weight.backend, dtype=weight.dtype) best_i = -1 - + for i_var, var in enumerate(variants): var = converter(var.as_openvino_tensor().data)[0] config.codebook_values = Tensor(var) qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) q_outs = fns.matmul(qw, X) - + cur_diff = fns.mean(fns.abs(fp_outs - q_outs)).item() if cur_diff < diff: diff = cur_diff best_codebook = var best_i = i_var - + print("Best codebook:", best_codebook, "diff:", diff, "best_i:", best_i) return Tensor(best_codebook), None, None @@ -224,7 +209,8 @@ def calculate_codebook( def round(quantiles, values): center_of_quantiles = 0.5 * (quantiles[1:] + quantiles[:-1]) - return fns.searchsorted(center_of_quantiles, values, side='left', sorter=None) + return fns.searchsorted(center_of_quantiles, values, side="left", sorter=None) + @dataclass class KMeansAlgoData: @@ -235,6 +221,7 @@ class KMeansAlgoData: frequencies: Tensor | None = None weights: Tensor | None = None + class KMeansWeighted: def __init__(self, n_clusters=8, max_iter=300): self.n_clusters = n_clusters @@ -249,7 +236,6 @@ def get_init(values, frequencies, n_clusters): n_frequencies = frequencies / denum n_frequencies = fns.cumsum(n_frequencies) - res = fns.zeros((n_clusters,), backend=values.backend, dtype=values.dtype) for i in range(len(quants)): if i == 0: @@ -268,7 +254,7 @@ def create_histogramm(data, granularity=0.01): centers = [] step = granularity - data_range=(data.min().item(), data.max().item()) + data_range = (data.min().item(), data.max().item()) prev = data_range[0] while prev < data_range[1]: @@ -287,9 +273,9 @@ def create_histogramm(data, granularity=0.01): res[1].append(fns.sum(data[idxs])) res[2].append(len(idxs[0])) - res[0] = fns.tensor(res[0]) # centers of histogram bins - res[1] = fns.tensor(res[1]) # sum of values in each bin - res[2] = fns.tensor(res[2]) # count of values in each bin + res[0] = fns.tensor(res[0]) # centers of histogram bins + res[1] = fns.tensor(res[1]) # sum of values in each bin + res[2] = fns.tensor(res[2]) # count of values in each bin return res @@ -307,23 +293,21 @@ def create_histogramm_sorted(data_, importance, granularity=0.01): sorted_idx = fns.argsort(data_) data = data_[sorted_idx] importance = importance[sorted_idx] - - #data = np.array([data_, importance]) - #data = data[:, data[0, :].argsort()] - data_range=(data.min().item(), data.max().item()) + # data = np.array([data_, importance]) + # data = data[:, data[0, :].argsort()] + + data_range = (data.min().item(), data.max().item()) prev = data_range[0] - while prev < data_range[1]: centers.append(prev + step / 2) prev += step - + if len(centers) > 1: ranges.append(0.5 * (centers[-2] + centers[-1])) ranges.append(centers[-1]) - centers = fns.tensor(centers, backend=data_.backend, dtype=data_.dtype) ranges = fns.tensor(ranges, backend=data_.backend, dtype=data_.dtype) @@ -333,32 +317,41 @@ def create_histogramm_sorted(data_, importance, granularity=0.01): for i in range(centers.size): res[0].append(centers[i]) if i == 0: - KMeansWeighted.add_weighted_data_and_weights(res, data[:ranges_idxs[1].item()], importance[:ranges_idxs[1].item()]) + KMeansWeighted.add_weighted_data_and_weights( + res, data[: ranges_idxs[1].item()], importance[: ranges_idxs[1].item()] + ) elif i == centers.size - 1: - KMeansWeighted.add_weighted_data_and_weights(res, data[ranges_idxs[-2].item():], importance[ranges_idxs[-2].item():]) + KMeansWeighted.add_weighted_data_and_weights( + res, data[ranges_idxs[-2].item() :], importance[ranges_idxs[-2].item() :] + ) else: idx = 2 * i - KMeansWeighted.add_weighted_data_and_weights(res, data[ranges_idxs[idx - 1].item():ranges_idxs[idx + 1].item()], - importance[ranges_idxs[idx - 1].item():ranges_idxs[idx + 1].item()]) + KMeansWeighted.add_weighted_data_and_weights( + res, + data[ranges_idxs[idx - 1].item() : ranges_idxs[idx + 1].item()], + importance[ranges_idxs[idx - 1].item() : ranges_idxs[idx + 1].item()], + ) - res[0] = centers #fns.tensor(res[0], backend=data_.backend, dtype=data_.dtype) # centers of histogram bins + res[0] = centers # fns.tensor(res[0], backend=data_.backend, dtype=data_.dtype) # centers of histogram bins res[1] = fns.tensor(res[1], backend=data_.backend, dtype=data_.dtype) res[2] = fns.tensor(res[2], backend=data_.backend, dtype=data_.dtype) return res - def fit(self, X_train, importance, init, fixed=[]): + def fit(self, X_train, importance, init, fixed=None): if self.max_iter == 1: self.centroids = deepcopy(init) return - + if fixed is None: + fixed = [0, len(init) // 2, len(init) - 1] + self.hist = KMeansWeighted.create_histogramm_sorted(X_train, importance) init_by_hist = self.get_init(self.hist[0], self.hist[2], self.n_clusters) init_by_hist[0] = init[0] init_by_hist[-1] = init[-1] zero_idx = fns.argmin(fns.abs(init_by_hist[:])) - init_by_hist[zero_idx] = 0.0 #init[0, zero_idx] + init_by_hist[zero_idx] = 0.0 # init[0, zero_idx] fixed[1] = zero_idx init = init_by_hist @@ -368,7 +361,7 @@ def fit(self, X_train, importance, init, fixed=[]): prev_centroids = self.centroids while iteration < self.max_iter: prev_centroids = deepcopy(self.centroids) - + if iteration % 5 == 0: self.variants.append(deepcopy(self.centroids)) @@ -387,9 +380,8 @@ def fit(self, X_train, importance, init, fixed=[]): break # if np.all(np.abs(self.centroids - prev_centroids) < 0.00001).any(): # break - - self.variants.append(deepcopy(self.centroids)) + self.variants.append(deepcopy(self.centroids)) def evaluate(self, X): centroid_idxs = round(self.centroids, X) @@ -397,8 +389,8 @@ def evaluate(self, X): def weights_clusterization_k_means(weight, importance, n_centroids=2**4): - #weight = weight.as_numpy_tensor().data - #importance = importance.as_numpy_tensor().data + # weight = weight.as_numpy_tensor().data + # importance = importance.as_numpy_tensor().data ow = deepcopy(weight) orig_shape = weight.shape @@ -412,12 +404,12 @@ def weights_clusterization_k_means(weight, importance, n_centroids=2**4): kmeans = KMeansWeighted(n_centroids, max_iter=70) kmeans.fit(weight, importance, n_init, fixed=[0, 7, 15]) - codebook, indexes = kmeans.evaluate(weight)#.reshape(-1, 1)) + codebook, indexes = kmeans.evaluate(weight) # .reshape(-1, 1)) indexes = fns.reshape(indexes, orig_shape) - #print(orig_shape, np.mean(np.abs(ow - codebook[indexes]))) - + # print(orig_shape, np.mean(np.abs(ow - codebook[indexes]))) + print(orig_shape, fns.mean(fns.abs(ow - codebook[indexes]))) return codebook, indexes, kmeans.variants diff --git a/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 6de9476135b..11172dcd4de 100644 --- a/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -227,7 +227,7 @@ def calculate_quantization_params( # all weight in group has importance based on corresponding input activations importance = fns.ones_like(original_weight) - #s = s**2 + # s = s**2 importance = importance * s target, zero_mask = get_target_zero_mask(compressed_weights, zp) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index eaef0df726e..0754e2ae44a 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -347,7 +347,11 @@ def compress_weight( ) if not config.is_integer: - if precomputed_compressed_weight is not None and precomputed_compressed_weight.tensor is not None and precomputed_compressed_weight.codebook is not None: + if ( + precomputed_compressed_weight is not None + and precomputed_compressed_weight.tensor is not None + and precomputed_compressed_weight.codebook is not None + ): return precomputed_compressed_weight compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale) diff --git a/src/nncf/tensor/functions/__init__.py b/src/nncf/tensor/functions/__init__.py index 3f1a3ad9a30..18a4fdd61ca 100644 --- a/src/nncf/tensor/functions/__init__.py +++ b/src/nncf/tensor/functions/__init__.py @@ -16,8 +16,8 @@ from nncf.tensor.functions.numeric import allclose as allclose from nncf.tensor.functions.numeric import any as any from nncf.tensor.functions.numeric import arange as arange -from nncf.tensor.functions.numeric import argsort as argsort from nncf.tensor.functions.numeric import argmin as argmin +from nncf.tensor.functions.numeric import argsort as argsort from nncf.tensor.functions.numeric import as_tensor_like as as_tensor_like from nncf.tensor.functions.numeric import astype as astype from nncf.tensor.functions.numeric import atleast_1d as atleast_1d @@ -54,6 +54,7 @@ from nncf.tensor.functions.numeric import minimum as minimum from nncf.tensor.functions.numeric import moveaxis as moveaxis from nncf.tensor.functions.numeric import multiply as multiply +from nncf.tensor.functions.numeric import nonzero as nonzero from nncf.tensor.functions.numeric import ones_like as ones_like from nncf.tensor.functions.numeric import percentile as percentile from nncf.tensor.functions.numeric import power as power @@ -65,14 +66,12 @@ from nncf.tensor.functions.numeric import squeeze as squeeze from nncf.tensor.functions.numeric import stack as stack from nncf.tensor.functions.numeric import sum as sum -from nncf.tensor.functions.numeric import cumsum as cumsum from nncf.tensor.functions.numeric import tensor as tensor from nncf.tensor.functions.numeric import transpose as transpose from nncf.tensor.functions.numeric import unsqueeze as unsqueeze from nncf.tensor.functions.numeric import unstack as unstack from nncf.tensor.functions.numeric import var as var from nncf.tensor.functions.numeric import where as where -from nncf.tensor.functions.numeric import nonzero as nonzero from nncf.tensor.functions.numeric import zeros as zeros from nncf.tensor.functions.numeric import zeros_like as zeros_like diff --git a/src/nncf/tensor/functions/numeric.py b/src/nncf/tensor/functions/numeric.py index 398a52b0949..bd405757a73 100644 --- a/src/nncf/tensor/functions/numeric.py +++ b/src/nncf/tensor/functions/numeric.py @@ -562,15 +562,14 @@ def item(a: Tensor) -> Union[int, float, bool]: @tensor_dispatcher -def cumsum(a: Tensor, axis: int) -> Tensor: +def cumsum(a: Tensor, axis: Optional[Union[int, tuple[int, ...]]] = None) -> Tensor: """ - Return the cumulative sum of the elements along a given axis. + Cumulative sum of tensor elements over a given axis. :param a: The input tensor. - :param axis: Axis along which the cumulative sum is computed. - The default (None) is to compute the cumsum over the flattened array. - :return: A new tensor holding the result. The result has the same size as a, - and the same shape as a if axis is not None or a is a 1-d array. + :param axis: Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements + of the input tensor. + :return: Returns the cumulative sum of all elements in the input tensor in the given axis. """ @@ -587,17 +586,6 @@ def sum(a: Tensor, axis: Optional[Union[int, tuple[int, ...]]] = None, keepdims: :return: Returns the sum of all elements in the input tensor in the given axis. """ -@tensor_dispatcher -def cumsum(a: Tensor, axis: Optional[Union[int, tuple[int, ...]]] = None) -> Tensor: - """ - Cumulative sum of tensor elements over a given axis. - - :param a: The input tensor. - :param axis: Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements - of the input tensor. - :return: Returns the cumulative sum of all elements in the input tensor in the given axis. - """ - @tensor_dispatcher def multiply(x1: Tensor, x2: Union[Tensor, float]) -> Tensor: diff --git a/src/nncf/tensor/functions/numpy_numeric.py b/src/nncf/tensor/functions/numpy_numeric.py index 28ad743167e..b92894e562e 100644 --- a/src/nncf/tensor/functions/numpy_numeric.py +++ b/src/nncf/tensor/functions/numpy_numeric.py @@ -321,11 +321,6 @@ def _(a: T_NUMPY) -> T_NUMBER: return a.item() -@numeric.cumsum.register -def _(a: T_NUMPY, axis: int) -> T_NUMPY: - return np.cumsum(a, axis=axis) - - @numeric.sum.register def _(a: T_NUMPY, axis: T_AXIS = None, keepdims: bool = False) -> T_NUMPY_ARRAY: return np.array(np.sum(a, axis=axis, keepdims=keepdims)) diff --git a/src/nncf/tensor/functions/torch_numeric.py b/src/nncf/tensor/functions/torch_numeric.py index 7735a8f810d..82138496c08 100644 --- a/src/nncf/tensor/functions/torch_numeric.py +++ b/src/nncf/tensor/functions/torch_numeric.py @@ -209,9 +209,7 @@ def _( @numeric.nonzero.register -def _( - condition: torch.Tensor -) -> torch.Tensor: +def _(condition: torch.Tensor) -> torch.Tensor: return torch.nonzero(condition, as_tuple=True) @@ -348,11 +346,6 @@ def _(a: torch.Tensor) -> T_NUMBER: return a.item() -@numeric.cumsum.register -def _(a: torch.Tensor, axis: int) -> torch.Tensor: - return torch.cumsum(a, dim=axis) - - @numeric.sum.register def _(a: torch.Tensor, axis: T_AXIS = None, keepdims: bool = False) -> torch.Tensor: return torch.sum(a, dim=axis, keepdim=keepdims) From 8ea394655f96694aa301c4582dbfba3b98146345 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Thu, 9 Oct 2025 16:29:32 +0200 Subject: [PATCH 18/25] Removed unused code. --- .../weight_compression/codebook_estimation.py | 55 +++---------------- 1 file changed, 8 insertions(+), 47 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index e9c3e035ea5..2a648087351 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -13,9 +13,6 @@ from dataclasses import dataclass from typing import Optional, TypeVar -import openvino as ov -from openvino.runtime import opset13 as opset - import nncf from nncf.common.graph.graph import NNCFGraph from nncf.common.logging.track_progress import track @@ -39,19 +36,6 @@ TModel = TypeVar("TModel") -def fp8_convert(in_shape): - input = opset.parameter(in_shape, dtype=ov.Type.f32) - scale_convert = opset.convert(input, ov.Type.f8e4m3) - scale_convert = opset.convert(scale_convert, ov.Type.f32) - result = opset.result(scale_convert, name="Result") - result.get_output_tensor(0).set_names(set(["Result"])) - model = ov.Model([result], [input]) - - compiled_model = ov.compile_model(model) - - return compiled_model - - class CodebookEstimation: """ Codebook estimation algorithm implementation. @@ -94,7 +78,6 @@ def apply( all_weight_params: list[WeightCompressionParameters], statistics: dict[str, WCTensorStatistic], backend_entity: Optional[WeightCompressionAlgoBackend] = None, - debug=False, ) -> dict[str, CompressedWeight]: """ Estimates better codebook. @@ -134,18 +117,10 @@ def apply( weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) - if debug: - qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) - print("Initial diff:", fns.mean(fns.abs(weight.data - qw.data))) - codebook, scale, indexes = self.calculate_codebook(stats, weight, wp.reduction_axes, config, wp) res[weight_name] = CompressedWeight(indexes, scale, None, codebook) config.codebook_values = codebook - if debug: - qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) - print("kmeans diff:", fns.mean(fns.abs(weight.data - qw.data))) - return res @staticmethod @@ -178,20 +153,19 @@ def calculate_codebook( codebook, indexes, variants = weights_clusterization_k_means(norm_weight, importance) - converter = fp8_convert(codebook.shape) indexes = indexes.reshape(orig_shape) - best_codebook = converter(codebook.as_openvino_tensor().data)[0] + best_codebook = codebook.as_openvino_tensor().astype(TensorDataType.f8e4m3) fp_outs = fns.matmul(weight, X) diff = float("inf") variants[0] = fns.tensor(CB4_QUANTILES, backend=weight.backend, dtype=weight.dtype) - variants[1] = fns.tensor([i for i in range(-8, 8)], backend=weight.backend, dtype=weight.dtype) + variants[1] = fns.tensor(list(range(-8, 8)), backend=weight.backend, dtype=weight.dtype) best_i = -1 for i_var, var in enumerate(variants): - var = converter(var.as_openvino_tensor().data)[0] + var = var.as_openvino_tensor().astype(TensorDataType.f8e4m3) config.codebook_values = Tensor(var) qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) q_outs = fns.matmul(qw, X) @@ -207,7 +181,7 @@ def calculate_codebook( return Tensor(best_codebook), None, None -def round(quantiles, values): +def round_to_left(quantiles, values): center_of_quantiles = 0.5 * (quantiles[1:] + quantiles[:-1]) return fns.searchsorted(center_of_quantiles, values, side="left", sorter=None) @@ -262,7 +236,7 @@ def create_histogramm(data, granularity=0.01): prev += step centers = fns.tensor(centers) - centroid_idxs = round(centers, data) + centroid_idxs = round_to_left(centers, data) res = [[], [], []] for i in range(centers.size): @@ -294,9 +268,6 @@ def create_histogramm_sorted(data_, importance, granularity=0.01): data = data_[sorted_idx] importance = importance[sorted_idx] - # data = np.array([data_, importance]) - # data = data[:, data[0, :].argsort()] - data_range = (data.min().item(), data.max().item()) prev = data_range[0] @@ -311,7 +282,7 @@ def create_histogramm_sorted(data_, importance, granularity=0.01): centers = fns.tensor(centers, backend=data_.backend, dtype=data_.dtype) ranges = fns.tensor(ranges, backend=data_.backend, dtype=data_.dtype) - ranges_idxs = round(data, ranges) + ranges_idxs = round_to_left(data, ranges) res = [[], [], []] for i in range(centers.size): @@ -365,33 +336,25 @@ def fit(self, X_train, importance, init, fixed=None): if iteration % 5 == 0: self.variants.append(deepcopy(self.centroids)) - centroid_idxs = round(self.centroids, self.hist[0]) + centroid_idxs = round_to_left(self.centroids, self.hist[0]) for i in range(self.n_clusters): idxs = fns.nonzero(centroid_idxs == i) self.centroids[i] = fns.sum(self.hist[1][idxs]).item() / fns.sum(self.hist[2][idxs]).item() - # for i, centroid in enumerate(self.centroids): - # if np.isnan(centroid).any(): # Catch any np.nans, resulting from a centroid having no points - # self.centroids[i] = prev_centroids[i] for idx in fixed: self.centroids[idx] = init[idx] iteration += 1 if fns.any(fns.all(fns.abs(self.centroids - prev_centroids) < 0.00001)): break - # if np.all(np.abs(self.centroids - prev_centroids) < 0.00001).any(): - # break self.variants.append(deepcopy(self.centroids)) def evaluate(self, X): - centroid_idxs = round(self.centroids, X) + centroid_idxs = round_to_left(self.centroids, X) return deepcopy(self.centroids).flatten(), centroid_idxs def weights_clusterization_k_means(weight, importance, n_centroids=2**4): - # weight = weight.as_numpy_tensor().data - # importance = importance.as_numpy_tensor().data - ow = deepcopy(weight) orig_shape = weight.shape weight = weight.flatten() @@ -408,8 +371,6 @@ def weights_clusterization_k_means(weight, importance, n_centroids=2**4): indexes = fns.reshape(indexes, orig_shape) - # print(orig_shape, np.mean(np.abs(ow - codebook[indexes]))) - print(orig_shape, fns.mean(fns.abs(ow - codebook[indexes]))) return codebook, indexes, kmeans.variants From 037a255c2ebc3898c638d206ee5c39cb449f3e17 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Fri, 10 Oct 2025 11:07:11 +0200 Subject: [PATCH 19/25] Fixed bug with close centroids. --- .../weight_compression/codebook_estimation.py | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index 2a648087351..80fd9a3ccbb 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -162,9 +162,8 @@ def calculate_codebook( variants[0] = fns.tensor(CB4_QUANTILES, backend=weight.backend, dtype=weight.dtype) variants[1] = fns.tensor(list(range(-8, 8)), backend=weight.backend, dtype=weight.dtype) - best_i = -1 - for i_var, var in enumerate(variants): + for var in variants: var = var.as_openvino_tensor().astype(TensorDataType.f8e4m3) config.codebook_values = Tensor(var) qw = float_quantize_dequantize_weight(weight, config, wp.reduction_axes) @@ -174,9 +173,6 @@ def calculate_codebook( if cur_diff < diff: diff = cur_diff best_codebook = var - best_i = i_var - - print("Best codebook:", best_codebook, "diff:", diff, "best_i:", best_i) return Tensor(best_codebook), None, None @@ -211,15 +207,21 @@ def get_init(values, frequencies, n_clusters): n_frequencies = fns.cumsum(n_frequencies) res = fns.zeros((n_clusters,), backend=values.backend, dtype=values.dtype) - for i in range(len(quants)): + for i in range(n_clusters): if i == 0: res[i] = values[0] - elif i == len(quants) - 1: + elif i == n_clusters - 1: res[i] = values[-1] else: - prev = values[fns.nonzero(n_frequencies <= quants[i])[0][-1].item()].item() - next_ = values[fns.nonzero(n_frequencies <= quants[i + 1])[0][-1].item()].item() - res[i] = (prev + next_) / 2 + prev_val = values[fns.nonzero(n_frequencies <= quants[i])[0][-1].item()].item() + next_val = values[fns.nonzero(n_frequencies <= quants[i + 1])[0][-1].item()].item() + res[i] = (prev_val + next_val) / 2 + + # avoid close centroids + th = 0.05 + for i in range(1, n_clusters - 1): + if (res[i] - res[i + 1]).abs() / max(res[i].abs(), res[i + 1].abs()) < th: + res[i] = (res[i - 1] + res[i + 1]) / 2 return res @@ -235,7 +237,7 @@ def create_histogramm(data, granularity=0.01): centers.append(prev + step / 2) prev += step - centers = fns.tensor(centers) + centers = fns.tensor(centers, backend=data.backend) centroid_idxs = round_to_left(centers, data) res = [[], [], []] @@ -247,9 +249,9 @@ def create_histogramm(data, granularity=0.01): res[1].append(fns.sum(data[idxs])) res[2].append(len(idxs[0])) - res[0] = fns.tensor(res[0]) # centers of histogram bins - res[1] = fns.tensor(res[1]) # sum of values in each bin - res[2] = fns.tensor(res[2]) # count of values in each bin + res[0] = fns.tensor(res[0], backend=data.backend) # centers of histogram bins + res[1] = fns.tensor(res[1], backend=data.backend) # sum of values in each bin + res[2] = fns.tensor(res[2], backend=data.backend) # count of values in each bin return res From 817a790407691142d287d75a7459cd612e422317 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 13 Oct 2025 13:18:45 +0200 Subject: [PATCH 20/25] Fixed error with argmin/cumsum args. --- .../weight_compression/codebook_estimation.py | 14 ++++++-------- src/nncf/tensor/functions/numeric.py | 13 +++++++------ src/nncf/tensor/functions/numpy_numeric.py | 6 +++--- src/nncf/tensor/functions/torch_numeric.py | 2 +- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py index 80fd9a3ccbb..33f1bd81321 100644 --- a/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/codebook_estimation.py @@ -197,6 +197,7 @@ def __init__(self, n_clusters=8, max_iter=300): self.n_clusters = n_clusters self.max_iter = max_iter self.variants = [] + self.centroids = None @staticmethod def get_init(values, frequencies, n_clusters): @@ -204,7 +205,7 @@ def get_init(values, frequencies, n_clusters): denum = fns.sum(frequencies) quants = [i * step for i in range(n_clusters)] n_frequencies = frequencies / denum - n_frequencies = fns.cumsum(n_frequencies) + n_frequencies = fns.cumsum(n_frequencies, axis=0) res = fns.zeros((n_clusters,), backend=values.backend, dtype=values.dtype) for i in range(n_clusters): @@ -305,7 +306,7 @@ def create_histogramm_sorted(data_, importance, granularity=0.01): importance[ranges_idxs[idx - 1].item() : ranges_idxs[idx + 1].item()], ) - res[0] = centers # fns.tensor(res[0], backend=data_.backend, dtype=data_.dtype) # centers of histogram bins + res[0] = centers res[1] = fns.tensor(res[1], backend=data_.backend, dtype=data_.dtype) res[2] = fns.tensor(res[2], backend=data_.backend, dtype=data_.dtype) @@ -323,8 +324,8 @@ def fit(self, X_train, importance, init, fixed=None): init_by_hist = self.get_init(self.hist[0], self.hist[2], self.n_clusters) init_by_hist[0] = init[0] init_by_hist[-1] = init[-1] - zero_idx = fns.argmin(fns.abs(init_by_hist[:])) - init_by_hist[zero_idx] = 0.0 # init[0, zero_idx] + zero_idx = fns.argmin(fns.abs(init_by_hist[:]), axis=0).item() + init_by_hist[zero_idx] = 0.0 # to have zero in codebook fixed[1] = zero_idx init = init_by_hist @@ -357,7 +358,6 @@ def evaluate(self, X): def weights_clusterization_k_means(weight, importance, n_centroids=2**4): - ow = deepcopy(weight) orig_shape = weight.shape weight = weight.flatten() importance = importance.flatten() @@ -369,10 +369,8 @@ def weights_clusterization_k_means(weight, importance, n_centroids=2**4): kmeans = KMeansWeighted(n_centroids, max_iter=70) kmeans.fit(weight, importance, n_init, fixed=[0, 7, 15]) - codebook, indexes = kmeans.evaluate(weight) # .reshape(-1, 1)) + codebook, indexes = kmeans.evaluate(weight) indexes = fns.reshape(indexes, orig_shape) - print(orig_shape, fns.mean(fns.abs(ow - codebook[indexes]))) - return codebook, indexes, kmeans.variants diff --git a/src/nncf/tensor/functions/numeric.py b/src/nncf/tensor/functions/numeric.py index bd405757a73..1c34850c1d1 100644 --- a/src/nncf/tensor/functions/numeric.py +++ b/src/nncf/tensor/functions/numeric.py @@ -562,14 +562,15 @@ def item(a: Tensor) -> Union[int, float, bool]: @tensor_dispatcher -def cumsum(a: Tensor, axis: Optional[Union[int, tuple[int, ...]]] = None) -> Tensor: +def cumsum(a: Tensor, axis: int) -> Tensor: """ - Cumulative sum of tensor elements over a given axis. + Return the cumulative sum of the elements along a given axis. :param a: The input tensor. - :param axis: Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements - of the input tensor. - :return: Returns the cumulative sum of all elements in the input tensor in the given axis. + :param axis: Axis along which the cumulative sum is computed. + The default (None) is to compute the cumsum over the flattened array. + :return: A new tensor holding the result. The result has the same size as a, + and the same shape as a if axis is not None or a is a 1-d array. """ @@ -672,7 +673,7 @@ def argsort(a: Tensor, axis: int = -1, descending: bool = False, stable: bool = @tensor_dispatcher -def argmin(a: Tensor, axis: T_AXIS = None) -> Tensor: +def argmin(a: Tensor, axis: None) -> Tensor: """ Returns the indices of the minimum values along an axis. diff --git a/src/nncf/tensor/functions/numpy_numeric.py b/src/nncf/tensor/functions/numpy_numeric.py index b92894e562e..e673da134d8 100644 --- a/src/nncf/tensor/functions/numpy_numeric.py +++ b/src/nncf/tensor/functions/numpy_numeric.py @@ -327,8 +327,8 @@ def _(a: T_NUMPY, axis: T_AXIS = None, keepdims: bool = False) -> T_NUMPY_ARRAY: @numeric.cumsum.register -def _(a: T_NUMPY, axis: T_AXIS = None) -> T_NUMPY_ARRAY: - return np.array(np.cumsum(a, axis=axis)) +def _(a: T_NUMPY, axis: int) -> T_NUMPY: + return np.cumsum(a, axis=axis) @numeric.multiply.register @@ -376,7 +376,7 @@ def _(a: T_NUMPY, axis: int = -1, descending: bool = False, stable: bool = False @numeric.argmin.register -def _(a: T_NUMPY, axis: T_AXIS = None) -> T_NUMPY: +def _(a: T_NUMPY, axis: None) -> T_NUMPY: return np.argmin(a, axis=axis) diff --git a/src/nncf/tensor/functions/torch_numeric.py b/src/nncf/tensor/functions/torch_numeric.py index 82138496c08..fb484b20f77 100644 --- a/src/nncf/tensor/functions/torch_numeric.py +++ b/src/nncf/tensor/functions/torch_numeric.py @@ -352,7 +352,7 @@ def _(a: torch.Tensor, axis: T_AXIS = None, keepdims: bool = False) -> torch.Ten @numeric.cumsum.register -def _(a: torch.Tensor, axis: T_AXIS = None) -> torch.Tensor: +def _(a: torch.Tensor, axis: int) -> torch.Tensor: return torch.cumsum(a, dim=axis) From be6029a9de91ac1be40ec8bac280832f3cbb63a0 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 13 Oct 2025 13:32:13 +0200 Subject: [PATCH 21/25] Removed unused fuction. --- src/nncf/tensor/functions/torch_numeric.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/nncf/tensor/functions/torch_numeric.py b/src/nncf/tensor/functions/torch_numeric.py index fb484b20f77..9000be673f3 100644 --- a/src/nncf/tensor/functions/torch_numeric.py +++ b/src/nncf/tensor/functions/torch_numeric.py @@ -208,11 +208,6 @@ def _( return torch.where(condition, x, y) -@numeric.nonzero.register -def _(condition: torch.Tensor) -> torch.Tensor: - return torch.nonzero(condition, as_tuple=True) - - @numeric.zeros_like.register def _(a: torch.Tensor) -> torch.Tensor: return torch.zeros_like(a) From 58a64d87fb8d7bac50c860e232f2b9a4f26d2028 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 13 Oct 2025 13:50:42 +0200 Subject: [PATCH 22/25] Fix. --- src/nncf/tensor/functions/numeric.py | 2 +- src/nncf/tensor/functions/numpy_numeric.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nncf/tensor/functions/numeric.py b/src/nncf/tensor/functions/numeric.py index 1c34850c1d1..d4f0c947e4f 100644 --- a/src/nncf/tensor/functions/numeric.py +++ b/src/nncf/tensor/functions/numeric.py @@ -325,7 +325,7 @@ def where(condition: Tensor, x: Union[Tensor, float], y: Union[Tensor, float]) - @tensor_dispatcher -def nonzero(condition: Tensor) -> Tensor: +def nonzero(condition: Tensor) -> tuple[Tensor, ...]: """ Return the indices of the elements that are non-zero. diff --git a/src/nncf/tensor/functions/numpy_numeric.py b/src/nncf/tensor/functions/numpy_numeric.py index e673da134d8..75cae23b4e2 100644 --- a/src/nncf/tensor/functions/numpy_numeric.py +++ b/src/nncf/tensor/functions/numpy_numeric.py @@ -201,7 +201,7 @@ def _( @numeric.zeros_like.register -def _(a: T_NUMPY) -> T_NUMPY_ARRAY: +def _(a: T_NUMPY) -> tuple[T_NUMPY_ARRAY]: return np.zeros_like(a) From ffe0cf4592f881972a190129fb4193a59c5749e8 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 13 Oct 2025 14:00:47 +0200 Subject: [PATCH 23/25] Fix. --- src/nncf/tensor/functions/numpy_numeric.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nncf/tensor/functions/numpy_numeric.py b/src/nncf/tensor/functions/numpy_numeric.py index 75cae23b4e2..d6562f5b268 100644 --- a/src/nncf/tensor/functions/numpy_numeric.py +++ b/src/nncf/tensor/functions/numpy_numeric.py @@ -196,12 +196,12 @@ def _( @numeric.nonzero.register def _( condition: T_NUMPY, -) -> T_NUMPY_ARRAY: +) -> tuple[T_NUMPY_ARRAY, ...]: return np.nonzero(condition) @numeric.zeros_like.register -def _(a: T_NUMPY) -> tuple[T_NUMPY_ARRAY]: +def _(a: T_NUMPY) -> T_NUMPY_ARRAY: return np.zeros_like(a) From 72af4fddff254fc2f3b5e96e5a2fcb84d9087bf9 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 13 Oct 2025 14:40:48 +0200 Subject: [PATCH 24/25] Fix. --- .../algorithms/weight_compression/scale_estimation.py | 2 +- .../algorithms/weight_compression/weight_lowering.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 11172dcd4de..e74320af17e 100644 --- a/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -227,7 +227,7 @@ def calculate_quantization_params( # all weight in group has importance based on corresponding input activations importance = fns.ones_like(original_weight) - # s = s**2 + importance = importance * s target, zero_mask = get_target_zero_mask(compressed_weights, zp) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 0754e2ae44a..1d462b019b6 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -78,7 +78,7 @@ def reshape_weight_for_grouped_quantization( def calculate_float_quantization_params( - weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, signed: bool = True + weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, signed: bool = False ) -> Tensor: """ Calculates the scale for nf4 or mxfp4/mxfp8_e4m3 quantization. From c6f72eed071bd34f857b6ade8b9c08c1cdefe731 Mon Sep 17 00:00:00 2001 From: Andrei Anufriev Date: Mon, 13 Oct 2025 16:37:43 +0200 Subject: [PATCH 25/25] Fixed bug with codebook type.. --- src/nncf/openvino/quantization/quantize_model.py | 2 ++ .../algorithms/weight_compression/algorithm.py | 13 ++++++++++--- .../weight_compression/openvino_backend.py | 2 +- src/nncf/quantization/quantize_model.py | 7 ++++++- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/nncf/openvino/quantization/quantize_model.py b/src/nncf/openvino/quantization/quantize_model.py index 4ac077a17d7..e329a7f3154 100644 --- a/src/nncf/openvino/quantization/quantize_model.py +++ b/src/nncf/openvino/quantization/quantize_model.py @@ -376,6 +376,7 @@ def compress_weights_impl( scale_estimation: bool, gptq: bool, lora_correction: bool, + codebook_estimation: bool, backup_mode: BackupMode, compression_format: CompressionFormat, advanced_parameters: Optional[AdvancedCompressionParameters] = None, @@ -397,6 +398,7 @@ def compress_weights_impl( scale_estimation, gptq, lora_correction, + codebook_estimation, backup_mode, compression_format, advanced_parameters, diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index 8a84e88402a..d02de7e33ab 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -87,6 +87,7 @@ def get_weight_compression_configuration( scale_estimation: Optional[bool] = None, gptq: Optional[bool] = None, lora_correction: Optional[bool] = None, + codebook_estimation: Optional[bool] = None, ignored_scope: Optional[IgnoredScope] = None, sensitivity_metric: Optional[SensitivityMetric] = None, backup_mode: Optional[BackupMode] = None, @@ -112,6 +113,7 @@ def get_weight_compression_configuration( "scale_estimation": scale_estimation or False, "gptq": gptq or False, "lora_correction": lora_correction or False, + "codebook_estimation": codebook_estimation or False, "ignored_scope": ignored_scope or IgnoredScope(), "sensitivity_metric": ( ( @@ -138,6 +140,7 @@ def check_user_compression_configuration( scale_estimation: Optional[bool], gptq: Optional[bool], lora_correction: Optional[bool], + codebook_estimation: Optional[bool], ignored_scope: Optional[IgnoredScope], sensitivity_metric: Optional[SensitivityMetric], backup_mode: Optional[BackupMode], @@ -168,6 +171,7 @@ def check_user_compression_configuration( "gptq": gptq, "lora_correction": lora_correction, "backup_mode": backup_mode, + "codebook_estimation": codebook_estimation, } unsupported_for_int8 = [name for name, value in unsupported_options.items() if value is not None] if unsupported_for_int8: @@ -281,6 +285,7 @@ def __init__( scale_estimation: bool, gptq: bool, lora_correction: bool, + codebook_estimation: bool, backup_mode: BackupMode = BackupMode.INT8_ASYM, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: Optional[AdvancedCompressionParameters] = None, @@ -340,6 +345,7 @@ def __init__( self._scale_estimation = scale_estimation self._gptq = gptq self._lora_correction = lora_correction + self._codebook_estimation = codebook_estimation self._backup_mode = backup_mode self._compression_format = compression_format self._advanced_parameters = ( @@ -380,7 +386,8 @@ def __init__( scale_estimation_params.weight_penalty, ) - self._codebook_estimation_algo = CodebookEstimation() + if self._codebook_estimation: + self._codebook_estimation_algo = CodebookEstimation() self._data_aware_mixed_precision = ( self._sensitivity_metric != SensitivityMetric.WEIGHT_QUANTIZATION_ERROR and self._ratio != 1.0 @@ -390,7 +397,7 @@ def __init__( or self._scale_estimation or self._lora_correction or self._gptq - or self._codebook_estimation_algo + or self._codebook_estimation ) @property @@ -942,7 +949,7 @@ def apply( lora_correction_algo = None description = "Applying Weight Compression" - if self._mode == CompressWeightsMode.CODEBOOK: + if self._codebook_estimation: precomputed_compressed_weights = self._codebook_estimation_algo.apply( model=model, graph=graph, diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 700b64a83ed..eebd7600cd6 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -259,7 +259,7 @@ def _create_compression_subgraph( n_quants = compressed_weight.codebook.size - 1 compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4) converted_const = create_ov_codebook_subgraph( - compressed_weight.codebook.as_openvino_tensor().astype(TensorDataType.f8e4m3), + compressed_weight.codebook.as_openvino_tensor(), indexes=compressed_weight.tensor, dtype=compression_dtype, name=const_node_name, diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py index 8d89bd19821..f0cc32e69c1 100644 --- a/src/nncf/quantization/quantize_model.py +++ b/src/nncf/quantization/quantize_model.py @@ -436,6 +436,7 @@ def compress_weights( scale_estimation: Optional[bool] = None, gptq: Optional[bool] = None, lora_correction: Optional[bool] = None, + codebook_estimation: Optional[bool] = None, backup_mode: Optional[BackupMode] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: Optional[AdvancedCompressionParameters] = None, @@ -580,6 +581,7 @@ def compress_weights( options = { "gptq": gptq, "lora_correction": lora_correction, + "codebook_estimation": codebook_estimation, } unsupported_options = [name for name, value in options.items() if value is not None] if unsupported_options: @@ -606,7 +608,7 @@ def compress_weights( elif backend == BackendType.OPENVINO: from nncf.openvino.quantization.quantize_model import compress_weights_impl as ov_compress_weights_impl - if any((scale_estimation, gptq, lora_correction)) and dataset is None: + if any((scale_estimation, gptq, lora_correction, codebook_estimation)) and dataset is None: msg = "Scale estimation, GPTQ or Lora Correction algorithm is defined, but dataset is None." raise nncf.ParameterNotSupportedError(msg) @@ -645,6 +647,7 @@ def compress_weights( options = { "gptq": gptq, "lora_correction": lora_correction, + "codebook_estimation": codebook_estimation, } unsupported_options = [name for name, value in options.items() if value is not None] if unsupported_options: @@ -670,6 +673,7 @@ def compress_weights( scale_estimation, gptq, lora_correction, + codebook_estimation, ignored_scope, sensitivity_metric, backup_mode, @@ -686,6 +690,7 @@ def compress_weights( scale_estimation, gptq, lora_correction, + codebook_estimation, ignored_scope, sensitivity_metric, backup_mode,