From 488cacc2be70b7ae7e417c555d2aeea29163f5b6 Mon Sep 17 00:00:00 2001
From: Aleksandr Suslov <alexander.suslov@intel.com>
Date: Mon, 10 Jun 2024 19:17:08 +0400
Subject: [PATCH 01/68] Support scale estimation inside GPTQ

---
 .../algorithms/layerwise/scheduler.py         |  34 +-
 .../weight_compression/activation_stats.py    |   7 +-
 .../weight_compression/algorithm.py           |  59 ++--
 .../algorithms/weight_compression/gptq.py     |  41 ++-
 .../weight_compression/scale_estimation.py    | 316 ++++++++++--------
 nncf/quantization/quantize_model.py           |   5 -
 .../openvino/native/quantization/test_gptq.py |   5 +-
 .../quantization/test_weights_compression.py  |   5 +-
 8 files changed, 271 insertions(+), 201 deletions(-)

diff --git a/nncf/quantization/algorithms/layerwise/scheduler.py b/nncf/quantization/algorithms/layerwise/scheduler.py
index 8eee99fad28..8abc03400c0 100644
--- a/nncf/quantization/algorithms/layerwise/scheduler.py
+++ b/nncf/quantization/algorithms/layerwise/scheduler.py
@@ -9,6 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import OrderedDict
 from copy import deepcopy
 from dataclasses import dataclass
 from dataclasses import field
@@ -177,26 +178,31 @@ def schedule(
             old_input_nodes = set()
             new_input_nodes = set()
             for p in paths:
-                target_output_nodes = set()
+                target_outputs = []
                 additional_output_nodes = set()
                 for output_node in p.output_nodes:
-                    if output_node in target_nodes:
-                        target_output_nodes.add(output_node)
-                    elif output_node in p.input_nodes:
-                        reuse_input_nodes.add(output_node)
-                    else:
-                        # filter additional output nodes
-                        for prev_node in inference_graph.get_previous_nodes(output_node):
-                            if prev_node not in p.output_nodes:
-                                additional_output_nodes.add(output_node)
-                                break
-                if not target_output_nodes:
+                    try:
+                        target_node_index = target_nodes.index(output_node)
+                        target_outputs.append((target_node_index, output_node))
+                    except ValueError:
+                        if output_node in p.input_nodes:
+                            reuse_input_nodes.add(output_node)
+                        else:
+                            # filter additional output nodes
+                            for prev_node in inference_graph.get_previous_nodes(output_node):
+                                if prev_node not in p.output_nodes:
+                                    additional_output_nodes.add(output_node)
+                                    break
+                if not target_outputs:
                     continue
 
+                target_outputs.sort(key=lambda target_output: target_output[0])
+                target_output_nodes = [output[1] for output in target_outputs]
+
                 old_input_nodes |= p.input_nodes
-                new_input_nodes |= target_output_nodes | additional_output_nodes
+                new_input_nodes |= set(target_output_nodes) | additional_output_nodes
                 subgraph_inputs = list(p.inputs)
-                step_target_nodes = {}
+                step_target_nodes = OrderedDict()
                 subgraph_outputs = []
                 for node in target_output_nodes:
                     target_edge = {}
diff --git a/nncf/quantization/algorithms/weight_compression/activation_stats.py b/nncf/quantization/algorithms/weight_compression/activation_stats.py
index eb8286e6383..359887e7769 100644
--- a/nncf/quantization/algorithms/weight_compression/activation_stats.py
+++ b/nncf/quantization/algorithms/weight_compression/activation_stats.py
@@ -9,14 +9,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Tuple, TypeVar
+from typing import List, Tuple
 
+from nncf.tensor import Tensor
 from nncf.tensor import functions as fns
 
-TTensor = TypeVar("TTensor")
 
-
-def process_stats(stats: List[TTensor], subset_size: int) -> Tuple[TTensor, TTensor]:
+def process_stats(stats: List[Tensor], subset_size: int) -> Tuple[Tensor, Tensor]:
     """
     It's a processing of activations shared between AWQ, Scale Estimation and LoRA Correction algorithms.
 
diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 3499521bce3..1b2af0fd9a3 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -124,7 +124,12 @@ def __init__(
 
         if self._gptq:
             gptq_params = self._advanced_parameters.gptq_params
-            self._gptq_algo = GPTQ(gptq_params.damp_percent, gptq_params.block_size, gptq_params.subset_size)
+            self._gptq_algo = GPTQ(
+                damp_percent=gptq_params.damp_percent,
+                block_size=gptq_params.block_size,
+                subset_size=gptq_params.subset_size,
+                scale_estimation=self._scale_estimation,
+            )
             self._gptq_statistics = None
 
     @property
@@ -379,25 +384,8 @@ def apply(
 
         scales = {}
         zero_points = {}
-        if (
-            self._scale_estimation
-            and activations is not None
-            and self._mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]
-        ):
-            scale_estimation_params = self._advanced_parameters.scale_estimation_params
-            scale_algo = ScaleEstimation(
-                model,
-                self._backend_entity.name_to_node_mapping,
-                all_weight_params,
-                nodes_to_compress,
-                activations,
-                scale_estimation_params.subset_size,
-                scale_estimation_params.initial_steps,
-                scale_estimation_params.scale_steps,
-                scale_estimation_params.weight_penalty,
-            )
-            scales = scale_algo.apply(model, graph)
-
+        lora_correction_algo = None
+        description = "Applying Weight Compression"
         if self._gptq:
             model, scales, zero_points = self._gptq_algo.apply(
                 model=model,
@@ -407,13 +395,30 @@ def apply(
                 statistic_points=self._gptq_statistics,
                 backend_entity=self._backend_entity,
             )
+        else:
+            if (
+                self._scale_estimation
+                and activations is not None
+                and self._mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]
+            ):
+                scale_estimation_params = self._advanced_parameters.scale_estimation_params
+                scale_algo = ScaleEstimation(
+                    model,
+                    self._backend_entity.name_to_node_mapping,
+                    all_weight_params,
+                    nodes_to_compress,
+                    activations,
+                    scale_estimation_params.subset_size,
+                    scale_estimation_params.initial_steps,
+                    scale_estimation_params.scale_steps,
+                    scale_estimation_params.weight_penalty,
+                )
+                scales = scale_algo.apply(model, graph)
 
-        lora_correction_algo = None
-        description = "Applying Weight Compression"
-        if self._lora_correction:
-            lora_correction_params = self._advanced_parameters.lora_correction_params
-            lora_correction_algo = LoraCorrectionAlgorithm(activations, lora_correction_params)
-            description += " with correction of low-rank adapters"
+            if self._lora_correction:
+                lora_correction_params = self._advanced_parameters.lora_correction_params
+                lora_correction_algo = LoraCorrectionAlgorithm(activations, lora_correction_params)
+                description += " with correction of low-rank adapters"
 
         # Sort weight params to start compression with the bigger constants. This lowers peak memory footprint.
         all_weight_params = sorted(all_weight_params, key=lambda wp: wp.num_weights, reverse=True)
@@ -542,7 +547,7 @@ def _get_activations(
         statistics_aggregator = StatisticsAggregatorFactory.create(model, dataset)
         statistics_aggregator.register_statistic_points(statistic_container)
 
-        if self._gptq:
+        if self._gptq and not self._awq:
             self._gptq_statistics = self._gptq_algo.get_statistic_points(
                 model, graph, nodes_to_compress, self._backend_entity
             )
diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py
index b595e080533..b1101916da3 100644
--- a/nncf/quantization/algorithms/weight_compression/gptq.py
+++ b/nncf/quantization/algorithms/weight_compression/gptq.py
@@ -25,6 +25,7 @@
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
+from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_integer_quantization_params
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_scale
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight
@@ -44,10 +45,7 @@ class GPTQ:
     """
 
     def __init__(
-        self,
-        damp_percent: float = 0.1,
-        block_size: int = 128,
-        subset_size: int = 128,
+        self, damp_percent: float = 0.1, block_size: int = 128, subset_size: int = 128, scale_estimation: bool = False
     ):
         """
         :param damp_percent: The percent of the average Hessian diagonal to use for dampening,
@@ -58,6 +56,7 @@ def __init__(
         self._damp_percent = damp_percent
         self._block_size = block_size
         self._subset_size = subset_size
+        self._scale_estimation = scale_estimation
         self._backend = None
         self._backend_entity = None
 
@@ -124,10 +123,9 @@ def apply(
                 CompressWeightsMode.INT8_SYM,
             ]:
                 continue
-            assert len(inputs) == 1
             _, input_tensors = next(iter(inputs.items()))
             hessian = self._calculate_hessian(node, input_tensors)
-            scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian)
+            scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian, input_tensors)
             scales[wc_params.weight_name] = scale
             zero_points[wc_params.weight_name] = zero_point
 
@@ -193,7 +191,12 @@ def _calculate_hessian(self, node: NNCFNode, inputs: List[Tensor]) -> Tensor:
         return hessian
 
     def _quantize_weights(
-        self, model: TModel, graph: NNCFGraph, wc_params: WeightCompressionParameters, hessian: Tensor
+        self,
+        model: TModel,
+        graph: NNCFGraph,
+        wc_params: WeightCompressionParameters,
+        hessian: Tensor,
+        inputs: List[Tensor],
     ):
         """
         Quantizes the weights of the model based on the calculated Hessian matrix.
@@ -260,11 +263,25 @@ def _quantize_weights(
                         scale = calculate_nf4_scale(weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes)
                         scales.append(scale)
                     else:
-                        scale, zero_point = calculate_integer_quantization_params(
-                            weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes, block_compression_config
-                        )
-                        scales.append(scale)
-                        zero_points.append(zero_point)
+                        if self._scale_estimation and block_compression_config.num_bits == 4:
+                            activations = [inp.squeeze()[:, (i1 + i) : (i1 + i + group_size)] for inp in inputs]
+                            scale, zero_point = ScaleEstimation.calculate_quantization_params(
+                                self._backend_entity,
+                                activations,
+                                weight_tensor[:, (i1 + i) : (i1 + i + group_size)],
+                                reduction_axes,
+                                wc_params.compression_config,
+                            )
+                            scales.append(scale.squeeze(axis=1))
+                            zero_points.append(zero_point)
+                        else:
+                            scale, zero_point = calculate_integer_quantization_params(
+                                weight_tensor[:, (i1 + i) : (i1 + i + group_size)],
+                                reduction_axes,
+                                block_compression_config,
+                            )
+                            scales.append(scale)
+                            zero_points.append(zero_point)
                 if block_compression_config.mode == CompressWeightsMode.NF4:
                     compressed_weights = do_nf4_quantization(
                         fns.unsqueeze(weight_col, 1), scales[-1], is_normalized_weight=False
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 6d1110c108f..712c5fd955d 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -20,16 +20,17 @@
 from nncf.common.utils.backend import BackendType
 from nncf.common.utils.backend import get_backend
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
+from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
+from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 from nncf.tensor import functions as fns
 
 TModel = TypeVar("TModel")
-TTensor = TypeVar("TTensor")
-TWeightType = TypeVar("TWeightType")
 
 
 class ScaleEstimation:
@@ -37,13 +38,15 @@ class ScaleEstimation:
     Scale estimation algorithm implementation.
     """
 
+    compress_decompress_cache = {}
+
     def __init__(
         self,
         model: TModel,
         name_to_node_mapping: Dict[str, Any],
         all_weight_params: List[WeightCompressionParameters],
         nodes_to_compress: List[NNCFNode],
-        activations: Optional[Dict[str, TTensor]] = None,
+        activations: Optional[Dict[str, List[Tensor]]] = None,
         subset_size: int = 32,
         initial_steps: int = 5,
         scale_steps: int = 10,
@@ -103,7 +106,7 @@ def apply(
         graph: NNCFGraph,
         statistic_points: Optional[StatisticPointsContainer] = None,
         dataset: Optional[Dataset] = None,
-    ) -> Dict[str, TTensor]:
+    ) -> Dict[str, Tensor]:
         """
         Estimates better scale for the int4 nodes in the model.
         Minimizes per-group difference between floating point MatMul and
@@ -118,8 +121,7 @@ def apply(
         :return: Dict with pairs (weight name, estimated scale).
         """
 
-        compress_decompress_cache = {}
-        res = dict()
+        scales = dict()
 
         for wp in track(self._all_weight_params, description="Applying Scale Estimation"):
             weight_name = wp.weight_name
@@ -127,11 +129,10 @@ def apply(
             config = wp.compression_config
 
             if config.num_bits != 4 or node_name not in self._activations:
-                res[weight_name] = None
+                scales[weight_name] = None
                 continue
 
-            s, X = process_stats(self._activations[node_name], self._subset_size)
-            reduction_axis = wp.reduction_axes[0]
+            stats = self._activations[node_name]
 
             weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph)
             if len(weight_data) != 1:  # not supported by the algorithm
@@ -139,162 +140,211 @@ def apply(
             _, weight_port_id = weight_data[0]
 
             weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)
-            weight = weight.astype(TensorDataType.float32)
-            eps = fns.finfo(weight).eps
 
-            if reduction_axis == 0:
-                weight = fns.transpose(weight)
-                reduction_axis = 1
+            scales[weight_name], _ = self.calculate_quantization_params(
+                self._backend_entity,
+                stats,
+                weight,
+                wp.reduction_axes,
+                config,
+                self._subset_size,
+                self._initial_steps,
+                self._scale_steps,
+                self._weight_penalty,
+            )
 
-            group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis]
-            cur_config = deepcopy(config)
-            cur_config.group_size = group_size
+        return scales
 
-            original_weight = fns.zeros_like(weight) + weight
+    @staticmethod
+    def calculate_quantization_params(
+        backend_entity: WeightCompressionAlgoBackend,
+        activations: List[Tensor],
+        weight: Tensor,
+        reduction_axes: Tuple[int, ...],
+        config: WeightCompressionConfig,
+        subset_size: int = 32,
+        initial_steps: int = 5,
+        scale_steps: int = 10,
+        weight_penalty: float = -1.0,
+    ) -> Tensor:
+        """
+        Calculates the quantization parameters for a given set of weights and activations.
+        This function estimates the optimal quantization scale for weight compression by
+        minimizing the difference between floating-point operations and operations with
+        quantized weights.
+
+        The function uses an iterative process:
+        1. Initial scale rectification based on activation statistics.
+        2. A grid search to further refine the scale parameters.
+
+        :param backend_entity: The backend-specific implementation of the weight compression algorithm.
+        :param activations: List of activation tensors corresponding to the layers being quantized.
+        :param weight: The weight tensor that is being quantized.
+        :param reduction_axes: Tuple specifying the axes along which the reduction is performed for quantization.
+        :param config: Configuration parameters for the weight compression, including quantization settings.
+        :param subset_size: The number of samples to use for scale estimation. Defaults to 32.
+        :param initial_steps: The number of steps for initial scale rectification using activation statistics.
+            Defaults to 5.
+        :param scale_steps: The number of steps for refining the scale using a grid search. Defaults to 10.
+        :param weight_penalty: Penalty coefficient applied to the difference between floating-point
+            and quantized weights. A value of -1 disables the penalty. Defaults to -1.0.
+        :return: A tensor containing the calculated quantization scales and zero points if applicable.
+        """
+        reduction_axis = reduction_axes[0]
 
-            compressed_weights, scale, zp = do_int_quantization(original_weight, reduction_axis, cur_config)
-            if zp is not None:
-                zp = zp.astype(scale.dtype)
-            q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis)
+        s, X = process_stats(activations, subset_size)
 
-            s = fns.unsqueeze(s, 0)
-            s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size)
+        weight = weight.astype(TensorDataType.float32)
+        eps = fns.finfo(weight).eps
 
-            original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size)
+        if reduction_axis == 0:
+            weight = fns.transpose(weight)
+            reduction_axis = 1
 
-            # all weight in group has importance based on corresponding input activations
-            importance = fns.ones_like(original_weight)
-            importance = importance * s
+        group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis]
+        cur_config = deepcopy(config)
+        cur_config.group_size = group_size
 
-            target, zero_mask = get_target_zero_mask(compressed_weights, zp)
-            importance = fns.where(zero_mask, 0.0, importance)
-
-            # normalize importances for every group of weights to make sum of them equal to 1.0
-            denum = fns.sum(importance, axis=2, keepdims=True)
-            importance = importance / (denum + eps)
-
-            X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size)
-            q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size)
-            best_diffs = None
-            result_scale = None
-
-            fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X)
-            q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X)
-
-            # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE
-            min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
-            min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0))
-            if self._weight_penalty > 0.0:
-                min_max_scale_diffs += self._weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1)
-
-            zp_shape = zp.shape if zp is not None else None
-            key = [(wp.compression_config.mode, wp.compression_config.num_bits) + q_weights.shape + scale.shape]
-            if zp is not None:
-                key += zp_shape
-            key = tuple(key)
-            if key in compress_decompress_cache:
-                compress_decompress_model = compress_decompress_cache[key]["compress_decompress_model"]
-                compress_model = compress_decompress_cache[key]["compress_model"]
-            else:
-                compress_decompress_model = self._backend_entity.get_compress_decompress_pipeline(
-                    wp.compression_config, q_weights.shape, scale.shape, zp_shape
-                )
-                compress_model = self._backend_entity.get_compress_pipeline(
-                    wp.compression_config, q_weights.shape, scale.shape, zp_shape
-                )
-                compress_decompress_cache[key] = {
-                    "compress_decompress_model": compress_decompress_model,
-                    "compress_model": compress_model,
-                }
-
-            scale_sign = scale / fns.abs(scale)
-            zero_scale = 0.001
-            zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
+        original_weight = fns.zeros_like(weight) + weight
 
-            input_tensors = [original_weight.data, None]
-            if zp is not None:
-                input_tensors.append(zp.data)
-            # iterative rectification of initial scale
-            for i in range(self._initial_steps):
-                near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
-                near_to_ideal_scale = near_to_ideal_scale * scale_sign
-                input_tensors[1] = near_to_ideal_scale.data
+        compressed_weights, scale, zp = do_int_quantization(original_weight, reduction_axis, cur_config)
+        if zp is not None:
+            zp = zp.astype(scale.dtype)
+        q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis)
 
-                out = compress_decompress_model(input_tensors)
-                q_weights_ = fns.zeros_like(original_weight) + out
-                q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
+        s = fns.unsqueeze(s, 0)
+        s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size)
 
-                ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
-                ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0))
-                if self._weight_penalty > 0.0:
-                    ideal_scale_diffs += self._weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1)
+        original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size)
 
-                if best_diffs is None:
-                    best_diffs = min_max_scale_diffs
+        # all weight in group has importance based on corresponding input activations
+        importance = fns.ones_like(original_weight)
+        importance = importance * s
 
-                mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype)
+        target, zero_mask = get_target_zero_mask(compressed_weights, zp)
+        importance = fns.where(zero_mask, 0.0, importance)
 
-                best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs
+        # normalize importances for every group of weights to make sum of them equal to 1.0
+        denum = fns.sum(importance, axis=2, keepdims=True)
+        importance = importance / (denum + eps)
 
-                mask = fns.unsqueeze(mask, axis=2)
+        X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size)
+        q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size)
+        best_diffs = None
+        result_scale = None
 
-                if result_scale is None:
-                    near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale
-                else:
-                    near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale
-                result_scale = near_to_ideal_scale
-                input_tensors[1] = near_to_ideal_scale.data
+        fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X)
+        q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X)
 
-                if i < self._initial_steps - 1:
-                    out = compress_model(input_tensors)
-                    compressed_weights = fns.zeros_like(original_weight) + out
-                    target, zero_mask = get_target_zero_mask(compressed_weights, zp)
-                    zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
+        # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE
+        min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
+        min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0))
+        if weight_penalty > 0.0:
+            min_max_scale_diffs += weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1)
 
-            # iterative rectification of scale based on grid search
-            for scale_steps in range(self._scale_steps):
-                factor = 1.0 - 0.05 * scale_steps
-                scaled_scale = factor * scale
+        zp_shape = zp.shape if zp is not None else None
+        key = (config.mode, config.num_bits) + q_weights.shape + scale.shape
+        if zp is not None:
+            key += zp_shape
+        if key in ScaleEstimation.compress_decompress_cache:
+            compress_decompress_model = ScaleEstimation.compress_decompress_cache[key]["compress_decompress_model"]
+            compress_model = ScaleEstimation.compress_decompress_cache[key]["compress_model"]
+        else:
+            compress_decompress_model = backend_entity.get_compress_decompress_pipeline(
+                config, q_weights.shape, scale.shape, zp_shape
+            )
+            compress_model = backend_entity.get_compress_pipeline(config, q_weights.shape, scale.shape, zp_shape)
+            ScaleEstimation.compress_decompress_cache[key] = {
+                "compress_decompress_model": compress_decompress_model,
+                "compress_model": compress_model,
+            }
+        scale_sign = scale / fns.abs(scale)
+        zero_scale = 0.001
+        zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
+
+        input_tensors = [original_weight.data, None]
+        if zp is not None:
+            input_tensors.append(zp.data)
+        # iterative rectification of initial scale
+        for i in range(initial_steps):
+            near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
+            near_to_ideal_scale = near_to_ideal_scale * scale_sign
+            input_tensors[1] = near_to_ideal_scale.data
+
+            out = compress_decompress_model(input_tensors)
+            q_weights_ = fns.zeros_like(original_weight) + out
+            q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
+
+            ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
+            ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0))
+            if weight_penalty > 0.0:
+                ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1)
+
+            if best_diffs is None:
+                best_diffs = min_max_scale_diffs
+
+            mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype)
+
+            best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs
+
+            mask = fns.unsqueeze(mask, axis=2)
+
+            if result_scale is None:
+                near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale
+            else:
+                near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale
+            result_scale = near_to_ideal_scale
+            input_tensors[1] = near_to_ideal_scale.data
 
-                input_tensors[1] = scaled_scale.data
+            if i < initial_steps - 1:
                 out = compress_model(input_tensors)
                 compressed_weights = fns.zeros_like(original_weight) + out
-
                 target, zero_mask = get_target_zero_mask(compressed_weights, zp)
                 zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
-                near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
-                near_to_ideal_scale = near_to_ideal_scale * scale_sign
 
-                input_tensors[1] = near_to_ideal_scale.data
-                out = compress_decompress_model(input_tensors)
-                q_weights_ = fns.zeros_like(original_weight) + out
+        # iterative rectification of scale based on grid search
+        for scale_steps in range(scale_steps):
+            factor = 1.0 - 0.05 * scale_steps
+            scaled_scale = factor * scale
+
+            input_tensors[1] = scaled_scale.data
+            out = compress_model(input_tensors)
+            compressed_weights = fns.zeros_like(original_weight) + out
 
-                q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
-                ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
-                ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0))
-                if self._weight_penalty > 0.0:
-                    ideal_scale_diffs += self._weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1)
+            target, zero_mask = get_target_zero_mask(compressed_weights, zp)
+            zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
+            near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
+            near_to_ideal_scale = near_to_ideal_scale * scale_sign
+
+            input_tensors[1] = near_to_ideal_scale.data
+            out = compress_decompress_model(input_tensors)
+            q_weights_ = fns.zeros_like(original_weight) + out
 
-                mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype)
+            q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
+            ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
+            ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0))
+            if weight_penalty > 0.0:
+                ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1)
 
-                best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs
+            mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype)
 
-                mask = fns.unsqueeze(mask, axis=2)
+            best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs
 
-                if result_scale is None:
-                    near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale
-                else:
-                    near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale
-                result_scale = near_to_ideal_scale
+            mask = fns.unsqueeze(mask, axis=2)
+
+            if result_scale is None:
+                near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale
+            else:
+                near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale
+            result_scale = near_to_ideal_scale
 
-            if config.group_size == -1:
-                result_scale = fns.squeeze(result_scale, axis=1)
-            res[weight_name] = result_scale
+        if config.group_size == -1:
+            result_scale = fns.squeeze(result_scale, axis=1)
 
-        return res
+        return result_scale, zp
 
 
-def get_target_zero_mask(compressed_weights: TTensor, zp: Optional[TTensor] = None) -> Tuple[TTensor, TTensor]:
+def get_target_zero_mask(compressed_weights: Tensor, zp: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
     """
     Computes the target values and a mask indicating zero values in the target.
 
@@ -310,7 +360,7 @@ def get_target_zero_mask(compressed_weights: TTensor, zp: Optional[TTensor] = No
     return target, zero_mask
 
 
-def estimate_scales(weight: TTensor, target: TTensor, zero_mask: TTensor, importance: TTensor) -> TTensor:
+def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importance: Tensor) -> Tensor:
     """
     Estimates scales for the given weight, target, zero mask, and importance.
 
diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py
index e96c4526c51..60baeacc48e 100644
--- a/nncf/quantization/quantize_model.py
+++ b/nncf/quantization/quantize_model.py
@@ -482,11 +482,6 @@ def compress_weights(
         if any((gptq, lora_correction)) and (dataset is None or mode == CompressWeightsMode.E2M1):
             raise AttributeError("GPTQ or Lora Correction algorithm is defined, but dataset is None or mode is E2M1.")
 
-        if gptq and scale_estimation:
-            raise AttributeError(
-                "Simultaneous use of Scale estimation and GPTQ algorithms is not supported. Select one of them."
-            )
-
         if gptq and lora_correction:
             raise AttributeError(
                 "Simultaneous use of Lora correction and GPTQ algorithms is not supported. Select one of them."
diff --git a/tests/openvino/native/quantization/test_gptq.py b/tests/openvino/native/quantization/test_gptq.py
index 1202b216ec7..ad19990eac0 100644
--- a/tests/openvino/native/quantization/test_gptq.py
+++ b/tests/openvino/native/quantization/test_gptq.py
@@ -341,7 +341,8 @@ def test_calculate_scale_linear():
     gptq._set_backend_entity(ov_model)
 
     nodes = graph.get_all_nodes()
-    H = gptq._calculate_hessian(nodes[1], [Tensor(inp) for inp in inputs])
+    wrapped_inputs = [Tensor(inp) for inp in inputs]
+    H = gptq._calculate_hessian(nodes[1], wrapped_inputs)
 
     ref_H = ref_gptq.H.numpy()
     assert np.all(np.isclose(ref_H, H.data))
@@ -351,7 +352,7 @@ def test_calculate_scale_linear():
     )
     wc_params.compression_config = WeightCompressionConfig(mode=CompressWeightsMode.INT4_SYM, group_size=16)
 
-    scale, _ = gptq._quantize_weights(ov_model, graph, wc_params, H)
+    scale, _ = gptq._quantize_weights(ov_model, graph, wc_params, H, wrapped_inputs)
     ref_scale = ref_scale.numpy()
     scale = scale.reshape(ref_scale.shape)
     assert np.all(np.isclose(ref_scale, scale.data))
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index bb9b5c373c7..c51cf667ca2 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -713,10 +713,7 @@ def test_raise_error_with_unsupported_params_for_int8(mode, params):
 @pytest.mark.parametrize("mode", INT4_MODES)
 @pytest.mark.parametrize(
     "params",
-    (
-        {"dataset": "anything", "scale_estimation": True, "gptq": True},
-        {"dataset": "anything", "lora_correction": True, "gptq": True},
-    ),
+    ({"dataset": "anything", "lora_correction": True, "gptq": True},),
 )
 def test_raise_error_with_unsupported_params_for_int4(mode, params):
     with pytest.raises(AttributeError):

From ee648777dcb951f4c7bdadd3997680a5083645a7 Mon Sep 17 00:00:00 2001
From: Aleksandr Suslov <alexander.suslov@intel.com>
Date: Wed, 4 Sep 2024 13:25:22 +0400
Subject: [PATCH 02/68] fix for INT4_ASYM

---
 nncf/quantization/algorithms/weight_compression/gptq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py
index b1101916da3..bd6518c86ad 100644
--- a/nncf/quantization/algorithms/weight_compression/gptq.py
+++ b/nncf/quantization/algorithms/weight_compression/gptq.py
@@ -273,7 +273,7 @@ def _quantize_weights(
                                 wc_params.compression_config,
                             )
                             scales.append(scale.squeeze(axis=1))
-                            zero_points.append(zero_point)
+                            zero_points.append(zero_point if zero_point is None else zero_point.squeeze(axis=1))
                         else:
                             scale, zero_point = calculate_integer_quantization_params(
                                 weight_tensor[:, (i1 + i) : (i1 + i + group_size)],

From 2fc8f9cd1f93b43ccb65e31a4795cd93761bbeba Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 9 May 2025 18:48:42 +0200
Subject: [PATCH 03/68] Draft.

---
 .../weight_compression/algorithm.py           | 10 ++++----
 .../algorithms/weight_compression/backend.py  |  4 ++--
 .../algorithms/weight_compression/gptq.py     | 11 ++++-----
 .../weight_compression/onnx_backend.py        |  7 +++---
 .../weight_compression/openvino_backend.py    |  4 ++--
 .../weight_compression/scale_estimation.py    | 12 ++++++----
 .../weight_compression/weight_lowering.py     | 23 ++++---------------
 7 files changed, 27 insertions(+), 44 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 9ac54e144b9..05ffe54b725 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -649,13 +649,12 @@ def apply(
             # del is used to prematurely mark non-necessary data as free for garbage collection
             del self.awq_algo
 
-        scales = {}
-        zero_points = {}
+        compressed_weights = None
         lora_correction_algo = None
         description = "Applying Weight Compression"
         if self._gptq:
             del statistics
-            model, scales, zero_points = self._gptq_algo.apply(
+            model, compressed_weights = self._gptq_algo.apply(
                 model=model,
                 graph=graph,
                 dataset=dataset,
@@ -664,7 +663,7 @@ def apply(
             )
         else:
             if self._scale_estimation:
-                scales, zero_points = self._scale_estimation_algo.apply(
+                compressed_weights = self._scale_estimation_algo.apply(
                     model=model,
                     graph=graph,
                     all_weight_params=all_weight_params,
@@ -687,8 +686,7 @@ def apply(
             model,
             graph,
             track(all_weight_params, description=description, weights=all_weight_sizes),
-            scales,
-            zero_points,
+            compressed_weights,
             lora_correction_algo,
             self._compression_format,
             self._advanced_parameters,
diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py
index 62d0745a0f4..2d928ff2908 100644
--- a/nncf/quantization/algorithms/weight_compression/backend.py
+++ b/nncf/quantization/algorithms/weight_compression/backend.py
@@ -26,6 +26,7 @@
 from nncf.experimental.common.tensor_statistics.statistics import HessianTensorStatistic
 from nncf.parameters import CompressionFormat
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
+from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.tensor import Tensor
@@ -148,8 +149,7 @@ def transform_model(
         model: TModel,
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        precomputed_scales: dict[str, Tensor] = None,
-        precomputed_zero_points: dict[str, Tensor] = None,
+        compressed_weights: dict[str, CompressedWeight] = None,
         lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py
index 76674fd9288..bcca525bd75 100644
--- a/nncf/quantization/algorithms/weight_compression/gptq.py
+++ b/nncf/quantization/algorithms/weight_compression/gptq.py
@@ -23,6 +23,7 @@
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.layerwise.engine import LayerwiseEngine
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
@@ -85,7 +86,7 @@ def apply(
         weight_compression_parameters: list[WeightCompressionParameters],
         statistic_points: Optional[StatisticPointsContainer] = None,
         backend_entity: Optional[WeightCompressionAlgoBackend] = None,
-    ) -> tuple[TModel, dict[str, Tensor], dict[str, Tensor]]:
+    ) -> tuple[TModel, dict[str, CompressedWeight]]:
         """
         Applies the GPTQ algorithm to quantize the weights of the given model.
 
@@ -101,8 +102,7 @@ def apply(
         if self._backend_entity is None:
             self._set_backend_entity(model)
 
-        scales = {}
-        zero_points = {}
+        res = {}
 
         target_nodes = []
         target_nodes_wc_params_map = {}
@@ -125,10 +125,9 @@ def apply(
             _, input_tensors = next(iter(inputs.items()))
             hessian = self._calculate_hessian(node, input_tensors)
             scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian, input_tensors)
-            scales[wc_params.weight_name] = scale
-            zero_points[wc_params.weight_name] = zero_point
+            res[wc_params.weight_name] = CompressedWeight(None, scale, zero_point, None)
 
-        return model, scales, zero_points
+        return model, res
 
     def get_statistic_points(
         self,
diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
index a962cf163bc..bef2160aa7e 100644
--- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -43,6 +43,7 @@
 from nncf.parameters import CompressionFormat
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
+from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
@@ -191,8 +192,7 @@ def transform_model(
         model: onnx.ModelProto,
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        precomputed_scales: dict[str, Tensor] = None,
-        precomputed_zero_points: dict[str, Tensor] = None,
+        compressed_weights: dict[str, CompressedWeight] = None,
         lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
@@ -206,8 +206,7 @@ def transform_model(
                 Tensor(weight),
                 wc_params.reduction_axes,
                 compression_config,
-                None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name),
-                None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name),
+                None if compressed_weights is None else compressed_weights.get(wc_params.weight_name)
             )
             dequantize_block_size = max(compression_config.group_size, 0)  # 0 - is no block wise quantization
             compressed_weight, scale, zero_point = self._preprocess_compressed_weight_shapes(
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 7c1838eb8d2..5df95ff40aa 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -53,6 +53,7 @@
 from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
@@ -282,8 +283,7 @@ def transform_model(
         model: ov.Model,
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        precomputed_scales: dict[str, Tensor] = None,
-        precomputed_zero_points: dict[str, Tensor] = None,
+        compressed_weights: dict[str, CompressedWeight] = None,
         lora_correction_algo: LoraCorrectionAlgorithm = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 4aea4633ebb..3ee49a2bd83 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -21,6 +21,7 @@
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
@@ -99,7 +100,7 @@ def apply(
         all_weight_params: list[WeightCompressionParameters],
         statistics: dict[str, WCTensorStatistic],
         backend_entity: Optional[WeightCompressionAlgoBackend] = None,
-    ) -> tuple[dict[str, Tensor], dict[str, Tensor]]:
+    ) -> dict[str, CompressedWeight]:
         """
         Estimates better scale for the int4 nodes in the model.
         Minimizes per-group difference between floating point MatMul and
@@ -119,7 +120,7 @@ def apply(
         self._backend_entity = backend_entity
         if self._backend_entity is None:
             self._set_backend_entity(model)
-        scales, zero_points = dict(), dict()
+        res = dict()
 
         invalid_node_names = []
         first_caught_error = None
@@ -129,7 +130,7 @@ def apply(
             config = wp.compression_config
 
             if config.num_bits != 4 or node_name not in statistics:
-                scales[weight_name] = None
+                res[weight_name] = CompressedWeight()
                 continue
 
             stats = statistics[node_name]
@@ -142,7 +143,7 @@ def apply(
             weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)
 
             try:
-                scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params(
+                scale, zero_point = self.calculate_quantization_params(
                     stats,
                     weight,
                     wp.reduction_axes,
@@ -152,6 +153,7 @@ def apply(
                     self._scale_steps,
                     self._weight_penalty,
                 )
+                res[weight_name] = CompressedWeight(None, scale, zero_point, None)
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
                 invalid_node_names.append(wp.node_with_weight.node_name)
@@ -159,7 +161,7 @@ def apply(
         if first_caught_error:
             handle_invalid_group_size_error(first_caught_error, invalid_node_names)
 
-        return scales, zero_points
+        return res
 
     @staticmethod
     def calculate_quantization_params(
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index f72a05193b9..0ce1c316746 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -9,7 +9,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from dataclasses import dataclass
 from typing import Optional, Union
 
 import numpy as np
@@ -19,6 +18,7 @@
 from nncf.common.utils.backend import is_openvino_at_least
 from nncf.common.utils.backend import is_openvino_available
 from nncf.parameters import CompressWeightsMode
+from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.fake_quantize import calculate_scale_zero_point
 from nncf.tensor import Tensor
@@ -72,22 +72,6 @@
 )
 
 
-@dataclass
-class CompressedWeight:
-    """
-    Compressed weight and decompression parameters.
-
-    :param tensor: The tensor with compressed weight.
-    :param scale: The decompression scale, in practice it is dequantization scale for the INT quantization.
-    :param zero_point: The zero-point, it is the value of the compression type corresponding to the value 0
-        in the non-compression realm. Applicable for INT quantization.
-    """
-
-    tensor: Tensor
-    scale: Tensor
-    zero_point: Optional[Tensor] = None
-
-
 def reshape_weight_for_grouped_quantization(
     weight: Tensor, reduction_axes: ReductionAxes, group_size: int
 ) -> tuple[Tensor, int]:
@@ -386,8 +370,7 @@ def compress_weight(
     weight: Tensor,
     reduction_axes: ReductionAxes,
     config: WeightCompressionConfig,
-    precomputed_scale: Tensor = None,
-    precomputed_zero_point: Tensor = None,
+    compressed_weight: CompressedWeight = None,
 ) -> CompressedWeight:
     """
     Compress weight using compression configuration.
@@ -399,6 +382,8 @@ def compress_weight(
     :param precomputed_zero_point: Precomputed zero point.
     :return: The compressed weight and decompression parameters as instance of CompressedWeight
     """
+
+    precomputed_scale, precomputed_zero_point = compressed_weight.scale, compressed_weight.zero_point if compressed_weight else (None, None)
     if not config.is_integer:
         if weight.backend == TensorBackend.ov:
             weight = weight.as_numpy_tensor()

From 7c6795e00d4ba2d3f55c8060f4cd8c1e160bb43d Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 9 May 2025 19:46:36 +0200
Subject: [PATCH 04/68] Draft.

---
 .../algorithms/weight_compression/onnx_backend.py  |  4 ++--
 .../weight_compression/openvino_backend.py         | 14 ++++----------
 .../algorithms/weight_compression/torch_backend.py |  8 +++-----
 .../weight_compression/torch_fx_backend.py         |  7 +++----
 .../weight_compression/weight_lowering.py          |  6 ++++--
 5 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
index bef2160aa7e..6d3633284cc 100644
--- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -43,8 +43,8 @@
 from nncf.parameters import CompressionFormat
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
-from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
@@ -206,7 +206,7 @@ def transform_model(
                 Tensor(weight),
                 wc_params.reduction_axes,
                 compression_config,
-                None if compressed_weights is None else compressed_weights.get(wc_params.weight_name)
+                None if compressed_weights is None else compressed_weights.get(wc_params.weight_name),
             )
             dequantize_block_size = max(compression_config.group_size, 0)  # 0 - is no block wise quantization
             compressed_weight, scale, zero_point = self._preprocess_compressed_weight_shapes(
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 5df95ff40aa..c85c7ea8b1f 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -218,8 +218,7 @@ def _create_compression_subgraph(
         weight_port_id: int,
         const_dtype,
         should_add_convert_node: bool,
-        layer_scales: Optional[Tensor] = None,
-        layer_zero_points: Optional[Tensor] = None,
+        compressed_weight: Optional[CompressedWeight] = None,
     ):
         scale_dtype = ov.Type.f16
         if compression_config.mode == CompressWeightsMode.NF4:
@@ -245,8 +244,7 @@ def _create_compression_subgraph(
                 weight,
                 reduction_axes,
                 compression_config,
-                layer_scales,
-                layer_zero_points,
+                compressed_weight,
             )
         compressed_const = create_ov_const_from_tensor(
             compressed_weight.tensor, compression_dtype, name=const_node_name
@@ -308,10 +306,7 @@ def transform_model(
                         should_add_convert_node = True
                         break
 
-            layer_scales = None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name)
-            layer_zero_points = (
-                None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name)
-            )
+            compressed_weight = None if compressed_weights is None else compressed_weights.get(wc_params.weight_name)
             try:
                 mul, compressed_weight = self._create_compression_subgraph(
                     weight=weight,
@@ -321,8 +316,7 @@ def transform_model(
                     weight_port_id=wc_params.weight_port_id,
                     const_dtype=const_dtype,
                     should_add_convert_node=should_add_convert_node,
-                    layer_scales=layer_scales,
-                    layer_zero_points=layer_zero_points,
+                    compressed_weight=compressed_weight,
                 )
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index 50f765c35c3..79869c49d46 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -45,10 +45,10 @@
 from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
-from nncf.quantization.algorithms.weight_compression.weight_lowering import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorDataType
@@ -432,8 +432,7 @@ def transform_model(
         model: Union[GraphModelWrapper, torch.nn.Module],
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        precomputed_scales: dict[str, Tensor] = None,
-        precomputed_zero_points: dict[str, Tensor] = None,
+        compressed_weights: dict[str, CompressedWeight] = None,
         lora_correction_algo: LoraCorrectionAlgorithm = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
@@ -470,8 +469,7 @@ def transform_model(
                     Tensor(weight),
                     wc_params.reduction_axes,
                     compression_config,
-                    None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name),
-                    None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name),
+                    None if compressed_weights is None else compressed_weights.get(wc_params.weight_name),
                 )
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
index 2650f16600c..80597096346 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
@@ -40,6 +40,7 @@
 from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
@@ -189,8 +190,7 @@ def transform_model(
         model: torch.fx.GraphModule,
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        precomputed_scales: dict[str, Tensor] = None,
-        precomputed_zero_points: dict[str, Tensor] = None,
+        compressed_weights: dict[str, CompressedWeight] = None,
         lora_correction_algo: LoraCorrectionAlgorithm = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
@@ -218,8 +218,7 @@ def transform_model(
                     weight,
                     wc_params.reduction_axes,
                     compression_config,
-                    None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name),
-                    None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name),
+                    None if compressed_weights is None else compressed_weights.get(wc_params.weight_name),
                 )
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 0ce1c316746..045dbc418eb 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -382,8 +382,10 @@ def compress_weight(
     :param precomputed_zero_point: Precomputed zero point.
     :return: The compressed weight and decompression parameters as instance of CompressedWeight
     """
-
-    precomputed_scale, precomputed_zero_point = compressed_weight.scale, compressed_weight.zero_point if compressed_weight else (None, None)
+    precomputed_scale, precomputed_zero_point = (
+        compressed_weight.scale,
+        compressed_weight.zero_point if compressed_weight else (None, None),
+    )
     if not config.is_integer:
         if weight.backend == TensorBackend.ov:
             weight = weight.as_numpy_tensor()

From 1dcdd7598e7b13a1bc94fac9eb649f004dfc5a75 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Mon, 12 May 2025 13:01:30 +0200
Subject: [PATCH 05/68] Draft for codebook.

---
 nncf/parameters.py                            |   1 +
 .../algorithms/weight_compression/codebook.py | 169 ++++++++++++++++++
 .../weight_compression/weight_lowering.py     |  28 +--
 3 files changed, 187 insertions(+), 11 deletions(-)
 create mode 100644 nncf/quantization/algorithms/weight_compression/codebook.py

diff --git a/nncf/parameters.py b/nncf/parameters.py
index 92b158fa9a6..6a6e6883ab4 100644
--- a/nncf/parameters.py
+++ b/nncf/parameters.py
@@ -94,6 +94,7 @@ class CompressWeightsMode(StrEnum):
     NF4 = "nf4"
     INT8 = "int8"  # Deprecated mode
     E2M1 = "e2m1"
+    CODEBOOK = "codebook"
 
 
 @api(canonical_alias="nncf.CompressionFormat")
diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py
new file mode 100644
index 00000000000..a4ad22498cb
--- /dev/null
+++ b/nncf/quantization/algorithms/weight_compression/codebook.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+from typing import Optional, TypeVar
+
+import nncf
+from nncf.common.graph.graph import NNCFGraph
+from nncf.common.logging.track_progress import track
+from nncf.common.utils.backend import BackendType
+from nncf.common.utils.backend import get_backend
+from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
+from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_scale
+from nncf.tensor import Tensor
+from nncf.tensor import TensorDataType
+from nncf.tensor import functions as fns
+
+TModel = TypeVar("TModel")
+
+
+class Codebook:
+    """
+    Codebook estimation algorithm implementation.
+    """
+
+    def __init__(
+        self,
+        initial_codebook: Tensor,
+    ):
+        """
+        :param initial_codebook: codebook for compression.
+        """
+        super().__init__()
+        self._initial_codebook = initial_codebook.flatten()
+
+    @property
+    def available_backends(self) -> list[BackendType]:
+        return [BackendType.OPENVINO]
+
+    def _set_backend_entity(self, model: TModel) -> None:
+        """
+        Creates a helper class with a backed-specific logic of the algorithm.
+
+        :param model: Backend-specific input model.
+        """
+        model_backend = get_backend(model)
+        if model_backend == BackendType.OPENVINO:
+            from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
+
+            self._backend_entity = OVWeightCompressionAlgoBackend(model)
+        else:
+            msg = (
+                "Cannot return backend-specific Scale Estimation entity because"
+                f" {model_backend.value} is not supported!"
+            )
+            raise nncf.UnsupportedBackendError(msg)
+
+    def apply(
+        self,
+        model: TModel,
+        graph: NNCFGraph,
+        all_weight_params: list[WeightCompressionParameters],
+        backend_entity: Optional[WeightCompressionAlgoBackend] = None,
+    ) -> dict[str, CompressedWeight]:
+        """
+        Estimates better scale for the int4 nodes in the model.
+        Minimizes per-group difference between floating point MatMul and
+        MatMul with compressed weights.
+        The algorithm computes weighted scale for the group of weights in MatMul, which
+        shared the same scale.
+
+        :param model: Model for applying algorithm.
+        :param graph: Model graph.
+        :param all_weight_params: List of all weight parameters.
+        :param backend_entity: Weight compression algorithm backend.
+        :return: Two dictionaries for estimated scales and zero points for each weight name.
+        """
+        self._backend_entity = backend_entity
+        if self._backend_entity is None:
+            self._set_backend_entity(model)
+
+        res = {}
+        invalid_node_names = []
+        first_caught_error = None
+        for wp in track(all_weight_params, description="Applying Codebook Compression"):
+            weight_name = wp.weight_name
+            config = wp.compression_config
+
+            weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph)
+            if len(weight_data) != 1:  # not supported by the algorithm
+                continue
+            _, weight_port_id = weight_data[0]
+
+            weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)
+
+            try:
+                indexes, scale, codebook = self.calculate_quantization_params(weight, wp.reduction_axes, config)
+                res[weight_name] = CompressedWeight(indexes, scale, None, codebook)
+            except nncf.InvalidGroupSizeError as error:
+                first_caught_error = error
+                invalid_node_names.append(wp.node_with_weight.node_name)
+
+        if first_caught_error:
+            handle_invalid_group_size_error(first_caught_error, invalid_node_names)
+
+        return res
+
+    def calculate_quantization_params(
+        self,
+        weight: Tensor,
+        reduction_axes: tuple[int, ...],
+        config: WeightCompressionConfig,
+    ) -> Tensor:
+        """
+        Calculates the quantization parameters for a given set of weights and activations.
+        This function estimates the optimal quantization scale for weight compression by
+        minimizing the difference between floating-point operations and operations with
+        quantized weights.
+
+        The function uses an iterative process:
+        1. Initial scale rectification based on activation statistics.
+        2. A grid search to further refine the scale parameters.
+
+        :param statistics: The input activations of the layer reduced over batch and sequence length dimensions,
+            together with original activation tensor shapes.
+        :param weight: The weight tensor that is being quantized.
+        :param reduction_axes: Tuple specifying the axes along which the reduction is performed for quantization.
+        :param config: Configuration parameters for the weight compression, including quantization settings.
+        :return: A tensor containing the calculated quantization scales and zero points if applicable.
+        """
+        reduction_axis = reduction_axes[0]
+
+        weight = weight.astype(TensorDataType.float32)
+
+        if reduction_axis == 0:
+            weight = fns.transpose(weight)
+            reduction_axis = 1
+
+        group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis]
+        cur_config = deepcopy(config)
+        cur_config.group_size = group_size
+
+        max_val = fns.max(fns.abs(weight))
+        norm_weight, scale = calculate_normalized_weight_and_scale(
+            weight, reduction_axis, cur_config.group_size, max_val=max_val
+        )
+
+        orig_shape = norm_weight.shape
+
+        norm_weight = fns.unsqueeze(norm_weight.flatten(), 1)
+
+        dist = (norm_weight - fns.unsqueeze(self._initial_codebook, 0)) ** 2
+
+        indexes = fns.argmin(dist, axis=1)[0]
+        indexes = fns.reshape(indexes, orig_shape)
+
+        return indexes, scale, self._initial_codebook
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 045dbc418eb..b209e3a93a7 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -104,7 +104,7 @@ def reshape_weight_for_grouped_quantization(
     return reshaped_weight, reduction_axes
 
 
-def calculate_nf4_scale(weight: Tensor, reduction_axes: ReductionAxes) -> Tensor:
+def calculate_nf4_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val=1.0) -> Tensor:
     """
     Calculates the scale for nf4 quantization.
 
@@ -115,7 +115,7 @@ def calculate_nf4_scale(weight: Tensor, reduction_axes: ReductionAxes) -> Tensor
     if weight.dtype != TensorDataType.float32:
         weight = weight.astype(TensorDataType.float32)
 
-    scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True)
+    scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) / max_val
 
     # NOTE: adding machine epsilon to avoid division by zero
     eps = fns.finfo(weight).eps
@@ -134,7 +134,7 @@ def calculate_e2m1_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val=
     :param to_e8m0: Defines convert scale to e8m0 or not.
     :return: Scale tensor of float32 type for e2m1 quantization.
     """
-    scale = calculate_nf4_scale(weight, reduction_axes) / max_val
+    scale = calculate_nf4_scale(weight, reduction_axes, max_val)
 
     scale = fns.log2(scale)
     scale = fns.ceil(scale)
@@ -219,12 +219,13 @@ def do_nf4_dequantization(nf4_weight: Tensor, scale: Tensor, reduction_axis: int
     return decompressed_weight
 
 
-def calculate_normalized_weight_and_fp4_scale(
+def calculate_normalized_weight_and_scale(
     weight: Tensor,
     reduction_axes: ReductionAxes,
     group_size: int = -1,
     precomputed_scale: Tensor = None,
     mode: CompressWeightsMode = CompressWeightsMode.NF4,
+    max_val=1.0,
 ) -> tuple[Tensor, Tensor]:
     """
     Calculates scale for fp4 (nf4, e2m1) quantization and normalizes weights by the scale.
@@ -235,9 +236,10 @@ def calculate_normalized_weight_and_fp4_scale(
     :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale).
         The value -1 means no grouping. Defaults to -1.
     :param precomputed_scale: Precomputed scale.
+    :parm max_val: Max value of compressed type for normalization.
     :return: Normalized weight tensor of float32 type and nf4 scale tensor of float32 type.
     """
-    assert mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]
+    assert mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]
     if weight.dtype != TensorDataType.float32:
         weight = weight.astype(TensorDataType.float32)
 
@@ -245,10 +247,14 @@ def calculate_normalized_weight_and_fp4_scale(
         # weights are reshaped: [a1, r, a2] -> [a1, r//gs, gs, a2]
         weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size)
 
-    if mode == CompressWeightsMode.NF4:
-        scale = calculate_nf4_scale(weight, reduction_axes) if precomputed_scale is None else precomputed_scale
-    if mode == CompressWeightsMode.E2M1:
+    if mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK]:
+        scale = calculate_nf4_scale(weight, reduction_axes, max_val) if precomputed_scale is None else precomputed_scale
+    elif mode == CompressWeightsMode.E2M1:
         scale = calculate_e2m1_scale(weight, reduction_axes) if precomputed_scale is None else precomputed_scale
+    else:
+        msg = f"Unsupported mode {mode} for weight compression."
+        raise ValueError(msg)
+
     norm_weight = calculate_normalized_weight(weight, scale)
     return norm_weight, scale
 
@@ -383,14 +389,14 @@ def compress_weight(
     :return: The compressed weight and decompression parameters as instance of CompressedWeight
     """
     precomputed_scale, precomputed_zero_point = (
-        compressed_weight.scale,
-        compressed_weight.zero_point if compressed_weight else (None, None),
+        (compressed_weight.scale, compressed_weight.zero_point) if compressed_weight else (None, None)
     )
+
     if not config.is_integer:
         if weight.backend == TensorBackend.ov:
             weight = weight.as_numpy_tensor()
 
-        compressed_weight, scale = calculate_normalized_weight_and_fp4_scale(
+        compressed_weight, scale = calculate_normalized_weight_and_scale(
             weight, reduction_axes, config.group_size, precomputed_scale, config.mode
         )
         return CompressedWeight(compressed_weight, scale)

From b870d8d9b242df262afe0859f6b7e0778ed8652b Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 13 May 2025 14:00:13 +0200
Subject: [PATCH 06/68] Compression for default codebook.

---
 .ci/cspell_dict.txt                           |  3 +-
 nncf/openvino/graph/node_utils.py             | 23 +++++++++
 nncf/quantization/advanced_parameters.py      | 37 ++++++++++++++
 .../weight_compression/algorithm.py           | 15 ++++++
 .../algorithms/weight_compression/codebook.py | 24 ++++++---
 .../weight_compression/openvino_backend.py    | 50 ++++++++++++-------
 .../weight_compression/scale_estimation.py    |  4 +-
 7 files changed, 129 insertions(+), 27 deletions(-)

diff --git a/.ci/cspell_dict.txt b/.ci/cspell_dict.txt
index 8d7bf519804..2dd19aafa41 100644
--- a/.ci/cspell_dict.txt
+++ b/.ci/cspell_dict.txt
@@ -72,6 +72,7 @@ ckpt
 clusterization
 cmap
 cnode
+codebook
 coeffs
 concr
 confs
@@ -492,4 +493,4 @@ yolov
 yscale
 yujie
 yury
-zfnet
\ No newline at end of file
+zfnet
diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 32ed821b7d1..96f4958f959 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -685,3 +685,26 @@ def create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] =
         return opset.constant(x.data, name=name, shared_memory=True)
     const = opset.constant(x.data, dtype=dtype, name=name)
     return const
+
+
+def create_ov_codebook_subgraph(
+    codebook: Tensor, indexes: Tensor, dtype: ov.Type, codebook_dtype: ov.Type, name: Optional[str] = None
+) -> op.Constant:
+    """
+    Create an OpenVINO subgraph with gather from the given codebook and indexes tensors.
+    :param codebook: Codebook tensor.
+    :param indexes: Indexes tensor.
+    :param dtype: Data type of the indexes.
+    :param codebook_dtype: Data type of the codebook.
+    :param name: Optional name of the constant.
+    :return: OpenVINO subgraph.
+    """
+    cobebook_const = opset.constant(codebook.data, dtype=codebook_dtype)
+    if codebook_dtype != ov.Type.f16:
+        cobebook_const = opset.convert(cobebook_const, destination_type=ov.Type.f16)
+    codebook_indexes = opset.constant(indexes.data, dtype=dtype)
+    if dtype == ov.Type.u4:
+        codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8)
+
+    const = opset.gather(cobebook_const, codebook_indexes, 0, name=name)
+    return const
diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index 10f18b34eae..a041c8da25c 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -20,6 +20,8 @@
 from enum import Enum
 from typing import Any, Optional, Union
 
+import openvino.runtime as ov
+
 import nncf
 from nncf.common.quantization.quantizer_propagation.structs import QuantizerPropagationRule
 from nncf.common.quantization.structs import QuantizationScheme as QuantizationMode
@@ -359,6 +361,40 @@ class AdvancedLoraCorrectionParameters:
     use_int8_adapters: bool = True
 
 
+@api()
+@dataclass
+class AdvancedCodebookParameters:
+    """
+    Contains advanced parameters for codebook compression algorithm.
+    :param codebook: The codebook (LUT) for the weight compression.
+        Applicable for vector quantization.
+    :type codebook: list[Any]
+    :param dts_type: The type of the codebook.
+    """
+
+    codebook: list[Any] = field(
+        default_factory=lambda: [
+            -3.5,
+            -2.5,
+            -1.875,
+            -1.375,
+            -1.0,
+            -0.625,
+            -0.3125,
+            0.0,
+            0.2812,
+            0.5625,
+            0.875,
+            1.125,
+            1.5,
+            2.0,
+            2.5,
+            3.5,
+        ]
+    )
+    dst_type: Any = ov.Type.f8e4m3
+
+
 @api()
 @dataclass
 class AdvancedCompressionParameters:
@@ -390,6 +426,7 @@ class AdvancedCompressionParameters:
     lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
     lora_adapter_rank: int = 256
     backend_params: dict[str, Any] = field(default_factory=dict)
+    codebook_params: AdvancedCodebookParameters = field(default_factory=AdvancedCodebookParameters)
 
 
 @api()
diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 05ffe54b725..631be6f17e2 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -38,6 +38,7 @@
 from nncf.quantization.advanced_parameters import convert_to_dict_recursively
 from nncf.quantization.algorithms.algorithm import Algorithm
 from nncf.quantization.algorithms.weight_compression.awq import AWQ
+from nncf.quantization.algorithms.weight_compression.codebook import Codebook
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.gptq import GPTQ
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
@@ -288,6 +289,7 @@ def __init__(
         self._advanced_parameters = (
             advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters()
         )
+        self._codebook = mode == CompressWeightsMode.CODEBOOK
 
         primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size)
         criterion_cls = MIXED_PRECISION_CRITERIA.get(self._sensitivity_metric)
@@ -320,6 +322,12 @@ def __init__(
                 scale_estimation_params.scale_steps,
                 scale_estimation_params.weight_penalty,
             )
+        if self._codebook:
+            codebook_params = self._advanced_parameters.codebook_params
+            self._codebook_algo = Codebook(
+                initial_codebook=codebook_params.codebook,
+                dst_type=codebook_params.dst_type,
+            )
 
         self._data_aware_mixed_precision = (
             self._sensitivity_metric != SensitivityMetric.WEIGHT_QUANTIZATION_ERROR and self._ratio != 1.0
@@ -652,6 +660,13 @@ def apply(
         compressed_weights = None
         lora_correction_algo = None
         description = "Applying Weight Compression"
+        if self._codebook:
+            compressed_weights = self._codebook_algo.apply(
+                model=model,
+                graph=graph,
+                all_weight_params=all_weight_params,
+                backend_entity=self._backend_entity,
+            )
         if self._gptq:
             del statistics
             model, compressed_weights = self._gptq_algo.apply(
diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py
index a4ad22498cb..17f30a34cd7 100644
--- a/nncf/quantization/algorithms/weight_compression/codebook.py
+++ b/nncf/quantization/algorithms/weight_compression/codebook.py
@@ -10,13 +10,14 @@
 # limitations under the License.
 
 from copy import deepcopy
-from typing import Optional, TypeVar
+from typing import Any, Optional, TypeVar
 
 import nncf
 from nncf.common.graph.graph import NNCFGraph
 from nncf.common.logging.track_progress import track
 from nncf.common.utils.backend import BackendType
 from nncf.common.utils.backend import get_backend
+from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
@@ -38,12 +39,14 @@ class Codebook:
     def __init__(
         self,
         initial_codebook: Tensor,
+        dst_type: Any,
     ):
         """
         :param initial_codebook: codebook for compression.
         """
         super().__init__()
-        self._initial_codebook = initial_codebook.flatten()
+        self._initial_codebook = initial_codebook
+        self._dst_type = dst_type
 
     @property
     def available_backends(self) -> list[BackendType]:
@@ -95,7 +98,10 @@ def apply(
         invalid_node_names = []
         first_caught_error = None
         for wp in track(all_weight_params, description="Applying Codebook Compression"):
+            if wp.compression_config.mode != CompressWeightsMode.CODEBOOK:
+                continue
             weight_name = wp.weight_name
+            print(weight_name)
             config = wp.compression_config
 
             weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph)
@@ -107,7 +113,7 @@ def apply(
 
             try:
                 indexes, scale, codebook = self.calculate_quantization_params(weight, wp.reduction_axes, config)
-                res[weight_name] = CompressedWeight(indexes, scale, None, codebook)
+                res[weight_name] = CompressedWeight(indexes, scale, None, (codebook, self._dst_type))
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
                 invalid_node_names.append(wp.node_with_weight.node_name)
@@ -144,6 +150,10 @@ def calculate_quantization_params(
 
         weight = weight.astype(TensorDataType.float32)
 
+        codebook = fns.tensor(
+            self._initial_codebook, backend=weight.backend, dtype=TensorDataType.float32, device=weight.device
+        )
+
         if reduction_axis == 0:
             weight = fns.transpose(weight)
             reduction_axis = 1
@@ -152,7 +162,7 @@ def calculate_quantization_params(
         cur_config = deepcopy(config)
         cur_config.group_size = group_size
 
-        max_val = fns.max(fns.abs(weight))
+        max_val = fns.max(fns.abs(codebook))
         norm_weight, scale = calculate_normalized_weight_and_scale(
             weight, reduction_axis, cur_config.group_size, max_val=max_val
         )
@@ -161,9 +171,9 @@ def calculate_quantization_params(
 
         norm_weight = fns.unsqueeze(norm_weight.flatten(), 1)
 
-        dist = (norm_weight - fns.unsqueeze(self._initial_codebook, 0)) ** 2
+        dist = (norm_weight - fns.unsqueeze(codebook, 0)) ** 2
 
-        indexes = fns.argmin(dist, axis=1)[0]
+        indexes = dist.data.argmin(-1)
         indexes = fns.reshape(indexes, orig_shape)
 
-        return indexes, scale, self._initial_codebook
+        return indexes, scale, codebook
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index c85c7ea8b1f..f18dc045537 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -32,6 +32,7 @@
 from nncf.openvino.graph.metatypes.groups import ATOMIC_ACTIVATIONS_OPERATIONS
 from nncf.openvino.graph.model_transformer import OVModelTransformer
 from nncf.openvino.graph.node_utils import convert_op
+from nncf.openvino.graph.node_utils import create_ov_codebook_subgraph
 from nncf.openvino.graph.node_utils import create_ov_const_from_tensor
 from nncf.openvino.graph.node_utils import get_const_value_as_numpy_tensor
 from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor
@@ -234,31 +235,46 @@ def _create_compression_subgraph(
             compression_dtype = ov.Type.i8
         elif compression_config.mode == CompressWeightsMode.INT8_ASYM:
             compression_dtype = ov.Type.u8
+        elif compression_config.mode == CompressWeightsMode.CODEBOOK:
+            if compressed_weight is None or not compressed_weight.is_codebook():
+                msg = "Codebook compression requires pre-computed codebook."
+                raise nncf.ValidationError(msg)
+            compression_dtype = ov.Type.u8 if compressed_weight.tensor.max() > 4 else ov.Type.u4
         else:
             msg = f"{compression_config.mode.value} is not supported."
             raise nncf.ParameterNotSupportedError(msg)
 
         original_shape = weight.shape
-        with disable_results_caching(OV_MODEL_CACHE):
-            compressed_weight = compress_weight(
-                weight,
-                reduction_axes,
-                compression_config,
-                compressed_weight,
-            )
-        compressed_const = create_ov_const_from_tensor(
-            compressed_weight.tensor, compression_dtype, name=const_node_name
-        )
-        converted_const = opset.convert(compressed_const, ov.Type.f16)
 
-        if compressed_weight.zero_point is not None:
-            zero_point_const = create_ov_const_from_tensor(
-                compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point"
+        if compression_config.mode == CompressWeightsMode.CODEBOOK:
+            converted_const = create_ov_codebook_subgraph(
+                codebook=compressed_weight.codebook[0],
+                indexes=compressed_weight.tensor,
+                dtype=compression_dtype,
+                codebook_dtype=compressed_weight.codebook[1],
+                name=const_node_name,
             )
-            zero_point_const = opset.convert(zero_point_const, ov.Type.f16)
-            converted_const = opset.subtract(
-                converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract"
+        else:
+            with disable_results_caching(OV_MODEL_CACHE):
+                compressed_weight = compress_weight(
+                    weight,
+                    reduction_axes,
+                    compression_config,
+                    compressed_weight,
+                )
+            compressed_const = create_ov_const_from_tensor(
+                compressed_weight.tensor, compression_dtype, name=const_node_name
             )
+            converted_const = opset.convert(compressed_const, ov.Type.f16)
+
+            if compressed_weight.zero_point is not None:
+                zero_point_const = create_ov_const_from_tensor(
+                    compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point"
+                )
+                zero_point_const = opset.convert(zero_point_const, ov.Type.f16)
+                converted_const = opset.subtract(
+                    converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract"
+                )
 
         scale_const = create_ov_const_from_tensor(compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale")
         scale_const = convert_op(scale_const, ov.Type.f16)
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 3ee49a2bd83..5f05db2cfd8 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -25,7 +25,7 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_scale
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization
@@ -215,7 +215,7 @@ def calculate_quantization_params(
 
         original_weight = fns.zeros_like(weight) + weight
         if config.mode == CompressWeightsMode.NF4:
-            norm_weight, scale = calculate_normalized_weight_and_fp4_scale(
+            norm_weight, scale = calculate_normalized_weight_and_scale(
                 original_weight, reduction_axis, cur_config.group_size
             )
             compressed_weights = do_nf4_quantization(norm_weight, scale, is_normalized_weight=True)

From ac26b8aec3d6f74a65669d36942cb9b1d7d089d0 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 13 May 2025 14:21:50 +0200
Subject: [PATCH 07/68] Reverted change in spell check.

---
 .ci/cspell_dict.txt               | 1 -
 nncf/openvino/graph/node_utils.py | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.ci/cspell_dict.txt b/.ci/cspell_dict.txt
index 2dd19aafa41..74d2f7ca9ce 100644
--- a/.ci/cspell_dict.txt
+++ b/.ci/cspell_dict.txt
@@ -72,7 +72,6 @@ ckpt
 clusterization
 cmap
 cnode
-codebook
 coeffs
 concr
 confs
diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 96f4958f959..db6f344fc23 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -699,12 +699,12 @@ def create_ov_codebook_subgraph(
     :param name: Optional name of the constant.
     :return: OpenVINO subgraph.
     """
-    cobebook_const = opset.constant(codebook.data, dtype=codebook_dtype)
+    codebook_const = opset.constant(codebook.data, dtype=codebook_dtype)
     if codebook_dtype != ov.Type.f16:
-        cobebook_const = opset.convert(cobebook_const, destination_type=ov.Type.f16)
+        codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16)
     codebook_indexes = opset.constant(indexes.data, dtype=dtype)
     if dtype == ov.Type.u4:
         codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8)
 
-    const = opset.gather(cobebook_const, codebook_indexes, 0, name=name)
+    const = opset.gather(codebook_const, codebook_indexes, 0, name=name)
     return const

From 16d7a9e5ea3487fda500d8989c015ab9af3b75f9 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 13 May 2025 16:53:42 +0200
Subject: [PATCH 08/68] Fixed compression to 4bit for codebook indexes.

---
 nncf/openvino/graph/node_utils.py                           | 4 ++--
 .../quantization/algorithms/weight_compression/algorithm.py | 4 ++--
 nncf/quantization/algorithms/weight_compression/codebook.py | 5 +++--
 .../algorithms/weight_compression/openvino_backend.py       | 6 +++---
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index db6f344fc23..1a4fcb06303 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -700,11 +700,11 @@ def create_ov_codebook_subgraph(
     :return: OpenVINO subgraph.
     """
     codebook_const = opset.constant(codebook.data, dtype=codebook_dtype)
-    if codebook_dtype != ov.Type.f16:
-        codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16)
     codebook_indexes = opset.constant(indexes.data, dtype=dtype)
     if dtype == ov.Type.u4:
         codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8)
 
     const = opset.gather(codebook_const, codebook_indexes, 0, name=name)
+    if codebook_dtype != ov.Type.f16:
+        const = opset.convert(const, destination_type=ov.Type.f16)
     return const
diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 631be6f17e2..30d0ae719ed 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -38,7 +38,7 @@
 from nncf.quantization.advanced_parameters import convert_to_dict_recursively
 from nncf.quantization.algorithms.algorithm import Algorithm
 from nncf.quantization.algorithms.weight_compression.awq import AWQ
-from nncf.quantization.algorithms.weight_compression.codebook import Codebook
+from nncf.quantization.algorithms.weight_compression.codebook import CodebookCompression
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.gptq import GPTQ
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
@@ -324,7 +324,7 @@ def __init__(
             )
         if self._codebook:
             codebook_params = self._advanced_parameters.codebook_params
-            self._codebook_algo = Codebook(
+            self._codebook_algo = CodebookCompression(
                 initial_codebook=codebook_params.codebook,
                 dst_type=codebook_params.dst_type,
             )
diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py
index 17f30a34cd7..880062c04e6 100644
--- a/nncf/quantization/algorithms/weight_compression/codebook.py
+++ b/nncf/quantization/algorithms/weight_compression/codebook.py
@@ -19,6 +19,7 @@
 from nncf.common.utils.backend import get_backend
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.common import Codebook
 from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
@@ -31,7 +32,7 @@
 TModel = TypeVar("TModel")
 
 
-class Codebook:
+class CodebookCompression:
     """
     Codebook estimation algorithm implementation.
     """
@@ -113,7 +114,7 @@ def apply(
 
             try:
                 indexes, scale, codebook = self.calculate_quantization_params(weight, wp.reduction_axes, config)
-                res[weight_name] = CompressedWeight(indexes, scale, None, (codebook, self._dst_type))
+                res[weight_name] = CompressedWeight(indexes, scale, None, Codebook(codebook, self._dst_type))
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
                 invalid_node_names.append(wp.node_with_weight.node_name)
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index f18dc045537..f48bff5519b 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -239,7 +239,7 @@ def _create_compression_subgraph(
             if compressed_weight is None or not compressed_weight.is_codebook():
                 msg = "Codebook compression requires pre-computed codebook."
                 raise nncf.ValidationError(msg)
-            compression_dtype = ov.Type.u8 if compressed_weight.tensor.max() > 4 else ov.Type.u4
+            compression_dtype = ov.Type.u8 if compressed_weight.tensor.max() > 15 else ov.Type.u4
         else:
             msg = f"{compression_config.mode.value} is not supported."
             raise nncf.ParameterNotSupportedError(msg)
@@ -248,10 +248,10 @@ def _create_compression_subgraph(
 
         if compression_config.mode == CompressWeightsMode.CODEBOOK:
             converted_const = create_ov_codebook_subgraph(
-                codebook=compressed_weight.codebook[0],
+                codebook=compressed_weight.codebook.codebook,
                 indexes=compressed_weight.tensor,
                 dtype=compression_dtype,
-                codebook_dtype=compressed_weight.codebook[1],
+                codebook_dtype=compressed_weight.codebook.dst_type,
                 name=const_node_name,
             )
         else:

From 87280cc4067c65a81106f3b6c22fa25311281c8a Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Thu, 15 May 2025 14:01:39 +0200
Subject: [PATCH 09/68] Added tests and example.

---
 .../openvino/smollm2_360m_codebook/main.py    | 114 ++++++++++++++++++
 nncf/parameters.py                            |   1 +
 nncf/quantization/quantize_model.py           |  19 +--
 .../quantization/test_weights_compression.py  |  75 ++++++++++++
 4 files changed, 201 insertions(+), 8 deletions(-)
 create mode 100644 examples/llm_compression/openvino/smollm2_360m_codebook/main.py

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
new file mode 100644
index 00000000000..7a37c0e3d42
--- /dev/null
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import openvino as ov
+from datasets import load_dataset
+from optimum.intel.openvino import OVModelForCausalLM
+from transformers import AutoTokenizer
+
+import nncf
+
+
+def transform_fn(data, model, tokenizer):
+    tokenized_text = tokenizer(data["text"], return_tensors="np")
+    input_ids = tokenized_text["input_ids"]
+    attention_mask = tokenized_text["attention_mask"]
+
+    inputs = {}
+    inputs["input_ids"] = input_ids
+    inputs["attention_mask"] = tokenized_text["attention_mask"]
+    position_ids = np.cumsum(attention_mask, axis=1) - 1
+    position_ids[attention_mask == 0] = 1
+
+    # The magic forms KV cache as model inputs
+    batch_size = input_ids.shape[0]
+    for input_name in model.key_value_input_names:
+        model_inputs = model.model.input(input_name)
+        shape = model_inputs.get_partial_shape()
+        shape[0] = batch_size
+        if shape[2].is_dynamic:
+            shape[2] = 0
+        else:
+            shape[1] = 0
+        inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape())
+
+    inputs["position_ids"] = position_ids
+    return inputs
+
+
+def generate_answers(questions, model, tokenizer, max_new_tokens=50):
+    messages = [
+        {"role": "system", "content": "You are a chatbot who always responds as short as possible."},
+        {"role": "user", "content": "What is the capital of Spain?"},
+        {"role": "assistant", "content": "Madrid."},
+    ]
+    answers_by_questions = {}
+    model.request = None
+
+    for question in questions:
+        messages.append({"role": "user", "content": question})
+        input_ids = tokenizer.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
+        ).to(device=model.device)
+        input_len = len(input_ids[0])
+
+        output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0]
+        answer = tokenizer.decode(output[input_len:], skip_special_tokens=True)
+        answers_by_questions[question] = answer
+        messages.append({"role": "assistant", "content": answer})
+
+    model.request = None
+    return answers_by_questions
+
+
+def main():
+    MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
+    OUTPUT_DIR = "smollm2_360m_compressed_codebook"
+
+    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+    # Filtering to remove empty samples from the dataset
+    dataset = dataset.filter(lambda example: len(example["text"]) > 1)
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = OVModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        export=True,
+        load_in_8bit=False,
+        compile=False,
+        stateful=False,
+        ov_config={"INFERENCE_PRECISION_HINT": "f32"},
+    )
+
+    questions = [
+        "What is the capital of France?",
+        "What is the highest peak in the Alps?",
+        "What is the largest city in Canada?",
+        "What is the most visited city in Japan?",
+    ]
+
+    answers_by_questions = generate_answers(questions, model, tokenizer)
+    print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
+
+    model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=64)
+    model.save_pretrained(OUTPUT_DIR)
+    tokenizer.save_pretrained(OUTPUT_DIR)
+
+    model = OVModelForCausalLM.from_pretrained(
+        OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "INFERENCE_PRECISION_HINT": "f32"}
+    )
+    answers_by_questions = generate_answers(questions, model, tokenizer)
+    print(f"Optimized model outputs:\n{answers_by_questions}\n")
+    return answers_by_questions
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nncf/parameters.py b/nncf/parameters.py
index 6a6e6883ab4..f1bf44dcb91 100644
--- a/nncf/parameters.py
+++ b/nncf/parameters.py
@@ -85,6 +85,7 @@ class CompressWeightsMode(StrEnum):
     :param NF4: The the same as INT4_SYM mode, but primary precision is NF4 data type without zero point.
     :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead.
     :param E2M1: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0.
+    :param CODEBOOK: Codebook (LUT) quantization format.
     """
 
     INT8_SYM = "int8_sym"
diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py
index c8921a07063..a22ea74dec8 100644
--- a/nncf/quantization/quantize_model.py
+++ b/nncf/quantization/quantize_model.py
@@ -516,8 +516,8 @@ def compress_weights(
         from nncf.torch.nncf_network import NNCFNetwork
         from nncf.torch.quantization.quantize_model import compress_weights_impl as pt_compression_weights_impl
 
-        if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]:
-            msg = "Torch backend does not support NF4 and E2M1 modes for weight compression."
+        if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]:
+            msg = "Torch backend does not support NF4, E2M1 and CODEBOOK modes for weight compression."
             raise nncf.ParameterNotSupportedError(msg)
 
         options = {"gptq": gptq, "lora_correction": lora_correction}
@@ -560,8 +560,8 @@ def compress_weights(
             compress_weights_impl as fx_compression_weights_impl,
         )
 
-        if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]:
-            msg = "Torch backend does not support NF4 and E2M1 modes for weight compression."
+        if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]:
+            msg = "Torch backend does not support NF4, E2M1 and CODEBOOK modes for weight compression."
             raise nncf.ParameterNotSupportedError(msg)
 
         options = {
@@ -597,8 +597,11 @@ def compress_weights(
             msg = "Scale estimation, GPTQ or Lora Correction algorithm is defined, but dataset is None."
             raise nncf.ParameterNotSupportedError(msg)
 
-        if any((awq, scale_estimation, gptq, lora_correction)) and mode == CompressWeightsMode.E2M1:
-            msg = "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is E2M1."
+        if any((awq, scale_estimation, gptq, lora_correction)) and mode in [
+            CompressWeightsMode.E2M1,
+            CompressWeightsMode.CODEBOOK,
+        ]:
+            msg = f"AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is {mode}."
             raise nncf.ParameterNotSupportedError(msg)
 
         if gptq and lora_correction:
@@ -614,8 +617,8 @@ def compress_weights(
     elif backend == BackendType.ONNX:
         from nncf.onnx.quantization.quantize_model import compress_weights_impl as onnx_compress_weights_impl
 
-        if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]:
-            msg = "ONNX backend does not support NF4 and E2M1 modes for weight compression."
+        if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]:
+            msg = "ONNX backend does not support NF4, E2M1 and CODEBOOK modes for weight compression."
             raise nncf.ParameterNotSupportedError(msg)
 
         options = {
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 5935b2265b7..fd48852a19e 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -36,6 +36,7 @@
 from nncf.parameters import BackupMode
 from nncf.parameters import CompressionFormat
 from nncf.quantization import compress_weights
+from nncf.quantization.advanced_parameters import AdvancedCodebookParameters
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams
 from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams
@@ -696,6 +697,20 @@ def test_raise_error_with_unsupported_params_for_e2m1(algo):
         compress_weights(ov.Model([], []), dataset="anything", mode=CompressWeightsMode.E2M1, **{algo: True})
 
 
+@pytest.mark.parametrize(
+    "algo",
+    (
+        "lora_correction",
+        "awq",
+        "scale_estimation",
+        "gptq",
+    ),
+)
+def test_raise_error_with_unsupported_params_for_codebook(algo):
+    with pytest.raises(nncf.ParameterNotSupportedError):
+        compress_weights(ov.Model([], []), dataset="anything", mode=CompressWeightsMode.CODEBOOK, **{algo: True})
+
+
 @pytest.mark.parametrize("mode", INT4_NF4_MODES)
 @pytest.mark.parametrize(
     "algo",
@@ -1023,6 +1038,66 @@ def test_mixed_precision_e2m1(mode, all_layers, ratio, ref_ids):
     assert ref_e8m0_nodes == names_e8m0
 
 
+@pytest.mark.parametrize(
+    ("mode", "all_layers", "ratio", "ref_ids"),
+    (
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4]),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 3, 4]),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0]),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, []),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3]),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 3]),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0]),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, []),
+    ),
+)
+def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids):
+    model = SequentialMatmulModel().ov_model
+    compressed_model = compress_weights(
+        model,
+        mode=CompressWeightsMode.CODEBOOK,
+        ratio=ratio,
+        group_size=1,
+        all_layers=all_layers,
+        sensitivity_metric=mode,
+    )
+    names_codebook = {
+        op.get_friendly_name()
+        for op in compressed_model.get_ordered_ops()
+        if op.get_element_type() == ov.Type.f8e4m3 and not op.get_friendly_name().startswith("Const")
+    }
+    ref_codebook_nodes = {f"weights_{i}" for i in ref_ids}
+
+    assert ref_codebook_nodes == names_codebook
+
+
+@pytest.mark.parametrize(
+    ("codebook", "dst_type", "n_layers"),
+    (
+        ([i for i in range(-8, 8)], ov.Type.i4, 2 * 5),
+        ([i for i in range(-(2**6), 2**6)], ov.Type.i8, 2 * 5),
+        ([i for i in range(-(2**6), 2**6)], ov.Type.f8e4m3, 2 * 5),
+    ),
+)
+def test_codebook(codebook, dst_type, n_layers):
+    model = SequentialMatmulModel().ov_model
+    compressed_model = compress_weights(
+        model,
+        mode=CompressWeightsMode.CODEBOOK,
+        ratio=1.0,
+        group_size=1,
+        all_layers=True,
+        advanced_parameters=AdvancedCompressionParameters(
+            codebook_params=AdvancedCodebookParameters(codebook=codebook, dst_type=dst_type)
+        ),
+    )
+    names_codebook = [
+        op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == dst_type
+    ]
+
+    assert len(names_codebook) == n_layers
+
+
 @pytest.mark.parametrize(
     ("mode", "data"),
     (

From 4ab1470c68035ffedd64214fc5c2efdc93816459 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Thu, 15 May 2025 16:13:28 +0200
Subject: [PATCH 10/68] Added file with compression data structures.

---
 .../algorithms/weight_compression/common.py   | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 nncf/quantization/algorithms/weight_compression/common.py

diff --git a/nncf/quantization/algorithms/weight_compression/common.py b/nncf/quantization/algorithms/weight_compression/common.py
new file mode 100644
index 00000000000..ff1c737ff19
--- /dev/null
+++ b/nncf/quantization/algorithms/weight_compression/common.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Any
+from nncf.tensor import Tensor
+
+
+@dataclass
+class Codebook:
+    """
+    Codebook parameters for weight compression.
+    :param codebook: The initial codebook for compression.
+    :param dst_type: The destination type for the codebook.
+    """
+    codebook: Optional[Tensor] = None
+    dst_type: Optional[Any] = None
+
+
+@dataclass
+class CompressedWeight:
+    """
+    Compressed weight and decompression parameters.
+
+    :param tensor: The tensor with compressed weight.
+    :param scale: The decompression scale, in practice it is dequantization scale for the quantization.
+    :param zero_point: The zero-point, it is the value of the compression type corresponding to the value 0
+        in the non-compression realm. Applicable for INT quantization.
+    :param codebook: The codebook (LUT) for the weight compression. Applicable for vector quantization
+    """
+
+    tensor: Optional[Tensor] = None
+    scale: Optional[Tensor] = None
+    zero_point: Optional[Tensor] = None
+    codebook: Optional[Codebook] = None
+
+    def is_codebook(self):
+        """
+        Check if the compressed weight is a codebook.
+
+        :return: True if the compressed weight is a codebook, False otherwise.
+        """
+        return not (self.codebook is None or self.tensor is None or self.scale is None)

From 6ccd252b343523f6d0db7e0efe12bbfdb5604755 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Thu, 15 May 2025 16:37:27 +0200
Subject: [PATCH 11/68] Removed debug information.

---
 nncf/quantization/advanced_parameters.py                    | 2 +-
 nncf/quantization/algorithms/weight_compression/codebook.py | 1 -
 nncf/quantization/algorithms/weight_compression/common.py   | 4 +++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index a041c8da25c..91f95d60303 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -20,7 +20,7 @@
 from enum import Enum
 from typing import Any, Optional, Union
 
-import openvino.runtime as ov
+import openvino as ov
 
 import nncf
 from nncf.common.quantization.quantizer_propagation.structs import QuantizerPropagationRule
diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py
index 903bbbb47ba..b1dec46275a 100644
--- a/nncf/quantization/algorithms/weight_compression/codebook.py
+++ b/nncf/quantization/algorithms/weight_compression/codebook.py
@@ -102,7 +102,6 @@ def apply(
             if wp.compression_config.mode != CompressWeightsMode.CODEBOOK:
                 continue
             weight_name = wp.weight_name
-            print(weight_name)
             config = wp.compression_config
 
             weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph)
diff --git a/nncf/quantization/algorithms/weight_compression/common.py b/nncf/quantization/algorithms/weight_compression/common.py
index ff1c737ff19..a172899374f 100644
--- a/nncf/quantization/algorithms/weight_compression/common.py
+++ b/nncf/quantization/algorithms/weight_compression/common.py
@@ -10,7 +10,8 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Optional, Any
+from typing import Any, Optional
+
 from nncf.tensor import Tensor
 
 
@@ -21,6 +22,7 @@ class Codebook:
     :param codebook: The initial codebook for compression.
     :param dst_type: The destination type for the codebook.
     """
+
     codebook: Optional[Tensor] = None
     dst_type: Optional[Any] = None
 

From 22308e931b5db4a6b2acda4a5c274a913815b75e Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 16 May 2025 10:32:21 +0200
Subject: [PATCH 12/68] Added custom codebook to example.

---
 .../openvino/smollm2_360m_codebook/README.md  | 26 ++++++
 .../openvino/smollm2_360m_codebook/main.py    | 85 +++++++++++--------
 .../smollm2_360m_codebook/requirements.txt    |  4 +
 nncf/__init__.py                              |  1 +
 nncf/version.py                               |  2 +-
 5 files changed, 80 insertions(+), 38 deletions(-)
 create mode 100644 examples/llm_compression/openvino/smollm2_360m_codebook/README.md
 create mode 100644 examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/README.md b/examples/llm_compression/openvino/smollm2_360m_codebook/README.md
new file mode 100644
index 00000000000..c82045d6261
--- /dev/null
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/README.md
@@ -0,0 +1,26 @@
+# Large Language Models FP8 Compression Example
+
+This example demonstrates how to apply codebook compression to [HuggingFaceTB/SmolLM2-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct) model. It can be useful for evaluation and early HW enablement purposes.
+
+## Prerequisites
+
+To use this example:
+
+- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate`
+- Install dependencies:
+
+```bash
+pip install -U pip
+pip install -r requirements.txt
+pip install ../../../../
+```
+
+## Run Example
+
+To run example:
+
+```bash
+python main.py
+```
+
+It will automatically download the dataset and baseline model and save the resulting model.
diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index 7a37c0e3d42..67fbfccb26c 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -9,42 +9,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import openvino as ov
-from datasets import load_dataset
 from optimum.intel.openvino import OVModelForCausalLM
 from transformers import AutoTokenizer
 
 import nncf
 
 
-def transform_fn(data, model, tokenizer):
-    tokenized_text = tokenizer(data["text"], return_tensors="np")
-    input_ids = tokenized_text["input_ids"]
-    attention_mask = tokenized_text["attention_mask"]
-
-    inputs = {}
-    inputs["input_ids"] = input_ids
-    inputs["attention_mask"] = tokenized_text["attention_mask"]
-    position_ids = np.cumsum(attention_mask, axis=1) - 1
-    position_ids[attention_mask == 0] = 1
-
-    # The magic forms KV cache as model inputs
-    batch_size = input_ids.shape[0]
-    for input_name in model.key_value_input_names:
-        model_inputs = model.model.input(input_name)
-        shape = model_inputs.get_partial_shape()
-        shape[0] = batch_size
-        if shape[2].is_dynamic:
-            shape[2] = 0
-        else:
-            shape[1] = 0
-        inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape())
-
-    inputs["position_ids"] = position_ids
-    return inputs
-
-
 def generate_answers(questions, model, tokenizer, max_new_tokens=50):
     messages = [
         {"role": "system", "content": "You are a chatbot who always responds as short as possible."},
@@ -70,14 +41,39 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50):
     return answers_by_questions
 
 
-def main():
-    MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
-    OUTPUT_DIR = "smollm2_360m_compressed_codebook"
+def default_codebook_example(MODEL_ID, OUTPUT_DIR):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = OVModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        export=True,
+        load_in_8bit=False,
+        compile=False,
+        stateful=False,
+        ov_config={"INFERENCE_PRECISION_HINT": "f32"},
+    )
 
-    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-    # Filtering to remove empty samples from the dataset
-    dataset = dataset.filter(lambda example: len(example["text"]) > 1)
+    questions = [
+        "What is the capital of France?",
+        "What is the highest peak in the Alps?",
+        "What is the largest city in Canada?",
+        "What is the most visited city in Japan?",
+    ]
 
+    answers_by_questions = generate_answers(questions, model, tokenizer)
+    print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
+
+    model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=64)
+    model.save_pretrained(OUTPUT_DIR)
+    tokenizer.save_pretrained(OUTPUT_DIR)
+
+    model = OVModelForCausalLM.from_pretrained(
+        OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "INFERENCE_PRECISION_HINT": "f32"}
+    )
+    answers_by_questions = generate_answers(questions, model, tokenizer)
+    print(f"Optimized model outputs:\n{answers_by_questions}\n")
+
+
+def custom_codebook_example(MODEL_ID, OUTPUT_DIR):
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
     model = OVModelForCausalLM.from_pretrained(
         MODEL_ID,
@@ -98,7 +94,15 @@ def main():
     answers_by_questions = generate_answers(questions, model, tokenizer)
     print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
 
-    model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=64)
+    codebook_params = nncf.AdvancedCodebookParameters([-8, -4, -2, -1, 0, 1, 2, 4, 8], ov.Type.i8)
+
+    model.model = nncf.compress_weights(
+        model.model,
+        mode=nncf.CompressWeightsMode.CODEBOOK,
+        ratio=1.0,
+        group_size=64,
+        advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params),
+    )
     model.save_pretrained(OUTPUT_DIR)
     tokenizer.save_pretrained(OUTPUT_DIR)
 
@@ -107,7 +111,14 @@ def main():
     )
     answers_by_questions = generate_answers(questions, model, tokenizer)
     print(f"Optimized model outputs:\n{answers_by_questions}\n")
-    return answers_by_questions
+
+
+def main():
+    MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
+    OUTPUT_DIR = "smollm2_360m_compressed_codebook"
+
+    default_codebook_example(MODEL_ID, OUTPUT_DIR)
+    custom_codebook_example(MODEL_ID, OUTPUT_DIR + "_custom")
 
 
 if __name__ == "__main__":
diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt
new file mode 100644
index 00000000000..feab3bfd695
--- /dev/null
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt
@@ -0,0 +1,4 @@
+openvino==2025.1
+optimum-intel[openvino]>=1.22.0
+transformers>=4.48.0
+onnx==1.17.0
diff --git a/nncf/__init__.py b/nncf/__init__.py
index 77cd6fbb09a..a0f9a45183f 100644
--- a/nncf/__init__.py
+++ b/nncf/__init__.py
@@ -52,6 +52,7 @@
 )
 from nncf.quantization.advanced_parameters import AdvancedAWQParameters as AdvancedAWQParameters
 from nncf.quantization.advanced_parameters import AdvancedBiasCorrectionParameters as AdvancedBiasCorrectionParameters
+from nncf.quantization.advanced_parameters import AdvancedCodebookParameters as AdvancedCodebookParameters
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as AdvancedCompressionParameters
 from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as AdvancedGPTQParameters
 from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as AdvancedLoraCorrectionParameters
diff --git a/nncf/version.py b/nncf/version.py
index 3769834a0b7..cec4ea22fb5 100644
--- a/nncf/version.py
+++ b/nncf/version.py
@@ -9,7 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.17.0"
+__version__ = "2.17.0.dev0+6ccd252b3dirty"
 
 
 BKC_TORCH_SPEC = "==2.7.*"

From fb259fc03beabe0e36c2c82d2ed21361c6c5dfee Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 16 May 2025 12:30:38 +0200
Subject: [PATCH 13/68] Fixed bug with group_size=-1.

---
 .../quantization/algorithms/weight_compression/codebook.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py
index b1dec46275a..55dc9f1a583 100644
--- a/nncf/quantization/algorithms/weight_compression/codebook.py
+++ b/nncf/quantization/algorithms/weight_compression/codebook.py
@@ -9,7 +9,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from copy import deepcopy
 from typing import Any, Optional, TypeVar
 
 import nncf
@@ -158,12 +157,8 @@ def calculate_quantization_params(
             weight = fns.transpose(weight)
             reduction_axis = 1
 
-        group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis]
-        cur_config = deepcopy(config)
-        cur_config.group_size = group_size
-
         max_val = fns.max(fns.abs(codebook))
-        norm_weight, scale = do_float_quantization(weight, cur_config, reduction_axis, max_val=max_val)
+        norm_weight, scale = do_float_quantization(weight, config, reduction_axis, max_val=max_val)
 
         orig_shape = norm_weight.shape
 

From 86acc8ee55f1d687d76073ea4340bfcfa3602361 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 16 May 2025 12:57:00 +0200
Subject: [PATCH 14/68] Moved convert before gather.

---
 .../llm_compression/openvino/smollm2_360m_codebook/main.py   | 2 +-
 nncf/openvino/graph/node_utils.py                            | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index 67fbfccb26c..b53bc5433d8 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -100,7 +100,7 @@ def custom_codebook_example(MODEL_ID, OUTPUT_DIR):
         model.model,
         mode=nncf.CompressWeightsMode.CODEBOOK,
         ratio=1.0,
-        group_size=64,
+        group_size=-1,
         advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params),
     )
     model.save_pretrained(OUTPUT_DIR)
diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 1a4fcb06303..75e5208ac43 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -700,11 +700,12 @@ def create_ov_codebook_subgraph(
     :return: OpenVINO subgraph.
     """
     codebook_const = opset.constant(codebook.data, dtype=codebook_dtype)
+    if codebook_dtype != ov.Type.f16:
+        codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16)
+
     codebook_indexes = opset.constant(indexes.data, dtype=dtype)
     if dtype == ov.Type.u4:
         codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8)
 
     const = opset.gather(codebook_const, codebook_indexes, 0, name=name)
-    if codebook_dtype != ov.Type.f16:
-        const = opset.convert(const, destination_type=ov.Type.f16)
     return const

From b54606c856999037415b834e5405f43d3011bb21 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 16 May 2025 13:05:33 +0200
Subject: [PATCH 15/68] Removed backend specific parameter from advanced
 parameters.

---
 nncf/quantization/advanced_parameters.py                      | 4 +---
 .../algorithms/weight_compression/openvino_backend.py         | 4 +++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index 91f95d60303..ba2bcccfad6 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -20,8 +20,6 @@
 from enum import Enum
 from typing import Any, Optional, Union
 
-import openvino as ov
-
 import nncf
 from nncf.common.quantization.quantizer_propagation.structs import QuantizerPropagationRule
 from nncf.common.quantization.structs import QuantizationScheme as QuantizationMode
@@ -392,7 +390,7 @@ class AdvancedCodebookParameters:
             3.5,
         ]
     )
-    dst_type: Any = ov.Type.f8e4m3
+    dst_type: Any = None
 
 
 @api()
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index f48bff5519b..109fb7fcbc1 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -251,7 +251,9 @@ def _create_compression_subgraph(
                 codebook=compressed_weight.codebook.codebook,
                 indexes=compressed_weight.tensor,
                 dtype=compression_dtype,
-                codebook_dtype=compressed_weight.codebook.dst_type,
+                codebook_dtype=compressed_weight.codebook.dst_type
+                if compressed_weight.codebook.dst_type
+                else ov.Type.f8e4m3,
                 name=const_node_name,
             )
         else:

From 72b803e9a3957e41be2b8657c1021669b34dfc0f Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 16 May 2025 13:26:55 +0200
Subject: [PATCH 16/68] Fixed tests.

---
 .../quantization/test_weights_compression.py  | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index dab7125e1d6..54f1cb2124f 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -1041,14 +1041,14 @@ def test_mixed_precision_e2m1(mode, all_layers, ratio, ref_ids):
 @pytest.mark.parametrize(
     ("mode", "all_layers", "ratio", "ref_ids"),
     (
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4]),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 3, 4]),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0]),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, []),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3]),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 3]),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0]),
-        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, []),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, 5),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, 3),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, 1),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, 0),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, 4),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, 3),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, 1),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, 0),
     ),
 )
 def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids):
@@ -1064,28 +1064,28 @@ def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids):
     names_codebook = {
         op.get_friendly_name()
         for op in compressed_model.get_ordered_ops()
-        if op.get_element_type() == ov.Type.f8e4m3 and not op.get_friendly_name().startswith("Const")
+        if op.get_element_type() == ov.Type.f8e4m3 and op.get_friendly_name().startswith("Const")
     }
-    ref_codebook_nodes = {f"weights_{i}" for i in ref_ids}
 
-    assert ref_codebook_nodes == names_codebook
+    assert ref_ids == len(names_codebook)
 
 
 @pytest.mark.parametrize(
     ("codebook", "dst_type", "n_layers"),
     (
-        ([i for i in range(-8, 8)], ov.Type.i4, 2 * 5),
-        ([i for i in range(-(2**6), 2**6)], ov.Type.i8, 2 * 5),
-        ([i for i in range(-(2**6), 2**6)], ov.Type.f8e4m3, 2 * 5),
+        ([i for i in range(-8, 8)], ov.Type.i4, 5),
+        ([i for i in range(-(2**6), 2**6)], ov.Type.i8, 5),
+        ([i for i in range(-(2**6), 2**6)], ov.Type.f8e4m3, 5),
     ),
 )
-def test_codebook(codebook, dst_type, n_layers):
+@pytest.mark.parametrize("group_size", (1, -1))
+def test_codebook(codebook, dst_type, n_layers, group_size):
     model = SequentialMatmulModel().ov_model
     compressed_model = compress_weights(
         model,
         mode=CompressWeightsMode.CODEBOOK,
         ratio=1.0,
-        group_size=1,
+        group_size=group_size,
         all_layers=True,
         advanced_parameters=AdvancedCompressionParameters(
             codebook_params=AdvancedCodebookParameters(codebook=codebook, dst_type=dst_type)

From 79f34a78b41b0343993296964f8701b3c41703c1 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 20 May 2025 11:58:04 +0200
Subject: [PATCH 17/68] Fix for prevent Gather from low-precision types be
 recognized as input for graph.

---
 nncf/openvino/graph/metatypes/openvino_metatypes.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/nncf/openvino/graph/metatypes/openvino_metatypes.py b/nncf/openvino/graph/metatypes/openvino_metatypes.py
index c7726276e00..fe433739237 100644
--- a/nncf/openvino/graph/metatypes/openvino_metatypes.py
+++ b/nncf/openvino/graph/metatypes/openvino_metatypes.py
@@ -817,7 +817,16 @@ def _is_embedding(node: ov.Node) -> bool:
     allowed_types_list = ["f16", "f32", "f64"]
     const_port_id = 0
     input_tensor = node.input_value(const_port_id)
-    if input_tensor.get_element_type().get_type_name() in allowed_types_list:
+    input_type = input_tensor.get_element_type().get_type_name()
+
+    try:
+        input_node = node.input(const_port_id).get_source_output().node
+        if input_node.get_type_info().name == "Convert":
+            input_type = input_node.input_value(0).get_element_type().get_type_name()
+    except AttributeError:
+        # Handle the case where input_node is not available
+        pass
+    if input_type in allowed_types_list:
         const_node = get_operation_const_op(node, const_port_id)
         if const_node is not None:
             return True

From 93233815815c4a5e91908ecca91b0d13b8f81c1b Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 21 May 2025 16:24:19 +0200
Subject: [PATCH 18/68] Extend test for codebook.

---
 .../quantization/test_weights_compression.py  | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 54f1cb2124f..0a14c3e4cbb 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -41,6 +41,7 @@
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams
 from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams
 from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams
+from nncf.quantization.algorithms.weight_compression.codebook import CodebookCompression
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
@@ -1119,6 +1120,30 @@ def test_compressed_weighs_range(mode, data):
     assert np.allclose(np.abs(compressed_weighs.data), np.abs(w.data))
 
 
+@pytest.mark.parametrize(
+    ("data"),
+    (
+        ([-8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0, 0.0]),
+        ([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
+        ([-8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]),
+        ([-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5]),
+    ),
+)
+def test_codebook_weighs_range(data):
+    data = np.array(data).astype(np.float32)
+    max_diff = 0.1
+    w = Tensor(data + (np.random.rand(*data.shape) - 0.5) * max_diff)
+    config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK)
+    codebook_compression = CodebookCompression(initial_codebook=data, dst_type=None)
+    indexes, scale, codebook = codebook_compression.calculate_quantization_params(w, [-1], config)
+    uncompressed_data = codebook[indexes] * scale
+
+    indexes = indexes.flatten()
+    target = np.arange(indexes.shape[0])
+    assert np.allclose(indexes.data, target)
+    assert np.all(np.abs(uncompressed_data.data - data) <= max_diff)
+
+
 @pytest.mark.parametrize(
     ("config", "precompute_scale", "precompute_zero_point", "raises"),
     [

From 464c0974b4c144e16303d03127ffbd55ab804754 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 23 May 2025 13:47:55 +0200
Subject: [PATCH 19/68] Refactoring.

---
 nncf/openvino/graph/node_utils.py             |  2 +-
 .../openvino/optimized_functions/functions.py |  2 +-
 .../weight_compression/algorithm.py           | 26 ++++++------
 .../algorithms/weight_compression/codebook.py | 37 +++++++++++++---
 .../algorithms/weight_compression/config.py   |  2 +-
 .../weight_compression/openvino_backend.py    | 23 ++++++----
 .../weight_compression/weight_lowering.py     | 42 +++++++++++++++----
 7 files changed, 99 insertions(+), 35 deletions(-)

diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 75e5208ac43..0ba2ab1b970 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -699,7 +699,7 @@ def create_ov_codebook_subgraph(
     :param name: Optional name of the constant.
     :return: OpenVINO subgraph.
     """
-    codebook_const = opset.constant(codebook.data, dtype=codebook_dtype)
+    codebook_const = opset.constant(codebook.data, dtype=codebook_dtype) #create_ov_const_from_tensor(codebook, codebook_dtype)# 
     if codebook_dtype != ov.Type.f16:
         codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16)
 
diff --git a/nncf/openvino/optimized_functions/functions.py b/nncf/openvino/optimized_functions/functions.py
index 2a11e4c3608..282a43f9d2b 100644
--- a/nncf/openvino/optimized_functions/functions.py
+++ b/nncf/openvino/optimized_functions/functions.py
@@ -151,7 +151,7 @@ def do_float_quantization(
         compressed_weight = model([weight, precomputed_scale])[0]
         scale = precomputed_scale
 
-    return compressed_weight, scale
+    return compressed_weight, scale, None
 
 
 def integer_quantize_dequantize_weight(
diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index aa962be7f90..b5826678d3b 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -322,12 +322,12 @@ def __init__(
                 scale_estimation_params.scale_steps,
                 scale_estimation_params.weight_penalty,
             )
-        if self._codebook:
-            codebook_params = self._advanced_parameters.codebook_params
-            self._codebook_algo = CodebookCompression(
-                initial_codebook=codebook_params.codebook,
-                dst_type=codebook_params.dst_type,
-            )
+        # if self._codebook:
+        #     codebook_params = self._advanced_parameters.codebook_params
+        #     self._codebook_algo = CodebookCompression(
+        #         initial_codebook=codebook_params.codebook,
+        #         dst_type=codebook_params.dst_type,
+        #     )
 
         self._data_aware_mixed_precision = (
             self._sensitivity_metric != SensitivityMetric.WEIGHT_QUANTIZATION_ERROR and self._ratio != 1.0
@@ -660,13 +660,13 @@ def apply(
         compressed_weights = None
         lora_correction_algo = None
         description = "Applying Weight Compression"
-        if self._codebook:
-            compressed_weights = self._codebook_algo.apply(
-                model=model,
-                graph=graph,
-                all_weight_params=all_weight_params,
-                backend_entity=self._backend_entity,
-            )
+        # if self._codebook:
+        #     compressed_weights = self._codebook_algo.apply(
+        #         model=model,
+        #         graph=graph,
+        #         all_weight_params=all_weight_params,
+        #         backend_entity=self._backend_entity,
+        #     )
         if self._gptq:
             del statistics
             model, compressed_weights = self._gptq_algo.apply(
diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py
index 55dc9f1a583..bcb463e4d67 100644
--- a/nncf/quantization/algorithms/weight_compression/codebook.py
+++ b/nncf/quantization/algorithms/weight_compression/codebook.py
@@ -158,15 +158,42 @@ def calculate_quantization_params(
             reduction_axis = 1
 
         max_val = fns.max(fns.abs(codebook))
-        norm_weight, scale = do_float_quantization(weight, config, reduction_axis, max_val=max_val)
+        if True:
+            norm_weight, scale, indexes = do_float_quantization(weight, config, reduction_axis, max_val=max_val, quantiles=codebook)
 
-        orig_shape = norm_weight.shape
+            orig_shape = norm_weight.shape
 
-        norm_weight = fns.unsqueeze(norm_weight.flatten(), 1)
+            # norm_weight = fns.unsqueeze(norm_weight.flatten(), 1)
 
-        dist = (norm_weight - fns.unsqueeze(codebook, 0)) ** 2
+            # dist = (norm_weight - fns.unsqueeze(codebook, 0)) ** 2
+
+            # indexes = dist.data.argmin(-1)
+        else:
+            norm_weight, scale = do_float_quantization(weight, config, reduction_axis, max_val=max_val, quantiles=codebook)
+
+            orig_shape = norm_weight.shape
+
+            norm_weight = fns.unsqueeze(norm_weight.flatten(), 1)
+
+            dist = (norm_weight - fns.unsqueeze(codebook, 0)) ** 2
+
+            indexes = dist.data.argmin(-1)
+        
+        
+        # norm_weight, scale, indexes = do_float_quantization(weight, config, reduction_axis, max_val=max_val, quantiles=codebook)
+
+        # norm_weight_, scale_ = do_float_quantization(weight, config, reduction_axis, max_val=max_val)
+
+        # orig_shape = norm_weight_.shape
+
+        # norm_weight_ = fns.unsqueeze(norm_weight_.flatten(), 1)
+
+        # dist = (norm_weight_ - fns.unsqueeze(codebook, 0)) ** 2
+
+        # indexes_ = dist.data.argmin(-1)
+        # import numpy as np
+        # print(np.count_nonzero(indexes_ != indexes.flatten()))
 
-        indexes = dist.data.argmin(-1)
         indexes = fns.reshape(indexes, orig_shape)
 
         return indexes, scale, codebook
diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
index 63ed892c472..36879412d45 100644
--- a/nncf/quantization/algorithms/weight_compression/config.py
+++ b/nncf/quantization/algorithms/weight_compression/config.py
@@ -49,7 +49,7 @@ def is_integer(self):
         """
         :return: True if compression type in integer, else False.
         """
-        return self.mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]
+        return self.mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]
 
     def __hash__(self):
         return hash((self.mode.value, self.group_size))
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 109fb7fcbc1..432a7eeee32 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -220,6 +220,7 @@ def _create_compression_subgraph(
         const_dtype,
         should_add_convert_node: bool,
         compressed_weight: Optional[CompressedWeight] = None,
+        advanced_parameters: Optional[AdvancedCompressionParameters] = None,
     ):
         scale_dtype = ov.Type.f16
         if compression_config.mode == CompressWeightsMode.NF4:
@@ -236,10 +237,7 @@ def _create_compression_subgraph(
         elif compression_config.mode == CompressWeightsMode.INT8_ASYM:
             compression_dtype = ov.Type.u8
         elif compression_config.mode == CompressWeightsMode.CODEBOOK:
-            if compressed_weight is None or not compressed_weight.is_codebook():
-                msg = "Codebook compression requires pre-computed codebook."
-                raise nncf.ValidationError(msg)
-            compression_dtype = ov.Type.u8 if compressed_weight.tensor.max() > 15 else ov.Type.u4
+            compression_dtype = None #ov.Type.u8 if compressed_weight.tensor.max() > 15 else ov.Type.u4
         else:
             msg = f"{compression_config.mode.value} is not supported."
             raise nncf.ParameterNotSupportedError(msg)
@@ -247,12 +245,22 @@ def _create_compression_subgraph(
         original_shape = weight.shape
 
         if compression_config.mode == CompressWeightsMode.CODEBOOK:
+            codebook_params = advanced_parameters.codebook_params
+            if compressed_weight is None:
+                compressed_weight = CompressedWeight(codebook=codebook_params.codebook)
+            compressed_weight = compress_weight(
+                    weight,
+                    reduction_axes,
+                    compression_config,
+                    compressed_weight,
+                )
+
             converted_const = create_ov_codebook_subgraph(
-                codebook=compressed_weight.codebook.codebook,
+                codebook=compressed_weight.codebook,
                 indexes=compressed_weight.tensor,
                 dtype=compression_dtype,
-                codebook_dtype=compressed_weight.codebook.dst_type
-                if compressed_weight.codebook.dst_type
+                codebook_dtype=codebook_params.dst_type
+                if codebook_params.dst_type
                 else ov.Type.f8e4m3,
                 name=const_node_name,
             )
@@ -335,6 +343,7 @@ def transform_model(
                     const_dtype=const_dtype,
                     should_add_convert_node=should_add_convert_node,
                     compressed_weight=compressed_weight,
+                    advanced_parameters=advanced_parameters
                 )
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 70affaa9745..a22d1e3f871 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -159,8 +159,9 @@ def do_float_quantization(
     config: WeightCompressionConfig,
     reduction_axes: Optional[ReductionAxes] = None,
     precomputed_scale: Optional[Tensor] = None,
+    quantiles: Optional[Tensor] = None,
     max_val: float = 6.0,
-) -> tuple[Tensor, Tensor]:
+) -> tuple[Tensor, Tensor, Tensor]:
     """
     Computes quantization scale if not provided, and performs corresponding (nf4, e2m1) weight quantization.
     For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
@@ -172,7 +173,7 @@ def do_float_quantization(
     :param reduction_axes: Axes, along which to reduce (collect) different statistics.
     :param precomputed_scale: Optional precomputed scale.
     :param max_val: Maximal value of destination type.
-    :return: Returns quantized (for codebook and e2m1 normalized) weight tensor and corresponding scale tensor.
+    :return: Returns quantized (for codebook and e2m1 normalized) weight tensor and corresponding scale tensor and optional indexes for codebook.
     """
     assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]
 
@@ -194,6 +195,8 @@ def do_float_quantization(
 
     scale = precomputed_scale
     if scale is None:
+        if quantiles is not None:
+            max_val = max(quantiles)
         scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val)
     norm_weight = _calculate_normalized_weight(weight, scale)
     if config.mode == CompressWeightsMode.NF4:
@@ -202,10 +205,13 @@ def do_float_quantization(
             compressed_weight = norm_weight.as_openvino_tensor().astype(TensorDataType.nf4)
         else:
             compressed_weight = _calculate_nf4_quantized_weight(norm_weight)
+    elif config.mode == CompressWeightsMode.CODEBOOK and quantiles is not None:
+        compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=quantiles)
+        return compressed_weight, scale, indexes
     else:
         # TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved
         compressed_weight = norm_weight
-    return compressed_weight, scale
+    return compressed_weight, scale, None
 
 
 def float_quantize_dequantize_weight(
@@ -346,13 +352,16 @@ def compress_weight(
     :param precomputed_zero_point: Precomputed zero point.
     :return: The compressed weight and decompression parameters as instance of CompressedWeight
     """
-    precomputed_scale, precomputed_zero_point = (
-        (compressed_weight.scale, compressed_weight.zero_point) if compressed_weight else (None, None)
+    precomputed_scale, precomputed_zero_point, quantiles = (
+        (compressed_weight.scale, compressed_weight.zero_point, compressed_weight.codebook) if compressed_weight else (None, None, None)
     )
 
     if not config.is_integer:
-        compressed_weight, scale = do_float_quantization(weight, config, reduction_axes, precomputed_scale)
-        return CompressedWeight(compressed_weight, scale)
+        compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale, quantiles=quantiles)
+        if quantiles is not None and indexes is not None:
+            return CompressedWeight(indexes, scale, None, fns.from_numpy(np.array(quantiles), backend=compressed_weight.backend))
+        else:
+            return CompressedWeight(compressed_weight, scale)
     compressed_weight, scale, zero_point = do_integer_quantization(
         weight, config, reduction_axes, precomputed_scale, precomputed_zero_point
     )
@@ -524,6 +533,25 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor:
     return quantized_weight
 
 
+def _calculate_codebook_quantized_weight(norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None) -> Tensor:
+    """
+    Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to "round" or "quantize" to the closest quant.
+
+    :param norm_weight: Weight tensor to quantize already normalized to quantiles range.
+    :return: Tensor with floating-point values, where each of them corresponds to elements from quantiles.
+    """
+    assert quantiles is not None or center_of_quantiles is not None, "Either quantiles or center_of_quantiles should be provided"
+
+    if center_of_quantiles is None:
+        quantiles = np.array(quantiles)
+        center_of_quantiles = 0.5 * (quantiles[1:] + quantiles[:-1])
+    center_of_quantiles = fns.from_numpy(center_of_quantiles, backend=norm_weight.backend)
+    indexes = fns.searchsorted(center_of_quantiles, norm_weight)
+    quantiles = fns.from_numpy(quantiles, backend=indexes.backend)
+    quantized_weight = quantiles[indexes]
+    return quantized_weight, indexes
+
+
 def _calculate_normalized_weight(weight: Tensor, scale: Tensor) -> Tensor:
     """
     Normalizes the weight tensor using the provided scale.

From b964c0c5f264037db67e1aa01098451f22e8368c Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 23 May 2025 14:30:47 +0200
Subject: [PATCH 20/68] Delete codebook algo.

---
 .../weight_compression/algorithm.py           |  16 +-
 .../algorithms/weight_compression/codebook.py | 199 ------------------
 2 files changed, 1 insertion(+), 214 deletions(-)
 delete mode 100644 nncf/quantization/algorithms/weight_compression/codebook.py

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index b5826678d3b..6cec4d1f6d0 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -38,7 +38,6 @@
 from nncf.quantization.advanced_parameters import convert_to_dict_recursively
 from nncf.quantization.algorithms.algorithm import Algorithm
 from nncf.quantization.algorithms.weight_compression.awq import AWQ
-from nncf.quantization.algorithms.weight_compression.codebook import CodebookCompression
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.gptq import GPTQ
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
@@ -289,7 +288,6 @@ def __init__(
         self._advanced_parameters = (
             advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters()
         )
-        self._codebook = mode == CompressWeightsMode.CODEBOOK
 
         primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size)
         criterion_cls = MIXED_PRECISION_CRITERIA.get(self._sensitivity_metric)
@@ -322,12 +320,6 @@ def __init__(
                 scale_estimation_params.scale_steps,
                 scale_estimation_params.weight_penalty,
             )
-        # if self._codebook:
-        #     codebook_params = self._advanced_parameters.codebook_params
-        #     self._codebook_algo = CodebookCompression(
-        #         initial_codebook=codebook_params.codebook,
-        #         dst_type=codebook_params.dst_type,
-        #     )
 
         self._data_aware_mixed_precision = (
             self._sensitivity_metric != SensitivityMetric.WEIGHT_QUANTIZATION_ERROR and self._ratio != 1.0
@@ -660,13 +652,7 @@ def apply(
         compressed_weights = None
         lora_correction_algo = None
         description = "Applying Weight Compression"
-        # if self._codebook:
-        #     compressed_weights = self._codebook_algo.apply(
-        #         model=model,
-        #         graph=graph,
-        #         all_weight_params=all_weight_params,
-        #         backend_entity=self._backend_entity,
-        #     )
+
         if self._gptq:
             del statistics
             model, compressed_weights = self._gptq_algo.apply(
diff --git a/nncf/quantization/algorithms/weight_compression/codebook.py b/nncf/quantization/algorithms/weight_compression/codebook.py
deleted file mode 100644
index bcb463e4d67..00000000000
--- a/nncf/quantization/algorithms/weight_compression/codebook.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright (c) 2025 Intel Corporation
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Any, Optional, TypeVar
-
-import nncf
-from nncf.common.graph.graph import NNCFGraph
-from nncf.common.logging.track_progress import track
-from nncf.common.utils.backend import BackendType
-from nncf.common.utils.backend import get_backend
-from nncf.parameters import CompressWeightsMode
-from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.common import Codebook
-from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
-from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
-from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
-from nncf.tensor import Tensor
-from nncf.tensor import TensorDataType
-from nncf.tensor import functions as fns
-
-TModel = TypeVar("TModel")
-
-
-class CodebookCompression:
-    """
-    Codebook estimation algorithm implementation.
-    """
-
-    def __init__(
-        self,
-        initial_codebook: Tensor,
-        dst_type: Any,
-    ):
-        """
-        :param initial_codebook: codebook for compression.
-        """
-        super().__init__()
-        self._initial_codebook = initial_codebook
-        self._dst_type = dst_type
-
-    @property
-    def available_backends(self) -> list[BackendType]:
-        return [BackendType.OPENVINO]
-
-    def _set_backend_entity(self, model: TModel) -> None:
-        """
-        Creates a helper class with a backed-specific logic of the algorithm.
-
-        :param model: Backend-specific input model.
-        """
-        model_backend = get_backend(model)
-        if model_backend == BackendType.OPENVINO:
-            from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
-
-            self._backend_entity = OVWeightCompressionAlgoBackend(model)
-        else:
-            msg = (
-                "Cannot return backend-specific Scale Estimation entity because"
-                f" {model_backend.value} is not supported!"
-            )
-            raise nncf.UnsupportedBackendError(msg)
-
-    def apply(
-        self,
-        model: TModel,
-        graph: NNCFGraph,
-        all_weight_params: list[WeightCompressionParameters],
-        backend_entity: Optional[WeightCompressionAlgoBackend] = None,
-    ) -> dict[str, CompressedWeight]:
-        """
-        Estimates better scale for the int4 nodes in the model.
-        Minimizes per-group difference between floating point MatMul and
-        MatMul with compressed weights.
-        The algorithm computes weighted scale for the group of weights in MatMul, which
-        shared the same scale.
-
-        :param model: Model for applying algorithm.
-        :param graph: Model graph.
-        :param all_weight_params: List of all weight parameters.
-        :param backend_entity: Weight compression algorithm backend.
-        :return: Two dictionaries for estimated scales and zero points for each weight name.
-        """
-        self._backend_entity = backend_entity
-        if self._backend_entity is None:
-            self._set_backend_entity(model)
-
-        res = {}
-        invalid_node_names = []
-        first_caught_error = None
-        for wp in track(all_weight_params, description="Applying Codebook Compression"):
-            if wp.compression_config.mode != CompressWeightsMode.CODEBOOK:
-                continue
-            weight_name = wp.weight_name
-            config = wp.compression_config
-
-            weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph)
-            if len(weight_data) != 1:  # not supported by the algorithm
-                continue
-            _, weight_port_id = weight_data[0]
-
-            weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)
-
-            try:
-                indexes, scale, codebook = self.calculate_quantization_params(weight, wp.reduction_axes, config)
-                res[weight_name] = CompressedWeight(indexes, scale, None, Codebook(codebook, self._dst_type))
-            except nncf.InvalidGroupSizeError as error:
-                first_caught_error = error
-                invalid_node_names.append(wp.node_with_weight.node_name)
-
-        if first_caught_error:
-            handle_invalid_group_size_error(first_caught_error, invalid_node_names)
-
-        return res
-
-    def calculate_quantization_params(
-        self,
-        weight: Tensor,
-        reduction_axes: tuple[int, ...],
-        config: WeightCompressionConfig,
-    ) -> Tensor:
-        """
-        Calculates the quantization parameters for a given set of weights and activations.
-        This function estimates the optimal quantization scale for weight compression by
-        minimizing the difference between floating-point operations and operations with
-        quantized weights.
-
-        The function uses an iterative process:
-        1. Initial scale rectification based on activation statistics.
-        2. A grid search to further refine the scale parameters.
-
-        :param statistics: The input activations of the layer reduced over batch and sequence length dimensions,
-            together with original activation tensor shapes.
-        :param weight: The weight tensor that is being quantized.
-        :param reduction_axes: Tuple specifying the axes along which the reduction is performed for quantization.
-        :param config: Configuration parameters for the weight compression, including quantization settings.
-        :return: A tensor containing the calculated quantization scales and zero points if applicable.
-        """
-        reduction_axis = reduction_axes[0]
-
-        weight = weight.astype(TensorDataType.float32)
-
-        codebook = fns.tensor(
-            self._initial_codebook, backend=weight.backend, dtype=TensorDataType.float32, device=weight.device
-        )
-
-        if reduction_axis == 0:
-            weight = fns.transpose(weight)
-            reduction_axis = 1
-
-        max_val = fns.max(fns.abs(codebook))
-        if True:
-            norm_weight, scale, indexes = do_float_quantization(weight, config, reduction_axis, max_val=max_val, quantiles=codebook)
-
-            orig_shape = norm_weight.shape
-
-            # norm_weight = fns.unsqueeze(norm_weight.flatten(), 1)
-
-            # dist = (norm_weight - fns.unsqueeze(codebook, 0)) ** 2
-
-            # indexes = dist.data.argmin(-1)
-        else:
-            norm_weight, scale = do_float_quantization(weight, config, reduction_axis, max_val=max_val, quantiles=codebook)
-
-            orig_shape = norm_weight.shape
-
-            norm_weight = fns.unsqueeze(norm_weight.flatten(), 1)
-
-            dist = (norm_weight - fns.unsqueeze(codebook, 0)) ** 2
-
-            indexes = dist.data.argmin(-1)
-        
-        
-        # norm_weight, scale, indexes = do_float_quantization(weight, config, reduction_axis, max_val=max_val, quantiles=codebook)
-
-        # norm_weight_, scale_ = do_float_quantization(weight, config, reduction_axis, max_val=max_val)
-
-        # orig_shape = norm_weight_.shape
-
-        # norm_weight_ = fns.unsqueeze(norm_weight_.flatten(), 1)
-
-        # dist = (norm_weight_ - fns.unsqueeze(codebook, 0)) ** 2
-
-        # indexes_ = dist.data.argmin(-1)
-        # import numpy as np
-        # print(np.count_nonzero(indexes_ != indexes.flatten()))
-
-        indexes = fns.reshape(indexes, orig_shape)
-
-        return indexes, scale, codebook

From 145fbf3a984f581e6d73533afef593fdbc3373c5 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Mon, 26 May 2025 09:44:33 +0200
Subject: [PATCH 21/68] Refactoring.

---
 .../weight_compression/algorithm.py           |  2 +-
 .../algorithms/weight_compression/config.py   |  3 +-
 .../weight_compression/openvino_backend.py    | 28 ++++++++-----------
 .../weight_compression/scale_estimation.py    |  4 +--
 .../weight_compression/weight_lowering.py     | 21 +++++++-------
 5 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 4a932b9774a..1c0a6d93336 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -445,7 +445,7 @@ def _set_weight_compression_config(
         :param graph: The model graph associated with the model.
         :param statistics_points: Statistics points.
         """
-        primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size)
+        primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size, user_data=self._advanced_parameters.codebook_params.codebook)
         if self._ratio == 1:
             for weight_param in ratio_defining_params:
                 weight_param.compression_config = primary_config
diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
index 36879412d45..80709f53d95 100644
--- a/nncf/quantization/algorithms/weight_compression/config.py
+++ b/nncf/quantization/algorithms/weight_compression/config.py
@@ -10,7 +10,7 @@
 # limitations under the License.
 from dataclasses import dataclass
 from dataclasses import field
-from typing import Optional, TypeVar
+from typing import Optional, TypeVar, Any
 
 import numpy as np
 
@@ -32,6 +32,7 @@ class WeightCompressionConfig:
 
     mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM
     group_size: Optional[int] = -1
+    user_data: Optional[Any] = None
 
     @property
     def num_bits(self):
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 432a7eeee32..8db1a463cb7 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -237,24 +237,25 @@ def _create_compression_subgraph(
         elif compression_config.mode == CompressWeightsMode.INT8_ASYM:
             compression_dtype = ov.Type.u8
         elif compression_config.mode == CompressWeightsMode.CODEBOOK:
-            compression_dtype = None #ov.Type.u8 if compressed_weight.tensor.max() > 15 else ov.Type.u4
+            compression_dtype = None
         else:
             msg = f"{compression_config.mode.value} is not supported."
             raise nncf.ParameterNotSupportedError(msg)
 
         original_shape = weight.shape
 
-        if compression_config.mode == CompressWeightsMode.CODEBOOK:
-            codebook_params = advanced_parameters.codebook_params
-            if compressed_weight is None:
-                compressed_weight = CompressedWeight(codebook=codebook_params.codebook)
+        with disable_results_caching(OV_MODEL_CACHE):
             compressed_weight = compress_weight(
-                    weight,
-                    reduction_axes,
-                    compression_config,
-                    compressed_weight,
-                )
+                weight,
+                reduction_axes,
+                compression_config,
+                compressed_weight,
+            )
 
+        if compression_config.mode == CompressWeightsMode.CODEBOOK:
+            n_quants = compressed_weight.tensor.max()
+            compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4)
+            codebook_params = advanced_parameters.codebook_params
             converted_const = create_ov_codebook_subgraph(
                 codebook=compressed_weight.codebook,
                 indexes=compressed_weight.tensor,
@@ -265,13 +266,6 @@ def _create_compression_subgraph(
                 name=const_node_name,
             )
         else:
-            with disable_results_caching(OV_MODEL_CACHE):
-                compressed_weight = compress_weight(
-                    weight,
-                    reduction_axes,
-                    compression_config,
-                    compressed_weight,
-                )
             compressed_const = create_ov_const_from_tensor(
                 compressed_weight.tensor, compression_dtype, name=const_node_name
             )
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index a772dd107b2..8701250bace 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -301,7 +301,7 @@ def calculate_quantization_params(
 
             if i < initial_steps - 1:
                 if config.mode == CompressWeightsMode.NF4:
-                    out, _ = do_float_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale)
+                    out, _, _ = do_float_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale)
                 else:
                     out, _, _ = do_integer_quantization(
                         original_weight,
@@ -319,7 +319,7 @@ def calculate_quantization_params(
             scaled_scale = factor * scale
 
             if config.mode == CompressWeightsMode.NF4:
-                out, _ = do_float_quantization(original_weight, config, precomputed_scale=scaled_scale)
+                out, _, _ = do_float_quantization(original_weight, config, precomputed_scale=scaled_scale)
             else:
                 out, _, _ = do_integer_quantization(
                     original_weight,
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index a22d1e3f871..97d21bbdd05 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -159,7 +159,6 @@ def do_float_quantization(
     config: WeightCompressionConfig,
     reduction_axes: Optional[ReductionAxes] = None,
     precomputed_scale: Optional[Tensor] = None,
-    quantiles: Optional[Tensor] = None,
     max_val: float = 6.0,
 ) -> tuple[Tensor, Tensor, Tensor]:
     """
@@ -195,8 +194,8 @@ def do_float_quantization(
 
     scale = precomputed_scale
     if scale is None:
-        if quantiles is not None:
-            max_val = max(quantiles)
+        if config.mode == CompressWeightsMode.CODEBOOK:
+            max_val = max(config.user_data)
         scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val)
     norm_weight = _calculate_normalized_weight(weight, scale)
     if config.mode == CompressWeightsMode.NF4:
@@ -205,8 +204,8 @@ def do_float_quantization(
             compressed_weight = norm_weight.as_openvino_tensor().astype(TensorDataType.nf4)
         else:
             compressed_weight = _calculate_nf4_quantized_weight(norm_weight)
-    elif config.mode == CompressWeightsMode.CODEBOOK and quantiles is not None:
-        compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=quantiles)
+    elif config.mode == CompressWeightsMode.CODEBOOK:
+        compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=config.user_data)
         return compressed_weight, scale, indexes
     else:
         # TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved
@@ -250,7 +249,7 @@ def float_quantize_dequantize_weight(
         )
 
     # Reference implementation
-    compressed_weight, scale = do_float_quantization(weight, config, reduction_axes, precomputed_scale)
+    compressed_weight, scale, _ = do_float_quantization(weight, config, reduction_axes, precomputed_scale)
     decompressed_weight = do_float_dequantization(compressed_weight, scale)
     if return_compressed_weight:
         return decompressed_weight, compressed_weight, scale
@@ -352,14 +351,14 @@ def compress_weight(
     :param precomputed_zero_point: Precomputed zero point.
     :return: The compressed weight and decompression parameters as instance of CompressedWeight
     """
-    precomputed_scale, precomputed_zero_point, quantiles = (
-        (compressed_weight.scale, compressed_weight.zero_point, compressed_weight.codebook) if compressed_weight else (None, None, None)
+    precomputed_scale, precomputed_zero_point = (
+        (compressed_weight.scale, compressed_weight.zero_point) if compressed_weight else (None, None)
     )
 
     if not config.is_integer:
-        compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale, quantiles=quantiles)
-        if quantiles is not None and indexes is not None:
-            return CompressedWeight(indexes, scale, None, fns.from_numpy(np.array(quantiles), backend=compressed_weight.backend))
+        compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale)
+        if indexes is not None:
+            return CompressedWeight(indexes, scale, None, fns.from_numpy(np.array(config.user_data), backend=compressed_weight.backend))
         else:
             return CompressedWeight(compressed_weight, scale)
     compressed_weight, scale, zero_point = do_integer_quantization(

From d4e8578c8db4c66c32a015a04c66d60cf40fcdb4 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Mon, 26 May 2025 13:44:15 +0200
Subject: [PATCH 22/68] Data aware codebook.

---
 nncf/openvino/graph/node_utils.py             |  2 +-
 .../weight_compression/algorithm.py           |  4 +++-
 .../algorithms/weight_compression/awq.py      |  3 +--
 .../algorithms/weight_compression/config.py   |  2 +-
 .../algorithms/weight_compression/gptq.py     |  8 ++++---
 .../weight_compression/openvino_backend.py    |  6 ++---
 .../weight_compression/scale_estimation.py    | 11 +++++-----
 .../weight_compression/weight_lowering.py     | 22 +++++++++++++------
 nncf/quantization/quantize_model.py           |  1 -
 9 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 0ba2ab1b970..75e5208ac43 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -699,7 +699,7 @@ def create_ov_codebook_subgraph(
     :param name: Optional name of the constant.
     :return: OpenVINO subgraph.
     """
-    codebook_const = opset.constant(codebook.data, dtype=codebook_dtype) #create_ov_const_from_tensor(codebook, codebook_dtype)# 
+    codebook_const = opset.constant(codebook.data, dtype=codebook_dtype)
     if codebook_dtype != ov.Type.f16:
         codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16)
 
diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 1c0a6d93336..89eb99d8fa8 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -445,7 +445,9 @@ def _set_weight_compression_config(
         :param graph: The model graph associated with the model.
         :param statistics_points: Statistics points.
         """
-        primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size, user_data=self._advanced_parameters.codebook_params.codebook)
+        primary_config = WeightCompressionConfig(
+            mode=self._mode, group_size=self._group_size, user_data=self._advanced_parameters.codebook_params.codebook
+        )
         if self._ratio == 1:
             for weight_param in ratio_defining_params:
                 weight_param.compression_config = primary_config
diff --git a/nncf/quantization/algorithms/weight_compression/awq.py b/nncf/quantization/algorithms/weight_compression/awq.py
index fbab09a1fdf..fa423828fc1 100644
--- a/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/nncf/quantization/algorithms/weight_compression/awq.py
@@ -25,7 +25,6 @@
 from nncf.common.utils.backend import BackendType
 from nncf.common.utils.backend import get_backend
 from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
-from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.algorithm import Algorithm
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
@@ -250,7 +249,7 @@ def _data_aware_step(self, wp, weight, statistics):
             for _ in range(self._steps):
                 cur_scale = gscale**alpha
                 weights_to_fake_quantize = gweight * cur_scale
-                if config.mode == CompressWeightsMode.NF4:
+                if not config.is_integer:
                     g_decompressed_weighs = float_quantize_dequantize_weight(
                         weights_to_fake_quantize, awq_config, reduction_axis
                     )
diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
index 80709f53d95..2939e2af609 100644
--- a/nncf/quantization/algorithms/weight_compression/config.py
+++ b/nncf/quantization/algorithms/weight_compression/config.py
@@ -10,7 +10,7 @@
 # limitations under the License.
 from dataclasses import dataclass
 from dataclasses import field
-from typing import Optional, TypeVar, Any
+from typing import Any, Optional, TypeVar
 
 import numpy as np
 
diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py
index 70a340b36b2..015b2628cee 100644
--- a/nncf/quantization/algorithms/weight_compression/gptq.py
+++ b/nncf/quantization/algorithms/weight_compression/gptq.py
@@ -235,7 +235,9 @@ def _quantize_weights(
             else weight_tensor.shape[1]
         )
         reduction_axes = wc_params.reduction_axes
-        block_compression_config = WeightCompressionConfig(mode=wc_params.compression_config.mode)
+        block_compression_config = WeightCompressionConfig(
+            mode=wc_params.compression_config.mode, user_data=wc_params.compression_config.user_data
+        )
 
         damp = self._damp_percent * fns.mean(fns.diag(hessian))
         diag_indices = fns.arange(columns, backend=hessian.backend, device=hessian.device)
@@ -260,7 +262,7 @@ def _quantize_weights(
                 hessian_diag_val = hessian_inv_block[i, i]
 
                 if (i1 + i) % group_size == 0:
-                    if block_compression_config.mode == CompressWeightsMode.NF4:
+                    if not block_compression_config.is_integer:
                         scale = calculate_float_quantization_params(
                             weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes, block_compression_config
                         )
@@ -289,7 +291,7 @@ def _quantize_weights(
                     # optimized OV compression performs worse than numpy compression.
                     # TODO(nikita-savelyevv): Remove this workaround by introducing logic that will control whether to
                     #   execute optimized compression based on input size.
-                    if block_compression_config.mode == CompressWeightsMode.NF4:
+                    if not block_compression_config.is_integer:
                         quantized_col = float_quantize_dequantize_weight(
                             fns.unsqueeze(weight_col, 1),
                             block_compression_config,
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 8db1a463cb7..447db197855 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -260,9 +260,7 @@ def _create_compression_subgraph(
                 codebook=compressed_weight.codebook,
                 indexes=compressed_weight.tensor,
                 dtype=compression_dtype,
-                codebook_dtype=codebook_params.dst_type
-                if codebook_params.dst_type
-                else ov.Type.f8e4m3,
+                codebook_dtype=codebook_params.dst_type if codebook_params.dst_type else ov.Type.f8e4m3,
                 name=const_node_name,
             )
         else:
@@ -337,7 +335,7 @@ def transform_model(
                     const_dtype=const_dtype,
                     should_add_convert_node=should_add_convert_node,
                     compressed_weight=compressed_weight,
-                    advanced_parameters=advanced_parameters
+                    advanced_parameters=advanced_parameters,
                 )
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 8701250bace..020a42c8f16 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -18,7 +18,6 @@
 from nncf.common.utils.backend import BackendType
 from nncf.common.utils.backend import get_backend
 from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
-from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
@@ -213,7 +212,7 @@ def calculate_quantization_params(
         cur_config.group_size = group_size
 
         original_weight = fns.zeros_like(weight) + weight
-        if config.mode == CompressWeightsMode.NF4:
+        if not config.is_integer:
             q_weights, compressed_weights, scale = float_quantize_dequantize_weight(
                 original_weight, cur_config, reduction_axis, return_compressed_weight=True
             )
@@ -262,7 +261,7 @@ def calculate_quantization_params(
             near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
             near_to_ideal_scale = near_to_ideal_scale * scale_sign
 
-            if config.mode == CompressWeightsMode.NF4:
+            if not config.is_integer:
                 out = float_quantize_dequantize_weight(
                     original_weight,
                     config,
@@ -300,7 +299,7 @@ def calculate_quantization_params(
             result_scale = near_to_ideal_scale
 
             if i < initial_steps - 1:
-                if config.mode == CompressWeightsMode.NF4:
+                if not config.is_integer:
                     out, _, _ = do_float_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale)
                 else:
                     out, _, _ = do_integer_quantization(
@@ -318,7 +317,7 @@ def calculate_quantization_params(
             factor = 1.0 - 0.05 * scale_step
             scaled_scale = factor * scale
 
-            if config.mode == CompressWeightsMode.NF4:
+            if not config.is_integer:
                 out, _, _ = do_float_quantization(original_weight, config, precomputed_scale=scaled_scale)
             else:
                 out, _, _ = do_integer_quantization(
@@ -334,7 +333,7 @@ def calculate_quantization_params(
             near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
             near_to_ideal_scale = near_to_ideal_scale * scale_sign
 
-            if config.mode == CompressWeightsMode.NF4:
+            if not config.is_integer:
                 out = float_quantize_dequantize_weight(original_weight, config, precomputed_scale=near_to_ideal_scale)
             else:
                 out = integer_quantize_dequantize_weight(
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 97d21bbdd05..774d720b65b 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -172,7 +172,8 @@ def do_float_quantization(
     :param reduction_axes: Axes, along which to reduce (collect) different statistics.
     :param precomputed_scale: Optional precomputed scale.
     :param max_val: Maximal value of destination type.
-    :return: Returns quantized (for codebook and e2m1 normalized) weight tensor and corresponding scale tensor and optional indexes for codebook.
+    :return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor and
+             optional indexes for codebook.
     """
     assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]
 
@@ -231,11 +232,11 @@ def float_quantize_dequantize_weight(
     :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale.
     :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale.
     """
-    assert config.mode == CompressWeightsMode.NF4
+    assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK]
     # TODO(nikita-savelyevv): add support for f4e2m1 once ticket 164851 is resolved
 
     # Optimized implementation
-    if _can_run_optimized(weight.backend):
+    if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight.backend):
         from nncf.openvino.optimized_functions import (
             float_quantize_dequantize_weight as float_quantize_dequantize_weight_ov,
         )
@@ -358,7 +359,9 @@ def compress_weight(
     if not config.is_integer:
         compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale)
         if indexes is not None:
-            return CompressedWeight(indexes, scale, None, fns.from_numpy(np.array(config.user_data), backend=compressed_weight.backend))
+            return CompressedWeight(
+                indexes, scale, None, fns.from_numpy(np.array(config.user_data), backend=compressed_weight.backend)
+            )
         else:
             return CompressedWeight(compressed_weight, scale)
     compressed_weight, scale, zero_point = do_integer_quantization(
@@ -532,14 +535,19 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor:
     return quantized_weight
 
 
-def _calculate_codebook_quantized_weight(norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None) -> Tensor:
+def _calculate_codebook_quantized_weight(
+    norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None
+) -> Tensor:
     """
-    Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to "round" or "quantize" to the closest quant.
+    Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to
+    "round" or "quantize" to the closest quant.
 
     :param norm_weight: Weight tensor to quantize already normalized to quantiles range.
     :return: Tensor with floating-point values, where each of them corresponds to elements from quantiles.
     """
-    assert quantiles is not None or center_of_quantiles is not None, "Either quantiles or center_of_quantiles should be provided"
+    assert quantiles is not None or center_of_quantiles is not None, (
+        "Either quantiles or center_of_quantiles should be provided"
+    )
 
     if center_of_quantiles is None:
         quantiles = np.array(quantiles)
diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py
index 6dc78e45f4a..0595a46d5f0 100644
--- a/nncf/quantization/quantize_model.py
+++ b/nncf/quantization/quantize_model.py
@@ -599,7 +599,6 @@ def compress_weights(
 
         if any((awq, scale_estimation, gptq, lora_correction)) and mode in [
             CompressWeightsMode.E2M1,
-            CompressWeightsMode.CODEBOOK,
         ]:
             msg = f"AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is {mode}."
             raise nncf.ParameterNotSupportedError(msg)

From ac0346dea67b4854e6ccdf2e022eabbb9fc5c034 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Mon, 26 May 2025 14:10:24 +0200
Subject: [PATCH 23/68] Fixed test.

---
 .../openvino/smollm2_360m_codebook/main.py    | 32 +++++++++----------
 .../test_compression_functions.py             |  2 +-
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index b53bc5433d8..923829f08b2 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -41,10 +41,10 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50):
     return answers_by_questions
 
 
-def default_codebook_example(MODEL_ID, OUTPUT_DIR):
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+def default_codebook_example(model_id, output_dir):
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = OVModelForCausalLM.from_pretrained(
-        MODEL_ID,
+        model_id,
         export=True,
         load_in_8bit=False,
         compile=False,
@@ -63,20 +63,20 @@ def default_codebook_example(MODEL_ID, OUTPUT_DIR):
     print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
 
     model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=64)
-    model.save_pretrained(OUTPUT_DIR)
-    tokenizer.save_pretrained(OUTPUT_DIR)
+    model.save_pretrained(output_dir)
+    tokenizer.save_pretrained(output_dir)
 
     model = OVModelForCausalLM.from_pretrained(
-        OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "INFERENCE_PRECISION_HINT": "f32"}
+        output_dir, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "64", "INFERENCE_PRECISION_HINT": "f32"}
     )
     answers_by_questions = generate_answers(questions, model, tokenizer)
     print(f"Optimized model outputs:\n{answers_by_questions}\n")
 
 
-def custom_codebook_example(MODEL_ID, OUTPUT_DIR):
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+def custom_codebook_example(model_id, output_dir):
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = OVModelForCausalLM.from_pretrained(
-        MODEL_ID,
+        model_id,
         export=True,
         load_in_8bit=False,
         compile=False,
@@ -103,22 +103,22 @@ def custom_codebook_example(MODEL_ID, OUTPUT_DIR):
         group_size=-1,
         advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params),
     )
-    model.save_pretrained(OUTPUT_DIR)
-    tokenizer.save_pretrained(OUTPUT_DIR)
+    model.save_pretrained(output_dir)
+    tokenizer.save_pretrained(output_dir)
 
     model = OVModelForCausalLM.from_pretrained(
-        OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "INFERENCE_PRECISION_HINT": "f32"}
+        output_dir, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "64", "INFERENCE_PRECISION_HINT": "f32"}
     )
     answers_by_questions = generate_answers(questions, model, tokenizer)
     print(f"Optimized model outputs:\n{answers_by_questions}\n")
 
 
 def main():
-    MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
-    OUTPUT_DIR = "smollm2_360m_compressed_codebook"
+    model_id = "HuggingFaceTB/SmolLM2-360M-Instruct"
+    output_dir = "smollm2_360m_compressed_codebook"
 
-    default_codebook_example(MODEL_ID, OUTPUT_DIR)
-    custom_codebook_example(MODEL_ID, OUTPUT_DIR + "_custom")
+    default_codebook_example(model_id, output_dir)
+    custom_codebook_example(model_id, output_dir + "_custom")
 
 
 if __name__ == "__main__":
diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py
index 77eaaf9364c..b57c3bb9281 100644
--- a/tests/openvino/optimized_functions/test_compression_functions.py
+++ b/tests/openvino/optimized_functions/test_compression_functions.py
@@ -215,7 +215,7 @@ def test_quantization_alignment(weight_shape, config, quantization_task, tensor_
                     if config.is_integer:
                         compressed_weight, scale, zero_point = outputs
                     else:
-                        compressed_weight, scale = outputs
+                        compressed_weight, scale, _ = outputs
                 elif quantization_task == QuantizationTask.Q_DQ:
                     decompressed_weight = outputs
                 else:

From 5fb55e42fc6a0dd846ce4203c0f607facff002cd Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Mon, 26 May 2025 15:21:23 +0200
Subject: [PATCH 24/68] Fixed tests.

---
 .../openvino/smollm2_360m_codebook/main.py    | 30 ++++++++-----------
 .../weight_compression/algorithm.py           | 11 ++++---
 .../weight_compression/weight_lowering.py     |  2 +-
 .../quantization/test_weights_compression.py  | 24 ++++-----------
 4 files changed, 25 insertions(+), 42 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index 923829f08b2..5c797f8c691 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -41,6 +41,14 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50):
     return answers_by_questions
 
 
+QUESTIONS = [
+    "What is the capital of France?",
+    "What is the highest peak in the Alps?",
+    "What is the largest city in Canada?",
+    "What is the most visited city in Japan?",
+]
+
+
 def default_codebook_example(model_id, output_dir):
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = OVModelForCausalLM.from_pretrained(
@@ -52,14 +60,7 @@ def default_codebook_example(model_id, output_dir):
         ov_config={"INFERENCE_PRECISION_HINT": "f32"},
     )
 
-    questions = [
-        "What is the capital of France?",
-        "What is the highest peak in the Alps?",
-        "What is the largest city in Canada?",
-        "What is the most visited city in Japan?",
-    ]
-
-    answers_by_questions = generate_answers(questions, model, tokenizer)
+    answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
 
     model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=64)
@@ -69,7 +70,7 @@ def default_codebook_example(model_id, output_dir):
     model = OVModelForCausalLM.from_pretrained(
         output_dir, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "64", "INFERENCE_PRECISION_HINT": "f32"}
     )
-    answers_by_questions = generate_answers(questions, model, tokenizer)
+    answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Optimized model outputs:\n{answers_by_questions}\n")
 
 
@@ -84,14 +85,7 @@ def custom_codebook_example(model_id, output_dir):
         ov_config={"INFERENCE_PRECISION_HINT": "f32"},
     )
 
-    questions = [
-        "What is the capital of France?",
-        "What is the highest peak in the Alps?",
-        "What is the largest city in Canada?",
-        "What is the most visited city in Japan?",
-    ]
-
-    answers_by_questions = generate_answers(questions, model, tokenizer)
+    answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
 
     codebook_params = nncf.AdvancedCodebookParameters([-8, -4, -2, -1, 0, 1, 2, 4, 8], ov.Type.i8)
@@ -109,7 +103,7 @@ def custom_codebook_example(model_id, output_dir):
     model = OVModelForCausalLM.from_pretrained(
         output_dir, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "64", "INFERENCE_PRECISION_HINT": "f32"}
     )
-    answers_by_questions = generate_answers(questions, model, tokenizer)
+    answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Optimized model outputs:\n{answers_by_questions}\n")
 
 
diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 89eb99d8fa8..a24222f9581 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -293,7 +293,7 @@ def __init__(
             advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters()
         )
 
-        primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size)
+        primary_config = self._get_primary_config()
         criterion_cls = MIXED_PRECISION_CRITERIA.get(self._sensitivity_metric)
         self._mixed_precision_algo = criterion_cls(primary_config, self._ratio, self._subset_size)
         self._statistics_path = self._advanced_parameters.statistics_path
@@ -429,6 +429,11 @@ def _get_ratio_defining_params(
 
         return ratio_defining_params
 
+    def _get_primary_config(self):
+        return WeightCompressionConfig(
+            mode=self._mode, group_size=self._group_size, user_data=self._advanced_parameters.codebook_params.codebook
+        )
+
     def _set_weight_compression_config(
         self,
         ratio_defining_params: list[WeightCompressionParameters],
@@ -445,9 +450,7 @@ def _set_weight_compression_config(
         :param graph: The model graph associated with the model.
         :param statistics_points: Statistics points.
         """
-        primary_config = WeightCompressionConfig(
-            mode=self._mode, group_size=self._group_size, user_data=self._advanced_parameters.codebook_params.codebook
-        )
+        primary_config = self._get_primary_config()
         if self._ratio == 1:
             for weight_param in ratio_defining_params:
                 weight_param.compression_config = primary_config
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index a7edf80cab0..c18abe82f8a 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -199,7 +199,7 @@ def do_float_quantization(
     scale = precomputed_scale
     if scale is None:
         if config.mode == CompressWeightsMode.CODEBOOK:
-            max_val = max(config.user_data)
+            max_val = max(np.abs(np.array(config.user_data)))
         scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val)
     norm_weight = _calculate_normalized_weight(weight, scale)
     if config.mode == CompressWeightsMode.NF4:
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index dece9e25e8f..44cc8ca62c1 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -41,7 +41,6 @@
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams
 from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams
 from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams
-from nncf.quantization.algorithms.weight_compression.codebook import CodebookCompression
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
@@ -49,6 +48,7 @@
 from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
 from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_nf4_quantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_normalized_weight
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error
 from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
@@ -699,20 +699,6 @@ def test_raise_error_with_unsupported_params_for_e2m1(algo):
         compress_weights(ov.Model([], []), dataset="anything", mode=CompressWeightsMode.E2M1, **{algo: True})
 
 
-@pytest.mark.parametrize(
-    "algo",
-    (
-        "lora_correction",
-        "awq",
-        "scale_estimation",
-        "gptq",
-    ),
-)
-def test_raise_error_with_unsupported_params_for_codebook(algo):
-    with pytest.raises(nncf.ParameterNotSupportedError):
-        compress_weights(ov.Model([], []), dataset="anything", mode=CompressWeightsMode.CODEBOOK, **{algo: True})
-
-
 @pytest.mark.parametrize("mode", INT4_NF4_MODES)
 @pytest.mark.parametrize(
     "algo",
@@ -1132,12 +1118,12 @@ def test_compressed_weighs_range(mode, data):
 )
 def test_codebook_weighs_range(data):
     data = np.array(data).astype(np.float32)
+    codebook = data
     max_diff = 0.1
     w = Tensor(data + (np.random.rand(*data.shape) - 0.5) * max_diff)
-    config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK)
-    codebook_compression = CodebookCompression(initial_codebook=data, dst_type=None)
-    indexes, scale, codebook = codebook_compression.calculate_quantization_params(w, [-1], config)
-    uncompressed_data = codebook[indexes] * scale
+    config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK, user_data=data)
+    _, scale, indexes = do_float_quantization(w, config, -1)
+    uncompressed_data = codebook[indexes.data] * scale.data
 
     indexes = indexes.flatten()
     target = np.arange(indexes.shape[0])

From bf94228a9f945f77200ddd04f32246c0155294c3 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 28 May 2025 10:21:39 +0200
Subject: [PATCH 25/68] Added CB4_F8E4M3 type.

---
 .../openvino/smollm2_360m_codebook/main.py    |  6 ++--
 nncf/parameters.py                            |  2 ++
 .../weight_compression/algorithm.py           |  7 +++-
 .../algorithms/weight_compression/config.py   | 14 +++++++-
 .../weight_compression/openvino_backend.py    |  4 +--
 .../weight_compression/weight_lowering.py     | 32 ++++++++++++++++---
 6 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index 5c797f8c691..feaa3fe8fec 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -63,7 +63,7 @@ def default_codebook_example(model_id, output_dir):
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
 
-    model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CODEBOOK, ratio=1.0, group_size=64)
+    model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CB4_F8E4M3, ratio=1.0, group_size=64)
     model.save_pretrained(output_dir)
     tokenizer.save_pretrained(output_dir)
 
@@ -88,7 +88,9 @@ def custom_codebook_example(model_id, output_dir):
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
 
-    codebook_params = nncf.AdvancedCodebookParameters([-8, -4, -2, -1, 0, 1, 2, 4, 8], ov.Type.i8)
+    codebook_params = nncf.AdvancedCodebookParameters(
+        [-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], ov.Type.i8
+    )
 
     model.model = nncf.compress_weights(
         model.model,
diff --git a/nncf/parameters.py b/nncf/parameters.py
index 50567733098..b8966210d75 100644
--- a/nncf/parameters.py
+++ b/nncf/parameters.py
@@ -86,6 +86,7 @@ class CompressWeightsMode(StrEnum):
     :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead.
     :param E2M1: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0.
     :param CODEBOOK: Codebook (LUT) quantization format.
+    :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values.
     """
 
     INT8_SYM = "int8_sym"
@@ -96,6 +97,7 @@ class CompressWeightsMode(StrEnum):
     INT8 = "int8"  # Deprecated mode
     E2M1 = "e2m1"
     CODEBOOK = "codebook"
+    CB4_F8E4M3 = "cb4_f8e4m3"
 
 
 @api(canonical_alias="nncf.CompressionFormat")
diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index a24222f9581..bff4e9abc38 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -43,6 +43,7 @@
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
 from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
+from nncf.quantization.algorithms.weight_compression.weight_lowering import CB4_QUANTILES
 from nncf.quantization.algorithms.weight_compression.weight_lowering import WeightCompressionConfig
 from nncf.scopes import IgnoredScope
 from nncf.scopes import get_ignored_node_names_from_ignored_scope
@@ -431,7 +432,11 @@ def _get_ratio_defining_params(
 
     def _get_primary_config(self):
         return WeightCompressionConfig(
-            mode=self._mode, group_size=self._group_size, user_data=self._advanced_parameters.codebook_params.codebook
+            mode=self._mode,
+            group_size=self._group_size,
+            user_data=CB4_QUANTILES
+            if self._mode == CompressWeightsMode.CB4_F8E4M3
+            else self._advanced_parameters.codebook_params.codebook,
         )
 
     def _set_weight_compression_config(
diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
index 2939e2af609..ff9b3eb10e9 100644
--- a/nncf/quantization/algorithms/weight_compression/config.py
+++ b/nncf/quantization/algorithms/weight_compression/config.py
@@ -50,7 +50,19 @@ def is_integer(self):
         """
         :return: True if compression type in integer, else False.
         """
-        return self.mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]
+        return self.mode not in [
+            CompressWeightsMode.NF4,
+            CompressWeightsMode.E2M1,
+            CompressWeightsMode.CODEBOOK,
+            CompressWeightsMode.CB4_F8E4M3,
+        ]
+
+    @property
+    def is_codebook(self):
+        """
+        :return: True if compression type is codebook, else False.
+        """
+        return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]
 
     def __hash__(self):
         return hash((self.mode.value, self.group_size))
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 447db197855..cd848c4120a 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -236,7 +236,7 @@ def _create_compression_subgraph(
             compression_dtype = ov.Type.i8
         elif compression_config.mode == CompressWeightsMode.INT8_ASYM:
             compression_dtype = ov.Type.u8
-        elif compression_config.mode == CompressWeightsMode.CODEBOOK:
+        elif compression_config.is_codebook:
             compression_dtype = None
         else:
             msg = f"{compression_config.mode.value} is not supported."
@@ -252,7 +252,7 @@ def _create_compression_subgraph(
                 compressed_weight,
             )
 
-        if compression_config.mode == CompressWeightsMode.CODEBOOK:
+        if compression_config.is_codebook:
             n_quants = compressed_weight.tensor.max()
             compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4)
             codebook_params = advanced_parameters.codebook_params
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index c18abe82f8a..bb43205d276 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -50,6 +50,28 @@
     dtype=np.float32,
 )
 
+CB4_QUANTILES = np.array(
+    [
+        -3.5,
+        -2.5,
+        -1.875,
+        -1.375,
+        -1.0,
+        -0.625,
+        -0.3125,
+        0.0,
+        0.2812,
+        0.5625,
+        0.875,
+        1.125,
+        1.5,
+        2.0,
+        2.5,
+        3.5,
+    ],
+    dtype=np.float32,
+)
+
 CENTER_OF_NF4_QUANTILES = np.array(
     [
         -0.84809643,
@@ -119,13 +141,13 @@ def calculate_float_quantization_params(
     :param max_val: Maximal value of e2m1 type.
     :return: Scale tensor of float32 type for float quantization.
     """
-    assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]
+    assert not config.is_integer
 
     if weight.dtype != TensorDataType.float32:
         weight = weight.astype(TensorDataType.float32)
 
     scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True)
-    if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]:
+    if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]:
         scale = scale / max_val
 
     # NOTE: adding machine epsilon to avoid division by zero
@@ -178,7 +200,7 @@ def do_float_quantization(
     :return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor and
              optional indexes for codebook.
     """
-    assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]
+    assert not config.is_integer
 
     if config.group_size != -1 and reduction_axes is not None:
         # weights are reshaped: [a1, r, a2] -> [a1, r//gs, gs, a2]
@@ -198,7 +220,7 @@ def do_float_quantization(
 
     scale = precomputed_scale
     if scale is None:
-        if config.mode == CompressWeightsMode.CODEBOOK:
+        if config.is_codebook:
             max_val = max(np.abs(np.array(config.user_data)))
         scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val)
     norm_weight = _calculate_normalized_weight(weight, scale)
@@ -208,7 +230,7 @@ def do_float_quantization(
             compressed_weight = norm_weight.as_openvino_tensor().astype(TensorDataType.nf4)
         else:
             compressed_weight = _calculate_nf4_quantized_weight(norm_weight)
-    elif config.mode == CompressWeightsMode.CODEBOOK:
+    elif config.is_codebook:
         compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=config.user_data)
         return compressed_weight, scale, indexes
     else:

From 37a7c590fd2cf11aca030a402deb627794ac5d8c Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 28 May 2025 10:37:51 +0200
Subject: [PATCH 26/68] Fixed pre-commit.

---
 nncf/quantization/advanced_parameters.py                        | 1 +
 nncf/quantization/algorithms/weight_compression/onnx_backend.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index ba2bcccfad6..9d67d90a56f 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -364,6 +364,7 @@ class AdvancedLoraCorrectionParameters:
 class AdvancedCodebookParameters:
     """
     Contains advanced parameters for codebook compression algorithm.
+
     :param codebook: The codebook (LUT) for the weight compression.
         Applicable for vector quantization.
     :type codebook: list[Any]
diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
index 478ddd076c1..cdcef81d77c 100644
--- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -50,7 +50,6 @@
 from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
-from nncf.quantization.algorithms.weight_compression.weight_lowering import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorDataType

From 6006be654e065e0ce5058aabaa5fed6fd53c43c1 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Mon, 2 Jun 2025 12:04:35 +0200
Subject: [PATCH 27/68] Applied suggestions.

---
 nncf/quantization/advanced_parameters.py      | 21 +------------------
 .../algorithms/weight_compression/backend.py  |  2 +-
 .../algorithms/weight_compression/common.py   |  2 +-
 .../weight_compression/onnx_backend.py        |  2 +-
 .../weight_compression/openvino_backend.py    |  4 ++--
 .../weight_compression/torch_backend.py       |  4 ++--
 .../weight_compression/torch_fx_backend.py    |  4 ++--
 7 files changed, 10 insertions(+), 29 deletions(-)

diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index 9d67d90a56f..fa9746900d4 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -371,26 +371,7 @@ class AdvancedCodebookParameters:
     :param dts_type: The type of the codebook.
     """
 
-    codebook: list[Any] = field(
-        default_factory=lambda: [
-            -3.5,
-            -2.5,
-            -1.875,
-            -1.375,
-            -1.0,
-            -0.625,
-            -0.3125,
-            0.0,
-            0.2812,
-            0.5625,
-            0.875,
-            1.125,
-            1.5,
-            2.0,
-            2.5,
-            3.5,
-        ]
-    )
+    codebook: list[Any] = None
     dst_type: Any = None
 
 
diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py
index 2d928ff2908..cee8763995b 100644
--- a/nncf/quantization/algorithms/weight_compression/backend.py
+++ b/nncf/quantization/algorithms/weight_compression/backend.py
@@ -149,7 +149,7 @@ def transform_model(
         model: TModel,
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        compressed_weights: dict[str, CompressedWeight] = None,
+        compressed_weights: Optional[dict[str, CompressedWeight]] = None,
         lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
diff --git a/nncf/quantization/algorithms/weight_compression/common.py b/nncf/quantization/algorithms/weight_compression/common.py
index a172899374f..8c1d60fd400 100644
--- a/nncf/quantization/algorithms/weight_compression/common.py
+++ b/nncf/quantization/algorithms/weight_compression/common.py
@@ -50,4 +50,4 @@ def is_codebook(self):
 
         :return: True if the compressed weight is a codebook, False otherwise.
         """
-        return not (self.codebook is None or self.tensor is None or self.scale is None)
+        return self.codebook is not None and self.tensor is not None and self.scale is not None
diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
index cdcef81d77c..c0a2ab73849 100644
--- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -201,7 +201,7 @@ def transform_model(
         model: onnx.ModelProto,
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        compressed_weights: dict[str, CompressedWeight] = None,
+        compressed_weights: Optional[dict[str, CompressedWeight]] = None,
         lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index cd848c4120a..6a3a3bf1c56 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -299,8 +299,8 @@ def transform_model(
         model: ov.Model,
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        compressed_weights: dict[str, CompressedWeight] = None,
-        lora_correction_algo: LoraCorrectionAlgorithm = None,
+        compressed_weights: Optional[dict[str, CompressedWeight]] = None,
+        lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
     ) -> ov.Model:
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index 92306f0a24b..f4254bfb0c5 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -456,8 +456,8 @@ def transform_model(
         model: Union[GraphModelWrapper, torch.nn.Module],
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        compressed_weights: dict[str, CompressedWeight] = None,
-        lora_correction_algo: LoraCorrectionAlgorithm = None,
+        compressed_weights: Optional[dict[str, CompressedWeight]] = None,
+        lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
     ) -> NNCFNetwork:
diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
index 80597096346..2172a6a5e37 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
@@ -190,8 +190,8 @@ def transform_model(
         model: torch.fx.GraphModule,
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        compressed_weights: dict[str, CompressedWeight] = None,
-        lora_correction_algo: LoraCorrectionAlgorithm = None,
+        compressed_weights: Optional[dict[str, CompressedWeight]] = None,
+        lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
     ) -> torch.fx.GraphModule:

From caed8a8569451f1ea2648cea544ee6d6f32693ca Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Mon, 2 Jun 2025 12:25:44 +0200
Subject: [PATCH 28/68] Fixed tests.

---
 nncf/quantization/advanced_parameters.py                       | 2 +-
 tests/openvino/native/quantization/test_weights_compression.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index fa9746900d4..945ae9bd68c 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -371,7 +371,7 @@ class AdvancedCodebookParameters:
     :param dts_type: The type of the codebook.
     """
 
-    codebook: list[Any] = None
+    codebook: Optional[list[Any]] = None
     dst_type: Any = None
 
 
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 44cc8ca62c1..cf07a96d486 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -1043,7 +1043,7 @@ def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids):
     model = SequentialMatmulModel().ov_model
     compressed_model = compress_weights(
         model,
-        mode=CompressWeightsMode.CODEBOOK,
+        mode=CompressWeightsMode.CB4_F8E4M3,
         ratio=ratio,
         group_size=1,
         all_layers=all_layers,

From 0a36b5169e6a1e92f1da034d389397cf75442270 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Mon, 2 Jun 2025 18:38:57 +0200
Subject: [PATCH 29/68] Added codebook parametars validation.

---
 .../weight_compression/algorithm.py           |  6 ++++++
 nncf/quantization/quantize_model.py           | 21 ++++++++++++++++---
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index bff4e9abc38..6c90bda6f44 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -208,6 +208,12 @@ def check_user_compression_configuration(
         msg = "LoRA Correction algorithm is not compatible with FQ, FQ_LORA and FQ_LORA_NLS compression formats."
         raise nncf.ValidationError(msg)
 
+    if mode == CompressWeightsMode.CODEBOOK and (
+        advanced_parameters is None or advanced_parameters.codebook_params.codebook is not None
+    ):
+        msg = "Codebook compression mode requires codebook parameters to be specified in advanced_parameters."
+        raise nncf.ValidationError(msg)
+
 
 class WeightCompression(Algorithm):
     """
diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py
index 0595a46d5f0..5b69e52359d 100644
--- a/nncf/quantization/quantize_model.py
+++ b/nncf/quantization/quantize_model.py
@@ -516,7 +516,12 @@ def compress_weights(
         from nncf.torch.nncf_network import NNCFNetwork
         from nncf.torch.quantization.quantize_model import compress_weights_impl as pt_compression_weights_impl
 
-        if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]:
+        if mode in [
+            CompressWeightsMode.NF4,
+            CompressWeightsMode.E2M1,
+            CompressWeightsMode.CODEBOOK,
+            CompressWeightsMode.CB4_F8E4M3,
+        ]:
             msg = "Torch backend does not support NF4, E2M1 and CODEBOOK modes for weight compression."
             raise nncf.ParameterNotSupportedError(msg)
 
@@ -560,7 +565,12 @@ def compress_weights(
             compress_weights_impl as fx_compression_weights_impl,
         )
 
-        if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]:
+        if mode in [
+            CompressWeightsMode.NF4,
+            CompressWeightsMode.E2M1,
+            CompressWeightsMode.CODEBOOK,
+            CompressWeightsMode.CB4_F8E4M3,
+        ]:
             msg = "Torch backend does not support NF4, E2M1 and CODEBOOK modes for weight compression."
             raise nncf.ParameterNotSupportedError(msg)
 
@@ -616,7 +626,12 @@ def compress_weights(
     elif backend == BackendType.ONNX:
         from nncf.onnx.quantization.quantize_model import compress_weights_impl as onnx_compress_weights_impl
 
-        if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK]:
+        if mode in [
+            CompressWeightsMode.NF4,
+            CompressWeightsMode.E2M1,
+            CompressWeightsMode.CODEBOOK,
+            CompressWeightsMode.CB4_F8E4M3,
+        ]:
             msg = "ONNX backend does not support NF4, E2M1 and CODEBOOK modes for weight compression."
             raise nncf.ParameterNotSupportedError(msg)
 

From 68d633bbf65a0414f54250f5fb128cd27f45564c Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Mon, 2 Jun 2025 19:05:25 +0200
Subject: [PATCH 30/68] Fixed bug.

---
 nncf/quantization/algorithms/weight_compression/algorithm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 6c90bda6f44..d21a851e0eb 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -209,7 +209,7 @@ def check_user_compression_configuration(
         raise nncf.ValidationError(msg)
 
     if mode == CompressWeightsMode.CODEBOOK and (
-        advanced_parameters is None or advanced_parameters.codebook_params.codebook is not None
+        advanced_parameters is None or advanced_parameters.codebook_params.codebook is None
     ):
         msg = "Codebook compression mode requires codebook parameters to be specified in advanced_parameters."
         raise nncf.ValidationError(msg)

From 508aec426b795ce1f8911015c0fa8b75607032c6 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 6 Jun 2025 10:26:00 +0200
Subject: [PATCH 31/68] Applied suggestions.

---
 .../openvino/smollm2_360m_codebook/main.py      | 17 +++++++++--------
 .../algorithms/weight_compression/algorithm.py  |  2 +-
 .../algorithms/weight_compression/config.py     |  2 +-
 .../algorithms/weight_compression/gptq.py       |  2 +-
 .../weight_compression/weight_lowering.py       | 14 ++++++++++----
 nncf/version.py                                 |  2 +-
 tests/cross_fw/examples/example_scope.json      | 17 +++++++++++++++++
 tests/cross_fw/examples/run_example.py          |  6 ++++++
 .../quantization/test_weights_compression.py    |  2 +-
 9 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index feaa3fe8fec..4eb3ef98612 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -67,12 +67,12 @@ def default_codebook_example(model_id, output_dir):
     model.save_pretrained(output_dir)
     tokenizer.save_pretrained(output_dir)
 
-    model = OVModelForCausalLM.from_pretrained(
-        output_dir, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "64", "INFERENCE_PRECISION_HINT": "f32"}
-    )
+    model = OVModelForCausalLM.from_pretrained(output_dir, ov_config={"INFERENCE_PRECISION_HINT": "f32"})
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Optimized model outputs:\n{answers_by_questions}\n")
 
+    return list(answers_by_questions.values())
+
 
 def custom_codebook_example(model_id, output_dir):
     tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -102,19 +102,20 @@ def custom_codebook_example(model_id, output_dir):
     model.save_pretrained(output_dir)
     tokenizer.save_pretrained(output_dir)
 
-    model = OVModelForCausalLM.from_pretrained(
-        output_dir, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "64", "INFERENCE_PRECISION_HINT": "f32"}
-    )
+    model = OVModelForCausalLM.from_pretrained(output_dir, ov_config={"INFERENCE_PRECISION_HINT": "f32"})
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Optimized model outputs:\n{answers_by_questions}\n")
 
+    return list(answers_by_questions.values())
+
 
 def main():
     model_id = "HuggingFaceTB/SmolLM2-360M-Instruct"
     output_dir = "smollm2_360m_compressed_codebook"
 
-    default_codebook_example(model_id, output_dir)
-    custom_codebook_example(model_id, output_dir + "_custom")
+    res = default_codebook_example(model_id, output_dir)
+    res += custom_codebook_example(model_id, output_dir + "_custom")
+    return res
 
 
 if __name__ == "__main__":
diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index d21a851e0eb..7070311f8e4 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -440,7 +440,7 @@ def _get_primary_config(self):
         return WeightCompressionConfig(
             mode=self._mode,
             group_size=self._group_size,
-            user_data=CB4_QUANTILES
+            codebook_values=CB4_QUANTILES
             if self._mode == CompressWeightsMode.CB4_F8E4M3
             else self._advanced_parameters.codebook_params.codebook,
         )
diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
index ff9b3eb10e9..519be93dee4 100644
--- a/nncf/quantization/algorithms/weight_compression/config.py
+++ b/nncf/quantization/algorithms/weight_compression/config.py
@@ -32,7 +32,7 @@ class WeightCompressionConfig:
 
     mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM
     group_size: Optional[int] = -1
-    user_data: Optional[Any] = None
+    codebook_values: Optional[Any] = None
 
     @property
     def num_bits(self):
diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py
index 963c9feb252..767fe2f5127 100644
--- a/nncf/quantization/algorithms/weight_compression/gptq.py
+++ b/nncf/quantization/algorithms/weight_compression/gptq.py
@@ -235,7 +235,7 @@ def _quantize_weights(
         )
         reduction_axes = wc_params.reduction_axes
         block_compression_config = WeightCompressionConfig(
-            mode=wc_params.compression_config.mode, user_data=wc_params.compression_config.user_data
+            mode=wc_params.compression_config.mode, codebook_values=wc_params.compression_config.codebook_values
         )
 
         damp = self._damp_percent * fns.mean(fns.diag(hessian))
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index bb43205d276..bd1031227ba 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -221,7 +221,7 @@ def do_float_quantization(
     scale = precomputed_scale
     if scale is None:
         if config.is_codebook:
-            max_val = max(np.abs(np.array(config.user_data)))
+            max_val = max(np.abs(np.array(config.codebook_values)))
         scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val)
     norm_weight = _calculate_normalized_weight(weight, scale)
     if config.mode == CompressWeightsMode.NF4:
@@ -231,7 +231,7 @@ def do_float_quantization(
         else:
             compressed_weight = _calculate_nf4_quantized_weight(norm_weight)
     elif config.is_codebook:
-        compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=config.user_data)
+        compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=config.codebook_values)
         return compressed_weight, scale, indexes
     else:
         # TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved
@@ -257,7 +257,7 @@ def float_quantize_dequantize_weight(
     :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale.
     :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale.
     """
-    assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK]
+    assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]
     # TODO(nikita-savelyevv): add support for f4e2m1 once ticket 164851 is resolved
 
     # Optimized implementation
@@ -385,7 +385,10 @@ def compress_weight(
         compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale)
         if indexes is not None:
             return CompressedWeight(
-                indexes, scale, None, fns.from_numpy(np.array(config.user_data), backend=compressed_weight.backend)
+                indexes,
+                scale,
+                None,
+                fns.from_numpy(np.array(config.codebook_values), backend=compressed_weight.backend),
             )
         else:
             return CompressedWeight(compressed_weight, scale)
@@ -568,6 +571,9 @@ def _calculate_codebook_quantized_weight(
     "round" or "quantize" to the closest quant.
 
     :param norm_weight: Weight tensor to quantize already normalized to quantiles range.
+    :param quantiles: Quantiles to use for quantization. If None, the center_of_quantiles must be provided.
+    :param center_of_quantiles: Center of quantiles to use for quantization. If None, it is calculated as the average
+        of adjacent quantiles.
     :return: Tensor with floating-point values, where each of them corresponds to elements from quantiles.
     """
     assert quantiles is not None or center_of_quantiles is not None, (
diff --git a/nncf/version.py b/nncf/version.py
index cec4ea22fb5..3769834a0b7 100644
--- a/nncf/version.py
+++ b/nncf/version.py
@@ -9,7 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.17.0.dev0+6ccd252b3dirty"
+__version__ = "2.17.0"
 
 
 BKC_TORCH_SPEC = "==2.7.*"
diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json
index 36e627b1f00..565a77d468c 100644
--- a/tests/cross_fw/examples/example_scope.json
+++ b/tests/cross_fw/examples/example_scope.json
@@ -275,6 +275,23 @@
             ]
         }
     },
+    "codebook_llm_compression": {
+        "backend": "openvino",
+        "requirements": "examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt",
+        "cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz",
+        "accuracy_metrics": {
+            "answers": [
+                "Paris.",
+                "Mont Blanc.",
+                "Toronto.",
+                "Tokyo.",
+                "Paris.",
+                "Mont Blanc.",
+                "Toronto.",
+                "Tokyo."
+            ]
+        }
+    },
     "llm_compression_qat_with_lora": {
         "backend": "torch",
         "device": "cuda",
diff --git a/tests/cross_fw/examples/run_example.py b/tests/cross_fw/examples/run_example.py
index b44e130b283..09dd29fb719 100644
--- a/tests/cross_fw/examples/run_example.py
+++ b/tests/cross_fw/examples/run_example.py
@@ -192,6 +192,12 @@ def fp8_llm_quantization() -> dict[str, float]:
     return {"answers": list(result.values())}
 
 
+def codebook_llm_compression() -> list[str]:
+    from examples.llm_compression.openvino.smollm2_360m_codebook.main import main as codebook_llm_compression_main
+
+    return codebook_llm_compression_main()
+
+
 def llm_compression_qat_with_lora() -> float:
     from examples.llm_compression.torch.qat_with_lora.main import main as qat_with_lora_main
 
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index cf07a96d486..4a364751e83 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -1121,7 +1121,7 @@ def test_codebook_weighs_range(data):
     codebook = data
     max_diff = 0.1
     w = Tensor(data + (np.random.rand(*data.shape) - 0.5) * max_diff)
-    config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK, user_data=data)
+    config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK, codebook_values=data)
     _, scale, indexes = do_float_quantization(w, config, -1)
     uncompressed_data = codebook[indexes.data] * scale.data
 

From 79f93680d146b1fc74898f48b226c8e60d77462a Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 10 Jun 2025 10:51:21 +0200
Subject: [PATCH 32/68] Added description for codebook parameter.

---
 nncf/quantization/advanced_parameters.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index 945ae9bd68c..3b521a6144a 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -395,6 +395,8 @@ class AdvancedCompressionParameters:
     :type lora_adapter_rank: int
     :param backend_params: Backend-specific parameters.
     :type backend_params: dict[str, Any]
+    :param codebook_params: Advanced parameters for codebook compression.
+    :type codebook_params: AdvancedCodebookParameters
     """
 
     statistics_path: Optional[str] = None

From 8c9b7b527ae19aecddfdcc08349b46201fc4cfa3 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 10 Jun 2025 12:05:37 +0200
Subject: [PATCH 33/68] Renamed global parameter for codebook.

---
 .../openvino/smollm2_360m_codebook/main.py                | 4 +---
 nncf/__init__.py                                          | 2 +-
 nncf/openvino/graph/node_utils.py                         | 1 +
 nncf/quantization/advanced_parameters.py                  | 8 ++++----
 .../native/quantization/test_weights_compression.py       | 4 ++--
 5 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index 4eb3ef98612..68a0ea50980 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -88,9 +88,7 @@ def custom_codebook_example(model_id, output_dir):
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
 
-    codebook_params = nncf.AdvancedCodebookParameters(
-        [-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], ov.Type.i8
-    )
+    codebook_params = nncf.CodebookParameters([-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], ov.Type.i8)
 
     model.model = nncf.compress_weights(
         model.model,
diff --git a/nncf/__init__.py b/nncf/__init__.py
index a0f9a45183f..14e1c38740f 100644
--- a/nncf/__init__.py
+++ b/nncf/__init__.py
@@ -52,13 +52,13 @@
 )
 from nncf.quantization.advanced_parameters import AdvancedAWQParameters as AdvancedAWQParameters
 from nncf.quantization.advanced_parameters import AdvancedBiasCorrectionParameters as AdvancedBiasCorrectionParameters
-from nncf.quantization.advanced_parameters import AdvancedCodebookParameters as AdvancedCodebookParameters
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as AdvancedCompressionParameters
 from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as AdvancedGPTQParameters
 from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as AdvancedLoraCorrectionParameters
 from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters as AdvancedQuantizationParameters
 from nncf.quantization.advanced_parameters import AdvancedScaleEstimationParameters as AdvancedScaleEstimationParameters
 from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters as AdvancedSmoothQuantParameters
+from nncf.quantization.advanced_parameters import CodebookParameters as CodebookParameters
 from nncf.quantization.advanced_parameters import OverflowFix as OverflowFix
 from nncf.scopes import IgnoredScope as IgnoredScope
 from nncf.scopes import Subgraph as Subgraph
diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 75e5208ac43..f1c05fdb86e 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -692,6 +692,7 @@ def create_ov_codebook_subgraph(
 ) -> op.Constant:
     """
     Create an OpenVINO subgraph with gather from the given codebook and indexes tensors.
+
     :param codebook: Codebook tensor.
     :param indexes: Indexes tensor.
     :param dtype: Data type of the indexes.
diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index 3b521a6144a..0ae6762a475 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -361,7 +361,7 @@ class AdvancedLoraCorrectionParameters:
 
 @api()
 @dataclass
-class AdvancedCodebookParameters:
+class CodebookParameters:
     """
     Contains advanced parameters for codebook compression algorithm.
 
@@ -395,8 +395,8 @@ class AdvancedCompressionParameters:
     :type lora_adapter_rank: int
     :param backend_params: Backend-specific parameters.
     :type backend_params: dict[str, Any]
-    :param codebook_params: Advanced parameters for codebook compression.
-    :type codebook_params: AdvancedCodebookParameters
+    :param codebook_params: Parameters for codebook compression.
+    :type codebook_params: CodebookParameters
     """
 
     statistics_path: Optional[str] = None
@@ -408,7 +408,7 @@ class AdvancedCompressionParameters:
     lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
     lora_adapter_rank: int = 256
     backend_params: dict[str, Any] = field(default_factory=dict)
-    codebook_params: AdvancedCodebookParameters = field(default_factory=AdvancedCodebookParameters)
+    codebook_params: CodebookParameters = field(default_factory=CodebookParameters)
 
 
 @api()
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 4a364751e83..ee6f1bab7e4 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -36,11 +36,11 @@
 from nncf.parameters import BackupMode
 from nncf.parameters import CompressionFormat
 from nncf.quantization import compress_weights
-from nncf.quantization.advanced_parameters import AdvancedCodebookParameters
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams
 from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams
 from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams
+from nncf.quantization.advanced_parameters import CodebookParameters
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
@@ -1076,7 +1076,7 @@ def test_codebook(codebook, dst_type, n_layers, group_size):
         group_size=group_size,
         all_layers=True,
         advanced_parameters=AdvancedCompressionParameters(
-            codebook_params=AdvancedCodebookParameters(codebook=codebook, dst_type=dst_type)
+            codebook_params=CodebookParameters(codebook=codebook, dst_type=dst_type)
         ),
     )
     names_codebook = [

From 9bd8c4b54f830c160a2bb34c2d2d26ebf9f1464d Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 11 Jun 2025 13:23:23 +0200
Subject: [PATCH 34/68] Removed tensor type.

---
 .../openvino/smollm2_360m_codebook/main.py                | 8 +++++---
 nncf/openvino/graph/node_utils.py                         | 7 +++----
 nncf/quantization/advanced_parameters.py                  | 1 -
 .../algorithms/weight_compression/openvino_backend.py     | 2 --
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index 68a0ea50980..ab6268a8b7f 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -9,7 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import openvino as ov
+import numpy as np
 from optimum.intel.openvino import OVModelForCausalLM
 from transformers import AutoTokenizer
 
@@ -88,7 +88,9 @@ def custom_codebook_example(model_id, output_dir):
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
 
-    codebook_params = nncf.CodebookParameters([-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], ov.Type.i8)
+    codebook_params = nncf.CodebookParameters(
+        np.array([-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], dtype=np.int8)
+    )
 
     model.model = nncf.compress_weights(
         model.model,
@@ -109,7 +111,7 @@ def custom_codebook_example(model_id, output_dir):
 
 def main():
     model_id = "HuggingFaceTB/SmolLM2-360M-Instruct"
-    output_dir = "smollm2_360m_compressed_codebook"
+    output_dir = "smollm2_360m_compressed_codebook_"
 
     res = default_codebook_example(model_id, output_dir)
     res += custom_codebook_example(model_id, output_dir + "_custom")
diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index f1c05fdb86e..edccbcfff0e 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -688,7 +688,7 @@ def create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] =
 
 
 def create_ov_codebook_subgraph(
-    codebook: Tensor, indexes: Tensor, dtype: ov.Type, codebook_dtype: ov.Type, name: Optional[str] = None
+    codebook: Tensor, indexes: Tensor, dtype: ov.Type, name: Optional[str] = None
 ) -> op.Constant:
     """
     Create an OpenVINO subgraph with gather from the given codebook and indexes tensors.
@@ -696,12 +696,11 @@ def create_ov_codebook_subgraph(
     :param codebook: Codebook tensor.
     :param indexes: Indexes tensor.
     :param dtype: Data type of the indexes.
-    :param codebook_dtype: Data type of the codebook.
     :param name: Optional name of the constant.
     :return: OpenVINO subgraph.
     """
-    codebook_const = opset.constant(codebook.data, dtype=codebook_dtype)
-    if codebook_dtype != ov.Type.f16:
+    codebook_const = opset.constant(codebook.data)
+    if codebook.dtype != ov.Type.f16:
         codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16)
 
     codebook_indexes = opset.constant(indexes.data, dtype=dtype)
diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index 0ae6762a475..7279d0d4ab3 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -372,7 +372,6 @@ class CodebookParameters:
     """
 
     codebook: Optional[list[Any]] = None
-    dst_type: Any = None
 
 
 @api()
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 6a3a3bf1c56..96e8ffe4f62 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -255,12 +255,10 @@ def _create_compression_subgraph(
         if compression_config.is_codebook:
             n_quants = compressed_weight.tensor.max()
             compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4)
-            codebook_params = advanced_parameters.codebook_params
             converted_const = create_ov_codebook_subgraph(
                 codebook=compressed_weight.codebook,
                 indexes=compressed_weight.tensor,
                 dtype=compression_dtype,
-                codebook_dtype=codebook_params.dst_type if codebook_params.dst_type else ov.Type.f8e4m3,
                 name=const_node_name,
             )
         else:

From 8f6eb33dda4bda0f5a24230d4ebce626e0c600ff Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Thu, 12 Jun 2025 14:43:22 +0200
Subject: [PATCH 35/68] 1) Applied suggestions. 2) Removed codebook gather from
 input nodes by name given in compression.

---
 nncf/openvino/graph/metatypes/openvino_metatypes.py    | 10 +++-------
 nncf/parameters.py                                     |  2 +-
 nncf/quantization/advanced_parameters.py               |  2 +-
 .../algorithms/weight_compression/openvino_backend.py  |  2 +-
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/nncf/openvino/graph/metatypes/openvino_metatypes.py b/nncf/openvino/graph/metatypes/openvino_metatypes.py
index fe433739237..e2b95afc241 100644
--- a/nncf/openvino/graph/metatypes/openvino_metatypes.py
+++ b/nncf/openvino/graph/metatypes/openvino_metatypes.py
@@ -819,13 +819,9 @@ def _is_embedding(node: ov.Node) -> bool:
     input_tensor = node.input_value(const_port_id)
     input_type = input_tensor.get_element_type().get_type_name()
 
-    try:
-        input_node = node.input(const_port_id).get_source_output().node
-        if input_node.get_type_info().name == "Convert":
-            input_type = input_node.input_value(0).get_element_type().get_type_name()
-    except AttributeError:
-        # Handle the case where input_node is not available
-        pass
+    if node.friendly_name.endswith("nncf_codebook"):
+        return False
+
     if input_type in allowed_types_list:
         const_node = get_operation_const_op(node, const_port_id)
         if const_node is not None:
diff --git a/nncf/parameters.py b/nncf/parameters.py
index b8966210d75..55ef80046de 100644
--- a/nncf/parameters.py
+++ b/nncf/parameters.py
@@ -94,10 +94,10 @@ class CompressWeightsMode(StrEnum):
     INT4_SYM = "int4_sym"
     INT4_ASYM = "int4_asym"
     NF4 = "nf4"
+    CB4_F8E4M3 = "cb4_f8e4m3"
     INT8 = "int8"  # Deprecated mode
     E2M1 = "e2m1"
     CODEBOOK = "codebook"
-    CB4_F8E4M3 = "cb4_f8e4m3"
 
 
 @api(canonical_alias="nncf.CompressionFormat")
diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index 0ae6762a475..93b02f2adec 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -363,7 +363,7 @@ class AdvancedLoraCorrectionParameters:
 @dataclass
 class CodebookParameters:
     """
-    Contains advanced parameters for codebook compression algorithm.
+    Contains parameters for codebook compression algorithm.
 
     :param codebook: The codebook (LUT) for the weight compression.
         Applicable for vector quantization.
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 6a3a3bf1c56..6175cf9857e 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -261,7 +261,7 @@ def _create_compression_subgraph(
                 indexes=compressed_weight.tensor,
                 dtype=compression_dtype,
                 codebook_dtype=codebook_params.dst_type if codebook_params.dst_type else ov.Type.f8e4m3,
-                name=const_node_name,
+                name=const_node_name + "_nncf_codebook",
             )
         else:
             compressed_const = create_ov_const_from_tensor(

From 8c7f42866b768d49a7e5e693a49ba30265b5dab1 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 24 Jun 2025 15:42:51 +0200
Subject: [PATCH 36/68] Removed data type from codebook parameters.

---
 .../weight_compression/algorithm.py           |  5 +-
 .../algorithms/weight_compression/config.py   |  3 +
 .../weight_compression/openvino_backend.py    |  1 -
 .../weight_compression/weight_lowering.py     | 75 ++-----------------
 4 files changed, 13 insertions(+), 71 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 7070311f8e4..339c2612851 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -39,14 +39,15 @@
 from nncf.quantization.algorithms.algorithm import Algorithm
 from nncf.quantization.algorithms.weight_compression.awq import AWQ
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
+from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES
 from nncf.quantization.algorithms.weight_compression.gptq import GPTQ
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
 from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
-from nncf.quantization.algorithms.weight_compression.weight_lowering import CB4_QUANTILES
 from nncf.quantization.algorithms.weight_compression.weight_lowering import WeightCompressionConfig
 from nncf.scopes import IgnoredScope
 from nncf.scopes import get_ignored_node_names_from_ignored_scope
+from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorDataType
 
 TModel = TypeVar("TModel")
@@ -442,7 +443,7 @@ def _get_primary_config(self):
             group_size=self._group_size,
             codebook_values=CB4_QUANTILES
             if self._mode == CompressWeightsMode.CB4_F8E4M3
-            else self._advanced_parameters.codebook_params.codebook,
+            else Tensor(self._advanced_parameters.codebook_params.codebook),
         )
 
     def _set_weight_compression_config(
diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
index 519be93dee4..d8d8c5e879e 100644
--- a/nncf/quantization/algorithms/weight_compression/config.py
+++ b/nncf/quantization/algorithms/weight_compression/config.py
@@ -64,6 +64,9 @@ def is_codebook(self):
         """
         return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]
 
+    def get_numpy_codebook(self):
+        return self.codebook_values.as_numpy_tensor().data
+
     def __hash__(self):
         return hash((self.mode.value, self.group_size))
 
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index fd2cb101e49..d464f894209 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -259,7 +259,6 @@ def _create_compression_subgraph(
                 codebook=compressed_weight.codebook,
                 indexes=compressed_weight.tensor,
                 dtype=compression_dtype,
-                name=const_node_name,
                 name=const_node_name + "_nncf_codebook",
             )
         else:
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index bd1031227ba..d328efb7c57 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -20,6 +20,8 @@
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES
+from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES
 from nncf.quantization.fake_quantize import calculate_scale_zero_point
 from nncf.tensor import Tensor
 from nncf.tensor import functions as fns
@@ -28,71 +30,6 @@
 
 ReductionAxes = Union[int, tuple[int, ...]]
 
-NF4_QUANTILES = np.array(
-    [
-        -1.0,
-        -0.6961928009986877,
-        -0.5250730514526367,
-        -0.39491748809814453,
-        -0.28444138169288635,
-        -0.18477343022823334,
-        -0.09105003625154495,
-        0.0,
-        0.07958029955625534,
-        0.16093020141124725,
-        0.24611230194568634,
-        0.33791524171829224,
-        0.44070982933044434,
-        0.5626170039176941,
-        0.7229568362236023,
-        1.0,
-    ],
-    dtype=np.float32,
-)
-
-CB4_QUANTILES = np.array(
-    [
-        -3.5,
-        -2.5,
-        -1.875,
-        -1.375,
-        -1.0,
-        -0.625,
-        -0.3125,
-        0.0,
-        0.2812,
-        0.5625,
-        0.875,
-        1.125,
-        1.5,
-        2.0,
-        2.5,
-        3.5,
-    ],
-    dtype=np.float32,
-)
-
-CENTER_OF_NF4_QUANTILES = np.array(
-    [
-        -0.84809643,
-        -0.6106329,
-        -0.45999527,
-        -0.33967942,
-        -0.2346074,
-        -0.13791174,
-        -0.045525018,
-        0.03979015,
-        0.120255254,
-        0.20352125,
-        0.29201376,
-        0.38931254,
-        0.5016634,
-        0.6427869,
-        0.8614784,
-    ],
-    dtype=np.float32,
-)
-
 
 MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000
 
@@ -221,7 +158,7 @@ def do_float_quantization(
     scale = precomputed_scale
     if scale is None:
         if config.is_codebook:
-            max_val = max(np.abs(np.array(config.codebook_values)))
+            max_val = max(np.abs(config.get_numpy_codebook()))
         scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val)
     norm_weight = _calculate_normalized_weight(weight, scale)
     if config.mode == CompressWeightsMode.NF4:
@@ -231,7 +168,9 @@ def do_float_quantization(
         else:
             compressed_weight = _calculate_nf4_quantized_weight(norm_weight)
     elif config.is_codebook:
-        compressed_weight, indexes = _calculate_codebook_quantized_weight(norm_weight, quantiles=config.codebook_values)
+        compressed_weight, indexes = _calculate_codebook_quantized_weight(
+            norm_weight, quantiles=config.get_numpy_codebook()
+        )
         return compressed_weight, scale, indexes
     else:
         # TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved
@@ -388,7 +327,7 @@ def compress_weight(
                 indexes,
                 scale,
                 None,
-                fns.from_numpy(np.array(config.codebook_values), backend=compressed_weight.backend),
+                config.codebook_values,
             )
         else:
             return CompressedWeight(compressed_weight, scale)

From db4399114ea8c696b5824dc9e6fc02990c134dd6 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 24 Jun 2025 17:13:50 +0200
Subject: [PATCH 37/68] Removed circular imports.

---
 .../openvino/smollm2_360m_codebook/main.py    |  2 +-
 .../openvino/optimized_functions/functions.py |  2 +-
 .../algorithms/weight_compression/common.py   | 38 ++++++++++++++++++-
 .../weight_compression/scale_estimation.py    |  2 +-
 .../weight_compression/weight_lowering.py     | 33 +---------------
 .../quantization/test_weights_compression.py  | 28 +++++++++-----
 .../test_compression_functions.py             |  2 +-
 7 files changed, 61 insertions(+), 46 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index ab6268a8b7f..eb2eaff0a25 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -111,7 +111,7 @@ def custom_codebook_example(model_id, output_dir):
 
 def main():
     model_id = "HuggingFaceTB/SmolLM2-360M-Instruct"
-    output_dir = "smollm2_360m_compressed_codebook_"
+    output_dir = "smollm2_360m_compressed_codebook"
 
     res = default_codebook_example(model_id, output_dir)
     res += custom_codebook_example(model_id, output_dir + "_custom")
diff --git a/nncf/openvino/optimized_functions/functions.py b/nncf/openvino/optimized_functions/functions.py
index 282a43f9d2b..bc34e6a023c 100644
--- a/nncf/openvino/optimized_functions/functions.py
+++ b/nncf/openvino/optimized_functions/functions.py
@@ -21,8 +21,8 @@
 from nncf.openvino.optimized_functions.models import get_integer_quantization_error_model
 from nncf.openvino.optimized_functions.models import get_integer_quantization_model
 from nncf.openvino.optimized_functions.models import get_integer_quantize_dequantize_weight_model
+from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
 from nncf.tensor import Tensor
 from nncf.tensor import TensorBackend
 from nncf.tensor import TensorDataType
diff --git a/nncf/quantization/algorithms/weight_compression/common.py b/nncf/quantization/algorithms/weight_compression/common.py
index 8c1d60fd400..94128b615de 100644
--- a/nncf/quantization/algorithms/weight_compression/common.py
+++ b/nncf/quantization/algorithms/weight_compression/common.py
@@ -10,10 +10,14 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
+from nncf.errors import InvalidGroupSizeError
+from nncf.errors import UnsupportedModelError
 from nncf.tensor import Tensor
 
+ReductionAxes = Union[int, tuple[int, ...]]
+
 
 @dataclass
 class Codebook:
@@ -51,3 +55,35 @@ def is_codebook(self):
         :return: True if the compressed weight is a codebook, False otherwise.
         """
         return self.codebook is not None and self.tensor is not None and self.scale is not None
+
+
+def reshape_weight_for_grouped_quantization(
+    weight: Tensor, reduction_axes: ReductionAxes, group_size: int
+) -> tuple[Tensor, int]:
+    """
+    Reshapes weight for group-wise quantization and return a reduction axis for collecting statistics per group
+    dimension. Having a transposed weight with shapes [c_out, c_in] and group size = 128, shape of reshaped weight is
+    [c_out, c_in // 128, 128], reduction axis = 1 and the returned reduction axis = 2.
+
+    :param weight: Weight array to compress.
+    :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
+    :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale).
+    :return: reshaped weight and new reduction axis.
+    """
+    assert group_size != -1
+    if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1:
+        reduction_axes = reduction_axes[0]
+    if not isinstance(reduction_axes, int):
+        msg = f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}."
+        raise UnsupportedModelError(msg)
+    channel_size = weight.shape[reduction_axes]
+    if channel_size % group_size != 0:
+        msg = f"Channel size {channel_size} should be divisible by size of group {group_size}."
+        raise InvalidGroupSizeError(msg)
+
+    num_groups_per_channel = channel_size // group_size
+    shape = list(weight.shape)  # [a1, r, a2] - "r" refers to number of channels along reduction axis
+    shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size)
+    reshaped_weight = weight.reshape(shape)
+    reduction_axes += 1
+    return reshaped_weight, reduction_axes
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 020a42c8f16..8b9b460df38 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -21,6 +21,7 @@
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
+from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
@@ -28,7 +29,6 @@
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import integer_quantize_dequantize_weight
-from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 from nncf.tensor import functions as fns
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index d328efb7c57..9421adc8ff2 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -19,6 +19,7 @@
 from nncf.common.utils.backend import is_openvino_available
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
+from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES
 from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES
@@ -34,38 +35,6 @@
 MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000
 
 
-def reshape_weight_for_grouped_quantization(
-    weight: Tensor, reduction_axes: ReductionAxes, group_size: int
-) -> tuple[Tensor, int]:
-    """
-    Reshapes weight for group-wise quantization and return a reduction axis for collecting statistics per group
-    dimension. Having a transposed weight with shapes [c_out, c_in] and group size = 128, shape of reshaped weight is
-    [c_out, c_in // 128, 128], reduction axis = 1 and the returned reduction axis = 2.
-
-    :param weight: Weight array to compress.
-    :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
-    :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale).
-    :return: reshaped weight and new reduction axis.
-    """
-    assert group_size != -1
-    if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1:
-        reduction_axes = reduction_axes[0]
-    if not isinstance(reduction_axes, int):
-        msg = f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}."
-        raise nncf.UnsupportedModelError(msg)
-    channel_size = weight.shape[reduction_axes]
-    if channel_size % group_size != 0:
-        msg = f"Channel size {channel_size} should be divisible by size of group {group_size}."
-        raise nncf.InvalidGroupSizeError(msg)
-
-    num_groups_per_channel = channel_size // group_size
-    shape = list(weight.shape)  # [a1, r, a2] - "r" refers to number of channels along reduction axis
-    shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size)
-    reshaped_weight = weight.reshape(shape)
-    reduction_axes += 1
-    return reshaped_weight, reduction_axes
-
-
 def calculate_float_quantization_params(
     weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, max_val=6.0
 ) -> Tensor:
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index ee6f1bab7e4..ecc9e92343f 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -41,6 +41,7 @@
 from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams
 from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams
 from nncf.quantization.advanced_parameters import CodebookParameters
+from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
@@ -51,7 +52,6 @@
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error
-from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
 from nncf.scopes import IgnoredScope
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
@@ -1061,13 +1061,17 @@ def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids):
 @pytest.mark.parametrize(
     ("codebook", "dst_type", "n_layers"),
     (
-        ([i for i in range(-8, 8)], ov.Type.i4, 5),
-        ([i for i in range(-(2**6), 2**6)], ov.Type.i8, 5),
-        ([i for i in range(-(2**6), 2**6)], ov.Type.f8e4m3, 5),
+        (np.array([i for i in range(-8, 8)], np.int8), ov.Type.i8, 5),
+        (np.array([i for i in range(-(2**6), 2**6)], np.int8), ov.Type.i8, 5),
+        (
+            Tensor(np.array([i for i in range(-(2**6), 2**6)])).as_openvino_tensor().astype(TensorDataType.f8e4m3),
+            ov.Type.f8e4m3,
+            5,
+        ),
     ),
 )
 @pytest.mark.parametrize("group_size", (1, -1))
-def test_codebook(codebook, dst_type, n_layers, group_size):
+def test_codebook(codebook, n_layers, dst_type, group_size):
     model = SequentialMatmulModel().ov_model
     compressed_model = compress_weights(
         model,
@@ -1075,10 +1079,16 @@ def test_codebook(codebook, dst_type, n_layers, group_size):
         ratio=1.0,
         group_size=group_size,
         all_layers=True,
-        advanced_parameters=AdvancedCompressionParameters(
-            codebook_params=CodebookParameters(codebook=codebook, dst_type=dst_type)
-        ),
+        advanced_parameters=AdvancedCompressionParameters(codebook_params=CodebookParameters(codebook=codebook)),
     )
+    names_codebook = [
+        op.get_friendly_name()
+        for op in compressed_model.get_ordered_ops()
+        if op.get_friendly_name().endswith("nncf_codebook")
+    ]
+
+    assert len(names_codebook) == n_layers
+
     names_codebook = [
         op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == dst_type
     ]
@@ -1121,7 +1131,7 @@ def test_codebook_weighs_range(data):
     codebook = data
     max_diff = 0.1
     w = Tensor(data + (np.random.rand(*data.shape) - 0.5) * max_diff)
-    config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK, codebook_values=data)
+    config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK, codebook_values=Tensor(data))
     _, scale, indexes = do_float_quantization(w, config, -1)
     uncompressed_data = codebook[indexes.data] * scale.data
 
diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py
index 67a9fcef14d..41148cbc2c2 100644
--- a/tests/openvino/optimized_functions/test_compression_functions.py
+++ b/tests/openvino/optimized_functions/test_compression_functions.py
@@ -28,6 +28,7 @@
 from nncf.common.utils.caching import cache_results
 from nncf.openvino.cpu_info import is_arm_cpu
 from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor
+from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
@@ -35,7 +36,6 @@
 from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error
 from nncf.quantization.algorithms.weight_compression.weight_lowering import integer_quantize_dequantize_weight
-from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 from nncf.tensor.definitions import TensorBackend

From 7c9429ea9cb18f6ae913e416f3e9e964c834298f Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 24 Jun 2025 17:28:19 +0200
Subject: [PATCH 38/68] Added file with constants.

---
 nncf/quantization/advanced_parameters.py      |  8 +-
 .../weight_compression/constants.py           | 86 +++++++++++++++++++
 2 files changed, 91 insertions(+), 3 deletions(-)
 create mode 100644 nncf/quantization/algorithms/weight_compression/constants.py

diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index d3a9446ea28..fcf04bf01a5 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -29,6 +29,8 @@
 from nncf.quantization.range_estimator import RangeEstimatorParameters
 from nncf.quantization.range_estimator import StatisticsType
 
+TTensor = Any
+
 
 @api(canonical_alias="nncf.OverflowFix")
 class OverflowFix(StrEnum):
@@ -366,12 +368,12 @@ class CodebookParameters:
     Contains parameters for codebook compression algorithm.
 
     :param codebook: The codebook (LUT) for the weight compression.
-        Applicable for vector quantization.
-    :type codebook: list[Any]
+        Applicable for vector quantization. Must be a numpy array, ov Tensor, or torch Tensor.
+    :type codebook: TTensor
     :param dts_type: The type of the codebook.
     """
 
-    codebook: Optional[list[Any]] = None
+    codebook: Optional[TTensor] = None
 
 
 @api()
diff --git a/nncf/quantization/algorithms/weight_compression/constants.py b/nncf/quantization/algorithms/weight_compression/constants.py
new file mode 100644
index 00000000000..fcbe91bfb53
--- /dev/null
+++ b/nncf/quantization/algorithms/weight_compression/constants.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from nncf.tensor import Tensor
+from nncf.tensor import TensorDataType
+
+NF4_QUANTILES = np.array(
+    [
+        -1.0,
+        -0.6961928009986877,
+        -0.5250730514526367,
+        -0.39491748809814453,
+        -0.28444138169288635,
+        -0.18477343022823334,
+        -0.09105003625154495,
+        0.0,
+        0.07958029955625534,
+        0.16093020141124725,
+        0.24611230194568634,
+        0.33791524171829224,
+        0.44070982933044434,
+        0.5626170039176941,
+        0.7229568362236023,
+        1.0,
+    ],
+    dtype=np.float32,
+)
+
+CB4_QUANTILES = (
+    Tensor(
+        np.array(
+            [
+                -3.5,
+                -2.5,
+                -1.875,
+                -1.375,
+                -1.0,
+                -0.625,
+                -0.3125,
+                0.0,
+                0.2812,
+                0.5625,
+                0.875,
+                1.125,
+                1.5,
+                2.0,
+                2.5,
+                3.5,
+            ],
+            dtype=np.float32,
+        )
+    )
+    .as_openvino_tensor()
+    .astype(TensorDataType.f8e4m3)
+)
+
+CENTER_OF_NF4_QUANTILES = np.array(
+    [
+        -0.84809643,
+        -0.6106329,
+        -0.45999527,
+        -0.33967942,
+        -0.2346074,
+        -0.13791174,
+        -0.045525018,
+        0.03979015,
+        0.120255254,
+        0.20352125,
+        0.29201376,
+        0.38931254,
+        0.5016634,
+        0.6427869,
+        0.8614784,
+    ],
+    dtype=np.float32,
+)

From b90ccf378119dca2d88127ee161667f70ad2c4ee Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 24 Jun 2025 17:45:25 +0200
Subject: [PATCH 39/68] Moved default codebook initialization to function.

---
 .../weight_compression/algorithm.py           |  4 +-
 .../weight_compression/constants.py           | 53 ++++++++++---------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 339c2612851..376894fde2e 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -39,7 +39,7 @@
 from nncf.quantization.algorithms.algorithm import Algorithm
 from nncf.quantization.algorithms.weight_compression.awq import AWQ
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
-from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES
+from nncf.quantization.algorithms.weight_compression.constants import get_cb4_quantiles
 from nncf.quantization.algorithms.weight_compression.gptq import GPTQ
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
@@ -441,7 +441,7 @@ def _get_primary_config(self):
         return WeightCompressionConfig(
             mode=self._mode,
             group_size=self._group_size,
-            codebook_values=CB4_QUANTILES
+            codebook_values=get_cb4_quantiles()
             if self._mode == CompressWeightsMode.CB4_F8E4M3
             else Tensor(self._advanced_parameters.codebook_params.codebook),
         )
diff --git a/nncf/quantization/algorithms/weight_compression/constants.py b/nncf/quantization/algorithms/weight_compression/constants.py
index fcbe91bfb53..4465549a706 100644
--- a/nncf/quantization/algorithms/weight_compression/constants.py
+++ b/nncf/quantization/algorithms/weight_compression/constants.py
@@ -36,33 +36,34 @@
     dtype=np.float32,
 )
 
-CB4_QUANTILES = (
-    Tensor(
-        np.array(
-            [
-                -3.5,
-                -2.5,
-                -1.875,
-                -1.375,
-                -1.0,
-                -0.625,
-                -0.3125,
-                0.0,
-                0.2812,
-                0.5625,
-                0.875,
-                1.125,
-                1.5,
-                2.0,
-                2.5,
-                3.5,
-            ],
-            dtype=np.float32,
-        )
+
+def get_cb4_quantiles() -> Tensor:
+    """
+    Returns the quantiles for the CB4 codebook.
+    """
+    CB4_QUANTILES = np.array(
+        [
+            -3.5,
+            -2.5,
+            -1.875,
+            -1.375,
+            -1.0,
+            -0.625,
+            -0.3125,
+            0.0,
+            0.2812,
+            0.5625,
+            0.875,
+            1.125,
+            1.5,
+            2.0,
+            2.5,
+            3.5,
+        ],
+        dtype=np.float32,
     )
-    .as_openvino_tensor()
-    .astype(TensorDataType.f8e4m3)
-)
+    return Tensor(CB4_QUANTILES).as_openvino_tensor().astype(TensorDataType.f8e4m3)
+
 
 CENTER_OF_NF4_QUANTILES = np.array(
     [

From 8a06f88f002dd811c72cd9dcc3f6efe1f5fccd4e Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 25 Jun 2025 10:24:54 +0200
Subject: [PATCH 40/68] Added test for comparison of compressed weight values
 for CB4_F8E4M3 type.

---
 nncf/openvino/graph/node_utils.py             |   4 +-
 .../weight_compression/openvino_backend.py    |   2 +-
 ...erModel_compressed_weights_cb4_f8e4m3.json | 178 ++++++++++++++++++
 .../quantization/test_weights_compression.py  |  32 ++++
 4 files changed, 213 insertions(+), 3 deletions(-)
 create mode 100644 tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json

diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index edccbcfff0e..7622eaf2c52 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -699,7 +699,7 @@ def create_ov_codebook_subgraph(
     :param name: Optional name of the constant.
     :return: OpenVINO subgraph.
     """
-    codebook_const = opset.constant(codebook.data)
+    codebook_const = opset.constant(codebook.data, name=name)
     if codebook.dtype != ov.Type.f16:
         codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16)
 
@@ -707,5 +707,5 @@ def create_ov_codebook_subgraph(
     if dtype == ov.Type.u4:
         codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8)
 
-    const = opset.gather(codebook_const, codebook_indexes, 0, name=name)
+    const = opset.gather(codebook_const, codebook_indexes, 0, name=name + "_nncf_codebook")
     return const
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index d464f894209..96e8ffe4f62 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -259,7 +259,7 @@ def _create_compression_subgraph(
                 codebook=compressed_weight.codebook,
                 indexes=compressed_weight.tensor,
                 dtype=compression_dtype,
-                name=const_node_name + "_nncf_codebook",
+                name=const_node_name,
             )
         else:
             compressed_const = create_ov_const_from_tensor(
diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json
new file mode 100644
index 00000000000..b8712bf3839
--- /dev/null
+++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json
@@ -0,0 +1,178 @@
+{
+    "matmul_2_data": {
+        "scale": [
+            [
+                [
+                    0.2275390625
+                ]
+            ],
+            [
+                [
+                    0.269287109375
+                ]
+            ],
+            [
+                [
+                    0.272705078125
+                ]
+            ],
+            [
+                [
+                    0.284423828125
+                ]
+            ],
+            [
+                [
+                    0.266357421875
+                ]
+            ],
+            [
+                [
+                    0.2802734375
+                ]
+            ]
+        ]
+    },
+    "matmul_1_data": {
+        "compressed_weight": [
+            [
+                119,
+                168,
+                11,
+                49,
+                255,
+                255
+            ],
+            [
+                255,
+                159,
+                255,
+                255,
+                255,
+                255
+            ],
+            [
+                255,
+                169,
+                59,
+                255,
+                228,
+                135
+            ],
+            [
+                202,
+                255,
+                255,
+                149,
+                238,
+                134
+            ],
+            [
+                229,
+                130,
+                151,
+                255,
+                87,
+                240
+            ],
+            [
+                26,
+                255,
+                245,
+                75,
+                255,
+                18
+            ]
+        ],
+        "zero_point": [
+            [
+                0
+            ],
+            [
+                0
+            ],
+            [
+                0
+            ],
+            [
+                0
+            ],
+            [
+                0
+            ],
+            [
+                0
+            ]
+        ],
+        "scale": [
+            [
+                0.0025196075439453125
+            ],
+            [
+                0.0024051666259765625
+            ],
+            [
+                0.002300262451171875
+            ],
+            [
+                0.0024013519287109375
+            ],
+            [
+                0.0025997161865234375
+            ],
+            [
+                0.003208160400390625
+            ]
+        ]
+    },
+    "gather_2_data": {
+        "compressed_weight": [
+            [
+                181,
+                77,
+                12,
+                5,
+                231,
+                255
+            ],
+            [
+                166,
+                200,
+                149,
+                255,
+                223,
+                1
+            ],
+            [
+                255,
+                10,
+                224,
+                54,
+                255,
+                166
+            ]
+        ],
+        "zero_point": [
+            [
+                0
+            ],
+            [
+                0
+            ],
+            [
+                0
+            ]
+        ],
+        "scale": [
+            [
+                0.0035152435302734375
+            ],
+            [
+                0.0036563873291015625
+            ],
+            [
+                0.003253936767578125
+            ]
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index ecc9e92343f..5b9aaf6a525 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -231,6 +231,37 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7):
     }
 
 
+def check_cb4_f8e4m3_grouped(op: ov.Node, group_size: int = 7):
+    assert op.get_element_type() == ov.Type.f8e4m3
+
+    convert_node = get_next_node(op)
+    assert convert_node.get_type_name() == "Convert"
+
+    gather_node = get_next_node(convert_node)
+    assert gather_node.get_type_name() == "Gather"
+
+    weight_shape = gather_node.shape
+    # NOTE: get_const_value_as_numpy_tensor doesn't work for 4-bit types
+    assert list(weight_shape)[-1] == group_size
+    reduced_weight_shape = list(weight_shape)
+    reduced_weight_shape[-1] = 1
+
+    mul_node = get_next_node(gather_node)
+    assert mul_node.get_type_name() == "Multiply"
+    scale_node = mul_node.input_value(1).get_node()
+    assert list(scale_node.shape) == reduced_weight_shape
+
+    reshape_node = get_next_node(mul_node)
+    assert reshape_node.get_type_name() == "Reshape"
+
+    convert_node = get_next_node(reshape_node)
+    assert convert_node.get_type_name() == "Convert"
+
+    return {
+        "scale": get_const_value_as_numpy_tensor(scale_node),
+    }
+
+
 def check_int4_sym_grouped(op: ov.Node):
     return check_int4_grouped(op, mode=CompressWeightsMode.INT4_SYM)
 
@@ -258,6 +289,7 @@ def get_mixed_mapping(primary_fn: Callable, list_layers: list[str]):
         (CompressWeightsMode.INT4_SYM, 7, get_mixed_mapping(check_int4_sym_grouped, TEST_MODELS[IntegerModel])),
         (CompressWeightsMode.INT4_ASYM, 7, get_mixed_mapping(check_int4_asym_grouped, TEST_MODELS[IntegerModel])),
         (CompressWeightsMode.NF4, 7, get_mixed_mapping(check_nf4_grouped, TEST_MODELS[IntegerModel])),
+        (CompressWeightsMode.CB4_F8E4M3, 7, get_mixed_mapping(check_cb4_f8e4m3_grouped, TEST_MODELS[IntegerModel])),
     ),
 )
 def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map):

From d6e4a76640940562ef8b23b9e71f04a358d78df9 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 25 Jun 2025 11:24:11 +0200
Subject: [PATCH 41/68] Fixed test.

---
 .../openvino/native/quantization/test_weights_compression.py  | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 5b9aaf6a525..bc80ac2d3c6 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -1082,9 +1082,7 @@ def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids):
         sensitivity_metric=mode,
     )
     names_codebook = {
-        op.get_friendly_name()
-        for op in compressed_model.get_ordered_ops()
-        if op.get_element_type() == ov.Type.f8e4m3 and op.get_friendly_name().startswith("Const")
+        op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == ov.Type.f8e4m3
     }
 
     assert ref_ids == len(names_codebook)

From b231848cc36902e49aa73ce1a1fa3789eb7ec003 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 25 Jun 2025 11:27:39 +0200
Subject: [PATCH 42/68] Fixed fp8 value.

---
 nncf/quantization/algorithms/weight_compression/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nncf/quantization/algorithms/weight_compression/constants.py b/nncf/quantization/algorithms/weight_compression/constants.py
index 4465549a706..726ba841e00 100644
--- a/nncf/quantization/algorithms/weight_compression/constants.py
+++ b/nncf/quantization/algorithms/weight_compression/constants.py
@@ -51,7 +51,7 @@ def get_cb4_quantiles() -> Tensor:
             -0.625,
             -0.3125,
             0.0,
-            0.2812,
+            0.28125,
             0.5625,
             0.875,
             1.125,

From de7b709d32173a8eb6ccbf5784dd150ba157a330 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 27 Jun 2025 10:46:09 +0200
Subject: [PATCH 43/68] Test for codebook graph.

---
 .../IntegerModel_codebook_f16_u4.json         |  61 ++++++++++
 .../IntegerModel_codebook_f8e4m3_u8.json      | 106 ++++++++++++++++++
 .../IntegerModel_codebook_i8_u8.json          | 106 ++++++++++++++++++
 .../IntegerModel_codebook_u8_u4.json          |  61 ++++++++++
 4 files changed, 334 insertions(+)
 create mode 100644 tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f16_u4.json
 create mode 100644 tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f8e4m3_u8.json
 create mode 100644 tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_i8_u8.json
 create mode 100644 tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_u8_u4.json

diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f16_u4.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f16_u4.json
new file mode 100644
index 00000000000..578b2cc53d3
--- /dev/null
+++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f16_u4.json
@@ -0,0 +1,61 @@
+{
+    "matmul_2_data_nncf_codebook_idxs": {
+        "indexes": [
+            171,
+            253,
+            154,
+            172,
+            217,
+            235,
+            250,
+            155,
+            253,
+            252,
+            188,
+            253,
+            207,
+            206,
+            253,
+            236,
+            254,
+            233,
+            255,
+            248,
+            255
+        ]
+    },
+    "matmul_2_data": {
+        "scale": [
+            [
+                [
+                    0.99560546875
+                ]
+            ],
+            [
+                [
+                    1.177734375
+                ]
+            ],
+            [
+                [
+                    1.193359375
+                ]
+            ],
+            [
+                [
+                    1.244140625
+                ]
+            ],
+            [
+                [
+                    1.1650390625
+                ]
+            ],
+            [
+                [
+                    1.2265625
+                ]
+            ]
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f8e4m3_u8.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f8e4m3_u8.json
new file mode 100644
index 00000000000..abf99c05ca4
--- /dev/null
+++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f8e4m3_u8.json
@@ -0,0 +1,106 @@
+{
+    "matmul_2_data_nncf_codebook_idxs": {
+        "indexes": [
+            [
+                [
+                    14,
+                    12,
+                    16,
+                    20,
+                    13,
+                    11,
+                    15
+                ]
+            ],
+            [
+                [
+                    12,
+                    11,
+                    16,
+                    13,
+                    17,
+                    12,
+                    20
+                ]
+            ],
+            [
+                [
+                    14,
+                    11,
+                    17,
+                    20,
+                    15,
+                    20,
+                    15
+                ]
+            ],
+            [
+                [
+                    14,
+                    16,
+                    20,
+                    19,
+                    15,
+                    18,
+                    15
+                ]
+            ],
+            [
+                [
+                    16,
+                    18,
+                    14,
+                    18,
+                    18,
+                    20,
+                    11
+                ]
+            ],
+            [
+                [
+                    17,
+                    19,
+                    20,
+                    10,
+                    19,
+                    20,
+                    20
+                ]
+            ]
+        ]
+    },
+    "matmul_2_data": {
+        "scale": [
+            [
+                [
+                    0.2275390625
+                ]
+            ],
+            [
+                [
+                    0.269287109375
+                ]
+            ],
+            [
+                [
+                    0.272705078125
+                ]
+            ],
+            [
+                [
+                    0.284423828125
+                ]
+            ],
+            [
+                [
+                    0.266357421875
+                ]
+            ],
+            [
+                [
+                    0.2802734375
+                ]
+            ]
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_i8_u8.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_i8_u8.json
new file mode 100644
index 00000000000..acf5ad93048
--- /dev/null
+++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_i8_u8.json
@@ -0,0 +1,106 @@
+{
+    "matmul_2_data_nncf_codebook_idxs": {
+        "indexes": [
+            [
+                [
+                    14,
+                    12,
+                    16,
+                    20,
+                    13,
+                    11,
+                    15
+                ]
+            ],
+            [
+                [
+                    12,
+                    11,
+                    16,
+                    13,
+                    17,
+                    12,
+                    20
+                ]
+            ],
+            [
+                [
+                    14,
+                    11,
+                    17,
+                    20,
+                    15,
+                    20,
+                    15
+                ]
+            ],
+            [
+                [
+                    14,
+                    16,
+                    20,
+                    20,
+                    15,
+                    18,
+                    15
+                ]
+            ],
+            [
+                [
+                    16,
+                    18,
+                    14,
+                    18,
+                    18,
+                    20,
+                    11
+                ]
+            ],
+            [
+                [
+                    17,
+                    19,
+                    20,
+                    10,
+                    19,
+                    20,
+                    20
+                ]
+            ]
+        ]
+    },
+    "matmul_2_data": {
+        "scale": [
+            [
+                [
+                    0.07965087890625
+                ]
+            ],
+            [
+                [
+                    0.09423828125
+                ]
+            ],
+            [
+                [
+                    0.095458984375
+                ]
+            ],
+            [
+                [
+                    0.0994873046875
+                ]
+            ],
+            [
+                [
+                    0.09320068359375
+                ]
+            ],
+            [
+                [
+                    0.09814453125
+                ]
+            ]
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_u8_u4.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_u8_u4.json
new file mode 100644
index 00000000000..8642e52a868
--- /dev/null
+++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_u8_u4.json
@@ -0,0 +1,61 @@
+{
+    "matmul_2_data_nncf_codebook_idxs": {
+        "indexes": [
+            54,
+            248,
+            20,
+            56,
+            145,
+            181,
+            243,
+            38,
+            250,
+            247,
+            104,
+            249,
+            126,
+            123,
+            217,
+            199,
+            251,
+            178,
+            254,
+            208,
+            255
+        ]
+    },
+    "matmul_2_data": {
+        "scale": [
+            [
+                [
+                    0.0531005859375
+                ]
+            ],
+            [
+                [
+                    0.06280517578125
+                ]
+            ],
+            [
+                [
+                    0.06365966796875
+                ]
+            ],
+            [
+                [
+                    0.06634521484375
+                ]
+            ],
+            [
+                [
+                    0.0621337890625
+                ]
+            ],
+            [
+                [
+                    0.0654296875
+                ]
+            ]
+        ]
+    }
+}
\ No newline at end of file

From 4737adea5556823425fc9a008463532a21413e95 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 27 Jun 2025 11:47:28 +0200
Subject: [PATCH 44/68] Changed name of file for more appropriate.

---
 nncf/openvino/graph/node_utils.py             |  2 +-
 .../{common.py => group_quantization.py}      | 41 +---------
 .../weight_compression/parameters.py          | 53 +++++++++++++
 .../weight_compression/weight_lowering.py     |  4 +-
 .../quantization/test_weights_compression.py  | 74 ++++++++++++++++++-
 .../test_compression_functions.py             |  2 +-
 6 files changed, 129 insertions(+), 47 deletions(-)
 rename nncf/quantization/algorithms/weight_compression/{common.py => group_quantization.py} (64%)
 create mode 100644 nncf/quantization/algorithms/weight_compression/parameters.py

diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 7622eaf2c52..3892876e51d 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -703,7 +703,7 @@ def create_ov_codebook_subgraph(
     if codebook.dtype != ov.Type.f16:
         codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16)
 
-    codebook_indexes = opset.constant(indexes.data, dtype=dtype)
+    codebook_indexes = opset.constant(indexes.data, dtype=dtype, name=name + "_nncf_codebook_idxs")
     if dtype == ov.Type.u4:
         codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8)
 
diff --git a/nncf/quantization/algorithms/weight_compression/common.py b/nncf/quantization/algorithms/weight_compression/group_quantization.py
similarity index 64%
rename from nncf/quantization/algorithms/weight_compression/common.py
rename to nncf/quantization/algorithms/weight_compression/group_quantization.py
index 94128b615de..215dd217ea3 100644
--- a/nncf/quantization/algorithms/weight_compression/common.py
+++ b/nncf/quantization/algorithms/weight_compression/group_quantization.py
@@ -9,8 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Union
 
 from nncf.errors import InvalidGroupSizeError
 from nncf.errors import UnsupportedModelError
@@ -19,44 +18,6 @@
 ReductionAxes = Union[int, tuple[int, ...]]
 
 
-@dataclass
-class Codebook:
-    """
-    Codebook parameters for weight compression.
-    :param codebook: The initial codebook for compression.
-    :param dst_type: The destination type for the codebook.
-    """
-
-    codebook: Optional[Tensor] = None
-    dst_type: Optional[Any] = None
-
-
-@dataclass
-class CompressedWeight:
-    """
-    Compressed weight and decompression parameters.
-
-    :param tensor: The tensor with compressed weight.
-    :param scale: The decompression scale, in practice it is dequantization scale for the quantization.
-    :param zero_point: The zero-point, it is the value of the compression type corresponding to the value 0
-        in the non-compression realm. Applicable for INT quantization.
-    :param codebook: The codebook (LUT) for the weight compression. Applicable for vector quantization
-    """
-
-    tensor: Optional[Tensor] = None
-    scale: Optional[Tensor] = None
-    zero_point: Optional[Tensor] = None
-    codebook: Optional[Codebook] = None
-
-    def is_codebook(self):
-        """
-        Check if the compressed weight is a codebook.
-
-        :return: True if the compressed weight is a codebook, False otherwise.
-        """
-        return self.codebook is not None and self.tensor is not None and self.scale is not None
-
-
 def reshape_weight_for_grouped_quantization(
     weight: Tensor, reduction_axes: ReductionAxes, group_size: int
 ) -> tuple[Tensor, int]:
diff --git a/nncf/quantization/algorithms/weight_compression/parameters.py b/nncf/quantization/algorithms/weight_compression/parameters.py
new file mode 100644
index 00000000000..8c1d60fd400
--- /dev/null
+++ b/nncf/quantization/algorithms/weight_compression/parameters.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Optional
+
+from nncf.tensor import Tensor
+
+
+@dataclass
+class Codebook:
+    """
+    Codebook parameters for weight compression.
+    :param codebook: The initial codebook for compression.
+    :param dst_type: The destination type for the codebook.
+    """
+
+    codebook: Optional[Tensor] = None
+    dst_type: Optional[Any] = None
+
+
+@dataclass
+class CompressedWeight:
+    """
+    Compressed weight and decompression parameters.
+
+    :param tensor: The tensor with compressed weight.
+    :param scale: The decompression scale, in practice it is dequantization scale for the quantization.
+    :param zero_point: The zero-point, it is the value of the compression type corresponding to the value 0
+        in the non-compression realm. Applicable for INT quantization.
+    :param codebook: The codebook (LUT) for the weight compression. Applicable for vector quantization
+    """
+
+    tensor: Optional[Tensor] = None
+    scale: Optional[Tensor] = None
+    zero_point: Optional[Tensor] = None
+    codebook: Optional[Codebook] = None
+
+    def is_codebook(self):
+        """
+        Check if the compressed weight is a codebook.
+
+        :return: True if the compressed weight is a codebook, False otherwise.
+        """
+        return self.codebook is not None and self.tensor is not None and self.scale is not None
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 9421adc8ff2..8a5370b7716 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -18,11 +18,11 @@
 from nncf.common.utils.backend import is_openvino_at_least
 from nncf.common.utils.backend import is_openvino_available
 from nncf.parameters import CompressWeightsMode
-from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
-from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES
 from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES
+from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.fake_quantize import calculate_scale_zero_point
 from nncf.tensor import Tensor
 from nncf.tensor import functions as fns
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index bc80ac2d3c6..7b92e71964f 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -231,8 +231,8 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7):
     }
 
 
-def check_cb4_f8e4m3_grouped(op: ov.Node, group_size: int = 7):
-    assert op.get_element_type() == ov.Type.f8e4m3
+def check_codebook_grouped(op: ov.Node, group_size: int = 7, dtype=ov.Type.f8e4m3):
+    assert op.get_element_type() == dtype
 
     convert_node = get_next_node(op)
     assert convert_node.get_type_name() == "Convert"
@@ -262,6 +262,23 @@ def check_cb4_f8e4m3_grouped(op: ov.Node, group_size: int = 7):
     }
 
 
+def check_codebook_indexes(op: ov.Node, dtype=ov.Type.u4):
+    assert op.get_element_type() == dtype
+
+    if dtype == ov.Type.u4:
+        convert_node = get_next_node(op)
+        assert convert_node.get_type_name() == "Convert"
+    else:
+        convert_node = op
+
+    gather_node = get_next_node(convert_node)
+    assert gather_node.get_type_name() == "Gather"
+
+    return {
+        "indexes": get_const_value_as_numpy_tensor(op),
+    }
+
+
 def check_int4_sym_grouped(op: ov.Node):
     return check_int4_grouped(op, mode=CompressWeightsMode.INT4_SYM)
 
@@ -289,7 +306,7 @@ def get_mixed_mapping(primary_fn: Callable, list_layers: list[str]):
         (CompressWeightsMode.INT4_SYM, 7, get_mixed_mapping(check_int4_sym_grouped, TEST_MODELS[IntegerModel])),
         (CompressWeightsMode.INT4_ASYM, 7, get_mixed_mapping(check_int4_asym_grouped, TEST_MODELS[IntegerModel])),
         (CompressWeightsMode.NF4, 7, get_mixed_mapping(check_nf4_grouped, TEST_MODELS[IntegerModel])),
-        (CompressWeightsMode.CB4_F8E4M3, 7, get_mixed_mapping(check_cb4_f8e4m3_grouped, TEST_MODELS[IntegerModel])),
+        (CompressWeightsMode.CB4_F8E4M3, 7, get_mixed_mapping(check_codebook_grouped, TEST_MODELS[IntegerModel])),
     ),
 )
 def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map):
@@ -313,6 +330,57 @@ def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map):
     compare_stats(ref_stats, actual_stats)
 
 
+@pytest.mark.parametrize(
+    "codebook, codebook_dtype, index_dtype, name",
+    [
+        (np.array([i for i in range(16)], np.uint8), ov.Type.u8, ov.Type.u4, "u8_u4"),
+        (np.array([0.1 * i for i in range(-8, 8)], np.float16), ov.Type.f16, ov.Type.u4, "f16_u4"),
+        (
+            Tensor(np.array([0.35 * i for i in range(-10, 11)], np.float16))
+            .as_openvino_tensor()
+            .astype(TensorDataType.f8e4m3),
+            ov.Type.f8e4m3,
+            ov.Type.u8,
+            "f8e4m3_u8",
+        ),
+        (
+            Tensor(np.array([i for i in range(-10, 11)], np.int8)).as_openvino_tensor().astype(TensorDataType.int8),
+            ov.Type.i8,
+            ov.Type.u8,
+            "i8_u8",
+        ),
+    ],
+)
+def test_compression_with_сodebook_for_different_dtypes(codebook, codebook_dtype, index_dtype, name):
+    model = IntegerModel().ov_model
+    codebook_params = nncf.CodebookParameters(codebook)
+
+    compressed_model = compress_weights(
+        model,
+        mode=CompressWeightsMode.CODEBOOK,
+        group_size=7,
+        advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params),
+    )
+    actual_stats = {}
+    for op in compressed_model.get_ops():
+        op_name = op.get_friendly_name()
+        if op.get_type_name() == "Constant":
+            if op_name == "matmul_2_data":
+                actual_stats[op_name] = check_codebook_grouped(op, group_size=7, dtype=codebook_dtype)
+            elif op_name == "matmul_2_data_nncf_codebook_idxs":
+                actual_stats[op_name] = check_codebook_indexes(op, dtype=index_dtype)
+
+    ref_stats_path = get_actual_reference_for_current_openvino(
+        REFERENCE_SCALES_DIR / f"IntegerModel_codebook_{name}.json"
+    )
+
+    if os.getenv("NNCF_TEST_REGEN_DOT") is not None:
+        dump_to_json(ref_stats_path, actual_stats)
+
+    ref_stats = load_json(ref_stats_path)
+    compare_stats(ref_stats, actual_stats)
+
+
 @pytest.mark.parametrize("metric", DATA_BASED_SENSITIVITY_METRICS)
 def test_gather_in_4_bit_if_all_layers_with_data(metric):
     dim1 = 2  # sequence length dimension
diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py
index 41148cbc2c2..4df1befd37b 100644
--- a/tests/openvino/optimized_functions/test_compression_functions.py
+++ b/tests/openvino/optimized_functions/test_compression_functions.py
@@ -28,8 +28,8 @@
 from nncf.common.utils.caching import cache_results
 from nncf.openvino.cpu_info import is_arm_cpu
 from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor
-from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization

From 072a62a3f03cd845594db59fe6effff342e63802 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 27 Jun 2025 11:48:14 +0200
Subject: [PATCH 45/68] Changed name of file for more appropriate.

---
 nncf/openvino/optimized_functions/functions.py                | 2 +-
 nncf/quantization/algorithms/weight_compression/backend.py    | 2 +-
 nncf/quantization/algorithms/weight_compression/gptq.py       | 2 +-
 .../algorithms/weight_compression/onnx_backend.py             | 2 +-
 .../algorithms/weight_compression/openvino_backend.py         | 2 +-
 .../algorithms/weight_compression/scale_estimation.py         | 4 ++--
 .../algorithms/weight_compression/torch_backend.py            | 2 +-
 .../algorithms/weight_compression/torch_fx_backend.py         | 2 +-
 .../openvino/native/quantization/test_weights_compression.py  | 2 +-
 9 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/nncf/openvino/optimized_functions/functions.py b/nncf/openvino/optimized_functions/functions.py
index bc34e6a023c..217b4444c8c 100644
--- a/nncf/openvino/optimized_functions/functions.py
+++ b/nncf/openvino/optimized_functions/functions.py
@@ -21,7 +21,7 @@
 from nncf.openvino.optimized_functions.models import get_integer_quantization_error_model
 from nncf.openvino.optimized_functions.models import get_integer_quantization_model
 from nncf.openvino.optimized_functions.models import get_integer_quantize_dequantize_weight_model
-from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization
+from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.tensor import Tensor
 from nncf.tensor import TensorBackend
diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py
index cee8763995b..49b4ce487e9 100644
--- a/nncf/quantization/algorithms/weight_compression/backend.py
+++ b/nncf/quantization/algorithms/weight_compression/backend.py
@@ -26,7 +26,7 @@
 from nncf.experimental.common.tensor_statistics.statistics import HessianTensorStatistic
 from nncf.parameters import CompressionFormat
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
-from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.tensor import Tensor
diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py
index 767fe2f5127..125e0c84a6f 100644
--- a/nncf/quantization/algorithms/weight_compression/gptq.py
+++ b/nncf/quantization/algorithms/weight_compression/gptq.py
@@ -23,7 +23,7 @@
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.layerwise.engine import LayerwiseEngine
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
index c0a2ab73849..c3eeb7e49d6 100644
--- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -47,7 +47,7 @@
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 96e8ffe4f62..e5d5490b8f5 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -54,7 +54,7 @@
 from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 8b9b460df38..85d358d23b2 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -20,8 +20,8 @@
 from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
-from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
+from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index f4254bfb0c5..dd5c4aec80b 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -45,7 +45,7 @@
 from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
index 2172a6a5e37..b76179d1dd6 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
@@ -40,7 +40,7 @@
 from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.common import CompressedWeight
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 7b92e71964f..b48b0ce15cf 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -41,7 +41,7 @@
 from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams
 from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams
 from nncf.quantization.advanced_parameters import CodebookParameters
-from nncf.quantization.algorithms.weight_compression.common import reshape_weight_for_grouped_quantization
+from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA

From ede93422aa2d0aee22e3489eb30fba8b1036c257 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 27 Jun 2025 12:11:09 +0200
Subject: [PATCH 46/68] Return reshape_weight_for_grouped_quantization to
 weight_lowering.

---
 .../openvino/optimized_functions/functions.py |  2 +-
 .../algorithms/weight_compression/backend.py  |  2 +-
 .../algorithms/weight_compression/gptq.py     |  2 +-
 .../weight_compression/group_quantization.py  | 50 -------------------
 .../weight_compression/onnx_backend.py        |  2 +-
 .../weight_compression/openvino_backend.py    |  2 +-
 .../weight_compression/scale_estimation.py    |  4 +-
 .../weight_compression/torch_backend.py       |  2 +-
 .../weight_compression/torch_fx_backend.py    |  2 +-
 .../weight_compression/weight_lowering.py     | 36 ++++++++++++-
 .../quantization/test_weights_compression.py  |  2 +-
 .../test_compression_functions.py             |  2 +-
 12 files changed, 45 insertions(+), 63 deletions(-)
 delete mode 100644 nncf/quantization/algorithms/weight_compression/group_quantization.py

diff --git a/nncf/openvino/optimized_functions/functions.py b/nncf/openvino/optimized_functions/functions.py
index 217b4444c8c..282a43f9d2b 100644
--- a/nncf/openvino/optimized_functions/functions.py
+++ b/nncf/openvino/optimized_functions/functions.py
@@ -21,8 +21,8 @@
 from nncf.openvino.optimized_functions.models import get_integer_quantization_error_model
 from nncf.openvino.optimized_functions.models import get_integer_quantization_model
 from nncf.openvino.optimized_functions.models import get_integer_quantize_dequantize_weight_model
-from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
 from nncf.tensor import Tensor
 from nncf.tensor import TensorBackend
 from nncf.tensor import TensorDataType
diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py
index 49b4ce487e9..92c6cb80a5d 100644
--- a/nncf/quantization/algorithms/weight_compression/backend.py
+++ b/nncf/quantization/algorithms/weight_compression/backend.py
@@ -26,9 +26,9 @@
 from nncf.experimental.common.tensor_statistics.statistics import HessianTensorStatistic
 from nncf.parameters import CompressionFormat
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
-from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 
diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py
index 125e0c84a6f..814ec4a2a6b 100644
--- a/nncf/quantization/algorithms/weight_compression/gptq.py
+++ b/nncf/quantization/algorithms/weight_compression/gptq.py
@@ -23,9 +23,9 @@
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.layerwise.engine import LayerwiseEngine
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_float_quantization_params
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_integer_quantization_params
diff --git a/nncf/quantization/algorithms/weight_compression/group_quantization.py b/nncf/quantization/algorithms/weight_compression/group_quantization.py
deleted file mode 100644
index 215dd217ea3..00000000000
--- a/nncf/quantization/algorithms/weight_compression/group_quantization.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2025 Intel Corporation
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Union
-
-from nncf.errors import InvalidGroupSizeError
-from nncf.errors import UnsupportedModelError
-from nncf.tensor import Tensor
-
-ReductionAxes = Union[int, tuple[int, ...]]
-
-
-def reshape_weight_for_grouped_quantization(
-    weight: Tensor, reduction_axes: ReductionAxes, group_size: int
-) -> tuple[Tensor, int]:
-    """
-    Reshapes weight for group-wise quantization and return a reduction axis for collecting statistics per group
-    dimension. Having a transposed weight with shapes [c_out, c_in] and group size = 128, shape of reshaped weight is
-    [c_out, c_in // 128, 128], reduction axis = 1 and the returned reduction axis = 2.
-
-    :param weight: Weight array to compress.
-    :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
-    :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale).
-    :return: reshaped weight and new reduction axis.
-    """
-    assert group_size != -1
-    if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1:
-        reduction_axes = reduction_axes[0]
-    if not isinstance(reduction_axes, int):
-        msg = f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}."
-        raise UnsupportedModelError(msg)
-    channel_size = weight.shape[reduction_axes]
-    if channel_size % group_size != 0:
-        msg = f"Channel size {channel_size} should be divisible by size of group {group_size}."
-        raise InvalidGroupSizeError(msg)
-
-    num_groups_per_channel = channel_size // group_size
-    shape = list(weight.shape)  # [a1, r, a2] - "r" refers to number of channels along reduction axis
-    shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size)
-    reshaped_weight = weight.reshape(shape)
-    reduction_axes += 1
-    return reshaped_weight, reduction_axes
diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
index c3eeb7e49d6..07347cd3abe 100644
--- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -47,9 +47,9 @@
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorDataType
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index e5d5490b8f5..2d158daf5e3 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -54,11 +54,11 @@
 from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorDataType
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 85d358d23b2..d7c63c3d1e8 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -20,15 +20,15 @@
 from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
-from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import integer_quantize_dequantize_weight
+from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 from nncf.tensor import functions as fns
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index dd5c4aec80b..e8efeb302e3 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -45,10 +45,10 @@
 from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorDataType
diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
index b76179d1dd6..d00b0ae5b4c 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
@@ -40,10 +40,10 @@
 from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
+from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.torch_backend import PTAWQAlgoAlgoBackend
 from nncf.quantization.algorithms.weight_compression.torch_backend import PTMixedPrecisionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 8a5370b7716..8b00c98a755 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -17,11 +17,12 @@
 from nncf.common.logging.logger import nncf_logger
 from nncf.common.utils.backend import is_openvino_at_least
 from nncf.common.utils.backend import is_openvino_available
+from nncf.errors import InvalidGroupSizeError
+from nncf.errors import UnsupportedModelError
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES
 from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES
-from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.fake_quantize import calculate_scale_zero_point
 from nncf.tensor import Tensor
@@ -31,10 +32,41 @@
 
 ReductionAxes = Union[int, tuple[int, ...]]
 
-
 MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000
 
 
+def reshape_weight_for_grouped_quantization(
+    weight: Tensor, reduction_axes: ReductionAxes, group_size: int
+) -> tuple[Tensor, int]:
+    """
+    Reshapes weight for group-wise quantization and return a reduction axis for collecting statistics per group
+    dimension. Having a transposed weight with shapes [c_out, c_in] and group size = 128, shape of reshaped weight is
+    [c_out, c_in // 128, 128], reduction axis = 1 and the returned reduction axis = 2.
+
+    :param weight: Weight array to compress.
+    :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
+    :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale).
+    :return: reshaped weight and new reduction axis.
+    """
+    assert group_size != -1
+    if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1:
+        reduction_axes = reduction_axes[0]
+    if not isinstance(reduction_axes, int):
+        msg = f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}."
+        raise UnsupportedModelError(msg)
+    channel_size = weight.shape[reduction_axes]
+    if channel_size % group_size != 0:
+        msg = f"Channel size {channel_size} should be divisible by size of group {group_size}."
+        raise InvalidGroupSizeError(msg)
+
+    num_groups_per_channel = channel_size // group_size
+    shape = list(weight.shape)  # [a1, r, a2] - "r" refers to number of channels along reduction axis
+    shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size)
+    reshaped_weight = weight.reshape(shape)
+    reduction_axes += 1
+    return reshaped_weight, reduction_axes
+
+
 def calculate_float_quantization_params(
     weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, max_val=6.0
 ) -> Tensor:
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index b48b0ce15cf..182a267876b 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -41,7 +41,6 @@
 from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams
 from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams
 from nncf.quantization.advanced_parameters import CodebookParameters
-from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
@@ -52,6 +51,7 @@
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error
+from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
 from nncf.scopes import IgnoredScope
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py
index 4df1befd37b..67a9fcef14d 100644
--- a/tests/openvino/optimized_functions/test_compression_functions.py
+++ b/tests/openvino/optimized_functions/test_compression_functions.py
@@ -29,13 +29,13 @@
 from nncf.openvino.cpu_info import is_arm_cpu
 from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.quantization.algorithms.weight_compression.group_quantization import reshape_weight_for_grouped_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error
 from nncf.quantization.algorithms.weight_compression.weight_lowering import integer_quantize_dequantize_weight
+from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 from nncf.tensor.definitions import TensorBackend

From 44712901f3a2a7cf475a6046c1b9598c7025515b Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 27 Jun 2025 15:05:56 +0200
Subject: [PATCH 47/68] Changed no ascii chracter.

---
 tests/openvino/native/quantization/test_weights_compression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 182a267876b..85d14d70a89 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -351,7 +351,7 @@ def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map):
         ),
     ],
 )
-def test_compression_with_сodebook_for_different_dtypes(codebook, codebook_dtype, index_dtype, name):
+def test_compression_with_codebook_for_different_dtypes(codebook, codebook_dtype, index_dtype, name):
     model = IntegerModel().ov_model
     codebook_params = nncf.CodebookParameters(codebook)
 

From 3f9f833d7be464e6d52e61b074808ba9b4a800ec Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Fri, 27 Jun 2025 15:56:15 +0200
Subject: [PATCH 48/68] Removed extra convert from fp16 to fp16.

---
 nncf/openvino/graph/node_utils.py                          | 3 ++-
 .../native/quantization/test_weights_compression.py        | 7 +++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 3892876e51d..5faec5e904e 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -44,6 +44,7 @@
 from nncf.openvino.graph.metatypes.openvino_metatypes import get_node_metatype
 from nncf.tensor import Tensor
 from nncf.tensor import TensorBackend
+from nncf.tensor import TensorDataType
 
 InplaceInsertionFnType = Callable[[ov.Node, int, str], ov.Node]
 
@@ -700,7 +701,7 @@ def create_ov_codebook_subgraph(
     :return: OpenVINO subgraph.
     """
     codebook_const = opset.constant(codebook.data, name=name)
-    if codebook.dtype != ov.Type.f16:
+    if codebook.dtype != TensorDataType.float16:
         codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16)
 
     codebook_indexes = opset.constant(indexes.data, dtype=dtype, name=name + "_nncf_codebook_idxs")
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 85d14d70a89..b3bb42f056a 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -234,8 +234,11 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7):
 def check_codebook_grouped(op: ov.Node, group_size: int = 7, dtype=ov.Type.f8e4m3):
     assert op.get_element_type() == dtype
 
-    convert_node = get_next_node(op)
-    assert convert_node.get_type_name() == "Convert"
+    if dtype == ov.Type.f16:
+        convert_node = op
+    else:
+        convert_node = get_next_node(op)
+        assert convert_node.get_type_name() == "Convert"
 
     gather_node = get_next_node(convert_node)
     assert gather_node.get_type_name() == "Gather"

From 67faaa7c77b878143b4abcac029321162f89eb74 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Mon, 30 Jun 2025 10:06:05 +0200
Subject: [PATCH 49/68] Added test and exception which checks what codebook is
 sorted, not empty  1d array.

---
 .../weight_compression/algorithm.py           | 16 +++++++++++++
 .../quantization/test_weights_compression.py  | 23 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 376894fde2e..2c85a3eba34 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -181,6 +181,22 @@ def check_user_compression_configuration(
             ]
         )
         ranks = [advanced_parameters.lora_adapter_rank, advanced_parameters.lora_correction_params.adapter_rank]
+
+        if advanced_parameters.codebook_params.codebook is not None:
+            codebook = Tensor(advanced_parameters.codebook_params.codebook).as_numpy_tensor().data
+            msg = None
+            if codebook.ndim != 1:
+                msg = "The codebook must be a 1D array, but a multi-dimensional array is given."
+            if codebook.size < 2:
+                msg = (
+                    "The codebook must contain at least two unique elements,"
+                    "but a single-element or empty array is given."
+                )
+            if (codebook[:-1] >= codebook[1:]).any():
+                msg = "The codebook must be a sorted 1D array with unique elements, but an unsorted array is given."
+            if msg:
+                raise nncf.ValidationError(msg)
+
     for size in values_to_check:
         if size <= 0:
             msg = f"The subset_size value should be positive, but subset_size={size} is given."
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index b3bb42f056a..5da9944a03e 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -1725,6 +1725,29 @@ def test_nf4_quantization_mid_quant(weight, scale):
     np.testing.assert_allclose(nf4_quant.data, ref_nf4_quant.data, atol=0, rtol=0)
 
 
+@pytest.mark.parametrize(
+    "codebook_values",
+    [
+        np.array([0.2, 0.2, 0.3, 0.4], dtype=np.float32),
+        np.array([0.5, 0.2, 0.3, 0.4], dtype=np.float32),
+        np.array([[-1, 0, 1, 2, 3], [-1, 0, 1, 2, 3]], dtype=np.float32),
+        np.array([5], dtype=np.float32),
+    ],
+)
+def test_codebook_is_correct_array(codebook_values):
+    codebook_params = nncf.CodebookParameters(codebook_values)
+    model = SequentialMatmulModel().ov_model
+
+    # The codebook should be a non empty 1D numpy array and sorted
+    with pytest.raises(nncf.ValidationError):
+        compress_weights(
+            model,
+            mode=CompressWeightsMode.CODEBOOK,
+            group_size=-1,
+            advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params),
+        )
+
+
 class TestOVTemplateWeightCompression(TemplateWeightCompression):
     @staticmethod
     def get_matmul_model() -> ov.Model:

From e5322e378057b3a80b03bcbeb3a148b5f3d99453 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Mon, 30 Jun 2025 10:33:10 +0200
Subject: [PATCH 50/68] Fixed fp8 values in test.

---
 .../native/quantization/test_weights_compression.py         | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 5da9944a03e..33ba2681b07 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -354,7 +354,7 @@ def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map):
         ),
     ],
 )
-def test_compression_with_codebook_for_different_dtypes(codebook, codebook_dtype, index_dtype, name):
+def test_codebook_compression_for_different_dtypes(codebook, codebook_dtype, index_dtype, name):
     model = IntegerModel().ov_model
     codebook_params = nncf.CodebookParameters(codebook)
 
@@ -1165,7 +1165,9 @@ def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids):
         (np.array([i for i in range(-8, 8)], np.int8), ov.Type.i8, 5),
         (np.array([i for i in range(-(2**6), 2**6)], np.int8), ov.Type.i8, 5),
         (
-            Tensor(np.array([i for i in range(-(2**6), 2**6)])).as_openvino_tensor().astype(TensorDataType.f8e4m3),
+            Tensor(np.array([np.sign(i) * 2 ** np.abs(i) for i in range(-6, 6)]))
+            .as_openvino_tensor()
+            .astype(TensorDataType.f8e4m3),
             ov.Type.f8e4m3,
             5,
         ),

From 8f18fb8b4aa0d4927e817fc9378b5a7148c0ec7f Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Mon, 30 Jun 2025 20:42:10 +0200
Subject: [PATCH 51/68] Applied suggestions.

---
 nncf/openvino/optimized_functions/functions.py       |  2 +-
 nncf/quantization/advanced_parameters.py             |  1 -
 .../algorithms/weight_compression/config.py          |  5 +++--
 .../weight_compression/openvino_backend.py           |  2 --
 .../algorithms/weight_compression/weight_lowering.py | 12 ++++--------
 nncf/quantization/quantize_model.py                  |  4 +---
 6 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/nncf/openvino/optimized_functions/functions.py b/nncf/openvino/optimized_functions/functions.py
index 282a43f9d2b..e22ea481abd 100644
--- a/nncf/openvino/optimized_functions/functions.py
+++ b/nncf/openvino/optimized_functions/functions.py
@@ -105,7 +105,7 @@ def do_float_quantization(
     config: WeightCompressionConfig,
     reduction_axes: Optional[ReductionAxes] = None,
     precomputed_scale: Optional[Tensor] = None,
-) -> tuple[Tensor, Tensor]:
+) -> tuple[Tensor, Tensor, Tensor]:
     """
     Computes quantization scale if not provided, and performs corresponding nf4 weight quantization.
     For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index fcf04bf01a5..78a4cfae2d6 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -370,7 +370,6 @@ class CodebookParameters:
     :param codebook: The codebook (LUT) for the weight compression.
         Applicable for vector quantization. Must be a numpy array, ov Tensor, or torch Tensor.
     :type codebook: TTensor
-    :param dts_type: The type of the codebook.
     """
 
     codebook: Optional[TTensor] = None
diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
index d8d8c5e879e..b686c84c669 100644
--- a/nncf/quantization/algorithms/weight_compression/config.py
+++ b/nncf/quantization/algorithms/weight_compression/config.py
@@ -10,7 +10,7 @@
 # limitations under the License.
 from dataclasses import dataclass
 from dataclasses import field
-from typing import Any, Optional, TypeVar
+from typing import Optional, TypeVar
 
 import numpy as np
 
@@ -18,6 +18,7 @@
 from nncf.parameters import CompressWeightsMode
 
 TWeightType = TypeVar("TWeightType")
+TTensor = TypeVar("TTensor")
 
 
 @dataclass
@@ -32,7 +33,7 @@ class WeightCompressionConfig:
 
     mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM
     group_size: Optional[int] = -1
-    codebook_values: Optional[Any] = None
+    codebook_values: Optional[TTensor] = None
 
     @property
     def num_bits(self):
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 2d158daf5e3..5d5656db350 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -220,7 +220,6 @@ def _create_compression_subgraph(
         const_dtype,
         should_add_convert_node: bool,
         compressed_weight: Optional[CompressedWeight] = None,
-        advanced_parameters: Optional[AdvancedCompressionParameters] = None,
     ):
         scale_dtype = ov.Type.f16
         if compression_config.mode == CompressWeightsMode.NF4:
@@ -333,7 +332,6 @@ def transform_model(
                     const_dtype=const_dtype,
                     should_add_convert_node=should_add_convert_node,
                     compressed_weight=compressed_weight,
-                    advanced_parameters=advanced_parameters,
                 )
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 8b00c98a755..2a89c9acc9f 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -68,7 +68,7 @@ def reshape_weight_for_grouped_quantization(
 
 
 def calculate_float_quantization_params(
-    weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, max_val=6.0
+    weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig
 ) -> Tensor:
     """
     Calculates the scale for nf4 or e2m1 quantization.
@@ -76,7 +76,6 @@ def calculate_float_quantization_params(
     :param weight: Weight array to compress.
     :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max).
     :param config: Weight compression configuration.
-    :param max_val: Maximal value of e2m1 type.
     :return: Scale tensor of float32 type for float quantization.
     """
     assert not config.is_integer
@@ -86,6 +85,7 @@ def calculate_float_quantization_params(
 
     scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True)
     if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]:
+        max_val = 6.0 if config.mode == CompressWeightsMode.E2M1 else max(np.abs(config.get_numpy_codebook()))
         scale = scale / max_val
 
     # NOTE: adding machine epsilon to avoid division by zero
@@ -122,7 +122,6 @@ def do_float_quantization(
     config: WeightCompressionConfig,
     reduction_axes: Optional[ReductionAxes] = None,
     precomputed_scale: Optional[Tensor] = None,
-    max_val: float = 6.0,
 ) -> tuple[Tensor, Tensor, Tensor]:
     """
     Computes quantization scale if not provided, and performs corresponding (nf4, e2m1) weight quantization.
@@ -134,7 +133,6 @@ def do_float_quantization(
     :param config: Weight compression configuration.
     :param reduction_axes: Axes, along which to reduce (collect) different statistics.
     :param precomputed_scale: Optional precomputed scale.
-    :param max_val: Maximal value of destination type.
     :return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor and
              optional indexes for codebook.
     """
@@ -158,9 +156,7 @@ def do_float_quantization(
 
     scale = precomputed_scale
     if scale is None:
-        if config.is_codebook:
-            max_val = max(np.abs(config.get_numpy_codebook()))
-        scale = calculate_float_quantization_params(weight, reduction_axes, config, max_val)
+        scale = calculate_float_quantization_params(weight, reduction_axes, config)
     norm_weight = _calculate_normalized_weight(weight, scale)
     if config.mode == CompressWeightsMode.NF4:
         if original_weight_backend == TensorBackend.ov:
@@ -505,7 +501,7 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor:
 
 def _calculate_codebook_quantized_weight(
     norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None
-) -> Tensor:
+) -> tuple[Tensor, Tensor]:
     """
     Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to
     "round" or "quantize" to the closest quant.
diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py
index 5b69e52359d..c63d698c430 100644
--- a/nncf/quantization/quantize_model.py
+++ b/nncf/quantization/quantize_model.py
@@ -607,9 +607,7 @@ def compress_weights(
             msg = "Scale estimation, GPTQ or Lora Correction algorithm is defined, but dataset is None."
             raise nncf.ParameterNotSupportedError(msg)
 
-        if any((awq, scale_estimation, gptq, lora_correction)) and mode in [
-            CompressWeightsMode.E2M1,
-        ]:
+        if any((awq, scale_estimation, gptq, lora_correction)) and mode == CompressWeightsMode.E2M1:
             msg = f"AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is {mode}."
             raise nncf.ParameterNotSupportedError(msg)
 

From c838708c7a369472d8022384e7da19e022ed5300 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 1 Jul 2025 14:38:38 +0200
Subject: [PATCH 52/68] Applied suggestions.

---
 nncf/quantization/algorithms/weight_compression/config.py       | 2 ++
 .../algorithms/weight_compression/weight_lowering.py            | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
index b686c84c669..78201f96ce5 100644
--- a/nncf/quantization/algorithms/weight_compression/config.py
+++ b/nncf/quantization/algorithms/weight_compression/config.py
@@ -29,6 +29,8 @@ class WeightCompressionConfig:
     :param mode: Defines a mode for weight compression. Defaults to INT8_ASYM mode.
     :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale).
         The value -1 means no grouping. Defaults to -1.
+    :param codebook_values: Optional codebook values for CODEBOOK compression mode.
+        Must be fns.Tensor which wraps numpy array, ov or torch tensor.
     """
 
     mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 2a89c9acc9f..d0abf039f90 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -500,7 +500,7 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor:
 
 
 def _calculate_codebook_quantized_weight(
-    norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None
+    norm_weight: Tensor, quantiles: np.array = None, center_of_quantiles: np.array = None
 ) -> tuple[Tensor, Tensor]:
     """
     Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to

From 0949a92a29b857bb9a7ef397944751b0ac63a1ec Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 1 Jul 2025 18:52:56 +0200
Subject: [PATCH 53/68] Applied suggestions.

---
 .../algorithms/weight_compression/algorithm.py            | 8 ++++----
 .../algorithms/weight_compression/openvino_backend.py     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
index 2c85a3eba34..dd3fac4d288 100644
--- a/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -686,13 +686,13 @@ def apply(
             # del is used to prematurely mark non-necessary data as free for garbage collection
             del self.awq_algo
 
-        compressed_weights = None
+        precomputed_compressed_weights = None
         lora_correction_algo = None
         description = "Applying Weight Compression"
 
         if self._gptq:
             del statistics
-            model, compressed_weights = self._gptq_algo.apply(
+            model, precomputed_compressed_weights = self._gptq_algo.apply(
                 model=model,
                 graph=graph,
                 dataset=dataset,
@@ -701,7 +701,7 @@ def apply(
             )
         else:
             if self._scale_estimation:
-                compressed_weights = self._scale_estimation_algo.apply(
+                precomputed_compressed_weights = self._scale_estimation_algo.apply(
                     model=model,
                     graph=graph,
                     all_weight_params=all_weight_params,
@@ -724,7 +724,7 @@ def apply(
             model,
             graph,
             track(all_weight_params, description=description, weights=all_weight_sizes),
-            compressed_weights,
+            precomputed_compressed_weights,
             lora_correction_algo,
             self._compression_format,
             self._advanced_parameters,
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 5d5656db350..20ff7248c7c 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -252,7 +252,7 @@ def _create_compression_subgraph(
             )
 
         if compression_config.is_codebook:
-            n_quants = compressed_weight.tensor.max()
+            n_quants = compressed_weight.codebook.size - 1
             compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4)
             converted_const = create_ov_codebook_subgraph(
                 codebook=compressed_weight.codebook,

From b491012310612d21d7b38d164bccaa55ed8c3e2d Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 1 Jul 2025 19:08:11 +0200
Subject: [PATCH 54/68] Fixed data type.

---
 .../algorithms/weight_compression/weight_lowering.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index d0abf039f90..c1a6406678d 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -500,7 +500,7 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor:
 
 
 def _calculate_codebook_quantized_weight(
-    norm_weight: Tensor, quantiles: np.array = None, center_of_quantiles: np.array = None
+    norm_weight: Tensor, quantiles: np.ndarray = None, center_of_quantiles: np.ndarray = None
 ) -> tuple[Tensor, Tensor]:
     """
     Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to

From 6bf05fcc65a16cd3bac12b7c292bc6e553e00988 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 1 Jul 2025 19:47:37 +0200
Subject: [PATCH 55/68] Removed torch tensor from codebook docstring.

---
 nncf/quantization/advanced_parameters.py                  | 2 +-
 nncf/quantization/algorithms/weight_compression/config.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
index 78a4cfae2d6..ad62fef11fc 100644
--- a/nncf/quantization/advanced_parameters.py
+++ b/nncf/quantization/advanced_parameters.py
@@ -368,7 +368,7 @@ class CodebookParameters:
     Contains parameters for codebook compression algorithm.
 
     :param codebook: The codebook (LUT) for the weight compression.
-        Applicable for vector quantization. Must be a numpy array, ov Tensor, or torch Tensor.
+        Applicable for vector quantization. Must be a numpy array or ov Tensor.
     :type codebook: TTensor
     """
 
diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
index 78201f96ce5..1d4e7a57917 100644
--- a/nncf/quantization/algorithms/weight_compression/config.py
+++ b/nncf/quantization/algorithms/weight_compression/config.py
@@ -30,7 +30,7 @@ class WeightCompressionConfig:
     :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale).
         The value -1 means no grouping. Defaults to -1.
     :param codebook_values: Optional codebook values for CODEBOOK compression mode.
-        Must be fns.Tensor which wraps numpy array, ov or torch tensor.
+        Must be fns.Tensor which wraps numpy array or ov tensor.
     """
 
     mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM

From e44b3d866673a44894e03ab552fe65d5b394bfcb Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 2 Jul 2025 09:58:56 +0200
Subject: [PATCH 56/68] Applied suggestion.

---
 .../algorithms/weight_compression/onnx_backend.py  |  6 ++++--
 .../weight_compression/openvino_backend.py         | 14 +++++++++-----
 .../algorithms/weight_compression/torch_backend.py |  6 ++++--
 .../weight_compression/torch_fx_backend.py         |  6 ++++--
 .../weight_compression/weight_lowering.py          |  6 ++++--
 5 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
index 07347cd3abe..761647184b4 100644
--- a/nncf/quantization/algorithms/weight_compression/onnx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -201,7 +201,7 @@ def transform_model(
         model: onnx.ModelProto,
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        compressed_weights: Optional[dict[str, CompressedWeight]] = None,
+        precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None,
         lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
@@ -217,7 +217,9 @@ def transform_model(
                 Tensor(weight),
                 wc_params.reduction_axes,
                 compression_config,
-                None if compressed_weights is None else compressed_weights.get(wc_params.weight_name),
+                None
+                if precomputed_compressed_weights is None
+                else precomputed_compressed_weights.get(wc_params.weight_name),
             )
             dequantize_block_size = max(compression_config.group_size, 0)  # 0 - is no block wise quantization
             dequantize_axis = (
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 20ff7248c7c..d763ad278de 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -219,7 +219,7 @@ def _create_compression_subgraph(
         weight_port_id: int,
         const_dtype,
         should_add_convert_node: bool,
-        compressed_weight: Optional[CompressedWeight] = None,
+        precomputed_compressed_weights: Optional[CompressedWeight] = None,
     ):
         scale_dtype = ov.Type.f16
         if compression_config.mode == CompressWeightsMode.NF4:
@@ -248,7 +248,7 @@ def _create_compression_subgraph(
                 weight,
                 reduction_axes,
                 compression_config,
-                compressed_weight,
+                precomputed_compressed_weights,
             )
 
         if compression_config.is_codebook:
@@ -296,7 +296,7 @@ def transform_model(
         model: ov.Model,
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        compressed_weights: Optional[dict[str, CompressedWeight]] = None,
+        precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None,
         lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
@@ -321,7 +321,11 @@ def transform_model(
                         should_add_convert_node = True
                         break
 
-            compressed_weight = None if compressed_weights is None else compressed_weights.get(wc_params.weight_name)
+            precomputed_compressed_weights = (
+                None
+                if precomputed_compressed_weights is None
+                else precomputed_compressed_weights.get(wc_params.weight_name)
+            )
             try:
                 mul, compressed_weight = self._create_compression_subgraph(
                     weight=weight,
@@ -331,7 +335,7 @@ def transform_model(
                     weight_port_id=wc_params.weight_port_id,
                     const_dtype=const_dtype,
                     should_add_convert_node=should_add_convert_node,
-                    compressed_weight=compressed_weight,
+                    precomputed_compressed_weights=precomputed_compressed_weights,
                 )
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py
index e8efeb302e3..79f6b315a09 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -456,7 +456,7 @@ def transform_model(
         model: Union[GraphModelWrapper, torch.nn.Module],
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        compressed_weights: Optional[dict[str, CompressedWeight]] = None,
+        precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None,
         lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
@@ -493,7 +493,9 @@ def transform_model(
                     Tensor(weight),
                     wc_params.reduction_axes,
                     compression_config,
-                    None if compressed_weights is None else compressed_weights.get(wc_params.weight_name),
+                    None
+                    if precomputed_compressed_weights is None
+                    else precomputed_compressed_weights.get(wc_params.weight_name),
                 )
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
diff --git a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
index d00b0ae5b4c..396f125ca7b 100644
--- a/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
@@ -190,7 +190,7 @@ def transform_model(
         model: torch.fx.GraphModule,
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        compressed_weights: Optional[dict[str, CompressedWeight]] = None,
+        precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None,
         lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
@@ -218,7 +218,9 @@ def transform_model(
                     weight,
                     wc_params.reduction_axes,
                     compression_config,
-                    None if compressed_weights is None else compressed_weights.get(wc_params.weight_name),
+                    None
+                    if precomputed_compressed_weights is None
+                    else precomputed_compressed_weights.get(wc_params.weight_name),
                 )
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index c1a6406678d..c4f697a01d9 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -301,7 +301,7 @@ def compress_weight(
     weight: Tensor,
     reduction_axes: ReductionAxes,
     config: WeightCompressionConfig,
-    compressed_weight: CompressedWeight = None,
+    precomputed_compressed_weights: CompressedWeight = None,
 ) -> CompressedWeight:
     """
     Compress weight using compression configuration.
@@ -314,7 +314,9 @@ def compress_weight(
     :return: The compressed weight and decompression parameters as instance of CompressedWeight
     """
     precomputed_scale, precomputed_zero_point = (
-        (compressed_weight.scale, compressed_weight.zero_point) if compressed_weight else (None, None)
+        (precomputed_compressed_weights.scale, precomputed_compressed_weights.zero_point)
+        if precomputed_compressed_weights
+        else (None, None)
     )
 
     if not config.is_integer:

From f1c68d6f787ab5a1689b84ed0fa4643e37adee52 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 2 Jul 2025 10:01:08 +0200
Subject: [PATCH 57/68] Applied suggestion.

---
 nncf/quantization/algorithms/weight_compression/backend.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py
index 92c6cb80a5d..e2257168ad3 100644
--- a/nncf/quantization/algorithms/weight_compression/backend.py
+++ b/nncf/quantization/algorithms/weight_compression/backend.py
@@ -149,7 +149,7 @@ def transform_model(
         model: TModel,
         graph: NNCFGraph,
         weight_compression_parameters: Iterable[WeightCompressionParameters],
-        compressed_weights: Optional[dict[str, CompressedWeight]] = None,
+        precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None,
         lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
         compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
@@ -160,8 +160,7 @@ def transform_model(
         :param model: Model in which the weights will be compressed according to the weight compression description.
         :param graph: The graph associated with the model.
         :param weight_compression_parameters: An iterable of weight compression parameters.
-        :param precomputed_scales: Precomputed scales for weight compression.
-        :param precomputed_zero_points: Precomputed zero points for weight compression.
+        :param precomputed_compressed_weights: Precomputed scales, zero points, or codebook for weight compression.
         :param lora_correction_algo: An optional algorithm to reduce quantization noise after weight compression by
             using low-rank adapters. This algorithm not only overrides weights with their quantized counterparts but
             also expands the model's execution graph following the Low-Rank Adaptation (LoRA) concept.

From 8159e56aa8548ee249a3c20bdbda14324f6ea951 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 2 Jul 2025 12:40:23 +0200
Subject: [PATCH 58/68] Fixed bug.

---
 .../algorithms/weight_compression/onnx_backend.py        | 4 ++--
 .../algorithms/weight_compression/openvino_backend.py    | 8 ++++----
 .../algorithms/weight_compression/weight_lowering.py     | 9 ++++-----
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
index 761647184b4..faf78319b74 100644
--- a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
+++ b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -213,7 +213,7 @@ def transform_model(
             compression_config = wc_params.compression_config
             node = wc_params.node_with_weight
             weight = self.get_weight(node, wc_params.weight_port_id, model, graph)
-            compressed_weight = compress_weight(
+            precomputed_compressed_weight = compress_weight(
                 Tensor(weight),
                 wc_params.reduction_axes,
                 compression_config,
@@ -231,7 +231,7 @@ def transform_model(
             # See https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md
             if opset_version < 21 and dequantize_block_size > 0:
                 compressed_weight, scale, zero_point = self._preprocess_compressed_weight(
-                    compressed_weight, weight.shape, dequantize_block_size=None, apply_transpose=True
+                    precomputed_compressed_weight, weight.shape, dequantize_block_size=None, apply_transpose=True
                 )
                 self._replace_matmul_with_matmulnbits(
                     model,
diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index d763ad278de..80b6a2d64c4 100644
--- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -219,7 +219,7 @@ def _create_compression_subgraph(
         weight_port_id: int,
         const_dtype,
         should_add_convert_node: bool,
-        precomputed_compressed_weights: Optional[CompressedWeight] = None,
+        precomputed_compressed_weight: Optional[CompressedWeight] = None,
     ):
         scale_dtype = ov.Type.f16
         if compression_config.mode == CompressWeightsMode.NF4:
@@ -248,7 +248,7 @@ def _create_compression_subgraph(
                 weight,
                 reduction_axes,
                 compression_config,
-                precomputed_compressed_weights,
+                precomputed_compressed_weight,
             )
 
         if compression_config.is_codebook:
@@ -321,7 +321,7 @@ def transform_model(
                         should_add_convert_node = True
                         break
 
-            precomputed_compressed_weights = (
+            precomputed_compressed_weight = (
                 None
                 if precomputed_compressed_weights is None
                 else precomputed_compressed_weights.get(wc_params.weight_name)
@@ -335,7 +335,7 @@ def transform_model(
                     weight_port_id=wc_params.weight_port_id,
                     const_dtype=const_dtype,
                     should_add_convert_node=should_add_convert_node,
-                    precomputed_compressed_weights=precomputed_compressed_weights,
+                    precomputed_compressed_weight=precomputed_compressed_weight,
                 )
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index c4f697a01d9..c572be2fff7 100644
--- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -301,7 +301,7 @@ def compress_weight(
     weight: Tensor,
     reduction_axes: ReductionAxes,
     config: WeightCompressionConfig,
-    precomputed_compressed_weights: CompressedWeight = None,
+    precomputed_compressed_weight: CompressedWeight = None,
 ) -> CompressedWeight:
     """
     Compress weight using compression configuration.
@@ -309,13 +309,12 @@ def compress_weight(
     :param weight: The weight to compress.
     :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
     :param config: Compression configuration.
-    :param precomputed_scale: Precomputed scale.
-    :param precomputed_zero_point: Precomputed zero point.
+    :param precomputed_compressed_weight: precomputed scale and zero point.
     :return: The compressed weight and decompression parameters as instance of CompressedWeight
     """
     precomputed_scale, precomputed_zero_point = (
-        (precomputed_compressed_weights.scale, precomputed_compressed_weights.zero_point)
-        if precomputed_compressed_weights
+        (precomputed_compressed_weight.scale, precomputed_compressed_weight.zero_point)
+        if precomputed_compressed_weight
         else (None, None)
     )
 

From b24936b31d41f5ae33e883ebf6aec906b4a0cf89 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 2 Jul 2025 13:48:55 +0200
Subject: [PATCH 59/68] Fixed bug for onnx.

---
 .../openvino/smollm2_360m_codebook/main.py    | 25 +++++++------------
 .../weight_compression/onnx_backend.py        |  4 +--
 .../weight_compression/openvino_backend.py    |  9 +++----
 .../weight_compression/weight_lowering.py     |  2 +-
 4 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index eb2eaff0a25..640a9fd1313 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -49,17 +49,18 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50):
 ]
 
 
-def default_codebook_example(model_id, output_dir):
+def load_model_and_tokenizer(model_id, export=True):
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = OVModelForCausalLM.from_pretrained(
         model_id,
-        export=True,
+        export=export,
         load_in_8bit=False,
-        compile=False,
-        stateful=False,
-        ov_config={"INFERENCE_PRECISION_HINT": "f32"},
     )
+    return model, tokenizer
+
 
+def default_codebook_example(model_id, output_dir):
+    model, tokenizer = load_model_and_tokenizer(model_id)
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
 
@@ -67,7 +68,7 @@ def default_codebook_example(model_id, output_dir):
     model.save_pretrained(output_dir)
     tokenizer.save_pretrained(output_dir)
 
-    model = OVModelForCausalLM.from_pretrained(output_dir, ov_config={"INFERENCE_PRECISION_HINT": "f32"})
+    model, tokenizer = load_model_and_tokenizer(output_dir, False)
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Optimized model outputs:\n{answers_by_questions}\n")
 
@@ -75,15 +76,7 @@ def default_codebook_example(model_id, output_dir):
 
 
 def custom_codebook_example(model_id, output_dir):
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = OVModelForCausalLM.from_pretrained(
-        model_id,
-        export=True,
-        load_in_8bit=False,
-        compile=False,
-        stateful=False,
-        ov_config={"INFERENCE_PRECISION_HINT": "f32"},
-    )
+    model, tokenizer = load_model_and_tokenizer(model_id)
 
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
@@ -102,7 +95,7 @@ def custom_codebook_example(model_id, output_dir):
     model.save_pretrained(output_dir)
     tokenizer.save_pretrained(output_dir)
 
-    model = OVModelForCausalLM.from_pretrained(output_dir, ov_config={"INFERENCE_PRECISION_HINT": "f32"})
+    model, tokenizer = load_model_and_tokenizer(output_dir, False)
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print(f"Optimized model outputs:\n{answers_by_questions}\n")
 
diff --git a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
index faf78319b74..761647184b4 100644
--- a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
+++ b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -213,7 +213,7 @@ def transform_model(
             compression_config = wc_params.compression_config
             node = wc_params.node_with_weight
             weight = self.get_weight(node, wc_params.weight_port_id, model, graph)
-            precomputed_compressed_weight = compress_weight(
+            compressed_weight = compress_weight(
                 Tensor(weight),
                 wc_params.reduction_axes,
                 compression_config,
@@ -231,7 +231,7 @@ def transform_model(
             # See https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md
             if opset_version < 21 and dequantize_block_size > 0:
                 compressed_weight, scale, zero_point = self._preprocess_compressed_weight(
-                    precomputed_compressed_weight, weight.shape, dequantize_block_size=None, apply_transpose=True
+                    compressed_weight, weight.shape, dequantize_block_size=None, apply_transpose=True
                 )
                 self._replace_matmul_with_matmulnbits(
                     model,
diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 80b6a2d64c4..37564cc654b 100644
--- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -321,11 +321,6 @@ def transform_model(
                         should_add_convert_node = True
                         break
 
-            precomputed_compressed_weight = (
-                None
-                if precomputed_compressed_weights is None
-                else precomputed_compressed_weights.get(wc_params.weight_name)
-            )
             try:
                 mul, compressed_weight = self._create_compression_subgraph(
                     weight=weight,
@@ -335,7 +330,9 @@ def transform_model(
                     weight_port_id=wc_params.weight_port_id,
                     const_dtype=const_dtype,
                     should_add_convert_node=should_add_convert_node,
-                    precomputed_compressed_weight=precomputed_compressed_weight,
+                    precomputed_compressed_weight=None
+                    if precomputed_compressed_weights is None
+                    else precomputed_compressed_weights.get(wc_params.weight_name),
                 )
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error
diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index c572be2fff7..c961b45af33 100644
--- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -309,7 +309,7 @@ def compress_weight(
     :param weight: The weight to compress.
     :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
     :param config: Compression configuration.
-    :param precomputed_compressed_weight: precomputed scale and zero point.
+    :param precomputed_compressed_weight: Contains precomputed scale and zero point.
     :return: The compressed weight and decompression parameters as instance of CompressedWeight
     """
     precomputed_scale, precomputed_zero_point = (

From 6fdfd3304629e5ae926729e837d5b79023dd7e0d Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 2 Jul 2025 16:24:01 +0200
Subject: [PATCH 60/68] Applied suggestion.

---
 src/nncf/quantization/algorithms/weight_compression/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/config.py b/src/nncf/quantization/algorithms/weight_compression/config.py
index 1d4e7a57917..fd72a5cdd8d 100644
--- a/src/nncf/quantization/algorithms/weight_compression/config.py
+++ b/src/nncf/quantization/algorithms/weight_compression/config.py
@@ -30,7 +30,8 @@ class WeightCompressionConfig:
     :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale).
         The value -1 means no grouping. Defaults to -1.
     :param codebook_values: Optional codebook values for CODEBOOK compression mode.
-        Must be fns.Tensor which wraps numpy array or ov tensor.
+        Must be fns.Tensor which wraps numpy array or ov tensor. Storing ov tensor is useful for having
+        destination data type information available.
     """
 
     mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM

From 17d6d2d61b44560fa3bed9868c24722e3d5a61be Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 8 Jul 2025 10:15:08 +0200
Subject: [PATCH 61/68] Applied suggestions.

---
 .../weight_compression/algorithm.py           |  4 +-
 .../weight_compression/constants.py           | 50 ++++++++-----------
 .../weight_compression/openvino_backend.py    |  4 +-
 3 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py
index dd3fac4d288..9077cf7a81c 100644
--- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -39,7 +39,7 @@
 from nncf.quantization.algorithms.algorithm import Algorithm
 from nncf.quantization.algorithms.weight_compression.awq import AWQ
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
-from nncf.quantization.algorithms.weight_compression.constants import get_cb4_quantiles
+from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES
 from nncf.quantization.algorithms.weight_compression.gptq import GPTQ
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
@@ -457,7 +457,7 @@ def _get_primary_config(self):
         return WeightCompressionConfig(
             mode=self._mode,
             group_size=self._group_size,
-            codebook_values=get_cb4_quantiles()
+            codebook_values=Tensor(CB4_QUANTILES)
             if self._mode == CompressWeightsMode.CB4_F8E4M3
             else Tensor(self._advanced_parameters.codebook_params.codebook),
         )
diff --git a/src/nncf/quantization/algorithms/weight_compression/constants.py b/src/nncf/quantization/algorithms/weight_compression/constants.py
index 726ba841e00..6119fd8f83c 100644
--- a/src/nncf/quantization/algorithms/weight_compression/constants.py
+++ b/src/nncf/quantization/algorithms/weight_compression/constants.py
@@ -11,9 +11,6 @@
 
 import numpy as np
 
-from nncf.tensor import Tensor
-from nncf.tensor import TensorDataType
-
 NF4_QUANTILES = np.array(
     [
         -1.0,
@@ -37,32 +34,27 @@
 )
 
 
-def get_cb4_quantiles() -> Tensor:
-    """
-    Returns the quantiles for the CB4 codebook.
-    """
-    CB4_QUANTILES = np.array(
-        [
-            -3.5,
-            -2.5,
-            -1.875,
-            -1.375,
-            -1.0,
-            -0.625,
-            -0.3125,
-            0.0,
-            0.28125,
-            0.5625,
-            0.875,
-            1.125,
-            1.5,
-            2.0,
-            2.5,
-            3.5,
-        ],
-        dtype=np.float32,
-    )
-    return Tensor(CB4_QUANTILES).as_openvino_tensor().astype(TensorDataType.f8e4m3)
+CB4_QUANTILES = np.array(
+    [
+        -3.5,
+        -2.5,
+        -1.875,
+        -1.375,
+        -1.0,
+        -0.625,
+        -0.3125,
+        0.0,
+        0.28125,
+        0.5625,
+        0.875,
+        1.125,
+        1.5,
+        2.0,
+        2.5,
+        3.5,
+    ],
+    dtype=np.float32,
+)
 
 
 CENTER_OF_NF4_QUANTILES = np.array(
diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 37564cc654b..6215fb4b1ee 100644
--- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -255,7 +255,9 @@ def _create_compression_subgraph(
             n_quants = compressed_weight.codebook.size - 1
             compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4)
             converted_const = create_ov_codebook_subgraph(
-                codebook=compressed_weight.codebook,
+                codebook=compressed_weight.codebook
+                if compression_config.mode == CompressWeightsMode.CODEBOOK
+                else compressed_weight.codebook.as_openvino_tensor().astype(TensorDataType.f8e4m3),
                 indexes=compressed_weight.tensor,
                 dtype=compression_dtype,
                 name=const_node_name,

From 61abc6a7a3944414844cd889f4d701b19b5bd5a2 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 8 Jul 2025 12:26:03 +0200
Subject: [PATCH 62/68] Applied suggestions.

---
 .../openvino/smollm2_360m_codebook/main.py    | 32 +++++++++++++++----
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index 640a9fd1313..7b091a25157 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -12,18 +12,31 @@
 import numpy as np
 from optimum.intel.openvino import OVModelForCausalLM
 from transformers import AutoTokenizer
+from transformers import logging
 
 import nncf
 
+logging.set_verbosity_error()
+
 
 def generate_answers(questions, model, tokenizer, max_new_tokens=50):
+    """Generate answers for a list of questions using the provided model and tokenizer.
+
+    Args:
+        questions : List of questions to be answered.
+        model : The model to use for generating answers.
+        tokenizer : The tokenizer to use for processing the input and output.
+        max_new_tokens (int, optional): Maximum number of new tokens to generate for each answer. Defaults to 50.
+
+    Returns:
+        dict: A dictionary mapping each question to its corresponding answer.
+    """
     messages = [
         {"role": "system", "content": "You are a chatbot who always responds as short as possible."},
         {"role": "user", "content": "What is the capital of Spain?"},
         {"role": "assistant", "content": "Madrid."},
     ]
     answers_by_questions = {}
-    model.request = None
 
     for question in questions:
         messages.append({"role": "user", "content": question})
@@ -37,10 +50,15 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50):
         answers_by_questions[question] = answer
         messages.append({"role": "assistant", "content": answer})
 
-    model.request = None
     return answers_by_questions
 
 
+def print_answers(header, answers_by_questions):
+    print(header)
+    for question, answer in answers_by_questions.items():
+        print(f"Q: {question}\nA: {answer}\n")
+
+
 QUESTIONS = [
     "What is the capital of France?",
     "What is the highest peak in the Alps?",
@@ -50,7 +68,7 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50):
 
 
 def load_model_and_tokenizer(model_id, export=True):
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
     model = OVModelForCausalLM.from_pretrained(
         model_id,
         export=export,
@@ -62,7 +80,7 @@ def load_model_and_tokenizer(model_id, export=True):
 def default_codebook_example(model_id, output_dir):
     model, tokenizer = load_model_and_tokenizer(model_id)
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
-    print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
+    print_answers("Non-optimized model outputs:\n", answers_by_questions)
 
     model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CB4_F8E4M3, ratio=1.0, group_size=64)
     model.save_pretrained(output_dir)
@@ -70,7 +88,7 @@ def default_codebook_example(model_id, output_dir):
 
     model, tokenizer = load_model_and_tokenizer(output_dir, False)
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
-    print(f"Optimized model outputs:\n{answers_by_questions}\n")
+    print_answers("Optimized model outputs:\n", answers_by_questions)
 
     return list(answers_by_questions.values())
 
@@ -79,7 +97,7 @@ def custom_codebook_example(model_id, output_dir):
     model, tokenizer = load_model_and_tokenizer(model_id)
 
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
-    print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
+    print_answers("Non-optimized model outputs:\n", answers_by_questions)
 
     codebook_params = nncf.CodebookParameters(
         np.array([-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], dtype=np.int8)
@@ -97,7 +115,7 @@ def custom_codebook_example(model_id, output_dir):
 
     model, tokenizer = load_model_and_tokenizer(output_dir, False)
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
-    print(f"Optimized model outputs:\n{answers_by_questions}\n")
+    print_answers("Optimized model outputs:\n", answers_by_questions)
 
     return list(answers_by_questions.values())
 

From d1d82329bafc6ed8762bf426aed88f8e0e4f2f98 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 8 Jul 2025 14:01:17 +0200
Subject: [PATCH 63/68] 1) Added docstrings for codebook example. 2) Changed
 custom codebook to smaller in codebook example.

---
 .../openvino/smollm2_360m_codebook/main.py    | 60 +++++++++++++------
 tests/cross_fw/examples/example_scope.json    |  2 +-
 2 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index 7b091a25157..37feef9d207 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -19,7 +19,9 @@
 logging.set_verbosity_error()
 
 
-def generate_answers(questions, model, tokenizer, max_new_tokens=50):
+def generate_answers(
+    questions: list[str], model: OVModelForCausalLM, tokenizer: AutoTokenizer, max_new_tokens: int = 50
+) -> dict[str, str]:
     """Generate answers for a list of questions using the provided model and tokenizer.
 
     Args:
@@ -53,7 +55,12 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50):
     return answers_by_questions
 
 
-def print_answers(header, answers_by_questions):
+def print_answers(header: str, answers_by_questions: list[str]) -> None:
+    """Print the answers to the console.
+    Args:
+        header (str): Header to print before the answers.
+        answers_by_questions (dict): Dictionary mapping questions to their answers.
+    """
     print(header)
     for question, answer in answers_by_questions.items():
         print(f"Q: {question}\nA: {answer}\n")
@@ -67,7 +74,14 @@ def print_answers(header, answers_by_questions):
 ]
 
 
-def load_model_and_tokenizer(model_id, export=True):
+def load_model_and_tokenizer(model_id: str, export=True) -> tuple[OVModelForCausalLM, AutoTokenizer]:
+    """Load the model and tokenizer from the specified model ID.
+    Args:
+        model_id (str): The identifier of the model to load.
+        export (bool): Whether to export the model for OpenVINO. Defaults to True.
+    Returns:
+        tuple: A tuple containing the loaded model and tokenizer.
+    """
     tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
     model = OVModelForCausalLM.from_pretrained(
         model_id,
@@ -77,31 +91,43 @@ def load_model_and_tokenizer(model_id, export=True):
     return model, tokenizer
 
 
-def default_codebook_example(model_id, output_dir):
+def default_codebook_example(model_id: str, compressed_model_id: str) -> None:
+    """Example of using the default codebook compression.
+    Args:
+        model_id (str): The identifier of the model to load.
+        compressed_model_id (str): The identifier for the compressed model to save.
+    Returns:
+        list: A list of answers generated by the model after compression.
+    """
     model, tokenizer = load_model_and_tokenizer(model_id)
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print_answers("Non-optimized model outputs:\n", answers_by_questions)
 
     model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CB4_F8E4M3, ratio=1.0, group_size=64)
-    model.save_pretrained(output_dir)
-    tokenizer.save_pretrained(output_dir)
+    model.save_pretrained(compressed_model_id)
+    tokenizer.save_pretrained(compressed_model_id)
 
-    model, tokenizer = load_model_and_tokenizer(output_dir, False)
+    model, tokenizer = load_model_and_tokenizer(compressed_model_id, False)
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print_answers("Optimized model outputs:\n", answers_by_questions)
 
     return list(answers_by_questions.values())
 
 
-def custom_codebook_example(model_id, output_dir):
+def custom_codebook_example(model_id: str, compressed_model_id: str) -> None:
+    """Example of using the custom codebook compression.
+    Args:
+        model_id (str): The identifier of the model to load.
+        compressed_model_id (str): The identifier for the compressed model to save.
+    Returns:
+        list: A list of answers generated by the model after compression.
+    """
     model, tokenizer = load_model_and_tokenizer(model_id)
 
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print_answers("Non-optimized model outputs:\n", answers_by_questions)
 
-    codebook_params = nncf.CodebookParameters(
-        np.array([-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64], dtype=np.int8)
-    )
+    codebook_params = nncf.CodebookParameters(np.array([-8, -4, -2, -1, 0, 1, 2, 4, 8], dtype=np.int8))
 
     model.model = nncf.compress_weights(
         model.model,
@@ -110,10 +136,10 @@ def custom_codebook_example(model_id, output_dir):
         group_size=-1,
         advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params),
     )
-    model.save_pretrained(output_dir)
-    tokenizer.save_pretrained(output_dir)
+    model.save_pretrained(compressed_model_id)
+    tokenizer.save_pretrained(compressed_model_id)
 
-    model, tokenizer = load_model_and_tokenizer(output_dir, False)
+    model, tokenizer = load_model_and_tokenizer(compressed_model_id, False)
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print_answers("Optimized model outputs:\n", answers_by_questions)
 
@@ -122,10 +148,10 @@ def custom_codebook_example(model_id, output_dir):
 
 def main():
     model_id = "HuggingFaceTB/SmolLM2-360M-Instruct"
-    output_dir = "smollm2_360m_compressed_codebook"
+    compressed_model_id = "smollm2_360m_compressed_codebook"
 
-    res = default_codebook_example(model_id, output_dir)
-    res += custom_codebook_example(model_id, output_dir + "_custom")
+    res = default_codebook_example(model_id, compressed_model_id)
+    res += custom_codebook_example(model_id, compressed_model_id + "_custom")
     return res
 
 
diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json
index c45dfc898b9..e027db051c2 100644
--- a/tests/cross_fw/examples/example_scope.json
+++ b/tests/cross_fw/examples/example_scope.json
@@ -296,7 +296,7 @@
                 "Paris.",
                 "Mont Blanc.",
                 "Toronto.",
-                "Tokyo."
+                "Fukuoka."
             ]
         }
     },

From b8f25269ec560b1f41c351d09db9c6d296641355 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 8 Jul 2025 14:53:48 +0200
Subject: [PATCH 64/68] Applied suggestions.

---
 .../openvino/smollm2_360m_codebook/main.py               | 4 ++++
 .../algorithms/weight_compression/algorithm.py           | 9 ++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index 37feef9d207..a8d3b539f23 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -9,14 +9,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
+
 import numpy as np
 from optimum.intel.openvino import OVModelForCausalLM
+from torch.jit import TracerWarning
 from transformers import AutoTokenizer
 from transformers import logging
 
 import nncf
 
 logging.set_verbosity_error()
+warnings.filterwarnings("ignore", category=TracerWarning)
 
 
 def generate_answers(
diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py
index 9077cf7a81c..a276f281345 100644
--- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -454,12 +454,15 @@ def _get_ratio_defining_params(
         return ratio_defining_params
 
     def _get_primary_config(self):
+        codebook_values = (
+            Tensor(CB4_QUANTILES)
+            if self._mode == CompressWeightsMode.CB4_F8E4M3
+            else Tensor(self._advanced_parameters.codebook_params.codebook)
+        )
         return WeightCompressionConfig(
             mode=self._mode,
             group_size=self._group_size,
-            codebook_values=Tensor(CB4_QUANTILES)
-            if self._mode == CompressWeightsMode.CB4_F8E4M3
-            else Tensor(self._advanced_parameters.codebook_params.codebook),
+            codebook_values=codebook_values,
         )
 
     def _set_weight_compression_config(

From ca342ab9e40496d9f4702dacffbca72806dc3dd9 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 8 Jul 2025 16:12:22 +0200
Subject: [PATCH 65/68] Applied suggestion.

---
 .../openvino/smollm2_360m_codebook/main.py            | 11 ++++++-----
 .../algorithms/weight_compression/config.py           |  2 +-
 .../algorithms/weight_compression/weight_lowering.py  |  7 ++-----
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index a8d3b539f23..6262639bd1e 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -23,6 +23,10 @@
 warnings.filterwarnings("ignore", category=TracerWarning)
 
 
+MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
+COMPRESSED_MODEL_ID = "smollm2_360m_compressed_codebook"
+
+
 def generate_answers(
     questions: list[str], model: OVModelForCausalLM, tokenizer: AutoTokenizer, max_new_tokens: int = 50
 ) -> dict[str, str]:
@@ -151,11 +155,8 @@ def custom_codebook_example(model_id: str, compressed_model_id: str) -> None:
 
 
 def main():
-    model_id = "HuggingFaceTB/SmolLM2-360M-Instruct"
-    compressed_model_id = "smollm2_360m_compressed_codebook"
-
-    res = default_codebook_example(model_id, compressed_model_id)
-    res += custom_codebook_example(model_id, compressed_model_id + "_custom")
+    res = default_codebook_example(MODEL_ID, COMPRESSED_MODEL_ID)
+    res += custom_codebook_example(MODEL_ID, COMPRESSED_MODEL_ID + "_custom")
     return res
 
 
diff --git a/src/nncf/quantization/algorithms/weight_compression/config.py b/src/nncf/quantization/algorithms/weight_compression/config.py
index fd72a5cdd8d..1d5376b3454 100644
--- a/src/nncf/quantization/algorithms/weight_compression/config.py
+++ b/src/nncf/quantization/algorithms/weight_compression/config.py
@@ -69,7 +69,7 @@ def is_codebook(self):
         return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]
 
     def get_numpy_codebook(self):
-        return self.codebook_values.as_numpy_tensor().data
+        return self.codebook_values.as_numpy_tensor()
 
     def __hash__(self):
         return hash((self.mode.value, self.group_size))
diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index c961b45af33..8a1ee8f9b40 100644
--- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -11,8 +11,6 @@
 import os
 from typing import Optional, Union
 
-import numpy as np
-
 import nncf
 from nncf.common.logging.logger import nncf_logger
 from nncf.common.utils.backend import is_openvino_at_least
@@ -85,7 +83,7 @@ def calculate_float_quantization_params(
 
     scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True)
     if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]:
-        max_val = 6.0 if config.mode == CompressWeightsMode.E2M1 else max(np.abs(config.get_numpy_codebook()))
+        max_val = 6.0 if config.mode == CompressWeightsMode.E2M1 else fns.max(fns.abs(config.get_numpy_codebook()))
         scale = scale / max_val
 
     # NOTE: adding machine epsilon to avoid division by zero
@@ -501,7 +499,7 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor:
 
 
 def _calculate_codebook_quantized_weight(
-    norm_weight: Tensor, quantiles: np.ndarray = None, center_of_quantiles: np.ndarray = None
+    norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None
 ) -> tuple[Tensor, Tensor]:
     """
     Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to
@@ -518,7 +516,6 @@ def _calculate_codebook_quantized_weight(
     )
 
     if center_of_quantiles is None:
-        quantiles = np.array(quantiles)
         center_of_quantiles = 0.5 * (quantiles[1:] + quantiles[:-1])
     center_of_quantiles = fns.from_numpy(center_of_quantiles, backend=norm_weight.backend)
     indexes = fns.searchsorted(center_of_quantiles, norm_weight)

From 635ef2327d563c349f17be743795902a21617ab6 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Tue, 8 Jul 2025 16:19:43 +0200
Subject: [PATCH 66/68] Changed docstring formatting.

---
 .../openvino/smollm2_360m_codebook/main.py    | 65 +++++++++----------
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index 6262639bd1e..c80fceca0e6 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -30,16 +30,14 @@
 def generate_answers(
     questions: list[str], model: OVModelForCausalLM, tokenizer: AutoTokenizer, max_new_tokens: int = 50
 ) -> dict[str, str]:
-    """Generate answers for a list of questions using the provided model and tokenizer.
-
-    Args:
-        questions : List of questions to be answered.
-        model : The model to use for generating answers.
-        tokenizer : The tokenizer to use for processing the input and output.
-        max_new_tokens (int, optional): Maximum number of new tokens to generate for each answer. Defaults to 50.
+    """
+    Generate answers for a list of questions using the provided model and tokenizer.
 
-    Returns:
-        dict: A dictionary mapping each question to its corresponding answer.
+    :param questions : List of questions to be answered.
+    :param model : The model to use for generating answers.
+    :param tokenizer : The tokenizer to use for processing the input and output.
+    :param max_new_tokens (int, optional): Maximum number of new tokens to generate for each answer. Defaults to 50.
+    :return: A dictionary mapping each question to its corresponding answer.
     """
     messages = [
         {"role": "system", "content": "You are a chatbot who always responds as short as possible."},
@@ -64,10 +62,11 @@ def generate_answers(
 
 
 def print_answers(header: str, answers_by_questions: list[str]) -> None:
-    """Print the answers to the console.
-    Args:
-        header (str): Header to print before the answers.
-        answers_by_questions (dict): Dictionary mapping questions to their answers.
+    """
+    Print the answers to the console.
+
+    :param header (str): Header to print before the answers.
+    :param answers_by_questions (dict): Dictionary mapping questions to their answers.
     """
     print(header)
     for question, answer in answers_by_questions.items():
@@ -83,12 +82,12 @@ def print_answers(header: str, answers_by_questions: list[str]) -> None:
 
 
 def load_model_and_tokenizer(model_id: str, export=True) -> tuple[OVModelForCausalLM, AutoTokenizer]:
-    """Load the model and tokenizer from the specified model ID.
-    Args:
-        model_id (str): The identifier of the model to load.
-        export (bool): Whether to export the model for OpenVINO. Defaults to True.
-    Returns:
-        tuple: A tuple containing the loaded model and tokenizer.
+    """
+    Load the model and tokenizer from the specified model ID.
+
+    :param model_id (str): The identifier of the model to load.
+    :param  export (bool): Whether to export the model for OpenVINO. Defaults to True.
+    :return: A tuple containing the loaded model and tokenizer.
     """
     tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
     model = OVModelForCausalLM.from_pretrained(
@@ -99,13 +98,13 @@ def load_model_and_tokenizer(model_id: str, export=True) -> tuple[OVModelForCaus
     return model, tokenizer
 
 
-def default_codebook_example(model_id: str, compressed_model_id: str) -> None:
-    """Example of using the default codebook compression.
-    Args:
-        model_id (str): The identifier of the model to load.
-        compressed_model_id (str): The identifier for the compressed model to save.
-    Returns:
-        list: A list of answers generated by the model after compression.
+def default_codebook_example(model_id: str, compressed_model_id: str) -> list[str]:
+    """
+    Example of using the default codebook compression.
+
+    :param  model_id (str): The identifier of the model to load.
+    :param compressed_model_id (str): The identifier for the compressed model to save.
+    :return: A list of answers generated by the model after compression.
     """
     model, tokenizer = load_model_and_tokenizer(model_id)
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
@@ -122,13 +121,13 @@ def default_codebook_example(model_id: str, compressed_model_id: str) -> None:
     return list(answers_by_questions.values())
 
 
-def custom_codebook_example(model_id: str, compressed_model_id: str) -> None:
-    """Example of using the custom codebook compression.
-    Args:
-        model_id (str): The identifier of the model to load.
-        compressed_model_id (str): The identifier for the compressed model to save.
-    Returns:
-        list: A list of answers generated by the model after compression.
+def custom_codebook_example(model_id: str, compressed_model_id: str) -> list[str]:
+    """
+    Example of using the custom codebook compression.
+
+    :param model_id (str): The identifier of the model to load.
+    :param compressed_model_id (str): The identifier for the compressed model to save.
+    :return: A list of answers generated by the model after compression.
     """
     model, tokenizer = load_model_and_tokenizer(model_id)
 

From 50a94aa1fc5cb220047009a80171c6b323423080 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 9 Jul 2025 14:04:14 +0200
Subject: [PATCH 67/68] Applied suggestions.

---
 .../openvino/smollm2_360m_codebook/main.py    | 24 +++++++++----------
 .../graph/metatypes/openvino_metatypes.py     |  1 +
 .../weight_compression/algorithm.py           | 13 ++++++----
 .../weight_compression/onnx_backend.py        |  5 ++--
 .../weight_compression/torch_backend.py       |  5 ++--
 5 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index c80fceca0e6..7df2572c148 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -33,10 +33,10 @@ def generate_answers(
     """
     Generate answers for a list of questions using the provided model and tokenizer.
 
-    :param questions : List of questions to be answered.
-    :param model : The model to use for generating answers.
-    :param tokenizer : The tokenizer to use for processing the input and output.
-    :param max_new_tokens (int, optional): Maximum number of new tokens to generate for each answer. Defaults to 50.
+    :param questions: List of questions to be answered.
+    :param model: The model to use for generating answers.
+    :param tokenizer: The tokenizer to use for processing the input and output.
+    :param max_new_tokens: Maximum number of new tokens to generate for each answer. Defaults to 50.
     :return: A dictionary mapping each question to its corresponding answer.
     """
     messages = [
@@ -65,8 +65,8 @@ def print_answers(header: str, answers_by_questions: list[str]) -> None:
     """
     Print the answers to the console.
 
-    :param header (str): Header to print before the answers.
-    :param answers_by_questions (dict): Dictionary mapping questions to their answers.
+    :param header: Header to print before the answers.
+    :param answers_by_questions: Dictionary mapping questions to their answers.
     """
     print(header)
     for question, answer in answers_by_questions.items():
@@ -85,8 +85,8 @@ def load_model_and_tokenizer(model_id: str, export=True) -> tuple[OVModelForCaus
     """
     Load the model and tokenizer from the specified model ID.
 
-    :param model_id (str): The identifier of the model to load.
-    :param  export (bool): Whether to export the model for OpenVINO. Defaults to True.
+    :param model_id: The identifier of the model to load.
+    :param export: Whether to export the model for OpenVINO. Defaults to True.
     :return: A tuple containing the loaded model and tokenizer.
     """
     tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
@@ -102,8 +102,8 @@ def default_codebook_example(model_id: str, compressed_model_id: str) -> list[st
     """
     Example of using the default codebook compression.
 
-    :param  model_id (str): The identifier of the model to load.
-    :param compressed_model_id (str): The identifier for the compressed model to save.
+    :param model_id: The identifier of the model to load.
+    :param compressed_model_id: The identifier for the compressed model to save.
     :return: A list of answers generated by the model after compression.
     """
     model, tokenizer = load_model_and_tokenizer(model_id)
@@ -125,8 +125,8 @@ def custom_codebook_example(model_id: str, compressed_model_id: str) -> list[str
     """
     Example of using the custom codebook compression.
 
-    :param model_id (str): The identifier of the model to load.
-    :param compressed_model_id (str): The identifier for the compressed model to save.
+    :param model_id: The identifier of the model to load.
+    :param compressed_model_id: The identifier for the compressed model to save.
     :return: A list of answers generated by the model after compression.
     """
     model, tokenizer = load_model_and_tokenizer(model_id)
diff --git a/src/nncf/openvino/graph/metatypes/openvino_metatypes.py b/src/nncf/openvino/graph/metatypes/openvino_metatypes.py
index e2b95afc241..214bce563f1 100644
--- a/src/nncf/openvino/graph/metatypes/openvino_metatypes.py
+++ b/src/nncf/openvino/graph/metatypes/openvino_metatypes.py
@@ -819,6 +819,7 @@ def _is_embedding(node: ov.Node) -> bool:
     input_tensor = node.input_value(const_port_id)
     input_type = input_tensor.get_element_type().get_type_name()
 
+    # TODO(aanuf): Implement a pattern based check for embedding.
     if node.friendly_name.endswith("nncf_codebook"):
         return False
 
diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py
index a276f281345..9d7ca909ea3 100644
--- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -48,6 +48,7 @@
 from nncf.scopes import IgnoredScope
 from nncf.scopes import get_ignored_node_names_from_ignored_scope
 from nncf.tensor import Tensor
+from nncf.tensor import functions as fns
 from nncf.tensor.definitions import TensorDataType
 
 TModel = TypeVar("TModel")
@@ -182,17 +183,19 @@ def check_user_compression_configuration(
         )
         ranks = [advanced_parameters.lora_adapter_rank, advanced_parameters.lora_correction_params.adapter_rank]
 
-        if advanced_parameters.codebook_params.codebook is not None:
-            codebook = Tensor(advanced_parameters.codebook_params.codebook).as_numpy_tensor().data
+        codebook = advanced_parameters.codebook_params.codebook
+        if codebook is not None:
+            # OpenVINO Tensor is not support functions to validate codebook
+            np_codebook = Tensor(codebook).as_numpy_tensor()
             msg = None
-            if codebook.ndim != 1:
+            if np_codebook.ndim != 1:
                 msg = "The codebook must be a 1D array, but a multi-dimensional array is given."
-            if codebook.size < 2:
+            elif np_codebook.size < 2:
                 msg = (
                     "The codebook must contain at least two unique elements,"
                     "but a single-element or empty array is given."
                 )
-            if (codebook[:-1] >= codebook[1:]).any():
+            elif fns.any(np_codebook[:-1] >= np_codebook[1:]):
                 msg = "The codebook must be a sorted 1D array with unique elements, but an unsorted array is given."
             if msg:
                 raise nncf.ValidationError(msg)
diff --git a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
index 761647184b4..0e7e1897813 100644
--- a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
+++ b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py
@@ -213,13 +213,12 @@ def transform_model(
             compression_config = wc_params.compression_config
             node = wc_params.node_with_weight
             weight = self.get_weight(node, wc_params.weight_port_id, model, graph)
+            precomputed_compressed_weights = precomputed_compressed_weights or {}
             compressed_weight = compress_weight(
                 Tensor(weight),
                 wc_params.reduction_axes,
                 compression_config,
-                None
-                if precomputed_compressed_weights is None
-                else precomputed_compressed_weights.get(wc_params.weight_name),
+                precomputed_compressed_weights.get(wc_params.weight_name),
             )
             dequantize_block_size = max(compression_config.group_size, 0)  # 0 - is no block wise quantization
             dequantize_axis = (
diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py
index 79f6b315a09..7e5c348f3a9 100644
--- a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py
+++ b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -488,14 +488,13 @@ def transform_model(
                 raise nncf.InternalError(msg)
 
             try:
+                precomputed_compressed_weights = precomputed_compressed_weights or {}
                 # calculates compressed weights and decompression parameters
                 compressed_weight = compress_weight(
                     Tensor(weight),
                     wc_params.reduction_axes,
                     compression_config,
-                    None
-                    if precomputed_compressed_weights is None
-                    else precomputed_compressed_weights.get(wc_params.weight_name),
+                    precomputed_compressed_weights.get(wc_params.weight_name),
                 )
             except nncf.InvalidGroupSizeError as error:
                 first_caught_error = error

From 82d9e5cacd92c7c18557d096416fc0546e73e205 Mon Sep 17 00:00:00 2001
From: Andrei Anufriev <andrey.anufriev@intel.com>
Date: Wed, 9 Jul 2025 17:27:24 +0200
Subject: [PATCH 68/68] Applied suggestions.

---
 .../openvino/smollm2_360m_codebook/main.py    |  4 ++--
 src/nncf/__init__.py                          |  1 -
 src/nncf/parameters.py                        |  2 +-
 src/nncf/quantization/advanced_parameters.py  | 21 ++++---------------
 .../weight_compression/algorithm.py           | 18 ++++++++--------
 .../weight_compression/parameters.py          | 16 ++------------
 src/nncf/quantization/quantize_model.py       |  2 +-
 .../quantization/test_weights_compression.py  | 13 +++++-------
 8 files changed, 24 insertions(+), 53 deletions(-)

diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
index 7df2572c148..a5b27104218 100644
--- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
+++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py
@@ -134,14 +134,14 @@ def custom_codebook_example(model_id: str, compressed_model_id: str) -> list[str
     answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
     print_answers("Non-optimized model outputs:\n", answers_by_questions)
 
-    codebook_params = nncf.CodebookParameters(np.array([-8, -4, -2, -1, 0, 1, 2, 4, 8], dtype=np.int8))
+    codebook = np.array([-8, -4, -2, -1, 0, 1, 2, 4, 8], dtype=np.int8)
 
     model.model = nncf.compress_weights(
         model.model,
         mode=nncf.CompressWeightsMode.CODEBOOK,
         ratio=1.0,
         group_size=-1,
-        advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params),
+        advanced_parameters=nncf.AdvancedCompressionParameters(codebook=codebook),
     )
     model.save_pretrained(compressed_model_id)
     tokenizer.save_pretrained(compressed_model_id)
diff --git a/src/nncf/__init__.py b/src/nncf/__init__.py
index 14e1c38740f..77cd6fbb09a 100644
--- a/src/nncf/__init__.py
+++ b/src/nncf/__init__.py
@@ -58,7 +58,6 @@
 from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters as AdvancedQuantizationParameters
 from nncf.quantization.advanced_parameters import AdvancedScaleEstimationParameters as AdvancedScaleEstimationParameters
 from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters as AdvancedSmoothQuantParameters
-from nncf.quantization.advanced_parameters import CodebookParameters as CodebookParameters
 from nncf.quantization.advanced_parameters import OverflowFix as OverflowFix
 from nncf.scopes import IgnoredScope as IgnoredScope
 from nncf.scopes import Subgraph as Subgraph
diff --git a/src/nncf/parameters.py b/src/nncf/parameters.py
index 55ef80046de..e1269ea78e1 100644
--- a/src/nncf/parameters.py
+++ b/src/nncf/parameters.py
@@ -86,7 +86,7 @@ class CompressWeightsMode(StrEnum):
     :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead.
     :param E2M1: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0.
     :param CODEBOOK: Codebook (LUT) quantization format.
-    :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values.
+    :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format.
     """
 
     INT8_SYM = "int8_sym"
diff --git a/src/nncf/quantization/advanced_parameters.py b/src/nncf/quantization/advanced_parameters.py
index ad62fef11fc..4de0152188f 100644
--- a/src/nncf/quantization/advanced_parameters.py
+++ b/src/nncf/quantization/advanced_parameters.py
@@ -361,20 +361,6 @@ class AdvancedLoraCorrectionParameters:
     use_int8_adapters: bool = True
 
 
-@api()
-@dataclass
-class CodebookParameters:
-    """
-    Contains parameters for codebook compression algorithm.
-
-    :param codebook: The codebook (LUT) for the weight compression.
-        Applicable for vector quantization. Must be a numpy array or ov Tensor.
-    :type codebook: TTensor
-    """
-
-    codebook: Optional[TTensor] = None
-
-
 @api()
 @dataclass
 class AdvancedCompressionParameters:
@@ -395,8 +381,9 @@ class AdvancedCompressionParameters:
     :type lora_adapter_rank: int
     :param backend_params: Backend-specific parameters.
     :type backend_params: dict[str, Any]
-    :param codebook_params: Parameters for codebook compression.
-    :type codebook_params: CodebookParameters
+    :param codebook: The codebook (LUT) for the weight compression.
+        Applicable for vector quantization. Must be a numpy array or ov Tensor.
+    :type codebook: TTensor
     """
 
     statistics_path: Optional[str] = None
@@ -408,7 +395,7 @@ class AdvancedCompressionParameters:
     lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
     lora_adapter_rank: int = 256
     backend_params: dict[str, Any] = field(default_factory=dict)
-    codebook_params: CodebookParameters = field(default_factory=CodebookParameters)
+    codebook: Optional[TTensor] = None
 
 
 @api()
diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py
index 9d7ca909ea3..7ab4d2d1813 100644
--- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py
+++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -183,7 +183,7 @@ def check_user_compression_configuration(
         )
         ranks = [advanced_parameters.lora_adapter_rank, advanced_parameters.lora_correction_params.adapter_rank]
 
-        codebook = advanced_parameters.codebook_params.codebook
+        codebook = advanced_parameters.codebook
         if codebook is not None:
             # OpenVINO Tensor is not support functions to validate codebook
             np_codebook = Tensor(codebook).as_numpy_tensor()
@@ -228,9 +228,7 @@ def check_user_compression_configuration(
         msg = "LoRA Correction algorithm is not compatible with FQ, FQ_LORA and FQ_LORA_NLS compression formats."
         raise nncf.ValidationError(msg)
 
-    if mode == CompressWeightsMode.CODEBOOK and (
-        advanced_parameters is None or advanced_parameters.codebook_params.codebook is None
-    ):
+    if mode == CompressWeightsMode.CODEBOOK and (advanced_parameters is None or advanced_parameters.codebook is None):
         msg = "Codebook compression mode requires codebook parameters to be specified in advanced_parameters."
         raise nncf.ValidationError(msg)
 
@@ -457,11 +455,13 @@ def _get_ratio_defining_params(
         return ratio_defining_params
 
     def _get_primary_config(self):
-        codebook_values = (
-            Tensor(CB4_QUANTILES)
-            if self._mode == CompressWeightsMode.CB4_F8E4M3
-            else Tensor(self._advanced_parameters.codebook_params.codebook)
-        )
+        codebook_values = None
+
+        if self._mode == CompressWeightsMode.CB4_F8E4M3:
+            codebook_values = Tensor(CB4_QUANTILES)
+        elif self._mode == CompressWeightsMode.CODEBOOK:
+            codebook_values = Tensor(self._advanced_parameters.codebook)
+
         return WeightCompressionConfig(
             mode=self._mode,
             group_size=self._group_size,
diff --git a/src/nncf/quantization/algorithms/weight_compression/parameters.py b/src/nncf/quantization/algorithms/weight_compression/parameters.py
index 8c1d60fd400..fb27775997d 100644
--- a/src/nncf/quantization/algorithms/weight_compression/parameters.py
+++ b/src/nncf/quantization/algorithms/weight_compression/parameters.py
@@ -10,23 +10,11 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Optional
 
 from nncf.tensor import Tensor
 
 
-@dataclass
-class Codebook:
-    """
-    Codebook parameters for weight compression.
-    :param codebook: The initial codebook for compression.
-    :param dst_type: The destination type for the codebook.
-    """
-
-    codebook: Optional[Tensor] = None
-    dst_type: Optional[Any] = None
-
-
 @dataclass
 class CompressedWeight:
     """
@@ -42,7 +30,7 @@ class CompressedWeight:
     tensor: Optional[Tensor] = None
     scale: Optional[Tensor] = None
     zero_point: Optional[Tensor] = None
-    codebook: Optional[Codebook] = None
+    codebook: Optional[Tensor] = None
 
     def is_codebook(self):
         """
diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py
index c63d698c430..340f5983f2b 100644
--- a/src/nncf/quantization/quantize_model.py
+++ b/src/nncf/quantization/quantize_model.py
@@ -608,7 +608,7 @@ def compress_weights(
             raise nncf.ParameterNotSupportedError(msg)
 
         if any((awq, scale_estimation, gptq, lora_correction)) and mode == CompressWeightsMode.E2M1:
-            msg = f"AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is {mode}."
+            msg = "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is E2M1."
             raise nncf.ParameterNotSupportedError(msg)
 
         if gptq and lora_correction:
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 33ba2681b07..936d5d53329 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -40,7 +40,6 @@
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams
 from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams
 from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams
-from nncf.quantization.advanced_parameters import CodebookParameters
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
@@ -356,13 +355,12 @@ def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map):
 )
 def test_codebook_compression_for_different_dtypes(codebook, codebook_dtype, index_dtype, name):
     model = IntegerModel().ov_model
-    codebook_params = nncf.CodebookParameters(codebook)
 
     compressed_model = compress_weights(
         model,
         mode=CompressWeightsMode.CODEBOOK,
         group_size=7,
-        advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params),
+        advanced_parameters=nncf.AdvancedCompressionParameters(codebook=codebook),
     )
     actual_stats = {}
     for op in compressed_model.get_ops():
@@ -1182,7 +1180,7 @@ def test_codebook(codebook, n_layers, dst_type, group_size):
         ratio=1.0,
         group_size=group_size,
         all_layers=True,
-        advanced_parameters=AdvancedCompressionParameters(codebook_params=CodebookParameters(codebook=codebook)),
+        advanced_parameters=AdvancedCompressionParameters(codebook=codebook),
     )
     names_codebook = [
         op.get_friendly_name()
@@ -1728,7 +1726,7 @@ def test_nf4_quantization_mid_quant(weight, scale):
 
 
 @pytest.mark.parametrize(
-    "codebook_values",
+    "codebook",
     [
         np.array([0.2, 0.2, 0.3, 0.4], dtype=np.float32),
         np.array([0.5, 0.2, 0.3, 0.4], dtype=np.float32),
@@ -1736,8 +1734,7 @@ def test_nf4_quantization_mid_quant(weight, scale):
         np.array([5], dtype=np.float32),
     ],
 )
-def test_codebook_is_correct_array(codebook_values):
-    codebook_params = nncf.CodebookParameters(codebook_values)
+def test_codebook_is_correct_array(codebook):
     model = SequentialMatmulModel().ov_model
 
     # The codebook should be a non empty 1D numpy array and sorted
@@ -1746,7 +1743,7 @@ def test_codebook_is_correct_array(codebook_values):
             model,
             mode=CompressWeightsMode.CODEBOOK,
             group_size=-1,
-            advanced_parameters=nncf.AdvancedCompressionParameters(codebook_params=codebook_params),
+            advanced_parameters=nncf.AdvancedCompressionParameters(codebook=codebook),
         )