diff --git a/.ci/cspell_dict.txt b/.ci/cspell_dict.txt index ce79be5034d..44977d86464 100644 --- a/.ci/cspell_dict.txt +++ b/.ci/cspell_dict.txt @@ -505,4 +505,4 @@ yolov yscale yujie yury -zfnet \ No newline at end of file +zfnet diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/README.md b/examples/llm_compression/openvino/smollm2_360m_codebook/README.md new file mode 100644 index 00000000000..c82045d6261 --- /dev/null +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/README.md @@ -0,0 +1,26 @@ +# Large Language Models FP8 Compression Example + +This example demonstrates how to apply codebook compression to [HuggingFaceTB/SmolLM2-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct) model. It can be useful for evaluation and early HW enablement purposes. + +## Prerequisites + +To use this example: + +- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate` +- Install dependencies: + +```bash +pip install -U pip +pip install -r requirements.txt +pip install ../../../../ +``` + +## Run Example + +To run example: + +```bash +python main.py +``` + +It will automatically download the dataset and baseline model and save the resulting model. diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py new file mode 100644 index 00000000000..a5b27104218 --- /dev/null +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -0,0 +1,163 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +import numpy as np +from optimum.intel.openvino import OVModelForCausalLM +from torch.jit import TracerWarning +from transformers import AutoTokenizer +from transformers import logging + +import nncf + +logging.set_verbosity_error() +warnings.filterwarnings("ignore", category=TracerWarning) + + +MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct" +COMPRESSED_MODEL_ID = "smollm2_360m_compressed_codebook" + + +def generate_answers( + questions: list[str], model: OVModelForCausalLM, tokenizer: AutoTokenizer, max_new_tokens: int = 50 +) -> dict[str, str]: + """ + Generate answers for a list of questions using the provided model and tokenizer. + + :param questions: List of questions to be answered. + :param model: The model to use for generating answers. + :param tokenizer: The tokenizer to use for processing the input and output. + :param max_new_tokens: Maximum number of new tokens to generate for each answer. Defaults to 50. + :return: A dictionary mapping each question to its corresponding answer. + """ + messages = [ + {"role": "system", "content": "You are a chatbot who always responds as short as possible."}, + {"role": "user", "content": "What is the capital of Spain?"}, + {"role": "assistant", "content": "Madrid."}, + ] + answers_by_questions = {} + + for question in questions: + messages.append({"role": "user", "content": question}) + input_ids = tokenizer.apply_chat_template( + messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" + ).to(device=model.device) + input_len = len(input_ids[0]) + + output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0] + answer = tokenizer.decode(output[input_len:], skip_special_tokens=True) + answers_by_questions[question] = answer + messages.append({"role": "assistant", "content": answer}) + + return answers_by_questions + + +def print_answers(header: str, answers_by_questions: list[str]) -> None: + """ + Print the answers to the console. + + :param header: Header to print before the answers. + :param answers_by_questions: Dictionary mapping questions to their answers. + """ + print(header) + for question, answer in answers_by_questions.items(): + print(f"Q: {question}\nA: {answer}\n") + + +QUESTIONS = [ + "What is the capital of France?", + "What is the highest peak in the Alps?", + "What is the largest city in Canada?", + "What is the most visited city in Japan?", +] + + +def load_model_and_tokenizer(model_id: str, export=True) -> tuple[OVModelForCausalLM, AutoTokenizer]: + """ + Load the model and tokenizer from the specified model ID. + + :param model_id: The identifier of the model to load. + :param export: Whether to export the model for OpenVINO. Defaults to True. + :return: A tuple containing the loaded model and tokenizer. + """ + tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) + model = OVModelForCausalLM.from_pretrained( + model_id, + export=export, + load_in_8bit=False, + ) + return model, tokenizer + + +def default_codebook_example(model_id: str, compressed_model_id: str) -> list[str]: + """ + Example of using the default codebook compression. + + :param model_id: The identifier of the model to load. + :param compressed_model_id: The identifier for the compressed model to save. + :return: A list of answers generated by the model after compression. + """ + model, tokenizer = load_model_and_tokenizer(model_id) + answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) + print_answers("Non-optimized model outputs:\n", answers_by_questions) + + model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CB4_F8E4M3, ratio=1.0, group_size=64) + model.save_pretrained(compressed_model_id) + tokenizer.save_pretrained(compressed_model_id) + + model, tokenizer = load_model_and_tokenizer(compressed_model_id, False) + answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) + print_answers("Optimized model outputs:\n", answers_by_questions) + + return list(answers_by_questions.values()) + + +def custom_codebook_example(model_id: str, compressed_model_id: str) -> list[str]: + """ + Example of using the custom codebook compression. + + :param model_id: The identifier of the model to load. + :param compressed_model_id: The identifier for the compressed model to save. + :return: A list of answers generated by the model after compression. + """ + model, tokenizer = load_model_and_tokenizer(model_id) + + answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) + print_answers("Non-optimized model outputs:\n", answers_by_questions) + + codebook = np.array([-8, -4, -2, -1, 0, 1, 2, 4, 8], dtype=np.int8) + + model.model = nncf.compress_weights( + model.model, + mode=nncf.CompressWeightsMode.CODEBOOK, + ratio=1.0, + group_size=-1, + advanced_parameters=nncf.AdvancedCompressionParameters(codebook=codebook), + ) + model.save_pretrained(compressed_model_id) + tokenizer.save_pretrained(compressed_model_id) + + model, tokenizer = load_model_and_tokenizer(compressed_model_id, False) + answers_by_questions = generate_answers(QUESTIONS, model, tokenizer) + print_answers("Optimized model outputs:\n", answers_by_questions) + + return list(answers_by_questions.values()) + + +def main(): + res = default_codebook_example(MODEL_ID, COMPRESSED_MODEL_ID) + res += custom_codebook_example(MODEL_ID, COMPRESSED_MODEL_ID + "_custom") + return res + + +if __name__ == "__main__": + main() diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt new file mode 100644 index 00000000000..feab3bfd695 --- /dev/null +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt @@ -0,0 +1,4 @@ +openvino==2025.1 +optimum-intel[openvino]>=1.22.0 +transformers>=4.48.0 +onnx==1.17.0 diff --git a/src/nncf/openvino/graph/metatypes/openvino_metatypes.py b/src/nncf/openvino/graph/metatypes/openvino_metatypes.py index c7726276e00..214bce563f1 100644 --- a/src/nncf/openvino/graph/metatypes/openvino_metatypes.py +++ b/src/nncf/openvino/graph/metatypes/openvino_metatypes.py @@ -817,7 +817,13 @@ def _is_embedding(node: ov.Node) -> bool: allowed_types_list = ["f16", "f32", "f64"] const_port_id = 0 input_tensor = node.input_value(const_port_id) - if input_tensor.get_element_type().get_type_name() in allowed_types_list: + input_type = input_tensor.get_element_type().get_type_name() + + # TODO(aanuf): Implement a pattern based check for embedding. + if node.friendly_name.endswith("nncf_codebook"): + return False + + if input_type in allowed_types_list: const_node = get_operation_const_op(node, const_port_id) if const_node is not None: return True diff --git a/src/nncf/openvino/graph/node_utils.py b/src/nncf/openvino/graph/node_utils.py index 32ed821b7d1..5faec5e904e 100644 --- a/src/nncf/openvino/graph/node_utils.py +++ b/src/nncf/openvino/graph/node_utils.py @@ -44,6 +44,7 @@ from nncf.openvino.graph.metatypes.openvino_metatypes import get_node_metatype from nncf.tensor import Tensor from nncf.tensor import TensorBackend +from nncf.tensor import TensorDataType InplaceInsertionFnType = Callable[[ov.Node, int, str], ov.Node] @@ -685,3 +686,27 @@ def create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] = return opset.constant(x.data, name=name, shared_memory=True) const = opset.constant(x.data, dtype=dtype, name=name) return const + + +def create_ov_codebook_subgraph( + codebook: Tensor, indexes: Tensor, dtype: ov.Type, name: Optional[str] = None +) -> op.Constant: + """ + Create an OpenVINO subgraph with gather from the given codebook and indexes tensors. + + :param codebook: Codebook tensor. + :param indexes: Indexes tensor. + :param dtype: Data type of the indexes. + :param name: Optional name of the constant. + :return: OpenVINO subgraph. + """ + codebook_const = opset.constant(codebook.data, name=name) + if codebook.dtype != TensorDataType.float16: + codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16) + + codebook_indexes = opset.constant(indexes.data, dtype=dtype, name=name + "_nncf_codebook_idxs") + if dtype == ov.Type.u4: + codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8) + + const = opset.gather(codebook_const, codebook_indexes, 0, name=name + "_nncf_codebook") + return const diff --git a/src/nncf/openvino/optimized_functions/functions.py b/src/nncf/openvino/optimized_functions/functions.py index 2a11e4c3608..e22ea481abd 100644 --- a/src/nncf/openvino/optimized_functions/functions.py +++ b/src/nncf/openvino/optimized_functions/functions.py @@ -105,7 +105,7 @@ def do_float_quantization( config: WeightCompressionConfig, reduction_axes: Optional[ReductionAxes] = None, precomputed_scale: Optional[Tensor] = None, -) -> tuple[Tensor, Tensor]: +) -> tuple[Tensor, Tensor, Tensor]: """ Computes quantization scale if not provided, and performs corresponding nf4 weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. @@ -151,7 +151,7 @@ def do_float_quantization( compressed_weight = model([weight, precomputed_scale])[0] scale = precomputed_scale - return compressed_weight, scale + return compressed_weight, scale, None def integer_quantize_dequantize_weight( diff --git a/src/nncf/parameters.py b/src/nncf/parameters.py index 0c8753f5530..e1269ea78e1 100644 --- a/src/nncf/parameters.py +++ b/src/nncf/parameters.py @@ -85,6 +85,8 @@ class CompressWeightsMode(StrEnum): :param NF4: The the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead. :param E2M1: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. + :param CODEBOOK: Codebook (LUT) quantization format. + :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format. """ INT8_SYM = "int8_sym" @@ -92,8 +94,10 @@ class CompressWeightsMode(StrEnum): INT4_SYM = "int4_sym" INT4_ASYM = "int4_asym" NF4 = "nf4" + CB4_F8E4M3 = "cb4_f8e4m3" INT8 = "int8" # Deprecated mode E2M1 = "e2m1" + CODEBOOK = "codebook" @api(canonical_alias="nncf.CompressionFormat") diff --git a/src/nncf/quantization/advanced_parameters.py b/src/nncf/quantization/advanced_parameters.py index 10f18b34eae..4de0152188f 100644 --- a/src/nncf/quantization/advanced_parameters.py +++ b/src/nncf/quantization/advanced_parameters.py @@ -29,6 +29,8 @@ from nncf.quantization.range_estimator import RangeEstimatorParameters from nncf.quantization.range_estimator import StatisticsType +TTensor = Any + @api(canonical_alias="nncf.OverflowFix") class OverflowFix(StrEnum): @@ -379,6 +381,9 @@ class AdvancedCompressionParameters: :type lora_adapter_rank: int :param backend_params: Backend-specific parameters. :type backend_params: dict[str, Any] + :param codebook: The codebook (LUT) for the weight compression. + Applicable for vector quantization. Must be a numpy array or ov Tensor. + :type codebook: TTensor """ statistics_path: Optional[str] = None @@ -390,6 +395,7 @@ class AdvancedCompressionParameters: lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters) lora_adapter_rank: int = 256 backend_params: dict[str, Any] = field(default_factory=dict) + codebook: Optional[TTensor] = None @api() diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index 8f75dbc4013..7ab4d2d1813 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -39,6 +39,7 @@ from nncf.quantization.algorithms.algorithm import Algorithm from nncf.quantization.algorithms.weight_compression.awq import AWQ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.constants import CB4_QUANTILES from nncf.quantization.algorithms.weight_compression.gptq import GPTQ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -46,6 +47,8 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import WeightCompressionConfig from nncf.scopes import IgnoredScope from nncf.scopes import get_ignored_node_names_from_ignored_scope +from nncf.tensor import Tensor +from nncf.tensor import functions as fns from nncf.tensor.definitions import TensorDataType TModel = TypeVar("TModel") @@ -179,6 +182,24 @@ def check_user_compression_configuration( ] ) ranks = [advanced_parameters.lora_adapter_rank, advanced_parameters.lora_correction_params.adapter_rank] + + codebook = advanced_parameters.codebook + if codebook is not None: + # OpenVINO Tensor is not support functions to validate codebook + np_codebook = Tensor(codebook).as_numpy_tensor() + msg = None + if np_codebook.ndim != 1: + msg = "The codebook must be a 1D array, but a multi-dimensional array is given." + elif np_codebook.size < 2: + msg = ( + "The codebook must contain at least two unique elements," + "but a single-element or empty array is given." + ) + elif fns.any(np_codebook[:-1] >= np_codebook[1:]): + msg = "The codebook must be a sorted 1D array with unique elements, but an unsorted array is given." + if msg: + raise nncf.ValidationError(msg) + for size in values_to_check: if size <= 0: msg = f"The subset_size value should be positive, but subset_size={size} is given." @@ -207,6 +228,10 @@ def check_user_compression_configuration( msg = "LoRA Correction algorithm is not compatible with FQ, FQ_LORA and FQ_LORA_NLS compression formats." raise nncf.ValidationError(msg) + if mode == CompressWeightsMode.CODEBOOK and (advanced_parameters is None or advanced_parameters.codebook is None): + msg = "Codebook compression mode requires codebook parameters to be specified in advanced_parameters." + raise nncf.ValidationError(msg) + class WeightCompression(Algorithm): """ @@ -293,7 +318,7 @@ def __init__( advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters() ) - primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size) + primary_config = self._get_primary_config() criterion_cls = MIXED_PRECISION_CRITERIA.get(self._sensitivity_metric) self._mixed_precision_algo = criterion_cls(primary_config, self._ratio, self._subset_size) self._statistics_path = self._advanced_parameters.statistics_path @@ -429,6 +454,20 @@ def _get_ratio_defining_params( return ratio_defining_params + def _get_primary_config(self): + codebook_values = None + + if self._mode == CompressWeightsMode.CB4_F8E4M3: + codebook_values = Tensor(CB4_QUANTILES) + elif self._mode == CompressWeightsMode.CODEBOOK: + codebook_values = Tensor(self._advanced_parameters.codebook) + + return WeightCompressionConfig( + mode=self._mode, + group_size=self._group_size, + codebook_values=codebook_values, + ) + def _set_weight_compression_config( self, ratio_defining_params: list[WeightCompressionParameters], @@ -445,7 +484,7 @@ def _set_weight_compression_config( :param graph: The model graph associated with the model. :param statistics_points: Statistics points. """ - primary_config = WeightCompressionConfig(mode=self._mode, group_size=self._group_size) + primary_config = self._get_primary_config() if self._ratio == 1: for weight_param in ratio_defining_params: weight_param.compression_config = primary_config @@ -653,13 +692,13 @@ def apply( # del is used to prematurely mark non-necessary data as free for garbage collection del self.awq_algo - scales = {} - zero_points = {} + precomputed_compressed_weights = None lora_correction_algo = None description = "Applying Weight Compression" + if self._gptq: del statistics - model, scales, zero_points = self._gptq_algo.apply( + model, precomputed_compressed_weights = self._gptq_algo.apply( model=model, graph=graph, dataset=dataset, @@ -668,7 +707,7 @@ def apply( ) else: if self._scale_estimation: - scales, zero_points = self._scale_estimation_algo.apply( + precomputed_compressed_weights = self._scale_estimation_algo.apply( model=model, graph=graph, all_weight_params=all_weight_params, @@ -691,8 +730,7 @@ def apply( model, graph, track(all_weight_params, description=description, weights=all_weight_sizes), - scales, - zero_points, + precomputed_compressed_weights, lora_correction_algo, self._compression_format, self._advanced_parameters, diff --git a/src/nncf/quantization/algorithms/weight_compression/awq.py b/src/nncf/quantization/algorithms/weight_compression/awq.py index fbab09a1fdf..fa423828fc1 100644 --- a/src/nncf/quantization/algorithms/weight_compression/awq.py +++ b/src/nncf/quantization/algorithms/weight_compression/awq.py @@ -25,7 +25,6 @@ from nncf.common.utils.backend import BackendType from nncf.common.utils.backend import get_backend from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic -from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.algorithm import Algorithm from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend @@ -250,7 +249,7 @@ def _data_aware_step(self, wp, weight, statistics): for _ in range(self._steps): cur_scale = gscale**alpha weights_to_fake_quantize = gweight * cur_scale - if config.mode == CompressWeightsMode.NF4: + if not config.is_integer: g_decompressed_weighs = float_quantize_dequantize_weight( weights_to_fake_quantize, awq_config, reduction_axis ) diff --git a/src/nncf/quantization/algorithms/weight_compression/backend.py b/src/nncf/quantization/algorithms/weight_compression/backend.py index 62d0745a0f4..e2257168ad3 100644 --- a/src/nncf/quantization/algorithms/weight_compression/backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/backend.py @@ -28,6 +28,7 @@ from nncf.quantization.advanced_parameters import AdvancedCompressionParameters from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.tensor import Tensor from nncf.tensor import TensorDataType @@ -148,8 +149,7 @@ def transform_model( model: TModel, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - precomputed_scales: dict[str, Tensor] = None, - precomputed_zero_points: dict[str, Tensor] = None, + precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None, lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), @@ -160,8 +160,7 @@ def transform_model( :param model: Model in which the weights will be compressed according to the weight compression description. :param graph: The graph associated with the model. :param weight_compression_parameters: An iterable of weight compression parameters. - :param precomputed_scales: Precomputed scales for weight compression. - :param precomputed_zero_points: Precomputed zero points for weight compression. + :param precomputed_compressed_weights: Precomputed scales, zero points, or codebook for weight compression. :param lora_correction_algo: An optional algorithm to reduce quantization noise after weight compression by using low-rank adapters. This algorithm not only overrides weights with their quantized counterparts but also expands the model's execution graph following the Low-Rank Adaptation (LoRA) concept. diff --git a/src/nncf/quantization/algorithms/weight_compression/config.py b/src/nncf/quantization/algorithms/weight_compression/config.py index 63ed892c472..1d5376b3454 100644 --- a/src/nncf/quantization/algorithms/weight_compression/config.py +++ b/src/nncf/quantization/algorithms/weight_compression/config.py @@ -18,6 +18,7 @@ from nncf.parameters import CompressWeightsMode TWeightType = TypeVar("TWeightType") +TTensor = TypeVar("TTensor") @dataclass @@ -28,10 +29,14 @@ class WeightCompressionConfig: :param mode: Defines a mode for weight compression. Defaults to INT8_ASYM mode. :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale). The value -1 means no grouping. Defaults to -1. + :param codebook_values: Optional codebook values for CODEBOOK compression mode. + Must be fns.Tensor which wraps numpy array or ov tensor. Storing ov tensor is useful for having + destination data type information available. """ mode: Optional[CompressWeightsMode] = CompressWeightsMode.INT8_ASYM group_size: Optional[int] = -1 + codebook_values: Optional[TTensor] = None @property def num_bits(self): @@ -49,7 +54,22 @@ def is_integer(self): """ :return: True if compression type in integer, else False. """ - return self.mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] + return self.mode not in [ + CompressWeightsMode.NF4, + CompressWeightsMode.E2M1, + CompressWeightsMode.CODEBOOK, + CompressWeightsMode.CB4_F8E4M3, + ] + + @property + def is_codebook(self): + """ + :return: True if compression type is codebook, else False. + """ + return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] + + def get_numpy_codebook(self): + return self.codebook_values.as_numpy_tensor() def __hash__(self): return hash((self.mode.value, self.group_size)) diff --git a/src/nncf/quantization/algorithms/weight_compression/constants.py b/src/nncf/quantization/algorithms/weight_compression/constants.py new file mode 100644 index 00000000000..6119fd8f83c --- /dev/null +++ b/src/nncf/quantization/algorithms/weight_compression/constants.py @@ -0,0 +1,79 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +NF4_QUANTILES = np.array( + [ + -1.0, + -0.6961928009986877, + -0.5250730514526367, + -0.39491748809814453, + -0.28444138169288635, + -0.18477343022823334, + -0.09105003625154495, + 0.0, + 0.07958029955625534, + 0.16093020141124725, + 0.24611230194568634, + 0.33791524171829224, + 0.44070982933044434, + 0.5626170039176941, + 0.7229568362236023, + 1.0, + ], + dtype=np.float32, +) + + +CB4_QUANTILES = np.array( + [ + -3.5, + -2.5, + -1.875, + -1.375, + -1.0, + -0.625, + -0.3125, + 0.0, + 0.28125, + 0.5625, + 0.875, + 1.125, + 1.5, + 2.0, + 2.5, + 3.5, + ], + dtype=np.float32, +) + + +CENTER_OF_NF4_QUANTILES = np.array( + [ + -0.84809643, + -0.6106329, + -0.45999527, + -0.33967942, + -0.2346074, + -0.13791174, + -0.045525018, + 0.03979015, + 0.120255254, + 0.20352125, + 0.29201376, + 0.38931254, + 0.5016634, + 0.6427869, + 0.8614784, + ], + dtype=np.float32, +) diff --git a/src/nncf/quantization/algorithms/weight_compression/gptq.py b/src/nncf/quantization/algorithms/weight_compression/gptq.py index 1de6f549851..814ec4a2a6b 100644 --- a/src/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/src/nncf/quantization/algorithms/weight_compression/gptq.py @@ -25,6 +25,7 @@ from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_float_quantization_params from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_integer_quantization_params @@ -83,7 +84,7 @@ def apply( weight_compression_parameters: list[WeightCompressionParameters], statistic_points: Optional[StatisticPointsContainer] = None, backend_entity: Optional[WeightCompressionAlgoBackend] = None, - ) -> tuple[TModel, dict[str, Tensor], dict[str, Tensor]]: + ) -> tuple[TModel, dict[str, CompressedWeight]]: """ Applies the GPTQ algorithm to quantize the weights of the given model. @@ -99,8 +100,7 @@ def apply( if self._backend_entity is None: self._set_backend_entity(model) - scales = {} - zero_points = {} + res = {} target_nodes = [] target_nodes_wc_params_map = {} @@ -123,10 +123,9 @@ def apply( _, input_tensors = next(iter(inputs.items())) hessian = self._calculate_hessian(node, input_tensors) scale, zero_point = self._quantize_weights(model, graph, wc_params, hessian, input_tensors) - scales[wc_params.weight_name] = scale - zero_points[wc_params.weight_name] = zero_point + res[wc_params.weight_name] = CompressedWeight(None, scale, zero_point, None) - return model, scales, zero_points + return model, res def get_statistic_points( self, @@ -235,7 +234,9 @@ def _quantize_weights( else weight_tensor.shape[1] ) reduction_axes = wc_params.reduction_axes - block_compression_config = WeightCompressionConfig(mode=wc_params.compression_config.mode) + block_compression_config = WeightCompressionConfig( + mode=wc_params.compression_config.mode, codebook_values=wc_params.compression_config.codebook_values + ) damp = self._damp_percent * fns.mean(fns.diag(hessian)) diag_indices = fns.arange(columns, backend=hessian.backend, device=hessian.device) @@ -260,7 +261,7 @@ def _quantize_weights( hessian_diag_val = hessian_inv_block[i, i] if (i1 + i) % group_size == 0: - if block_compression_config.mode == CompressWeightsMode.NF4: + if not block_compression_config.is_integer: scale = calculate_float_quantization_params( weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes, block_compression_config ) @@ -284,7 +285,7 @@ def _quantize_weights( scales.append(scale) zero_points.append(zero_point) - if block_compression_config.mode == CompressWeightsMode.NF4: + if not block_compression_config.is_integer: quantized_col = float_quantize_dequantize_weight( fns.unsqueeze(weight_col, 1), block_compression_config, diff --git a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py index 2bac7048639..0e7e1897813 100644 --- a/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/onnx_backend.py @@ -49,7 +49,7 @@ from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm -from nncf.quantization.algorithms.weight_compression.weight_lowering import CompressedWeight +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType @@ -201,8 +201,7 @@ def transform_model( model: onnx.ModelProto, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - precomputed_scales: dict[str, Tensor] = None, - precomputed_zero_points: dict[str, Tensor] = None, + precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None, lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), @@ -214,12 +213,12 @@ def transform_model( compression_config = wc_params.compression_config node = wc_params.node_with_weight weight = self.get_weight(node, wc_params.weight_port_id, model, graph) + precomputed_compressed_weights = precomputed_compressed_weights or {} compressed_weight = compress_weight( Tensor(weight), wc_params.reduction_axes, compression_config, - None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name), - None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name), + precomputed_compressed_weights.get(wc_params.weight_name), ) dequantize_block_size = max(compression_config.group_size, 0) # 0 - is no block wise quantization dequantize_axis = ( diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 7c1838eb8d2..6215fb4b1ee 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -32,6 +32,7 @@ from nncf.openvino.graph.metatypes.groups import ATOMIC_ACTIVATIONS_OPERATIONS from nncf.openvino.graph.model_transformer import OVModelTransformer from nncf.openvino.graph.node_utils import convert_op +from nncf.openvino.graph.node_utils import create_ov_codebook_subgraph from nncf.openvino.graph.node_utils import create_ov_const_from_tensor from nncf.openvino.graph.node_utils import get_const_value_as_numpy_tensor from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor @@ -57,6 +58,7 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType @@ -217,8 +219,7 @@ def _create_compression_subgraph( weight_port_id: int, const_dtype, should_add_convert_node: bool, - layer_scales: Optional[Tensor] = None, - layer_zero_points: Optional[Tensor] = None, + precomputed_compressed_weight: Optional[CompressedWeight] = None, ): scale_dtype = ov.Type.f16 if compression_config.mode == CompressWeightsMode.NF4: @@ -234,32 +235,47 @@ def _create_compression_subgraph( compression_dtype = ov.Type.i8 elif compression_config.mode == CompressWeightsMode.INT8_ASYM: compression_dtype = ov.Type.u8 + elif compression_config.is_codebook: + compression_dtype = None else: msg = f"{compression_config.mode.value} is not supported." raise nncf.ParameterNotSupportedError(msg) original_shape = weight.shape + with disable_results_caching(OV_MODEL_CACHE): compressed_weight = compress_weight( weight, reduction_axes, compression_config, - layer_scales, - layer_zero_points, + precomputed_compressed_weight, ) - compressed_const = create_ov_const_from_tensor( - compressed_weight.tensor, compression_dtype, name=const_node_name - ) - converted_const = opset.convert(compressed_const, ov.Type.f16) - if compressed_weight.zero_point is not None: - zero_point_const = create_ov_const_from_tensor( - compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point" + if compression_config.is_codebook: + n_quants = compressed_weight.codebook.size - 1 + compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4) + converted_const = create_ov_codebook_subgraph( + codebook=compressed_weight.codebook + if compression_config.mode == CompressWeightsMode.CODEBOOK + else compressed_weight.codebook.as_openvino_tensor().astype(TensorDataType.f8e4m3), + indexes=compressed_weight.tensor, + dtype=compression_dtype, + name=const_node_name, ) - zero_point_const = opset.convert(zero_point_const, ov.Type.f16) - converted_const = opset.subtract( - converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract" + else: + compressed_const = create_ov_const_from_tensor( + compressed_weight.tensor, compression_dtype, name=const_node_name ) + converted_const = opset.convert(compressed_const, ov.Type.f16) + + if compressed_weight.zero_point is not None: + zero_point_const = create_ov_const_from_tensor( + compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point" + ) + zero_point_const = opset.convert(zero_point_const, ov.Type.f16) + converted_const = opset.subtract( + converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract" + ) scale_const = create_ov_const_from_tensor(compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale") scale_const = convert_op(scale_const, ov.Type.f16) @@ -282,9 +298,8 @@ def transform_model( model: ov.Model, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - precomputed_scales: dict[str, Tensor] = None, - precomputed_zero_points: dict[str, Tensor] = None, - lora_correction_algo: LoraCorrectionAlgorithm = None, + precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None, + lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), ) -> ov.Model: @@ -308,10 +323,6 @@ def transform_model( should_add_convert_node = True break - layer_scales = None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name) - layer_zero_points = ( - None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name) - ) try: mul, compressed_weight = self._create_compression_subgraph( weight=weight, @@ -321,8 +332,9 @@ def transform_model( weight_port_id=wc_params.weight_port_id, const_dtype=const_dtype, should_add_convert_node=should_add_convert_node, - layer_scales=layer_scales, - layer_zero_points=layer_zero_points, + precomputed_compressed_weight=None + if precomputed_compressed_weights is None + else precomputed_compressed_weights.get(wc_params.weight_name), ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/src/nncf/quantization/algorithms/weight_compression/parameters.py b/src/nncf/quantization/algorithms/weight_compression/parameters.py new file mode 100644 index 00000000000..fb27775997d --- /dev/null +++ b/src/nncf/quantization/algorithms/weight_compression/parameters.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional + +from nncf.tensor import Tensor + + +@dataclass +class CompressedWeight: + """ + Compressed weight and decompression parameters. + + :param tensor: The tensor with compressed weight. + :param scale: The decompression scale, in practice it is dequantization scale for the quantization. + :param zero_point: The zero-point, it is the value of the compression type corresponding to the value 0 + in the non-compression realm. Applicable for INT quantization. + :param codebook: The codebook (LUT) for the weight compression. Applicable for vector quantization + """ + + tensor: Optional[Tensor] = None + scale: Optional[Tensor] = None + zero_point: Optional[Tensor] = None + codebook: Optional[Tensor] = None + + def is_codebook(self): + """ + Check if the compressed weight is a codebook. + + :return: True if the compressed weight is a codebook, False otherwise. + """ + return self.codebook is not None and self.tensor is not None and self.scale is not None diff --git a/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 34921ad1563..d7c63c3d1e8 100644 --- a/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/src/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -18,12 +18,12 @@ from nncf.common.utils.backend import BackendType from nncf.common.utils.backend import get_backend from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic -from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight @@ -98,7 +98,7 @@ def apply( all_weight_params: list[WeightCompressionParameters], statistics: dict[str, WCTensorStatistic], backend_entity: Optional[WeightCompressionAlgoBackend] = None, - ) -> tuple[dict[str, Tensor], dict[str, Tensor]]: + ) -> dict[str, CompressedWeight]: """ Estimates better scale for the int4 nodes in the model. Minimizes per-group difference between floating point MatMul and @@ -118,7 +118,7 @@ def apply( self._backend_entity = backend_entity if self._backend_entity is None: self._set_backend_entity(model) - scales, zero_points = dict(), dict() + res = dict() invalid_node_names = [] first_caught_error = None @@ -128,7 +128,7 @@ def apply( config = wp.compression_config if config.num_bits != 4 or node_name not in statistics: - scales[weight_name] = None + res[weight_name] = CompressedWeight() continue stats = statistics[node_name] @@ -141,7 +141,7 @@ def apply( weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) try: - scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params( + scale, zero_point = self.calculate_quantization_params( stats, weight, wp.reduction_axes, @@ -151,6 +151,7 @@ def apply( self._scale_steps, self._weight_penalty, ) + res[weight_name] = CompressedWeight(None, scale, zero_point, None) except nncf.InvalidGroupSizeError as error: first_caught_error = error invalid_node_names.append(wp.node_with_weight.node_name) @@ -158,7 +159,7 @@ def apply( if first_caught_error: handle_invalid_group_size_error(first_caught_error, invalid_node_names) - return scales, zero_points + return res @staticmethod def calculate_quantization_params( @@ -211,7 +212,7 @@ def calculate_quantization_params( cur_config.group_size = group_size original_weight = fns.zeros_like(weight) + weight - if config.mode == CompressWeightsMode.NF4: + if not config.is_integer: q_weights, compressed_weights, scale = float_quantize_dequantize_weight( original_weight, cur_config, reduction_axis, return_compressed_weight=True ) @@ -260,7 +261,7 @@ def calculate_quantization_params( near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) near_to_ideal_scale = near_to_ideal_scale * scale_sign - if config.mode == CompressWeightsMode.NF4: + if not config.is_integer: out = float_quantize_dequantize_weight( original_weight, config, @@ -298,8 +299,8 @@ def calculate_quantization_params( result_scale = near_to_ideal_scale if i < initial_steps - 1: - if config.mode == CompressWeightsMode.NF4: - out, _ = do_float_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale) + if not config.is_integer: + out, _, _ = do_float_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale) else: out, _, _ = do_integer_quantization( original_weight, @@ -316,8 +317,8 @@ def calculate_quantization_params( factor = 1.0 - 0.05 * scale_step scaled_scale = factor * scale - if config.mode == CompressWeightsMode.NF4: - out, _ = do_float_quantization(original_weight, config, precomputed_scale=scaled_scale) + if not config.is_integer: + out, _, _ = do_float_quantization(original_weight, config, precomputed_scale=scaled_scale) else: out, _, _ = do_integer_quantization( original_weight, @@ -332,7 +333,7 @@ def calculate_quantization_params( near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) near_to_ideal_scale = near_to_ideal_scale * scale_sign - if config.mode == CompressWeightsMode.NF4: + if not config.is_integer: out = float_quantize_dequantize_weight(original_weight, config, precomputed_scale=near_to_ideal_scale) else: out = integer_quantize_dequantize_weight( diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py index 13e6abc751a..7e5c348f3a9 100644 --- a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -48,7 +48,7 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm -from nncf.quantization.algorithms.weight_compression.weight_lowering import CompressedWeight +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType @@ -456,9 +456,8 @@ def transform_model( model: Union[GraphModelWrapper, torch.nn.Module], graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - precomputed_scales: dict[str, Tensor] = None, - precomputed_zero_points: dict[str, Tensor] = None, - lora_correction_algo: LoraCorrectionAlgorithm = None, + precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None, + lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), ) -> NNCFNetwork: @@ -489,13 +488,13 @@ def transform_model( raise nncf.InternalError(msg) try: + precomputed_compressed_weights = precomputed_compressed_weights or {} # calculates compressed weights and decompression parameters compressed_weight = compress_weight( Tensor(weight), wc_params.reduction_axes, compression_config, - None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name), - None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name), + precomputed_compressed_weights.get(wc_params.weight_name), ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index 2650f16600c..396f125ca7b 100644 --- a/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -43,6 +43,7 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.handle_errors import handle_invalid_group_size_error from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.algorithms.weight_compression.torch_backend import PTAWQAlgoAlgoBackend from nncf.quantization.algorithms.weight_compression.torch_backend import PTMixedPrecisionAlgoBackend from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend @@ -189,9 +190,8 @@ def transform_model( model: torch.fx.GraphModule, graph: NNCFGraph, weight_compression_parameters: Iterable[WeightCompressionParameters], - precomputed_scales: dict[str, Tensor] = None, - precomputed_zero_points: dict[str, Tensor] = None, - lora_correction_algo: LoraCorrectionAlgorithm = None, + precomputed_compressed_weights: Optional[dict[str, CompressedWeight]] = None, + lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None, compression_format: CompressionFormat = CompressionFormat.DQ, advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(), ) -> torch.fx.GraphModule: @@ -218,8 +218,9 @@ def transform_model( weight, wc_params.reduction_axes, compression_config, - None if precomputed_scales is None else precomputed_scales.get(wc_params.weight_name), - None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name), + None + if precomputed_compressed_weights is None + else precomputed_compressed_weights.get(wc_params.weight_name), ) except nncf.InvalidGroupSizeError as error: first_caught_error = error diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 12522e0fa60..8a1ee8f9b40 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -9,17 +9,19 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from dataclasses import dataclass from typing import Optional, Union -import numpy as np - import nncf from nncf.common.logging.logger import nncf_logger from nncf.common.utils.backend import is_openvino_at_least from nncf.common.utils.backend import is_openvino_available +from nncf.errors import InvalidGroupSizeError +from nncf.errors import UnsupportedModelError from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES +from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES +from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.fake_quantize import calculate_scale_zero_point from nncf.tensor import Tensor from nncf.tensor import functions as fns @@ -28,69 +30,9 @@ ReductionAxes = Union[int, tuple[int, ...]] -NF4_QUANTILES = np.array( - [ - -1.0, - -0.6961928009986877, - -0.5250730514526367, - -0.39491748809814453, - -0.28444138169288635, - -0.18477343022823334, - -0.09105003625154495, - 0.0, - 0.07958029955625534, - 0.16093020141124725, - 0.24611230194568634, - 0.33791524171829224, - 0.44070982933044434, - 0.5626170039176941, - 0.7229568362236023, - 1.0, - ], - dtype=np.float32, -) - -CENTER_OF_NF4_QUANTILES = np.array( - [ - -0.84809643, - -0.6106329, - -0.45999527, - -0.33967942, - -0.2346074, - -0.13791174, - -0.045525018, - 0.03979015, - 0.120255254, - 0.20352125, - 0.29201376, - 0.38931254, - 0.5016634, - 0.6427869, - 0.8614784, - ], - dtype=np.float32, -) - - MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000 -@dataclass -class CompressedWeight: - """ - Compressed weight and decompression parameters. - - :param tensor: The tensor with compressed weight. - :param scale: The decompression scale, in practice it is dequantization scale for the INT quantization. - :param zero_point: The zero-point, it is the value of the compression type corresponding to the value 0 - in the non-compression realm. Applicable for INT quantization. - """ - - tensor: Tensor - scale: Tensor - zero_point: Optional[Tensor] = None - - def reshape_weight_for_grouped_quantization( weight: Tensor, reduction_axes: ReductionAxes, group_size: int ) -> tuple[Tensor, int]: @@ -109,11 +51,11 @@ def reshape_weight_for_grouped_quantization( reduction_axes = reduction_axes[0] if not isinstance(reduction_axes, int): msg = f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}." - raise nncf.UnsupportedModelError(msg) + raise UnsupportedModelError(msg) channel_size = weight.shape[reduction_axes] if channel_size % group_size != 0: msg = f"Channel size {channel_size} should be divisible by size of group {group_size}." - raise nncf.InvalidGroupSizeError(msg) + raise InvalidGroupSizeError(msg) num_groups_per_channel = channel_size // group_size shape = list(weight.shape) # [a1, r, a2] - "r" refers to number of channels along reduction axis @@ -124,7 +66,7 @@ def reshape_weight_for_grouped_quantization( def calculate_float_quantization_params( - weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, max_val=6.0 + weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig ) -> Tensor: """ Calculates the scale for nf4 or e2m1 quantization. @@ -132,22 +74,23 @@ def calculate_float_quantization_params( :param weight: Weight array to compress. :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max). :param config: Weight compression configuration. - :param max_val: Maximal value of e2m1 type. :return: Scale tensor of float32 type for float quantization. """ - assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] + assert not config.is_integer if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) + if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]: + max_val = 6.0 if config.mode == CompressWeightsMode.E2M1 else fns.max(fns.abs(config.get_numpy_codebook())) + scale = scale / max_val # NOTE: adding machine epsilon to avoid division by zero eps = fns.finfo(weight).eps scale = fns.where(fns.abs(scale) < eps, eps, scale) if config.mode == CompressWeightsMode.E2M1: - scale = scale / max_val scale = fns.log2(scale) scale = fns.ceil(scale) scale = fns.clip(scale, -127, 127) @@ -177,20 +120,21 @@ def do_float_quantization( config: WeightCompressionConfig, reduction_axes: Optional[ReductionAxes] = None, precomputed_scale: Optional[Tensor] = None, -) -> tuple[Tensor, Tensor]: +) -> tuple[Tensor, Tensor, Tensor]: """ Computes quantization scale if not provided, and performs corresponding (nf4, e2m1) weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - For E2M1 currently returns normalized weight without quantization. + For E2M1 and CODEBOOK currently returns normalized weight without quantization. TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor. + :return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor and + optional indexes for codebook. """ - assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] + assert not config.is_integer if config.group_size != -1 and reduction_axes is not None: # weights are reshaped: [a1, r, a2] -> [a1, r//gs, gs, a2] @@ -218,10 +162,15 @@ def do_float_quantization( compressed_weight = norm_weight.as_openvino_tensor().astype(TensorDataType.nf4) else: compressed_weight = _calculate_nf4_quantized_weight(norm_weight) + elif config.is_codebook: + compressed_weight, indexes = _calculate_codebook_quantized_weight( + norm_weight, quantiles=config.get_numpy_codebook() + ) + return compressed_weight, scale, indexes else: # TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved compressed_weight = norm_weight - return compressed_weight, scale + return compressed_weight, scale, None def float_quantize_dequantize_weight( @@ -242,11 +191,11 @@ def float_quantize_dequantize_weight( :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale. :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale. """ - assert config.mode == CompressWeightsMode.NF4 + assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] # TODO(nikita-savelyevv): add support for f4e2m1 once ticket 164851 is resolved # Optimized implementation - if _can_run_optimized(weight): + if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight): from nncf.openvino.optimized_functions import ( float_quantize_dequantize_weight as float_quantize_dequantize_weight_ov, ) @@ -260,7 +209,7 @@ def float_quantize_dequantize_weight( ) # Reference implementation - compressed_weight, scale = do_float_quantization(weight, config, reduction_axes, precomputed_scale) + compressed_weight, scale, _ = do_float_quantization(weight, config, reduction_axes, precomputed_scale) decompressed_weight = do_float_dequantization(compressed_weight, scale) if return_compressed_weight: return decompressed_weight, compressed_weight, scale @@ -350,8 +299,7 @@ def compress_weight( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, - precomputed_scale: Tensor = None, - precomputed_zero_point: Tensor = None, + precomputed_compressed_weight: CompressedWeight = None, ) -> CompressedWeight: """ Compress weight using compression configuration. @@ -359,13 +307,26 @@ def compress_weight( :param weight: The weight to compress. :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). :param config: Compression configuration. - :param precomputed_scale: Precomputed scale. - :param precomputed_zero_point: Precomputed zero point. + :param precomputed_compressed_weight: Contains precomputed scale and zero point. :return: The compressed weight and decompression parameters as instance of CompressedWeight """ + precomputed_scale, precomputed_zero_point = ( + (precomputed_compressed_weight.scale, precomputed_compressed_weight.zero_point) + if precomputed_compressed_weight + else (None, None) + ) + if not config.is_integer: - compressed_weight, scale = do_float_quantization(weight, config, reduction_axes, precomputed_scale) - return CompressedWeight(compressed_weight, scale) + compressed_weight, scale, indexes = do_float_quantization(weight, config, reduction_axes, precomputed_scale) + if indexes is not None: + return CompressedWeight( + indexes, + scale, + None, + config.codebook_values, + ) + else: + return CompressedWeight(compressed_weight, scale) compressed_weight, scale, zero_point = do_integer_quantization( weight, config, reduction_axes, precomputed_scale, precomputed_zero_point ) @@ -537,6 +498,32 @@ def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor: return quantized_weight +def _calculate_codebook_quantized_weight( + norm_weight: Tensor, quantiles: Tensor = None, center_of_quantiles: Tensor = None +) -> tuple[Tensor, Tensor]: + """ + Performs quantization by quantiles (if center_of_quantiles is None). Look-up table is used to + "round" or "quantize" to the closest quant. + + :param norm_weight: Weight tensor to quantize already normalized to quantiles range. + :param quantiles: Quantiles to use for quantization. If None, the center_of_quantiles must be provided. + :param center_of_quantiles: Center of quantiles to use for quantization. If None, it is calculated as the average + of adjacent quantiles. + :return: Tensor with floating-point values, where each of them corresponds to elements from quantiles. + """ + assert quantiles is not None or center_of_quantiles is not None, ( + "Either quantiles or center_of_quantiles should be provided" + ) + + if center_of_quantiles is None: + center_of_quantiles = 0.5 * (quantiles[1:] + quantiles[:-1]) + center_of_quantiles = fns.from_numpy(center_of_quantiles, backend=norm_weight.backend) + indexes = fns.searchsorted(center_of_quantiles, norm_weight) + quantiles = fns.from_numpy(quantiles, backend=indexes.backend) + quantized_weight = quantiles[indexes] + return quantized_weight, indexes + + def _calculate_normalized_weight(weight: Tensor, scale: Tensor) -> Tensor: """ Normalizes the weight tensor using the provided scale. diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py index f5435197e3c..340f5983f2b 100644 --- a/src/nncf/quantization/quantize_model.py +++ b/src/nncf/quantization/quantize_model.py @@ -516,8 +516,13 @@ def compress_weights( from nncf.torch.nncf_network import NNCFNetwork from nncf.torch.quantization.quantize_model import compress_weights_impl as pt_compression_weights_impl - if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]: - msg = "Torch backend does not support NF4 and E2M1 modes for weight compression." + if mode in [ + CompressWeightsMode.NF4, + CompressWeightsMode.E2M1, + CompressWeightsMode.CODEBOOK, + CompressWeightsMode.CB4_F8E4M3, + ]: + msg = "Torch backend does not support NF4, E2M1 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = {"gptq": gptq, "lora_correction": lora_correction} @@ -560,8 +565,13 @@ def compress_weights( compress_weights_impl as fx_compression_weights_impl, ) - if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]: - msg = "Torch backend does not support NF4 and E2M1 modes for weight compression." + if mode in [ + CompressWeightsMode.NF4, + CompressWeightsMode.E2M1, + CompressWeightsMode.CODEBOOK, + CompressWeightsMode.CB4_F8E4M3, + ]: + msg = "Torch backend does not support NF4, E2M1 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = { @@ -614,8 +624,13 @@ def compress_weights( elif backend == BackendType.ONNX: from nncf.onnx.quantization.quantize_model import compress_weights_impl as onnx_compress_weights_impl - if mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]: - msg = "ONNX backend does not support NF4 and E2M1 modes for weight compression." + if mode in [ + CompressWeightsMode.NF4, + CompressWeightsMode.E2M1, + CompressWeightsMode.CODEBOOK, + CompressWeightsMode.CB4_F8E4M3, + ]: + msg = "ONNX backend does not support NF4, E2M1 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = { diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json index fa2080ba064..e027db051c2 100644 --- a/tests/cross_fw/examples/example_scope.json +++ b/tests/cross_fw/examples/example_scope.json @@ -283,6 +283,23 @@ ] } }, + "codebook_llm_compression": { + "backend": "openvino", + "requirements": "examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt", + "cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz", + "accuracy_metrics": { + "answers": [ + "Paris.", + "Mont Blanc.", + "Toronto.", + "Tokyo.", + "Paris.", + "Mont Blanc.", + "Toronto.", + "Fukuoka." + ] + } + }, "llm_compression_distillation_qat_with_lora": { "backend": "torch", "device": "cuda", diff --git a/tests/cross_fw/examples/run_example.py b/tests/cross_fw/examples/run_example.py index 2931fba1186..c9ead09f7bf 100644 --- a/tests/cross_fw/examples/run_example.py +++ b/tests/cross_fw/examples/run_example.py @@ -200,6 +200,12 @@ def fp8_llm_quantization() -> dict[str, float]: return {"answers": list(result.values())} +def codebook_llm_compression() -> list[str]: + from examples.llm_compression.openvino.smollm2_360m_codebook.main import main as codebook_llm_compression_main + + return codebook_llm_compression_main() + + def llm_compression_distillation_qat_with_lora() -> float: from examples.llm_compression.torch.distillation_qat_with_lora.main import main as distillation_qat_with_lora_main diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f16_u4.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f16_u4.json new file mode 100644 index 00000000000..578b2cc53d3 --- /dev/null +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f16_u4.json @@ -0,0 +1,61 @@ +{ + "matmul_2_data_nncf_codebook_idxs": { + "indexes": [ + 171, + 253, + 154, + 172, + 217, + 235, + 250, + 155, + 253, + 252, + 188, + 253, + 207, + 206, + 253, + 236, + 254, + 233, + 255, + 248, + 255 + ] + }, + "matmul_2_data": { + "scale": [ + [ + [ + 0.99560546875 + ] + ], + [ + [ + 1.177734375 + ] + ], + [ + [ + 1.193359375 + ] + ], + [ + [ + 1.244140625 + ] + ], + [ + [ + 1.1650390625 + ] + ], + [ + [ + 1.2265625 + ] + ] + ] + } +} \ No newline at end of file diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f8e4m3_u8.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f8e4m3_u8.json new file mode 100644 index 00000000000..abf99c05ca4 --- /dev/null +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_f8e4m3_u8.json @@ -0,0 +1,106 @@ +{ + "matmul_2_data_nncf_codebook_idxs": { + "indexes": [ + [ + [ + 14, + 12, + 16, + 20, + 13, + 11, + 15 + ] + ], + [ + [ + 12, + 11, + 16, + 13, + 17, + 12, + 20 + ] + ], + [ + [ + 14, + 11, + 17, + 20, + 15, + 20, + 15 + ] + ], + [ + [ + 14, + 16, + 20, + 19, + 15, + 18, + 15 + ] + ], + [ + [ + 16, + 18, + 14, + 18, + 18, + 20, + 11 + ] + ], + [ + [ + 17, + 19, + 20, + 10, + 19, + 20, + 20 + ] + ] + ] + }, + "matmul_2_data": { + "scale": [ + [ + [ + 0.2275390625 + ] + ], + [ + [ + 0.269287109375 + ] + ], + [ + [ + 0.272705078125 + ] + ], + [ + [ + 0.284423828125 + ] + ], + [ + [ + 0.266357421875 + ] + ], + [ + [ + 0.2802734375 + ] + ] + ] + } +} \ No newline at end of file diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_i8_u8.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_i8_u8.json new file mode 100644 index 00000000000..acf5ad93048 --- /dev/null +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_i8_u8.json @@ -0,0 +1,106 @@ +{ + "matmul_2_data_nncf_codebook_idxs": { + "indexes": [ + [ + [ + 14, + 12, + 16, + 20, + 13, + 11, + 15 + ] + ], + [ + [ + 12, + 11, + 16, + 13, + 17, + 12, + 20 + ] + ], + [ + [ + 14, + 11, + 17, + 20, + 15, + 20, + 15 + ] + ], + [ + [ + 14, + 16, + 20, + 20, + 15, + 18, + 15 + ] + ], + [ + [ + 16, + 18, + 14, + 18, + 18, + 20, + 11 + ] + ], + [ + [ + 17, + 19, + 20, + 10, + 19, + 20, + 20 + ] + ] + ] + }, + "matmul_2_data": { + "scale": [ + [ + [ + 0.07965087890625 + ] + ], + [ + [ + 0.09423828125 + ] + ], + [ + [ + 0.095458984375 + ] + ], + [ + [ + 0.0994873046875 + ] + ], + [ + [ + 0.09320068359375 + ] + ], + [ + [ + 0.09814453125 + ] + ] + ] + } +} \ No newline at end of file diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_u8_u4.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_u8_u4.json new file mode 100644 index 00000000000..8642e52a868 --- /dev/null +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_codebook_u8_u4.json @@ -0,0 +1,61 @@ +{ + "matmul_2_data_nncf_codebook_idxs": { + "indexes": [ + 54, + 248, + 20, + 56, + 145, + 181, + 243, + 38, + 250, + 247, + 104, + 249, + 126, + 123, + 217, + 199, + 251, + 178, + 254, + 208, + 255 + ] + }, + "matmul_2_data": { + "scale": [ + [ + [ + 0.0531005859375 + ] + ], + [ + [ + 0.06280517578125 + ] + ], + [ + [ + 0.06365966796875 + ] + ], + [ + [ + 0.06634521484375 + ] + ], + [ + [ + 0.0621337890625 + ] + ], + [ + [ + 0.0654296875 + ] + ] + ] + } +} \ No newline at end of file diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json new file mode 100644 index 00000000000..b8712bf3839 --- /dev/null +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json @@ -0,0 +1,178 @@ +{ + "matmul_2_data": { + "scale": [ + [ + [ + 0.2275390625 + ] + ], + [ + [ + 0.269287109375 + ] + ], + [ + [ + 0.272705078125 + ] + ], + [ + [ + 0.284423828125 + ] + ], + [ + [ + 0.266357421875 + ] + ], + [ + [ + 0.2802734375 + ] + ] + ] + }, + "matmul_1_data": { + "compressed_weight": [ + [ + 119, + 168, + 11, + 49, + 255, + 255 + ], + [ + 255, + 159, + 255, + 255, + 255, + 255 + ], + [ + 255, + 169, + 59, + 255, + 228, + 135 + ], + [ + 202, + 255, + 255, + 149, + 238, + 134 + ], + [ + 229, + 130, + 151, + 255, + 87, + 240 + ], + [ + 26, + 255, + 245, + 75, + 255, + 18 + ] + ], + "zero_point": [ + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ] + ], + "scale": [ + [ + 0.0025196075439453125 + ], + [ + 0.0024051666259765625 + ], + [ + 0.002300262451171875 + ], + [ + 0.0024013519287109375 + ], + [ + 0.0025997161865234375 + ], + [ + 0.003208160400390625 + ] + ] + }, + "gather_2_data": { + "compressed_weight": [ + [ + 181, + 77, + 12, + 5, + 231, + 255 + ], + [ + 166, + 200, + 149, + 255, + 223, + 1 + ], + [ + 255, + 10, + 224, + 54, + 255, + 166 + ] + ], + "zero_point": [ + [ + 0 + ], + [ + 0 + ], + [ + 0 + ] + ], + "scale": [ + [ + 0.0035152435302734375 + ], + [ + 0.0036563873291015625 + ], + [ + 0.003253936767578125 + ] + ] + } +} \ No newline at end of file diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 2e15dc9a0c5..936d5d53329 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -47,6 +47,7 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_nf4_quantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_normalized_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization @@ -229,6 +230,57 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7): } +def check_codebook_grouped(op: ov.Node, group_size: int = 7, dtype=ov.Type.f8e4m3): + assert op.get_element_type() == dtype + + if dtype == ov.Type.f16: + convert_node = op + else: + convert_node = get_next_node(op) + assert convert_node.get_type_name() == "Convert" + + gather_node = get_next_node(convert_node) + assert gather_node.get_type_name() == "Gather" + + weight_shape = gather_node.shape + # NOTE: get_const_value_as_numpy_tensor doesn't work for 4-bit types + assert list(weight_shape)[-1] == group_size + reduced_weight_shape = list(weight_shape) + reduced_weight_shape[-1] = 1 + + mul_node = get_next_node(gather_node) + assert mul_node.get_type_name() == "Multiply" + scale_node = mul_node.input_value(1).get_node() + assert list(scale_node.shape) == reduced_weight_shape + + reshape_node = get_next_node(mul_node) + assert reshape_node.get_type_name() == "Reshape" + + convert_node = get_next_node(reshape_node) + assert convert_node.get_type_name() == "Convert" + + return { + "scale": get_const_value_as_numpy_tensor(scale_node), + } + + +def check_codebook_indexes(op: ov.Node, dtype=ov.Type.u4): + assert op.get_element_type() == dtype + + if dtype == ov.Type.u4: + convert_node = get_next_node(op) + assert convert_node.get_type_name() == "Convert" + else: + convert_node = op + + gather_node = get_next_node(convert_node) + assert gather_node.get_type_name() == "Gather" + + return { + "indexes": get_const_value_as_numpy_tensor(op), + } + + def check_int4_sym_grouped(op: ov.Node): return check_int4_grouped(op, mode=CompressWeightsMode.INT4_SYM) @@ -256,6 +308,7 @@ def get_mixed_mapping(primary_fn: Callable, list_layers: list[str]): (CompressWeightsMode.INT4_SYM, 7, get_mixed_mapping(check_int4_sym_grouped, TEST_MODELS[IntegerModel])), (CompressWeightsMode.INT4_ASYM, 7, get_mixed_mapping(check_int4_asym_grouped, TEST_MODELS[IntegerModel])), (CompressWeightsMode.NF4, 7, get_mixed_mapping(check_nf4_grouped, TEST_MODELS[IntegerModel])), + (CompressWeightsMode.CB4_F8E4M3, 7, get_mixed_mapping(check_codebook_grouped, TEST_MODELS[IntegerModel])), ), ) def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map): @@ -279,6 +332,56 @@ def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map): compare_stats(ref_stats, actual_stats) +@pytest.mark.parametrize( + "codebook, codebook_dtype, index_dtype, name", + [ + (np.array([i for i in range(16)], np.uint8), ov.Type.u8, ov.Type.u4, "u8_u4"), + (np.array([0.1 * i for i in range(-8, 8)], np.float16), ov.Type.f16, ov.Type.u4, "f16_u4"), + ( + Tensor(np.array([0.35 * i for i in range(-10, 11)], np.float16)) + .as_openvino_tensor() + .astype(TensorDataType.f8e4m3), + ov.Type.f8e4m3, + ov.Type.u8, + "f8e4m3_u8", + ), + ( + Tensor(np.array([i for i in range(-10, 11)], np.int8)).as_openvino_tensor().astype(TensorDataType.int8), + ov.Type.i8, + ov.Type.u8, + "i8_u8", + ), + ], +) +def test_codebook_compression_for_different_dtypes(codebook, codebook_dtype, index_dtype, name): + model = IntegerModel().ov_model + + compressed_model = compress_weights( + model, + mode=CompressWeightsMode.CODEBOOK, + group_size=7, + advanced_parameters=nncf.AdvancedCompressionParameters(codebook=codebook), + ) + actual_stats = {} + for op in compressed_model.get_ops(): + op_name = op.get_friendly_name() + if op.get_type_name() == "Constant": + if op_name == "matmul_2_data": + actual_stats[op_name] = check_codebook_grouped(op, group_size=7, dtype=codebook_dtype) + elif op_name == "matmul_2_data_nncf_codebook_idxs": + actual_stats[op_name] = check_codebook_indexes(op, dtype=index_dtype) + + ref_stats_path = get_actual_reference_for_current_openvino( + REFERENCE_SCALES_DIR / f"IntegerModel_codebook_{name}.json" + ) + + if os.getenv("NNCF_TEST_REGEN_DOT") is not None: + dump_to_json(ref_stats_path, actual_stats) + + ref_stats = load_json(ref_stats_path) + compare_stats(ref_stats, actual_stats) + + @pytest.mark.parametrize("metric", DATA_BASED_SENSITIVITY_METRICS) def test_gather_in_4_bit_if_all_layers_with_data(metric): dim1 = 2 # sequence length dimension @@ -1024,6 +1127,76 @@ def test_mixed_precision_e2m1(mode, all_layers, ratio, ref_ids): assert ref_e8m0_nodes == names_e8m0 +@pytest.mark.parametrize( + ("mode", "all_layers", "ratio", "ref_ids"), + ( + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, 5), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, 3), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, 1), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, 0), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, 4), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, 3), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, 1), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, 0), + ), +) +def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids): + model = SequentialMatmulModel().ov_model + compressed_model = compress_weights( + model, + mode=CompressWeightsMode.CB4_F8E4M3, + ratio=ratio, + group_size=1, + all_layers=all_layers, + sensitivity_metric=mode, + ) + names_codebook = { + op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == ov.Type.f8e4m3 + } + + assert ref_ids == len(names_codebook) + + +@pytest.mark.parametrize( + ("codebook", "dst_type", "n_layers"), + ( + (np.array([i for i in range(-8, 8)], np.int8), ov.Type.i8, 5), + (np.array([i for i in range(-(2**6), 2**6)], np.int8), ov.Type.i8, 5), + ( + Tensor(np.array([np.sign(i) * 2 ** np.abs(i) for i in range(-6, 6)])) + .as_openvino_tensor() + .astype(TensorDataType.f8e4m3), + ov.Type.f8e4m3, + 5, + ), + ), +) +@pytest.mark.parametrize("group_size", (1, -1)) +def test_codebook(codebook, n_layers, dst_type, group_size): + model = SequentialMatmulModel().ov_model + compressed_model = compress_weights( + model, + mode=CompressWeightsMode.CODEBOOK, + ratio=1.0, + group_size=group_size, + all_layers=True, + advanced_parameters=AdvancedCompressionParameters(codebook=codebook), + ) + names_codebook = [ + op.get_friendly_name() + for op in compressed_model.get_ordered_ops() + if op.get_friendly_name().endswith("nncf_codebook") + ] + + assert len(names_codebook) == n_layers + + names_codebook = [ + op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == dst_type + ] + + assert len(names_codebook) == n_layers + + @pytest.mark.parametrize( ("mode", "data"), ( @@ -1045,6 +1218,30 @@ def test_compressed_weighs_range(mode, data): assert np.allclose(np.abs(compressed_weighs.data), np.abs(w.data)) +@pytest.mark.parametrize( + ("data"), + ( + ([-8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0, 0.0]), + ([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), + ([-8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]), + ([-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5]), + ), +) +def test_codebook_weighs_range(data): + data = np.array(data).astype(np.float32) + codebook = data + max_diff = 0.1 + w = Tensor(data + (np.random.rand(*data.shape) - 0.5) * max_diff) + config = WeightCompressionConfig(mode=CompressWeightsMode.CODEBOOK, codebook_values=Tensor(data)) + _, scale, indexes = do_float_quantization(w, config, -1) + uncompressed_data = codebook[indexes.data] * scale.data + + indexes = indexes.flatten() + target = np.arange(indexes.shape[0]) + assert np.allclose(indexes.data, target) + assert np.all(np.abs(uncompressed_data.data - data) <= max_diff) + + @pytest.mark.parametrize( ("config", "precompute_scale", "precompute_zero_point", "raises"), [ @@ -1528,6 +1725,28 @@ def test_nf4_quantization_mid_quant(weight, scale): np.testing.assert_allclose(nf4_quant.data, ref_nf4_quant.data, atol=0, rtol=0) +@pytest.mark.parametrize( + "codebook", + [ + np.array([0.2, 0.2, 0.3, 0.4], dtype=np.float32), + np.array([0.5, 0.2, 0.3, 0.4], dtype=np.float32), + np.array([[-1, 0, 1, 2, 3], [-1, 0, 1, 2, 3]], dtype=np.float32), + np.array([5], dtype=np.float32), + ], +) +def test_codebook_is_correct_array(codebook): + model = SequentialMatmulModel().ov_model + + # The codebook should be a non empty 1D numpy array and sorted + with pytest.raises(nncf.ValidationError): + compress_weights( + model, + mode=CompressWeightsMode.CODEBOOK, + group_size=-1, + advanced_parameters=nncf.AdvancedCompressionParameters(codebook=codebook), + ) + + class TestOVTemplateWeightCompression(TemplateWeightCompression): @staticmethod def get_matmul_model() -> ov.Model: diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index fcbb127c8d3..67a9fcef14d 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -219,7 +219,7 @@ def test_quantization_alignment(weight_shape, config, quantization_task, tensor_ if config.is_integer: compressed_weight, scale, zero_point = outputs else: - compressed_weight, scale = outputs + compressed_weight, scale, _ = outputs elif quantization_task == QuantizationTask.Q_DQ: decompressed_weight = outputs else: