openvinotoolkit · alexsu52 · Jul 10, 2025 · Jun 10, 2024 · Sep 4, 2024 · Sep 23, 2024
@@ -505,4 +505,4 @@ yolov
 yscale
 yujie
 yury
-zfnet
+zfnet
@@ -0,0 +1,26 @@
+# Large Language Models FP8 Compression Example
+
+This example demonstrates how to apply codebook compression to [HuggingFaceTB/SmolLM2-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct) model. It can be useful for evaluation and early HW enablement purposes.
+
+## Prerequisites
+
+To use this example:
+
+- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate`
+- Install dependencies:
+
+```bash
+pip install -U pip
+pip install -r requirements.txt
+pip install ../../../../
+```
+
+## Run Example
+
+To run example:
+
+```bash
+python main.py
+```
+
+It will automatically download the dataset and baseline model and save the resulting model.
@@ -0,0 +1,163 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import numpy as np
+from optimum.intel.openvino import OVModelForCausalLM
+from torch.jit import TracerWarning
+from transformers import AutoTokenizer
+from transformers import logging
+
+import nncf
+
+logging.set_verbosity_error()
+warnings.filterwarnings("ignore", category=TracerWarning)
+
+
+MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
+COMPRESSED_MODEL_ID = "smollm2_360m_compressed_codebook"
+
+
+def generate_answers(
+    questions: list[str], model: OVModelForCausalLM, tokenizer: AutoTokenizer, max_new_tokens: int = 50
+) -> dict[str, str]:
+    """
+    Generate answers for a list of questions using the provided model and tokenizer.
+
+    :param questions: List of questions to be answered.
+    :param model: The model to use for generating answers.
+    :param tokenizer: The tokenizer to use for processing the input and output.
+    :param max_new_tokens: Maximum number of new tokens to generate for each answer. Defaults to 50.
+    :return: A dictionary mapping each question to its corresponding answer.
+    """
+    messages = [
+        {"role": "system", "content": "You are a chatbot who always responds as short as possible."},
+        {"role": "user", "content": "What is the capital of Spain?"},
+        {"role": "assistant", "content": "Madrid."},
+    ]
+    answers_by_questions = {}
+
+    for question in questions:
+        messages.append({"role": "user", "content": question})
+        input_ids = tokenizer.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
+        ).to(device=model.device)
+        input_len = len(input_ids[0])
+
+        output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0]
+        answer = tokenizer.decode(output[input_len:], skip_special_tokens=True)
+        answers_by_questions[question] = answer
+        messages.append({"role": "assistant", "content": answer})
+
+    return answers_by_questions
+
+
+def print_answers(header: str, answers_by_questions: list[str]) -> None:
+    """
+    Print the answers to the console.
+
+    :param header: Header to print before the answers.
+    :param answers_by_questions: Dictionary mapping questions to their answers.
+    """
+    print(header)
+    for question, answer in answers_by_questions.items():
+        print(f"Q: {question}\nA: {answer}\n")
+
+
+QUESTIONS = [
+    "What is the capital of France?",
+    "What is the highest peak in the Alps?",
+    "What is the largest city in Canada?",
+    "What is the most visited city in Japan?",
+]
+
+
+def load_model_and_tokenizer(model_id: str, export=True) -> tuple[OVModelForCausalLM, AutoTokenizer]:
+    """
+    Load the model and tokenizer from the specified model ID.
+
+    :param model_id: The identifier of the model to load.
+    :param export: Whether to export the model for OpenVINO. Defaults to True.
+    :return: A tuple containing the loaded model and tokenizer.
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
+    model = OVModelForCausalLM.from_pretrained(
+        model_id,
+        export=export,
+        load_in_8bit=False,
+    )
+    return model, tokenizer
+
+
+def default_codebook_example(model_id: str, compressed_model_id: str) -> list[str]:
+    """
+    Example of using the default codebook compression.
+
+    :param model_id: The identifier of the model to load.
+    :param compressed_model_id: The identifier for the compressed model to save.
+    :return: A list of answers generated by the model after compression.
+    """
+    model, tokenizer = load_model_and_tokenizer(model_id)
+    answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
+    print_answers("Non-optimized model outputs:\n", answers_by_questions)
+
+    model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CB4_F8E4M3, ratio=1.0, group_size=64)
+    model.save_pretrained(compressed_model_id)
+    tokenizer.save_pretrained(compressed_model_id)
+
+    model, tokenizer = load_model_and_tokenizer(compressed_model_id, False)
+    answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
+    print_answers("Optimized model outputs:\n", answers_by_questions)
+
+    return list(answers_by_questions.values())
+
+
+def custom_codebook_example(model_id: str, compressed_model_id: str) -> list[str]:
+    """
+    Example of using the custom codebook compression.
+
+    :param model_id: The identifier of the model to load.
+    :param compressed_model_id: The identifier for the compressed model to save.
+    :return: A list of answers generated by the model after compression.
+    """
+    model, tokenizer = load_model_and_tokenizer(model_id)
+
+    answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
+    print_answers("Non-optimized model outputs:\n", answers_by_questions)
+
+    codebook = np.array([-8, -4, -2, -1, 0, 1, 2, 4, 8], dtype=np.int8)
+
+    model.model = nncf.compress_weights(
+        model.model,
+        mode=nncf.CompressWeightsMode.CODEBOOK,
+        ratio=1.0,
+        group_size=-1,
+        advanced_parameters=nncf.AdvancedCompressionParameters(codebook=codebook),
+    )
+    model.save_pretrained(compressed_model_id)
+    tokenizer.save_pretrained(compressed_model_id)
+
+    model, tokenizer = load_model_and_tokenizer(compressed_model_id, False)
+    answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
+    print_answers("Optimized model outputs:\n", answers_by_questions)
+
+    return list(answers_by_questions.values())
+
+
+def main():
+    res = default_codebook_example(MODEL_ID, COMPRESSED_MODEL_ID)
+    res += custom_codebook_example(MODEL_ID, COMPRESSED_MODEL_ID + "_custom")
+    return res
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,4 @@
+openvino==2025.1
+optimum-intel[openvino]>=1.22.0
+transformers>=4.48.0
+onnx==1.17.0
@@ -817,7 +817,13 @@ def _is_embedding(node: ov.Node) -> bool:
     allowed_types_list = ["f16", "f32", "f64"]
     const_port_id = 0
     input_tensor = node.input_value(const_port_id)
-    if input_tensor.get_element_type().get_type_name() in allowed_types_list:
+    input_type = input_tensor.get_element_type().get_type_name()
+
+    # TODO(aanuf): Implement a pattern based check for embedding.
+    if node.friendly_name.endswith("nncf_codebook"):
+        return False
+
+    if input_type in allowed_types_list:
         const_node = get_operation_const_op(node, const_port_id)
         if const_node is not None:
             return True

@@ -44,6 +44,7 @@
 from nncf.openvino.graph.metatypes.openvino_metatypes import get_node_metatype
 from nncf.tensor import Tensor
 from nncf.tensor import TensorBackend
+from nncf.tensor import TensorDataType
 
 InplaceInsertionFnType = Callable[[ov.Node, int, str], ov.Node]
 
@@ -685,3 +686,27 @@ def create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] =
         return opset.constant(x.data, name=name, shared_memory=True)
     const = opset.constant(x.data, dtype=dtype, name=name)
     return const
+
+
+def create_ov_codebook_subgraph(
+    codebook: Tensor, indexes: Tensor, dtype: ov.Type, name: Optional[str] = None
+) -> op.Constant:
+    """
+    Create an OpenVINO subgraph with gather from the given codebook and indexes tensors.
+
+    :param codebook: Codebook tensor.
+    :param indexes: Indexes tensor.
+    :param dtype: Data type of the indexes.
+    :param name: Optional name of the constant.
+    :return: OpenVINO subgraph.
+    """
+    codebook_const = opset.constant(codebook.data, name=name)
+    if codebook.dtype != TensorDataType.float16:
+        codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16)
+
+    codebook_indexes = opset.constant(indexes.data, dtype=dtype, name=name + "_nncf_codebook_idxs")
+    if dtype == ov.Type.u4:
+        codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8)
+
+    const = opset.gather(codebook_const, codebook_indexes, 0, name=name + "_nncf_codebook")
+    return const
@@ -105,7 +105,7 @@ def do_float_quantization(
     config: WeightCompressionConfig,
     reduction_axes: Optional[ReductionAxes] = None,
     precomputed_scale: Optional[Tensor] = None,
-) -> tuple[Tensor, Tensor]:
+) -> tuple[Tensor, Tensor, Tensor]:
     """
     Computes quantization scale if not provided, and performs corresponding nf4 weight quantization.
     For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
@@ -151,7 +151,7 @@ def do_float_quantization(
         compressed_weight = model([weight, precomputed_scale])[0]
         scale = precomputed_scale
 
-    return compressed_weight, scale
+    return compressed_weight, scale, None
 
 
 def integer_quantize_dequantize_weight(

@@ -85,15 +85,19 @@ class CompressWeightsMode(StrEnum):
     :param NF4: The the same as INT4_SYM mode, but primary precision is NF4 data type without zero point.
     :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead.
     :param E2M1: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0.
+    :param CODEBOOK: Codebook (LUT) quantization format.
+    :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format.
     """
 
     INT8_SYM = "int8_sym"
     INT8_ASYM = "int8_asym"
     INT4_SYM = "int4_sym"
     INT4_ASYM = "int4_asym"
     NF4 = "nf4"
+    CB4_F8E4M3 = "cb4_f8e4m3"
     INT8 = "int8"  # Deprecated mode
     E2M1 = "e2m1"
+    CODEBOOK = "codebook"
 
 
 @api(canonical_alias="nncf.CompressionFormat")

@@ -29,6 +29,8 @@
 from nncf.quantization.range_estimator import RangeEstimatorParameters
 from nncf.quantization.range_estimator import StatisticsType
 
+TTensor = Any
+
 
 @api(canonical_alias="nncf.OverflowFix")
 class OverflowFix(StrEnum):
@@ -379,6 +381,9 @@ class AdvancedCompressionParameters:
     :type lora_adapter_rank: int
     :param backend_params: Backend-specific parameters.
     :type backend_params: dict[str, Any]
+    :param codebook: The codebook (LUT) for the weight compression.
+        Applicable for vector quantization. Must be a numpy array or ov Tensor.
+    :type codebook: TTensor
     """
 
     statistics_path: Optional[str] = None
@@ -390,6 +395,7 @@ class AdvancedCompressionParameters:
     lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
     lora_adapter_rank: int = 256
     backend_params: dict[str, Any] = field(default_factory=dict)
+    codebook: Optional[TTensor] = None
 
 
 @api()
-Original file line number
+Diff line change
@@ Expand Up / @@ -505,4 +505,4 @@ yolov @@
     yscale
     yujie
     yury
-    zfnet
+    zfnet