Skip to content

Commit 71ae2c1

Browse files
LUT-based compressed data type (#3496)
### Changes Implementation of compression to fixed codebook (LUT) values . ### Reason for changes CVS-167084 ### Related tickets CVS-167084 ### Tests tests/openvino/native/quantization/test_weights_compression.py https://github.com/openvinotoolkit/nncf/actions/runs/16024264575 --------- Co-authored-by: Aleksandr Suslov <alexander.suslov@intel.com>
1 parent 6b78fe9 commit 71ae2c1

32 files changed

+1349
-170
lines changed

.ci/cspell_dict.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -505,4 +505,4 @@ yolov
505505
yscale
506506
yujie
507507
yury
508-
zfnet
508+
zfnet
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Large Language Models FP8 Compression Example
2+
3+
This example demonstrates how to apply codebook compression to [HuggingFaceTB/SmolLM2-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct) model. It can be useful for evaluation and early HW enablement purposes.
4+
5+
## Prerequisites
6+
7+
To use this example:
8+
9+
- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate`
10+
- Install dependencies:
11+
12+
```bash
13+
pip install -U pip
14+
pip install -r requirements.txt
15+
pip install ../../../../
16+
```
17+
18+
## Run Example
19+
20+
To run example:
21+
22+
```bash
23+
python main.py
24+
```
25+
26+
It will automatically download the dataset and baseline model and save the resulting model.
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
# Copyright (c) 2025 Intel Corporation
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
# Unless required by applicable law or agreed to in writing, software
7+
# distributed under the License is distributed on an "AS IS" BASIS,
8+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
# See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
12+
import warnings
13+
14+
import numpy as np
15+
from optimum.intel.openvino import OVModelForCausalLM
16+
from torch.jit import TracerWarning
17+
from transformers import AutoTokenizer
18+
from transformers import logging
19+
20+
import nncf
21+
22+
logging.set_verbosity_error()
23+
warnings.filterwarnings("ignore", category=TracerWarning)
24+
25+
26+
MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
27+
COMPRESSED_MODEL_ID = "smollm2_360m_compressed_codebook"
28+
29+
30+
def generate_answers(
31+
questions: list[str], model: OVModelForCausalLM, tokenizer: AutoTokenizer, max_new_tokens: int = 50
32+
) -> dict[str, str]:
33+
"""
34+
Generate answers for a list of questions using the provided model and tokenizer.
35+
36+
:param questions: List of questions to be answered.
37+
:param model: The model to use for generating answers.
38+
:param tokenizer: The tokenizer to use for processing the input and output.
39+
:param max_new_tokens: Maximum number of new tokens to generate for each answer. Defaults to 50.
40+
:return: A dictionary mapping each question to its corresponding answer.
41+
"""
42+
messages = [
43+
{"role": "system", "content": "You are a chatbot who always responds as short as possible."},
44+
{"role": "user", "content": "What is the capital of Spain?"},
45+
{"role": "assistant", "content": "Madrid."},
46+
]
47+
answers_by_questions = {}
48+
49+
for question in questions:
50+
messages.append({"role": "user", "content": question})
51+
input_ids = tokenizer.apply_chat_template(
52+
messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
53+
).to(device=model.device)
54+
input_len = len(input_ids[0])
55+
56+
output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0]
57+
answer = tokenizer.decode(output[input_len:], skip_special_tokens=True)
58+
answers_by_questions[question] = answer
59+
messages.append({"role": "assistant", "content": answer})
60+
61+
return answers_by_questions
62+
63+
64+
def print_answers(header: str, answers_by_questions: list[str]) -> None:
65+
"""
66+
Print the answers to the console.
67+
68+
:param header: Header to print before the answers.
69+
:param answers_by_questions: Dictionary mapping questions to their answers.
70+
"""
71+
print(header)
72+
for question, answer in answers_by_questions.items():
73+
print(f"Q: {question}\nA: {answer}\n")
74+
75+
76+
QUESTIONS = [
77+
"What is the capital of France?",
78+
"What is the highest peak in the Alps?",
79+
"What is the largest city in Canada?",
80+
"What is the most visited city in Japan?",
81+
]
82+
83+
84+
def load_model_and_tokenizer(model_id: str, export=True) -> tuple[OVModelForCausalLM, AutoTokenizer]:
85+
"""
86+
Load the model and tokenizer from the specified model ID.
87+
88+
:param model_id: The identifier of the model to load.
89+
:param export: Whether to export the model for OpenVINO. Defaults to True.
90+
:return: A tuple containing the loaded model and tokenizer.
91+
"""
92+
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
93+
model = OVModelForCausalLM.from_pretrained(
94+
model_id,
95+
export=export,
96+
load_in_8bit=False,
97+
)
98+
return model, tokenizer
99+
100+
101+
def default_codebook_example(model_id: str, compressed_model_id: str) -> list[str]:
102+
"""
103+
Example of using the default codebook compression.
104+
105+
:param model_id: The identifier of the model to load.
106+
:param compressed_model_id: The identifier for the compressed model to save.
107+
:return: A list of answers generated by the model after compression.
108+
"""
109+
model, tokenizer = load_model_and_tokenizer(model_id)
110+
answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
111+
print_answers("Non-optimized model outputs:\n", answers_by_questions)
112+
113+
model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CB4_F8E4M3, ratio=1.0, group_size=64)
114+
model.save_pretrained(compressed_model_id)
115+
tokenizer.save_pretrained(compressed_model_id)
116+
117+
model, tokenizer = load_model_and_tokenizer(compressed_model_id, False)
118+
answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
119+
print_answers("Optimized model outputs:\n", answers_by_questions)
120+
121+
return list(answers_by_questions.values())
122+
123+
124+
def custom_codebook_example(model_id: str, compressed_model_id: str) -> list[str]:
125+
"""
126+
Example of using the custom codebook compression.
127+
128+
:param model_id: The identifier of the model to load.
129+
:param compressed_model_id: The identifier for the compressed model to save.
130+
:return: A list of answers generated by the model after compression.
131+
"""
132+
model, tokenizer = load_model_and_tokenizer(model_id)
133+
134+
answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
135+
print_answers("Non-optimized model outputs:\n", answers_by_questions)
136+
137+
codebook = np.array([-8, -4, -2, -1, 0, 1, 2, 4, 8], dtype=np.int8)
138+
139+
model.model = nncf.compress_weights(
140+
model.model,
141+
mode=nncf.CompressWeightsMode.CODEBOOK,
142+
ratio=1.0,
143+
group_size=-1,
144+
advanced_parameters=nncf.AdvancedCompressionParameters(codebook=codebook),
145+
)
146+
model.save_pretrained(compressed_model_id)
147+
tokenizer.save_pretrained(compressed_model_id)
148+
149+
model, tokenizer = load_model_and_tokenizer(compressed_model_id, False)
150+
answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
151+
print_answers("Optimized model outputs:\n", answers_by_questions)
152+
153+
return list(answers_by_questions.values())
154+
155+
156+
def main():
157+
res = default_codebook_example(MODEL_ID, COMPRESSED_MODEL_ID)
158+
res += custom_codebook_example(MODEL_ID, COMPRESSED_MODEL_ID + "_custom")
159+
return res
160+
161+
162+
if __name__ == "__main__":
163+
main()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
openvino==2025.1
2+
optimum-intel[openvino]>=1.22.0
3+
transformers>=4.48.0
4+
onnx==1.17.0

src/nncf/openvino/graph/metatypes/openvino_metatypes.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -817,7 +817,13 @@ def _is_embedding(node: ov.Node) -> bool:
817817
allowed_types_list = ["f16", "f32", "f64"]
818818
const_port_id = 0
819819
input_tensor = node.input_value(const_port_id)
820-
if input_tensor.get_element_type().get_type_name() in allowed_types_list:
820+
input_type = input_tensor.get_element_type().get_type_name()
821+
822+
# TODO(aanuf): Implement a pattern based check for embedding.
823+
if node.friendly_name.endswith("nncf_codebook"):
824+
return False
825+
826+
if input_type in allowed_types_list:
821827
const_node = get_operation_const_op(node, const_port_id)
822828
if const_node is not None:
823829
return True

src/nncf/openvino/graph/node_utils.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
from nncf.openvino.graph.metatypes.openvino_metatypes import get_node_metatype
4545
from nncf.tensor import Tensor
4646
from nncf.tensor import TensorBackend
47+
from nncf.tensor import TensorDataType
4748

4849
InplaceInsertionFnType = Callable[[ov.Node, int, str], ov.Node]
4950

@@ -685,3 +686,27 @@ def create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] =
685686
return opset.constant(x.data, name=name, shared_memory=True)
686687
const = opset.constant(x.data, dtype=dtype, name=name)
687688
return const
689+
690+
691+
def create_ov_codebook_subgraph(
692+
codebook: Tensor, indexes: Tensor, dtype: ov.Type, name: Optional[str] = None
693+
) -> op.Constant:
694+
"""
695+
Create an OpenVINO subgraph with gather from the given codebook and indexes tensors.
696+
697+
:param codebook: Codebook tensor.
698+
:param indexes: Indexes tensor.
699+
:param dtype: Data type of the indexes.
700+
:param name: Optional name of the constant.
701+
:return: OpenVINO subgraph.
702+
"""
703+
codebook_const = opset.constant(codebook.data, name=name)
704+
if codebook.dtype != TensorDataType.float16:
705+
codebook_const = opset.convert(codebook_const, destination_type=ov.Type.f16)
706+
707+
codebook_indexes = opset.constant(indexes.data, dtype=dtype, name=name + "_nncf_codebook_idxs")
708+
if dtype == ov.Type.u4:
709+
codebook_indexes = opset.convert(codebook_indexes, destination_type=ov.Type.u8)
710+
711+
const = opset.gather(codebook_const, codebook_indexes, 0, name=name + "_nncf_codebook")
712+
return const

src/nncf/openvino/optimized_functions/functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def do_float_quantization(
105105
config: WeightCompressionConfig,
106106
reduction_axes: Optional[ReductionAxes] = None,
107107
precomputed_scale: Optional[Tensor] = None,
108-
) -> tuple[Tensor, Tensor]:
108+
) -> tuple[Tensor, Tensor, Tensor]:
109109
"""
110110
Computes quantization scale if not provided, and performs corresponding nf4 weight quantization.
111111
For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
@@ -151,7 +151,7 @@ def do_float_quantization(
151151
compressed_weight = model([weight, precomputed_scale])[0]
152152
scale = precomputed_scale
153153

154-
return compressed_weight, scale
154+
return compressed_weight, scale, None
155155

156156

157157
def integer_quantize_dequantize_weight(

src/nncf/parameters.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,15 +85,19 @@ class CompressWeightsMode(StrEnum):
8585
:param NF4: The the same as INT4_SYM mode, but primary precision is NF4 data type without zero point.
8686
:param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead.
8787
:param E2M1: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0.
88+
:param CODEBOOK: Codebook (LUT) quantization format.
89+
:param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format.
8890
"""
8991

9092
INT8_SYM = "int8_sym"
9193
INT8_ASYM = "int8_asym"
9294
INT4_SYM = "int4_sym"
9395
INT4_ASYM = "int4_asym"
9496
NF4 = "nf4"
97+
CB4_F8E4M3 = "cb4_f8e4m3"
9598
INT8 = "int8" # Deprecated mode
9699
E2M1 = "e2m1"
100+
CODEBOOK = "codebook"
97101

98102

99103
@api(canonical_alias="nncf.CompressionFormat")

src/nncf/quantization/advanced_parameters.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
from nncf.quantization.range_estimator import RangeEstimatorParameters
3030
from nncf.quantization.range_estimator import StatisticsType
3131

32+
TTensor = Any
33+
3234

3335
@api(canonical_alias="nncf.OverflowFix")
3436
class OverflowFix(StrEnum):
@@ -379,6 +381,9 @@ class AdvancedCompressionParameters:
379381
:type lora_adapter_rank: int
380382
:param backend_params: Backend-specific parameters.
381383
:type backend_params: dict[str, Any]
384+
:param codebook: The codebook (LUT) for the weight compression.
385+
Applicable for vector quantization. Must be a numpy array or ov Tensor.
386+
:type codebook: TTensor
382387
"""
383388

384389
statistics_path: Optional[str] = None
@@ -390,6 +395,7 @@ class AdvancedCompressionParameters:
390395
lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
391396
lora_adapter_rank: int = 256
392397
backend_params: dict[str, Any] = field(default_factory=dict)
398+
codebook: Optional[TTensor] = None
393399

394400

395401
@api()

0 commit comments

Comments
 (0)