Skip to content

Commit b7d8a7b

Browse files
Initial commit
1 parent e9e7cd0 commit b7d8a7b

File tree

10 files changed

+270
-97
lines changed

10 files changed

+270
-97
lines changed

src/nncf/openvino/optimized_functions/functions.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def do_float_quantization(
116116
:param precomputed_scale: Optional precomputed scale.
117117
:return: Returns quantized weight tensor and corresponding scale tensor.
118118
"""
119-
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4]
119+
assert config.mode not in [CompressWeightsMode.CB4_F8E4M3, CompressWeightsMode.CODEBOOK]
120120

121121
weight_shape = weight.shape
122122
scale_shape = None if precomputed_scale is None else precomputed_scale.shape
@@ -128,8 +128,7 @@ def do_float_quantization(
128128
if weight.backend == TensorBackend.ov:
129129
# Return ov tensors in target precision to seamlessly insert them into openvino model later
130130
ov_model_params.return_ov_tensors = True
131-
weight_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1
132-
ov_model_params.output_dtypes.update({"compressed_weight": weight_dtype})
131+
ov_model_params.output_dtypes.update({"compressed_weight": config.compression_dtype})
133132

134133
model = get_float_quantization_model(
135134
ov_model_params,
@@ -234,7 +233,7 @@ def float_quantize_dequantize_weight(
234233
:param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale.
235234
:return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale.
236235
"""
237-
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4]
236+
assert config.mode not in [CompressWeightsMode.CB4_F8E4M3, CompressWeightsMode.CODEBOOK]
238237

239238
# When reduction axes are not provided, assuming that the weights are already reshaped
240239
if config.group_size != -1 and reduction_axes is not None:

src/nncf/openvino/optimized_functions/models.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from nncf.openvino.graph.node_utils import convert_op
3232
from nncf.openvino.graph.node_utils import non_convertable_divide_op
3333
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
34+
from nncf.quantization.algorithms.weight_compression.constants import FP_MAX_VALUES
3435
from nncf.tensor import Tensor
3536
from nncf.tensor import TensorDataType
3637
from nncf.tensor.functions.openvino_numeric import DTYPE_MAP as DTYPE_MAP_OV
@@ -571,7 +572,7 @@ def _build_float_quantization_model(
571572
reduction_axes: Optional[ReductionAxes] = None,
572573
return_nodes: bool = False,
573574
) -> Union[ModelCallable, ModelAsNodes]:
574-
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4]
575+
assert config.mode not in [CompressWeightsMode.CB4_F8E4M3, CompressWeightsMode.CODEBOOK]
575576

576577
default_input_dtypes = {"scale": TensorDataType.float32}
577578
default_output_dtypes = {"compressed_weight": TensorDataType.float32, "scale": TensorDataType.float32}
@@ -597,7 +598,12 @@ def _build_float_quantization_model(
597598
)
598599

599600
# Validate output dtypes
600-
valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4, TensorDataType.f4e2m1]
601+
valid_compressed_weight_dtypes = [
602+
TensorDataType.float32,
603+
TensorDataType.nf4,
604+
TensorDataType.f4e2m1,
605+
TensorDataType.f8e4m3,
606+
]
601607
if compressed_weight_dtype not in valid_compressed_weight_dtypes:
602608
msg = (
603609
f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. "
@@ -625,23 +631,17 @@ def _build_float_quantization_model(
625631
eps = np.finfo(np.float32).eps
626632
scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale)
627633

628-
# Equals 1.0 for NF4
629-
FP_MAX_VALS = {
630-
CompressWeightsMode.MXFP4: 6.0,
631-
CompressWeightsMode.FP4: 6.0,
632-
}
633-
if config.mode in FP_MAX_VALS:
634-
scale = divide_op(scale, opset.constant(FP_MAX_VALS[config.mode], ov.Type.f32))
634+
if config.compression_dtype != TensorDataType.nf4:
635+
scale = divide_op(scale, opset.constant(FP_MAX_VALUES[config.compression_dtype], ov.Type.f32))
635636

636-
if config.mode == CompressWeightsMode.MXFP4:
637+
if config.mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]:
637638
scale = opset.log(scale) / opset.log(opset.constant(2.0, ov.Type.f32))
638639
scale = opset.ceil(scale)
639640
scale = opset.clamp(scale, -127.0, 127.0)
640641
scale = opset.power(opset.constant(2.0, ov.Type.f32), scale)
641642

642643
compressed_weight = divide_op(weight, scale)
643-
target_dtype = ov.Type.nf4 if config.mode == CompressWeightsMode.NF4 else ov.Type.f4e2m1
644-
compressed_weight = convert_op(compressed_weight, target_dtype)
644+
compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[config.compression_dtype])
645645
compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype])
646646

647647
ov_results = [compressed_weight]

src/nncf/quantization/algorithms/weight_compression/config.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,17 @@ def num_bits(self):
4646
"""
4747
:return: number of bits that is used for storing a single quantized value in the given mode.
4848
"""
49-
return 8 if self.mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM] else 4
49+
return (
50+
8
51+
if self.mode
52+
in [
53+
CompressWeightsMode.INT8_SYM,
54+
CompressWeightsMode.INT8_ASYM,
55+
CompressWeightsMode.FP8_E4M3,
56+
CompressWeightsMode.MXFP8_E4M3,
57+
]
58+
else 4
59+
)
5060

5161
@property
5262
def is_asym_mode(self):
@@ -74,6 +84,31 @@ def is_codebook(self):
7484
"""
7585
return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]
7686

87+
@property
88+
def compression_dtype(self) -> TensorDataType:
89+
"""
90+
:return: data type that is used to store compressed weights.
91+
"""
92+
if self.is_codebook:
93+
n_quants = self.codebook_values.size
94+
if n_quants <= 16:
95+
return TensorDataType.uint4
96+
if n_quants <= 256:
97+
return TensorDataType.uint8
98+
return TensorDataType.uint16
99+
dtype_per_mode = {
100+
CompressWeightsMode.INT4_SYM: TensorDataType.int4,
101+
CompressWeightsMode.INT4_ASYM: TensorDataType.uint4,
102+
CompressWeightsMode.INT8_ASYM: TensorDataType.uint8,
103+
CompressWeightsMode.INT8_SYM: TensorDataType.int8,
104+
CompressWeightsMode.NF4: TensorDataType.nf4,
105+
CompressWeightsMode.FP4: TensorDataType.f4e2m1,
106+
CompressWeightsMode.MXFP4: TensorDataType.f4e2m1,
107+
CompressWeightsMode.FP8_E4M3: TensorDataType.f8e4m3,
108+
CompressWeightsMode.MXFP8_E4M3: TensorDataType.f8e4m3,
109+
}
110+
return dtype_per_mode[self.mode]
111+
77112
def get_numpy_codebook(self):
78113
return self.codebook_values.as_numpy_tensor()
79114

src/nncf/quantization/algorithms/weight_compression/constants.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111

1212
import numpy as np
1313

14+
from nncf.tensor import TensorDataType
15+
1416
NF4_QUANTILES = np.array(
1517
[
1618
-1.0,
@@ -101,3 +103,10 @@
101103

102104

103105
CENTER_OF_F4E2M1_QUANTILES = (F4E2M1_QUANTILES[1:] + F4E2M1_QUANTILES[:-1]) / 2
106+
107+
108+
FP_MAX_VALUES = {
109+
TensorDataType.nf4: 1.0,
110+
TensorDataType.f4e2m1: 6.0,
111+
TensorDataType.f8e4m3: 448.0,
112+
}
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# Copyright (c) 2025 Intel Corporation
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
# Unless required by applicable law or agreed to in writing, software
7+
# distributed under the License is distributed on an "AS IS" BASIS,
8+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
# See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
12+
import numpy as np
13+
14+
15+
F8E4M3_LUT = np.array(
16+
[
17+
0.0, 0.001953125, 0.00390625, 0.005859375, 0.0078125, 0.009765625, 0.01171875, 0.013671875,
18+
0.015625, 0.017578125, 0.01953125, 0.021484375, 0.0234375, 0.025390625, 0.02734375, 0.029296875,
19+
0.03125, 0.03515625, 0.0390625, 0.04296875, 0.046875, 0.05078125, 0.0546875, 0.05859375,
20+
0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875,
21+
0.125, 0.140625, 0.15625, 0.171875, 0.1875, 0.203125, 0.21875, 0.234375,
22+
0.25, 0.28125, 0.3125, 0.34375, 0.375, 0.40625, 0.4375, 0.46875,
23+
0.5, 0.5625, 0.625, 0.6875, 0.75, 0.8125, 0.875, 0.9375,
24+
1.0, 1.125, 1.25, 1.375, 1.5, 1.625, 1.75, 1.875,
25+
2.0, 2.25, 2.5, 2.75, 3.0, 3.25, 3.5, 3.75,
26+
4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5,
27+
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
28+
16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0, 30.0,
29+
32.0, 36.0, 40.0, 44.0, 48.0, 52.0, 56.0, 60.0,
30+
64.0, 72.0, 80.0, 88.0, 96.0, 104.0, 112.0, 120.0,
31+
128.0, 144.0, 160.0, 176.0, 192.0, 208.0, 224.0, 240.0,
32+
256.0, 288.0, 320.0, 352.0, 384.0, 416.0, 448.0, np.nan,
33+
],
34+
dtype=np.float32,
35+
)
36+
37+
38+
def _f16_to_f8e4m3_bits_scalar(h_bits: int) -> int:
39+
"""Exact port of ov::f16_to_f8e4m3_bits for a single float16 bit-pattern."""
40+
# f16 layout
41+
f16_s_mask = 0x8000
42+
f16_e_mask = 0x7C00
43+
f16_e_bias = 15
44+
f16_e_size = 5
45+
f16_m_mask = 0x03FF
46+
f16_m_size = 10
47+
48+
# f8 e4m3 layout
49+
f8e4m3_s_mask = 0x80
50+
f8e4m3_e_size = 4
51+
f8e4m3_e_mask = 0x78
52+
f8e4m3_e_bias = 7
53+
f8e4m3_e_max = 0x0F
54+
f8e4m3_m_size = 3
55+
f8e4m3_m_mask = 0x07
56+
57+
byte_shift = 8
58+
59+
# f8 masks in uint16 domain
60+
f8_e_mask = f8e4m3_e_mask << byte_shift # 0x7800
61+
f8_m_mask = f8e4m3_m_mask << byte_shift # 0x0700
62+
f8_m_hidden_one_mask = 0x0800 # hidden 1 for subnormals
63+
64+
# rounding constants (same as C++)
65+
round_half = 0x01FF
66+
round_norm = 0x007F
67+
round_even = 0x0080
68+
round_odd = 0x0180
69+
70+
# min exponent for which subnormals are representable
71+
f8_e_subnormal_min = -10
72+
73+
inp = int(h_bits) & 0xFFFF
74+
75+
# sign bit: f16 sign -> f8 sign position (bit 15 -> bit 7)
76+
f8_bits = (inp & f16_s_mask) >> byte_shift
77+
78+
f16_e_field = inp & f16_e_mask
79+
80+
if f16_e_field == f16_e_mask:
81+
# f16 NaN / Inf -> f8 NaN (no Inf)
82+
f8_bits |= (f8e4m3_e_mask | f8e4m3_m_mask)
83+
elif f16_e_field != 0:
84+
# normalized f16
85+
f8_biased_exp = (f16_e_field >> f16_m_size) - (f16_e_bias - f8e4m3_e_bias)
86+
# *** IMPORTANT FIX: shift by (f16_e_size - f8e4m3_e_size) = 5 - 4 = 1 ***
87+
fractional = (inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size)
88+
89+
# normalized f8 part (exp >= 0)
90+
if f8_biased_exp >= 0:
91+
if (fractional & round_half) == round_odd or (fractional & round_norm) != 0:
92+
fractional += round_even
93+
if (fractional & f8_e_mask) != 0:
94+
f8_biased_exp += 1
95+
fractional &= f8_m_mask
96+
97+
# now set exponent & mantissa
98+
if f8_biased_exp > f8e4m3_e_max:
99+
# overflow -> NaN (no Inf)
100+
f8_bits |= (f8e4m3_e_mask | f8e4m3_m_mask)
101+
elif f8_biased_exp > 0:
102+
# normalized f8
103+
exp_field = (f8_biased_exp & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size
104+
f8_bits |= exp_field
105+
f8_bits |= (fractional >> byte_shift)
106+
else:
107+
# subnormal f8
108+
fractional = f8_m_hidden_one_mask | ((inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size))
109+
f8_exp = f8_biased_exp - f8e4m3_e_bias
110+
shift = 1 - f8_exp
111+
sticky_mask = 0 if f8_exp < f8_e_subnormal_min else ((1 << shift) - 1)
112+
sticky = 1 if (fractional & sticky_mask) != 0 else 0
113+
114+
fractional = 0 if f8_exp < f8_e_subnormal_min else (fractional >> (1 - f8_biased_exp))
115+
116+
if (((fractional & round_half) == round_odd and sticky == 0) or
117+
(fractional & round_norm) != 0 or sticky != 0):
118+
fractional += round_even
119+
120+
f8_bits |= (fractional >> byte_shift)
121+
else:
122+
# f16 zero / subnormal -> sign + zero exponent/mantissa
123+
# (f8_bits already contains the sign)
124+
pass
125+
126+
return f8_bits & 0xFF
127+
128+
129+
_f16_to_f8e4m3_bits_vec = np.vectorize(_f16_to_f8e4m3_bits_scalar, otypes=[np.uint8])
130+
131+
132+
def fp32_to_fp8e4m3_values(x: np.ndarray) -> np.ndarray:
133+
"""
134+
Bit-exact to ov::float8_e4m3(float):
135+
float32 -> float16 -> f8e4m3 bits -> float via LUT
136+
"""
137+
x = np.asarray(x, dtype=np.float32)
138+
x_f16 = x.astype(np.float16)
139+
h_bits = x_f16.view(np.uint16)
140+
141+
f8_bits = _f16_to_f8e4m3_bits_vec(h_bits)
142+
143+
# Decode exactly like C++: LUT for magnitude + sign bit
144+
idx = f8_bits & 0x7F
145+
mag = F8E4M3_LUT[idx.astype(np.int32)]
146+
147+
sign = np.where((f8_bits & 0x80) != 0, -1.0, 1.0)
148+
out = sign * mag
149+
return out.astype(np.float32)

src/nncf/quantization/algorithms/weight_compression/openvino_backend.py

Lines changed: 5 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
6565
from nncf.tensor import Tensor
6666
from nncf.tensor.definitions import TensorDataType
67+
from nncf.tensor.functions.openvino_numeric import DTYPE_MAP
6768
from nncf.tensor.functions.openvino_numeric import DTYPE_MAP_REV
6869

6970

@@ -223,32 +224,11 @@ def _create_compression_subgraph(
223224
should_add_convert_node: bool,
224225
precomputed_compressed_weight: Optional[CompressedWeight] = None,
225226
):
226-
scale_dtype = ov.Type.f16
227-
if compression_config.mode == CompressWeightsMode.NF4:
228-
compression_dtype = ov.Type.nf4
229-
elif compression_config.mode == CompressWeightsMode.MXFP4:
230-
compression_dtype = ov.Type.f4e2m1
227+
compression_dtype = DTYPE_MAP[compression_config.compression_dtype]
228+
if compression_config.mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]:
231229
scale_dtype = ov.Type.f8e8m0
232-
elif compression_config.mode == CompressWeightsMode.MXFP8_E4M3:
233-
compression_dtype = ov.Type.f8e4m3
234-
scale_dtype = ov.Type.f8e8m0
235-
elif compression_config.mode == CompressWeightsMode.FP8_E4M3:
236-
compression_dtype = ov.Type.f8e4m3
237-
elif compression_config.mode == CompressWeightsMode.FP4:
238-
compression_dtype = ov.Type.f4e2m1
239-
elif compression_config.mode == CompressWeightsMode.INT4_SYM:
240-
compression_dtype = ov.Type.i4
241-
elif compression_config.mode == CompressWeightsMode.INT4_ASYM:
242-
compression_dtype = ov.Type.u4
243-
elif compression_config.mode == CompressWeightsMode.INT8_SYM:
244-
compression_dtype = ov.Type.i8
245-
elif compression_config.mode == CompressWeightsMode.INT8_ASYM:
246-
compression_dtype = ov.Type.u8
247-
elif compression_config.is_codebook:
248-
compression_dtype = None
249230
else:
250-
msg = f"{compression_config.mode.value} is not supported."
251-
raise nncf.ParameterNotSupportedError(msg)
231+
scale_dtype = ov.Type.f16
252232

253233
original_shape = weight.shape
254234

@@ -261,8 +241,7 @@ def _create_compression_subgraph(
261241
)
262242

263243
if compression_config.is_codebook:
264-
n_quants = compressed_weight.codebook.size - 1
265-
compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4)
244+
compression_dtype = DTYPE_MAP[compression_config.compression_dtype]
266245
converted_const = create_ov_codebook_subgraph(
267246
codebook=compressed_weight.codebook
268247
if compression_config.mode == CompressWeightsMode.CODEBOOK

0 commit comments

Comments
 (0)