Skip to content

Commit 7c22e95

Browse files
[WC][OV] NVFP4 support
1 parent 27204ad commit 7c22e95

File tree

18 files changed

+1169
-158
lines changed

18 files changed

+1169
-158
lines changed

.ci/cspell_dict.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@ nsamples
310310
nsga
311311
numels
312312
nvcc
313+
nvfp
313314
objwalk
314315
odict
315316
oimp

docs/Algorithms.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
- Arbitrary look-up table (CODEBOOK) or predefined lookup table based on NF4 (CB4)
1515
- MX-compliant types - MXFP4 and MXFP8_E4M3
1616
- FP types - FP8_E4M3 and FP4
17+
- NVFP4 type
1718
- Mixed precision weights compression
1819
- Grouped weights compression
1920

docs/usage/post_training_compression/weights_compression/Usage.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ NNCF can automatically distribute precision assignments based on quantization se
5555
| MXFP8_E4M3 | E4M3 | E8M0 | Group-wise (32) | [MX-compliant FP8](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
5656
| FP8_E4M3 | E4M3 | FP16 | Per-channel / Group-wise | [FP8](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
5757
| FP4 | E2M1 | FP16 | Per-channel / Group-wise | [FP4](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
58+
| NVFP4 | E2M1 | E4M3 per group / FP32 per weight | Group-wise (16) | [NVFP4](https://www.arxiv.org/pdf/2602.14582) |
5859

5960
**Note**: Granularity refers to the scope of elements sharing quantization parameters. "Per-channel" applies different parameters for each output channel, while "Group-wise" divides weights into groups (e.g., group_size=128) that share the same parameters.
6061

@@ -706,7 +707,7 @@ Accuracy/footprint trade-off for `microsoft/Phi-3-mini-4k-instruct`:
706707
- The compression applies in-place.
707708
- The compressed model is not trainable.
708709
- INT4_SYM, INT4_ASYM, NF4 and E2M1 modes, grouped quantization and mixed precision selection is available for OpenVINO backend only.
709-
- NF4, MXFP4, MXFP8_E4M3 support is experimental on GPU and NPU - models quantized to nf4/mxfp4/mxfp8_e4m3 should not be faster models quantized to 8-bit integer.
710+
- NF4, MXFP4, MXFP8_E4M3, NVFP4 support is experimental on GPU and NPU - models quantized to nf4/mxfp4/mxfp8_e4m3/nvfp4 should not be faster models quantized to 8-bit integer.
710711

711712
### Additional resources
712713

src/nncf/openvino/optimized_functions/functions.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
2424
from nncf.quantization.algorithms.weight_compression.constants import OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES
2525
from nncf.quantization.algorithms.weight_compression.constants import OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES
26+
from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
2627
from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
2728
from nncf.tensor import Tensor
2829
from nncf.tensor import TensorBackend
@@ -37,7 +38,7 @@ def do_integer_quantization(
3738
reduction_axes: ReductionAxes | None = None,
3839
precomputed_scale: Tensor = None,
3940
precomputed_zero_point: Tensor = None,
40-
) -> tuple[Tensor, Tensor, Tensor]:
41+
) -> CompressedWeight:
4142
"""
4243
Quantizes the given weight tensor to an integer data type.
4344
@@ -47,7 +48,7 @@ def do_integer_quantization(
4748
precomputed scale (and zero point) are provided.
4849
:param precomputed_scale: Optional precomputed scale tensor.
4950
:param precomputed_zero_point: Optional precomputed zero point tensor.
50-
:return: A tuple containing the compressed weights, scale, and zero point tensors.
51+
:return: A CompressedWeight object containing the compressed weights, scale, and zero point tensors.
5152
"""
5253
assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES
5354

@@ -100,15 +101,15 @@ def do_integer_quantization(
100101
compressed_weight = model(inputs)[0]
101102
scale, zero_point = precomputed_scale, precomputed_zero_point
102103

103-
return compressed_weight, scale, zero_point
104+
return CompressedWeight(compressed_weight, scale, zero_point=zero_point)
104105

105106

106107
def do_float_quantization(
107108
weight: Tensor,
108109
config: WeightCompressionConfig,
109110
reduction_axes: ReductionAxes | None = None,
110111
precomputed_scale: Tensor | None = None,
111-
) -> tuple[Tensor, Tensor, Tensor]:
112+
) -> CompressedWeight:
112113
"""
113114
Computes quantization scale if not provided, and performs corresponding float weight quantization.
114115
NF4 format uses 16 levels in [-1, 1] range, while FP4/MXFP4 uses 16 levels in [-6, 6].
@@ -117,7 +118,7 @@ def do_float_quantization(
117118
:param config: Weight compression configuration.
118119
:param reduction_axes: Axes, along which to reduce (collect) different statistics.
119120
:param precomputed_scale: Optional precomputed scale.
120-
:return: Returns quantized weight tensor and corresponding scale tensor.
121+
:return: A CompressedWeight object containing the quantized weights and scale.
121122
"""
122123
assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES
123124

@@ -153,7 +154,7 @@ def do_float_quantization(
153154
compressed_weight = model([weight, precomputed_scale])[0]
154155
scale = precomputed_scale
155156

156-
return compressed_weight, scale, None
157+
return CompressedWeight(compressed_weight, scale)
157158

158159

159160
def integer_quantize_dequantize_weight(
@@ -163,7 +164,7 @@ def integer_quantize_dequantize_weight(
163164
precomputed_scale: Tensor | None = None,
164165
precomputed_zero_point: Tensor | None = None,
165166
return_compressed_weight: bool | None = False,
166-
) -> Tensor | tuple[Tensor, Tensor, Tensor, Tensor]:
167+
) -> Tensor | tuple[Tensor, CompressedWeight]:
167168
"""
168169
Quantizes the given weight tensor to an integer data type and then dequantizes it back to obtain float32 values.
169170
@@ -175,8 +176,7 @@ def integer_quantize_dequantize_weight(
175176
:param precomputed_zero_point: Optional precomputed zero point tensor.
176177
:param return_compressed_weight: If True, besides decompressed weight will also return compressed weight, scale,
177178
(and zero point).
178-
:return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight, scale,
179-
(and zero point).
179+
:return: Dequantized weight tensor or a tuple containing the decompressed weight and a CompressedWeight object.
180180
"""
181181
assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES
182182

@@ -217,7 +217,7 @@ def integer_quantize_dequantize_weight(
217217
else:
218218
decompressed_weight, compressed_weight, scale, zero_point = results
219219
if return_compressed_weight:
220-
return decompressed_weight, compressed_weight, scale, zero_point
220+
return decompressed_weight, CompressedWeight(compressed_weight, scale, zero_point=zero_point)
221221
return decompressed_weight
222222

223223

@@ -227,7 +227,7 @@ def float_quantize_dequantize_weight(
227227
reduction_axes: ReductionAxes | None = None,
228228
precomputed_scale: Tensor | None = None,
229229
return_compressed_weight: bool | None = False,
230-
) -> Tensor | tuple[Tensor, Tensor, Tensor]:
230+
) -> Tensor | tuple[Tensor, CompressedWeight]:
231231
"""
232232
First quantizes the given weight tensor and then dequantizes it back to obtain float32 values.
233233
@@ -236,7 +236,7 @@ def float_quantize_dequantize_weight(
236236
:param reduction_axes: Axes along which to reduce statistics. Not required if precomputed scale are provided.
237237
:param precomputed_scale: Optional precomputed scale tensor.
238238
:param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale.
239-
:return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale.
239+
:return: Dequantized weight tensor or a tuple containing the decompressed weight and a CompressedWeight object.
240240
"""
241241
assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES
242242

@@ -270,7 +270,7 @@ def float_quantize_dequantize_weight(
270270
else:
271271
decompressed_weight, compressed_weight, scale = results
272272
if return_compressed_weight:
273-
return decompressed_weight, compressed_weight, scale
273+
return decompressed_weight, CompressedWeight(compressed_weight, scale)
274274
return decompressed_weight
275275

276276

src/nncf/parameters.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ class CompressWeightsMode(StrEnum):
9494
:param MXFP8_E4M3: MX-compliant FP8 format with E4M3 values sharing group-level E8M0 scale. The size of group is 32.
9595
:param FP8_E4M3: A FP8 format with E4M3 values sharing group-level fp16 scale.
9696
:param FP4: A FP4 format with E2M1 values sharing group-level fp16 scale.
97+
:param NVFP4: A FP4 format with E2M1 values sharing group-level E4M3 scale and FP32 per weight scale.
98+
The size of group is 16.
9799
:param CODEBOOK: Codebook (LUT) quantization format.
98100
:param ADAPTIVE_CODEBOOK: Adaptive codebook (LUT) quantization format.
99101
:param CB4: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format.
@@ -110,6 +112,7 @@ class CompressWeightsMode(StrEnum):
110112
MXFP8_E4M3 = "mxfp8_e4m3"
111113
FP8_E4M3 = "fp8_e4m3"
112114
FP4 = "fp4"
115+
NVFP4 = "nvfp4"
113116
CODEBOOK = "codebook"
114117
ADAPTIVE_CODEBOOK = "adaptive_codebook"
115118

src/nncf/quantization/algorithms/weight_compression/algorithm.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ def get_weight_compression_configuration(
9494
elif group_size is None and mode in NON_INT8_MODES:
9595
if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]:
9696
group_size = 32
97+
elif mode == CompressWeightsMode.NVFP4:
98+
group_size = 16
9799
elif mode in [
98100
CompressWeightsMode.CODEBOOK,
99101
CompressWeightsMode.CB4,
@@ -275,14 +277,18 @@ def check_user_compression_configuration(
275277
f"Supported modes are: {[e.value for e in GroupSizeFallbackMode]}."
276278
)
277279
raise nncf.ValidationError(msg)
278-
if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]:
279-
if group_size not in [None, 32]:
280+
if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3, CompressWeightsMode.NVFP4]:
281+
if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3] and group_size not in [None, 32]:
280282
msg = f"MXFP4 and MXFP8_E4M3 types only support group size of 32, group size of {group_size} is given"
281283
raise nncf.ValidationError(msg)
282284

285+
if mode == CompressWeightsMode.NVFP4 and group_size not in [None, 16]:
286+
msg = f"NVFP4 type only supports group size of 16, group size of {group_size} is given"
287+
raise nncf.ValidationError(msg)
288+
283289
if advanced_parameters and advanced_parameters.group_size_fallback_mode is GroupSizeFallbackMode.ADJUST:
284290
msg = (
285-
"MXFP4 and MXFP8_E4M3 types do not support the group size"
291+
"MXFP4, MXFP8_E4M3 and NVFP4 types do not support the group size"
286292
f" fallback mode {advanced_parameters.group_size_fallback_mode.value}."
287293
" Please use other group size fallback mode."
288294
)
@@ -332,6 +338,8 @@ def __init__(
332338
MXFP8_E4M3 is MX-compliant FP8 with E4M3 values sharing group-level E8M0 scale. The size of group is 32.
333339
FP8_E4M3 is FP8 with E4M3 values sharing group-level FP16 scale.
334340
FP4 is FP4 with E2M1 values sharing group-level FP16 scale.
341+
NVFP4 is FP4 format with E2M1 values sharing group-level E4M3 scale and FP32 per weight scale.
342+
The size of group is 16.
335343
:param ratio: the ratio between primary and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
336344
and the rest to backup_mode).
337345
:param group_size: number of weights (e.g. 128) in the channel dimension

src/nncf/quantization/algorithms/weight_compression/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def is_integer(self):
7070
CompressWeightsMode.MXFP8_E4M3,
7171
CompressWeightsMode.FP8_E4M3,
7272
CompressWeightsMode.FP4,
73+
CompressWeightsMode.NVFP4,
7374
CompressWeightsMode.CODEBOOK,
7475
CompressWeightsMode.ADAPTIVE_CODEBOOK,
7576
CompressWeightsMode.CB4,
@@ -106,6 +107,7 @@ def compression_dtype(self) -> TensorDataType:
106107
CompressWeightsMode.NF4: TensorDataType.nf4,
107108
CompressWeightsMode.FP4: TensorDataType.f4e2m1,
108109
CompressWeightsMode.MXFP4: TensorDataType.f4e2m1,
110+
CompressWeightsMode.NVFP4: TensorDataType.f4e2m1,
109111
CompressWeightsMode.FP8_E4M3: TensorDataType.f8e4m3,
110112
CompressWeightsMode.MXFP8_E4M3: TensorDataType.f8e4m3,
111113
}

src/nncf/quantization/algorithms/weight_compression/lora_correction.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -172,13 +172,11 @@ def calculate_low_rank_matrices(
172172
reduction_axis = reduction_axes[0] if compression_config.group_size != -1 else -1
173173
if mode in (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM):
174174
fq_weights = do_integer_dequantization(
175-
compressed_weight.tensor,
176-
compressed_weight.scale,
177-
compressed_weight.zero_point,
175+
compressed_weight,
178176
reduction_axis,
179177
)
180178
elif mode == CompressWeightsMode.NF4:
181-
fq_weights = do_float_dequantization(compressed_weight.tensor, compressed_weight.scale, reduction_axis)
179+
fq_weights = do_float_dequantization(compressed_weight, reduction_axis)
182180
else:
183181
msg = (
184182
f"{mode.value} mode is invalid for Lora Correction algorithm. Supported modes: INT4_SYM, INT4_ASYM, NF4"

src/nncf/quantization/algorithms/weight_compression/openvino_backend.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -224,13 +224,6 @@ def _create_compression_subgraph(
224224
should_add_convert_node: bool,
225225
precomputed_compressed_weight: CompressedWeight | None = None,
226226
):
227-
compression_dtype = DTYPE_MAP[compression_config.compression_dtype]
228-
scale_dtype = (
229-
ov.Type.f8e8m0
230-
if compression_config.mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]
231-
else ov.Type.f16
232-
)
233-
234227
original_shape = weight.shape
235228

236229
with disable_results_caching(OV_MODEL_CACHE):
@@ -241,6 +234,14 @@ def _create_compression_subgraph(
241234
precomputed_compressed_weight,
242235
)
243236

237+
compression_dtype = DTYPE_MAP[compression_config.compression_dtype]
238+
239+
scale_dtype = ov.Type.f16
240+
if compression_config.mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]:
241+
scale_dtype = ov.Type.f8e8m0
242+
elif compression_config.mode == CompressWeightsMode.NVFP4:
243+
scale_dtype = ov.Type.f8e4m3
244+
244245
if compression_config.is_codebook:
245246
converted_const = create_ov_codebook_subgraph(
246247
codebook=compressed_weight.codebook
@@ -266,6 +267,19 @@ def _create_compression_subgraph(
266267
)
267268

268269
scale_const = create_ov_const_from_tensor(compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale")
270+
271+
if compressed_weight.second_degree_scale is not None:
272+
sec_order_scale = create_ov_const_from_tensor(
273+
compressed_weight.second_degree_scale, ov.Type.f32, name=f"{const_node_name}/second_order_scale"
274+
)
275+
scale_const = convert_op(scale_const, ov.Type.f32)
276+
277+
scale_const = opset.multiply(
278+
scale_const,
279+
sec_order_scale,
280+
name=f"{const_node_name}/dequantized_scale_{weight_port_id}",
281+
)
282+
269283
scale_const = convert_op(scale_const, ov.Type.f16)
270284

271285
mul = opset.multiply(

src/nncf/quantization/algorithms/weight_compression/parameters.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,4 @@ class CompressedWeight:
3030
scale: Tensor | None = None
3131
zero_point: Tensor | None = None
3232
codebook: Tensor | None = None
33-
34-
def is_codebook(self):
35-
"""
36-
Check if the compressed weight is a codebook.
37-
38-
:return: True if the compressed weight is a codebook, False otherwise.
39-
"""
40-
return self.codebook is not None and self.tensor is not None and self.scale is not None
33+
second_degree_scale: Tensor | None = None

0 commit comments

Comments
 (0)