Skip to content

Commit 1451632

Browse files
Nikita SavelyevCopilot
andauthored
Optimized compression for FP8 modes (#3748)
### Changes Added optimized OpenVINO weights compression for fp8e4m3 data type. `optimum-cli export openvino` time: | Model | Memory Before (MiB) | Memory After (MiB) | Time Before (sec) | Time After (sec) | |---------------------------|---------------------|----------------------------|-------------------|---------------------------| | Llama-3.2-1B | 2328.03 | 2394.14 (+2.84%) | 63.52 | 14.03 (-77.92%) | | Phi-4-mini | 5608.48 | 5197.70 (-7.33%) | 187.34 | 28.05 (-85.03%) | | Llama-3.1-8B | 9918.52 | 8443.87 (-14.86%) | 399.14 | 48.48 (-87.86%) | ### Reason for changes UX improvement. ### Tests Extended existing tests. https://github.com/openvinotoolkit/nncf/actions/runs/19767009608 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 9065776 commit 1451632

File tree

16 files changed

+558
-118
lines changed

16 files changed

+558
-118
lines changed

src/nncf/openvino/optimized_functions/functions.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from typing import Optional, Union
1313

1414
import nncf
15-
from nncf import CompressWeightsMode
1615
from nncf.common.utils.caching import disable_results_caching
1716
from nncf.openvino.optimized_functions.models import OV_MODEL_CACHE
1817
from nncf.openvino.optimized_functions.models import OVModelParameters
@@ -23,6 +22,8 @@
2322
from nncf.openvino.optimized_functions.models import get_integer_quantization_model
2423
from nncf.openvino.optimized_functions.models import get_integer_quantize_dequantize_weight_model
2524
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
25+
from nncf.quantization.algorithms.weight_compression.constants import OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES
26+
from nncf.quantization.algorithms.weight_compression.constants import OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES
2627
from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
2728
from nncf.tensor import Tensor
2829
from nncf.tensor import TensorBackend
@@ -49,6 +50,8 @@ def do_integer_quantization(
4950
:param precomputed_zero_point: Optional precomputed zero point tensor.
5051
:return: A tuple containing the compressed weights, scale, and zero point tensors.
5152
"""
53+
assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES
54+
5255
weight_shape = weight.shape
5356
scale_shape = None if precomputed_scale is None else precomputed_scale.shape
5457
zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape
@@ -117,7 +120,7 @@ def do_float_quantization(
117120
:param precomputed_scale: Optional precomputed scale.
118121
:return: Returns quantized weight tensor and corresponding scale tensor.
119122
"""
120-
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4]
123+
assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES
121124

122125
weight_shape = weight.shape
123126
scale_shape = None if precomputed_scale is None else precomputed_scale.shape
@@ -129,8 +132,7 @@ def do_float_quantization(
129132
if weight.backend == TensorBackend.ov:
130133
# Return ov tensors in target precision to seamlessly insert them into openvino model later
131134
ov_model_params.return_ov_tensors = True
132-
weight_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1
133-
ov_model_params.output_dtypes.update({"compressed_weight": weight_dtype})
135+
ov_model_params.output_dtypes.update({"compressed_weight": config.compression_dtype})
134136

135137
model = get_float_quantization_model(
136138
ov_model_params,
@@ -177,6 +179,8 @@ def integer_quantize_dequantize_weight(
177179
:return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight, scale,
178180
(and zero point).
179181
"""
182+
assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES
183+
180184
# When reduction axes are not provided, assuming that the weights are already reshaped
181185
if config.group_size != -1 and reduction_axes is not None:
182186
# weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
@@ -235,7 +239,7 @@ def float_quantize_dequantize_weight(
235239
:param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale.
236240
:return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale.
237241
"""
238-
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4]
242+
assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES
239243

240244
# When reduction axes are not provided, assuming that the weights are already reshaped
241245
if config.group_size != -1 and reduction_axes is not None:
@@ -290,6 +294,8 @@ def get_integer_quantization_error(
290294
:param reduction: Reduction mode to aggregate error values. Supported modes: "max_mean", "frobenius".
291295
:return: The quantity characterizing the error of integer quantization.
292296
"""
297+
assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES
298+
293299
if reduction not in ["max_mean", "frobenius"]:
294300
exception_str = f"Unsupported aggregation mode: {reduction}."
295301
raise nncf.InternalError(exception_str)

src/nncf/openvino/optimized_functions/models.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from nncf.openvino.graph.node_utils import convert_op
3232
from nncf.openvino.graph.node_utils import non_convertable_divide_op
3333
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
34+
from nncf.quantization.algorithms.weight_compression.constants import FP_MAX_VALUES
3435
from nncf.tensor import Tensor
3536
from nncf.tensor import TensorDataType
3637
from nncf.tensor.functions.openvino_numeric import DTYPE_MAP as DTYPE_MAP_OV
@@ -579,8 +580,6 @@ def _build_float_quantization_model(
579580
reduction_axes: Optional[ReductionAxes] = None,
580581
return_nodes: bool = False,
581582
) -> Union[ModelCallable, ModelAsNodes]:
582-
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4]
583-
584583
default_input_dtypes = {"scale": TensorDataType.float32}
585584
default_output_dtypes = {"compressed_weight": TensorDataType.float32, "scale": TensorDataType.float32}
586585

@@ -605,7 +604,12 @@ def _build_float_quantization_model(
605604
)
606605

607606
# Validate output dtypes
608-
valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4, TensorDataType.f4e2m1]
607+
valid_compressed_weight_dtypes = [
608+
TensorDataType.float32,
609+
TensorDataType.nf4,
610+
TensorDataType.f4e2m1,
611+
TensorDataType.f8e4m3,
612+
]
609613
if compressed_weight_dtype not in valid_compressed_weight_dtypes:
610614
msg = (
611615
f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. "
@@ -633,23 +637,17 @@ def _build_float_quantization_model(
633637
eps = np.finfo(np.float32).eps
634638
scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale)
635639

636-
# Equals 1.0 for NF4
637-
FP_MAX_VALS = {
638-
CompressWeightsMode.MXFP4: 6.0,
639-
CompressWeightsMode.FP4: 6.0,
640-
}
641-
if config.mode in FP_MAX_VALS:
642-
scale = divide_op(scale, opset.constant(FP_MAX_VALS[config.mode], ov.Type.f32))
640+
if config.compression_dtype != TensorDataType.nf4:
641+
scale = divide_op(scale, opset.constant(FP_MAX_VALUES[config.compression_dtype], ov.Type.f32))
643642

644-
if config.mode == CompressWeightsMode.MXFP4:
643+
if config.mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]:
645644
scale = opset.log(scale) / opset.log(opset.constant(2.0, ov.Type.f32))
646645
scale = opset.ceil(scale)
647646
scale = opset.clamp(scale, -127.0, 127.0)
648647
scale = opset.power(opset.constant(2.0, ov.Type.f32), scale)
649648

650649
compressed_weight = divide_op(weight, scale)
651-
target_dtype = ov.Type.nf4 if config.mode == CompressWeightsMode.NF4 else ov.Type.f4e2m1
652-
compressed_weight = convert_op(compressed_weight, target_dtype)
650+
compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[config.compression_dtype])
653651
compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype])
654652

655653
ov_results = [compressed_weight]

src/nncf/quantization/algorithms/weight_compression/config.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,14 @@ def num_bits(self):
4646
"""
4747
:return: number of bits that is used for storing a single quantized value in the given mode.
4848
"""
49-
return 8 if self.mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM] else 4
49+
if self.mode in [
50+
CompressWeightsMode.INT8_SYM,
51+
CompressWeightsMode.INT8_ASYM,
52+
CompressWeightsMode.FP8_E4M3,
53+
CompressWeightsMode.MXFP8_E4M3,
54+
]:
55+
return 8
56+
return 4
5057

5158
@property
5259
def is_asym_mode(self):
@@ -74,6 +81,31 @@ def is_codebook(self):
7481
"""
7582
return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]
7683

84+
@property
85+
def compression_dtype(self) -> TensorDataType:
86+
"""
87+
:return: data type that is used to store compressed weights.
88+
"""
89+
if self.is_codebook:
90+
n_quants = self.codebook_values.size
91+
if n_quants <= 16:
92+
return TensorDataType.uint4
93+
if n_quants <= 256:
94+
return TensorDataType.uint8
95+
return TensorDataType.uint16
96+
dtype_per_mode = {
97+
CompressWeightsMode.INT4_SYM: TensorDataType.int4,
98+
CompressWeightsMode.INT4_ASYM: TensorDataType.uint4,
99+
CompressWeightsMode.INT8_ASYM: TensorDataType.uint8,
100+
CompressWeightsMode.INT8_SYM: TensorDataType.int8,
101+
CompressWeightsMode.NF4: TensorDataType.nf4,
102+
CompressWeightsMode.FP4: TensorDataType.f4e2m1,
103+
CompressWeightsMode.MXFP4: TensorDataType.f4e2m1,
104+
CompressWeightsMode.FP8_E4M3: TensorDataType.f8e4m3,
105+
CompressWeightsMode.MXFP8_E4M3: TensorDataType.f8e4m3,
106+
}
107+
return dtype_per_mode[self.mode]
108+
77109
def get_numpy_codebook(self):
78110
return self.codebook_values.as_numpy_tensor()
79111

src/nncf/quantization/algorithms/weight_compression/constants.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111

1212
import numpy as np
1313

14+
from nncf.parameters import CompressWeightsMode
15+
from nncf.tensor import TensorDataType
16+
1417
NF4_QUANTILES = np.array(
1518
[
1619
-1.0,
@@ -101,3 +104,33 @@
101104

102105

103106
CENTER_OF_F4E2M1_QUANTILES = (F4E2M1_QUANTILES[1:] + F4E2M1_QUANTILES[:-1]) / 2
107+
108+
109+
FP_MAX_VALUES = {
110+
TensorDataType.nf4: 1.0,
111+
TensorDataType.f4e2m1: 6.0,
112+
TensorDataType.f8e4m3: 448.0,
113+
}
114+
115+
116+
MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000
117+
118+
OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES = (
119+
CompressWeightsMode.INT8_ASYM,
120+
CompressWeightsMode.INT8_SYM,
121+
CompressWeightsMode.INT4_ASYM,
122+
CompressWeightsMode.INT4_SYM,
123+
)
124+
125+
OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES = (
126+
CompressWeightsMode.NF4,
127+
CompressWeightsMode.MXFP4,
128+
CompressWeightsMode.FP4,
129+
CompressWeightsMode.FP8_E4M3,
130+
CompressWeightsMode.MXFP8_E4M3,
131+
)
132+
133+
OPTIMIZED_COMPRESSION_COMPATIBLE_MODES = (
134+
*OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES,
135+
*OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES,
136+
)

0 commit comments

Comments
 (0)