Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a42f1eb
WIP
nikita-savelyevv Jun 17, 2025
17a3aec
Merge branch 'develop' into ns/ov-f4e2m1-support
nikita-savelyevv Jun 23, 2025
b2e090c
Add round to nearest logic for numpy case
nikita-savelyevv Jul 22, 2025
66c0366
Merge branch 'develop' into ns/ov-f4e2m1-support
nikita-savelyevv Jul 29, 2025
a345984
Tweaks
nikita-savelyevv Jul 29, 2025
6e3ba6e
Temporarily install OV nightly
nikita-savelyevv Jul 29, 2025
7555794
Update src/nncf/version.py
nikita-savelyevv Aug 21, 2025
999d54f
Merge branch 'develop' into ns/ov-f4e2m1-support
nikita-savelyevv Sep 3, 2025
83770e1
[OpenVINO][WC] E5M2 and E4M3 FP8 weights compression support
daniil-lyakhov Sep 22, 2025
8054217
MXFP4/MXFP8_E4M3
daniil-lyakhov Sep 30, 2025
3d944de
Expand wc docs with a table
daniil-lyakhov Oct 1, 2025
0c48792
Codebook is removed from wc docs
daniil-lyakhov Oct 1, 2025
ac2f05c
Type
daniil-lyakhov Oct 1, 2025
1e23ecf
Apply suggestions from code review
daniil-lyakhov Oct 2, 2025
33aae33
Typos/pre-commit
daniil-lyakhov Oct 2, 2025
e4d47ab
Fix adjust group size
daniil-lyakhov Oct 7, 2025
2aaec38
Revert "Fix adjust group size"
daniil-lyakhov Oct 8, 2025
ab6aa74
Fail for MX with adjust fallback mode
daniil-lyakhov Oct 8, 2025
a25b5c3
Update src/nncf/quantization/algorithms/weight_compression/weight_low…
daniil-lyakhov Oct 8, 2025
a64b30c
Merge branch 'develop' into ns/ov-f4e2m1-support
nikita-savelyevv Oct 8, 2025
83d09fc
Merge branch 'dl/FP8' into ns/ov-f4e2m1-support
nikita-savelyevv Oct 8, 2025
c227bac
Merge branch 'develop' into ns/ov-f4e2m1-support
nikita-savelyevv Oct 8, 2025
c026573
Revert nightly installation
nikita-savelyevv Oct 8, 2025
a000f87
Post-merge fixes
nikita-savelyevv Oct 8, 2025
831bf25
Post-merge fixes part 2
nikita-savelyevv Oct 8, 2025
344b94b
Increase test weight channel size
nikita-savelyevv Oct 8, 2025
089631a
Address suggested changes
nikita-savelyevv Oct 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/nncf/openvino/graph/nncf_graph_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def convert_to_nncf_dtype(ov_type: ov.Type) -> Dtype:
type_name = ov_type.get_type_name()
conversion_map = {
"nf4": "float",
"f4e2m1": "float",
"f8e8m0": "float",
"f8e4m3": "float",
"f8e5m2": "float",
"f16": "float",
Expand Down
14 changes: 7 additions & 7 deletions src/nncf/openvino/optimized_functions/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,17 +107,16 @@ def do_float_quantization(
precomputed_scale: Optional[Tensor] = None,
) -> tuple[Tensor, Tensor, Tensor]:
"""
Computes quantization scale if not provided, and performs corresponding nf4 weight quantization.
For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
Computes quantization scale if not provided, and performs corresponding float weight quantization.
NF4 format uses 16 levels in [-1, 1] range, while MXFP4 uses 16 levels in [-6, 6].

:param weight: Weight array to compress.
:param config: Weight compression configuration.
:param reduction_axes: Axes, along which to reduce (collect) different statistics.
:param precomputed_scale: Optional precomputed scale.
:return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor.
:return: Returns quantized weight tensor and corresponding scale tensor.
"""
assert config.mode == CompressWeightsMode.NF4
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4]

weight_shape = weight.shape
scale_shape = None if precomputed_scale is None else precomputed_scale.shape
Expand All @@ -129,7 +128,8 @@ def do_float_quantization(
if weight.backend == TensorBackend.ov:
# Return ov tensors in target precision to seamlessly insert them into openvino model later
ov_model_params.return_ov_tensors = True
ov_model_params.output_dtypes.update({"compressed_weight": TensorDataType.nf4})
weight_dtype = TensorDataType.f4e2m1 if config.mode == CompressWeightsMode.MXFP4 else TensorDataType.nf4
ov_model_params.output_dtypes.update({"compressed_weight": weight_dtype})

model = get_float_quantization_model(
ov_model_params,
Expand Down Expand Up @@ -235,7 +235,7 @@ def float_quantize_dequantize_weight(
:param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale.
:return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale.
"""
assert config.mode == CompressWeightsMode.NF4
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4]

# When reduction axes are not provided, assuming that the weights are already reshaped
if config.group_size != -1 and reduction_axes is not None:
Expand Down
18 changes: 13 additions & 5 deletions src/nncf/openvino/optimized_functions/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,8 @@ def get_float_quantization_model(
reduction_axes: Optional[ReductionAxes] = None,
) -> Union[ModelCallable, ModelAsNodes]:
"""
Get a model that compresses weights to float (currently only nf4) destination type using the given configuration.
Get a model that compresses weights to float (currently nf4 or mxfp4) destination type using the given
configuration.

:param ov_model_params: OV model parameters.
:param config: Compression configuration.
Expand Down Expand Up @@ -571,7 +572,7 @@ def _build_float_quantization_model(
reduction_axes: Optional[ReductionAxes] = None,
return_nodes: bool = False,
) -> Union[ModelCallable, ModelAsNodes]:
assert config.mode == CompressWeightsMode.NF4
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4]

default_input_dtypes = {"scale": TensorDataType.float32}
default_output_dtypes = {"compressed_weight": TensorDataType.float32, "scale": TensorDataType.float32}
Expand All @@ -597,8 +598,7 @@ def _build_float_quantization_model(
)

# Validate output dtypes
# TODO: add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4]
valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4, TensorDataType.f4e2m1]
if compressed_weight_dtype not in valid_compressed_weight_dtypes:
msg = (
f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. "
Expand Down Expand Up @@ -626,8 +626,16 @@ def _build_float_quantization_model(
eps = np.finfo(np.float32).eps
scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale)

if config.mode == CompressWeightsMode.MXFP4:
scale = scale / opset.constant(6.0, ov.Type.f32)
scale = opset.log(scale) / opset.log(opset.constant(2.0, ov.Type.f32))
scale = opset.ceil(scale)
scale = opset.clamp(scale, -127.0, 127.0)
scale = opset.power(opset.constant(2.0, ov.Type.f32), scale)

compressed_weight = divide_op(weight, scale)
compressed_weight = convert_op(compressed_weight, ov.Type.nf4)
target_dtype = ov.Type.nf4 if config.mode == CompressWeightsMode.NF4 else ov.Type.f4e2m1
compressed_weight = convert_op(compressed_weight, target_dtype)
compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype])

ov_results = [compressed_weight]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,27 @@
dtype=np.float32,
)

MXFP4_QUANTILES = np.array(
[
-6.0,
-4.0,
-3.0,
-2.0,
-1.5,
-1.0,
-0.5,
-0.0,
0.5,
1.0,
1.5,
2.0,
3.0,
4.0,
6.0,
],
dtype=np.float32,
)


CB4_QUANTILES = np.array(
[
Expand Down Expand Up @@ -77,3 +98,6 @@
],
dtype=np.float32,
)


CENTER_OF_MXFP4_QUANTILES = (MXFP4_QUANTILES[1:] + MXFP4_QUANTILES[:-1]) / 2
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
from nncf.errors import UnsupportedModelError
from nncf.parameters import CompressWeightsMode
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_MXFP4_QUANTILES
from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES
from nncf.quantization.algorithms.weight_compression.constants import MXFP4_QUANTILES
from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES
from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
from nncf.quantization.fake_quantize import calculate_scale_zero_point
Expand Down Expand Up @@ -147,15 +149,15 @@ def do_float_quantization(
"""
Computes quantization scale if not provided,
and performs corresponding (nf4, MXFP4 and MXFP8_E4M3) weight quantization.
For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
For MXFP4, MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization.
TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
NF4 format uses 16 levels in [-1, 1] range, while MXFP4 uses 16 levels in [-6, 6].
For MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization.
For CODEBOOK currently returns normalized weight without quantization.

:param weight: Weight array to compress.
:param config: Weight compression configuration.
:param reduction_axes: Axes, along which to reduce (collect) different statistics.
:param precomputed_scale: Optional precomputed scale.
:return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor and
:return: Returns quantized (for MXFP8_E4M3 and codebook normalized) weight tensor and corresponding scale tensor and
optional indexes for codebook.
"""
assert not config.is_integer
Expand All @@ -165,7 +167,7 @@ def do_float_quantization(
weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)

# Optimized implementation
if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight):
if config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4] and _can_run_optimized(weight):
from nncf.openvino.optimized_functions import do_float_quantization as do_float_quantization_ov

return do_float_quantization_ov(weight, config, reduction_axes, precomputed_scale)
Expand All @@ -180,19 +182,19 @@ def do_float_quantization(
if scale is None:
scale = calculate_float_quantization_params(weight, reduction_axes, config)
norm_weight = _calculate_normalized_weight(weight, scale)
if config.mode == CompressWeightsMode.NF4:
if config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4]:
if original_weight_backend == TensorBackend.ov:
# Can convert through OpenVINO and return OpenVINO-native NF4 tensor
compressed_weight = norm_weight.as_openvino_tensor().astype(TensorDataType.nf4)
# Can convert through OpenVINO and return OpenVINO-native nf4/f4e2m1 tensor
target_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1
compressed_weight = norm_weight.as_openvino_tensor().astype(target_dtype)
else:
compressed_weight = _calculate_nf4_quantized_weight(norm_weight)
compressed_weight = _calculate_float_quantized_weight(norm_weight, config.mode)
elif config.is_codebook:
compressed_weight, indexes = _calculate_codebook_quantized_weight(
norm_weight, quantiles=config.get_numpy_codebook()
)
return compressed_weight, scale, indexes
else:
# TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
compressed_weight = norm_weight
return compressed_weight, scale, None

Expand All @@ -205,8 +207,8 @@ def float_quantize_dequantize_weight(
return_compressed_weight: Optional[bool] = False,
) -> Union[Tensor, tuple[Tensor, Tensor, Tensor]]:
"""
First quantizes the given weight tensor to float (nf4) dtype and then dequantizes it back to obtain float32 values.
MXFP4 and MXFP8_E4M3 mode is currently not supported.
First quantizes the given weight tensor to float dtype and then dequantizes it back to obtain float32 values.
MXFP8_E4M3 mode is currently not supported.

:param weight: The weight tensor to quantize-dequantize.
:param config: Compression configuration.
Expand All @@ -215,11 +217,15 @@ def float_quantize_dequantize_weight(
:param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale.
:return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale.
"""
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]
# TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3, once ticket 164851 is resolved
assert config.mode in [
CompressWeightsMode.NF4,
CompressWeightsMode.MXFP4,
CompressWeightsMode.CODEBOOK,
CompressWeightsMode.CB4_F8E4M3,
]

# Optimized implementation
if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight):
if config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4] and _can_run_optimized(weight):
from nncf.openvino.optimized_functions import (
float_quantize_dequantize_weight as float_quantize_dequantize_weight_ov,
)
Expand Down Expand Up @@ -508,17 +514,30 @@ def integer_quantize_dequantize_weight(
return decompressed_weight


def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor:
def _calculate_float_quantized_weight(norm_weight: Tensor, mode: CompressWeightsMode) -> Tensor:
"""
Performs NF4 quantization. Look-up table is used to "round" or "quantize" to the closest quant.
Performs float (currently NF4 or MXFP4) quantization. Look-up table is used to "round" or "quantize" to the
closest quant.

:param norm_weight: Weight tensor to quantize already normalized to [-1, 1] range.
:return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants on [-1, 1].
:param norm_weight: Normalized weight tensor to quantize.
:return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants.
"""
center_nf4_quantiles = fns.from_numpy(CENTER_OF_NF4_QUANTILES, backend=norm_weight.backend)
indexes = fns.searchsorted(center_nf4_quantiles, norm_weight)
nf4_quantiles = fns.from_numpy(NF4_QUANTILES, backend=indexes.backend)
quantized_weight = nf4_quantiles[indexes]
assert mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4]
quantiles_np = NF4_QUANTILES if mode == CompressWeightsMode.NF4 else MXFP4_QUANTILES
quantile_centers_np = CENTER_OF_NF4_QUANTILES if mode == CompressWeightsMode.NF4 else CENTER_OF_MXFP4_QUANTILES
quantile_centers = fns.from_numpy(quantile_centers_np, backend=norm_weight.backend)
indexes = fns.searchsorted(quantile_centers, norm_weight)
quantiles = fns.from_numpy(quantiles_np, backend=indexes.backend)

if mode == CompressWeightsMode.MXFP4:
# If in-between two quantiles, round to the nearest even quantile.
shifted_indexes = fns.clip(indexes + 1, 0, quantiles.size - 1)
dist_left = fns.abs(norm_weight - quantiles[indexes])
dist_right = fns.abs(norm_weight - quantiles[shifted_indexes])
choose_right = (dist_right < dist_left) | ((dist_left == dist_right) & ((shifted_indexes + 1) % 2 == 0))
indexes = fns.where(choose_right, shifted_indexes, indexes)

quantized_weight = quantiles[indexes]
return quantized_weight


Expand Down
2 changes: 2 additions & 0 deletions src/nncf/tensor/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ class TensorDataType(StrEnum):
float64 = auto()
f8e4m3 = auto()
f8e5m2 = auto()
f8e8m0 = auto()
f4e2m1 = auto()
nf4 = auto()
int8 = auto()
int32 = auto()
Expand Down
6 changes: 5 additions & 1 deletion src/nncf/tensor/functions/openvino_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

DTYPE_MAP: dict[TensorDataType, ov.Type] = {
TensorDataType.nf4: ov.Type.nf4,
TensorDataType.f4e2m1: ov.Type.f4e2m1,
TensorDataType.f8e8m0: ov.Type.f8e8m0,
TensorDataType.f8e4m3: ov.Type.f8e4m3,
TensorDataType.f8e5m2: ov.Type.f8e5m2,
TensorDataType.float16: ov.Type.f16,
Expand All @@ -42,6 +44,8 @@
TensorDataType.int4,
TensorDataType.uint4,
TensorDataType.nf4,
TensorDataType.f4e2m1,
TensorDataType.f8e8m0,
TensorDataType.f8e4m3,
TensorDataType.f8e5m2,
]
Expand Down Expand Up @@ -95,7 +99,7 @@ def _(a: ov.Tensor, shape: Union[int, tuple[int, ...]]) -> ov.Tensor:

@numeric.as_numpy_tensor.register
def _(a: ov.Tensor) -> NDArray[Any]:
# Cannot convert bfloat16, uint4, int4, nf4, f8e4m3, f8e5m2 to numpy directly
# Cannot convert bfloat16, uint4, int4, nf4, f4e2m1, f8e8m0, f8e4m3, f8e5m2 to numpy directly
a_dtype = DTYPE_MAP_REV[a.get_element_type()]
if a_dtype in NATIVE_OV_CAST_DTYPES:
dtype = TensorDataType.float32
Expand Down
9 changes: 9 additions & 0 deletions src/nncf/tensor/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@ def __len__(self) -> int:

# built-in operations

def __or__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
return Tensor(self.data | unwrap_tensor_data(other))

def __and__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
return Tensor(self.data & unwrap_tensor_data(other))

def __add__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
return Tensor(self.data + unwrap_tensor_data(other))

Expand Down Expand Up @@ -144,6 +150,9 @@ def __ifloordiv__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
self._data //= unwrap_tensor_data(other)
return self

def __mod__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
return cast(Tensor, _call_function("_binary_op_nowarn", self, other, operator.mod))

def __matmul__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
return Tensor(self.data @ unwrap_tensor_data(other))

Expand Down
Loading