Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a42f1eb
WIP
nikita-savelyevv Jun 17, 2025
17a3aec
Merge branch 'develop' into ns/ov-f4e2m1-support
nikita-savelyevv Jun 23, 2025
b2e090c
Add round to nearest logic for numpy case
nikita-savelyevv Jul 22, 2025
66c0366
Merge branch 'develop' into ns/ov-f4e2m1-support
nikita-savelyevv Jul 29, 2025
a345984
Tweaks
nikita-savelyevv Jul 29, 2025
6e3ba6e
Temporarily install OV nightly
nikita-savelyevv Jul 29, 2025
7555794
Update src/nncf/version.py
nikita-savelyevv Aug 21, 2025
999d54f
Merge branch 'develop' into ns/ov-f4e2m1-support
nikita-savelyevv Sep 3, 2025
83770e1
[OpenVINO][WC] E5M2 and E4M3 FP8 weights compression support
daniil-lyakhov Sep 22, 2025
8054217
MXFP4/MXFP8_E4M3
daniil-lyakhov Sep 30, 2025
3d944de
Expand wc docs with a table
daniil-lyakhov Oct 1, 2025
0c48792
Codebook is removed from wc docs
daniil-lyakhov Oct 1, 2025
ac2f05c
Type
daniil-lyakhov Oct 1, 2025
1e23ecf
Apply suggestions from code review
daniil-lyakhov Oct 2, 2025
33aae33
Typos/pre-commit
daniil-lyakhov Oct 2, 2025
e4d47ab
Fix adjust group size
daniil-lyakhov Oct 7, 2025
2aaec38
Revert "Fix adjust group size"
daniil-lyakhov Oct 8, 2025
ab6aa74
Fail for MX with adjust fallback mode
daniil-lyakhov Oct 8, 2025
a25b5c3
Update src/nncf/quantization/algorithms/weight_compression/weight_low…
daniil-lyakhov Oct 8, 2025
a64b30c
Merge branch 'develop' into ns/ov-f4e2m1-support
nikita-savelyevv Oct 8, 2025
83d09fc
Merge branch 'dl/FP8' into ns/ov-f4e2m1-support
nikita-savelyevv Oct 8, 2025
c227bac
Merge branch 'develop' into ns/ov-f4e2m1-support
nikita-savelyevv Oct 8, 2025
c026573
Revert nightly installation
nikita-savelyevv Oct 8, 2025
a000f87
Post-merge fixes
nikita-savelyevv Oct 8, 2025
831bf25
Post-merge fixes part 2
nikita-savelyevv Oct 8, 2025
344b94b
Increase test weight channel size
nikita-savelyevv Oct 8, 2025
089631a
Address suggested changes
nikita-savelyevv Oct 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/call_precommit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ jobs:
shell: bash
- name: Install NNCF and test requirements
run: pip install . -r tests/openvino/requirements.txt
- name: Install OpenVINO nightly
run: pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
- name: Print installed modules
run: pip list
- name: Run OV precommit test scope
Expand Down
2 changes: 2 additions & 0 deletions src/nncf/openvino/graph/nncf_graph_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def convert_to_nncf_dtype(ov_type: ov.Type) -> Dtype:
type_name = ov_type.get_type_name()
conversion_map = {
"nf4": "float",
"f4e2m1": "float",
"f8e8m0": "float",
"f8e4m3": "float",
"f8e5m2": "float",
"f16": "float",
Expand Down
13 changes: 7 additions & 6 deletions src/nncf/openvino/optimized_functions/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,17 +107,17 @@ def do_float_quantization(
precomputed_scale: Optional[Tensor] = None,
) -> tuple[Tensor, Tensor, Tensor]:
"""
Computes quantization scale if not provided, and performs corresponding nf4 weight quantization.
Computes quantization scale if not provided, and performs corresponding float weight quantization.
For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved
For E2M1 quantization quantizes the weights to 16 levels on [-6, 6] interval.

:param weight: Weight array to compress.
:param config: Weight compression configuration.
:param reduction_axes: Axes, along which to reduce (collect) different statistics.
:param precomputed_scale: Optional precomputed scale.
:return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor.
:return: Returns quantized weight tensor and corresponding scale tensor.
"""
assert config.mode == CompressWeightsMode.NF4
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]

weight_shape = weight.shape
scale_shape = None if precomputed_scale is None else precomputed_scale.shape
Expand All @@ -129,7 +129,8 @@ def do_float_quantization(
if weight.backend == TensorBackend.ov:
# Return ov tensors in target precision to seamlessly insert them into openvino model later
ov_model_params.return_ov_tensors = True
ov_model_params.output_dtypes.update({"compressed_weight": TensorDataType.nf4})
weight_dtype = TensorDataType.f4e2m1 if config.mode == CompressWeightsMode.E2M1 else TensorDataType.nf4
ov_model_params.output_dtypes.update({"compressed_weight": weight_dtype})

model = get_float_quantization_model(
ov_model_params,
Expand Down Expand Up @@ -235,7 +236,7 @@ def float_quantize_dequantize_weight(
:param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale.
:return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale.
"""
assert config.mode == CompressWeightsMode.NF4
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]

# When reduction axes are not provided, assuming that the weights are already reshaped
if config.group_size != -1 and reduction_axes is not None:
Expand Down
18 changes: 13 additions & 5 deletions src/nncf/openvino/optimized_functions/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,8 @@ def get_float_quantization_model(
reduction_axes: Optional[ReductionAxes] = None,
) -> Union[ModelCallable, ModelAsNodes]:
"""
Get a model that compresses weights to float (currently only nf4) destination type using the given configuration.
Get a model that compresses weights to float (currently nf4 or f4e2m1) destination type using the given
configuration.

:param ov_model_params: OV model parameters.
:param config: Compression configuration.
Expand Down Expand Up @@ -569,7 +570,7 @@ def _build_float_quantization_model(
reduction_axes: Optional[ReductionAxes] = None,
return_nodes: bool = False,
) -> Union[ModelCallable, ModelAsNodes]:
assert config.mode == CompressWeightsMode.NF4
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]

default_input_dtypes = {"scale": TensorDataType.float32}
default_output_dtypes = {"compressed_weight": TensorDataType.float32, "scale": TensorDataType.float32}
Expand All @@ -595,8 +596,7 @@ def _build_float_quantization_model(
)

# Validate output dtypes
# TODO: add support for f4e2m1 once ticket 164851 is resolved
valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4]
valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4, TensorDataType.f4e2m1]
if compressed_weight_dtype not in valid_compressed_weight_dtypes:
msg = (
f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. "
Expand Down Expand Up @@ -624,8 +624,16 @@ def _build_float_quantization_model(
eps = np.finfo(np.float32).eps
scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale)

if config.mode == CompressWeightsMode.E2M1:
scale = scale / opset.constant(6.0, ov.Type.f32)
scale = opset.log(scale) / opset.log(opset.constant(2.0, ov.Type.f32))
scale = opset.ceil(scale)
scale = opset.clamp(scale, -127.0, 127.0)
scale = opset.power(opset.constant(2.0, ov.Type.f32), scale)

compressed_weight = divide_op(weight, scale)
compressed_weight = convert_op(compressed_weight, ov.Type.nf4)
target_dtype = ov.Type.nf4 if config.mode == CompressWeightsMode.NF4 else ov.Type.f4e2m1
compressed_weight = convert_op(compressed_weight, target_dtype)
compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype])

ov_results = [compressed_weight]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,27 @@
dtype=np.float32,
)

E2M1_QUANTILES = np.array(
[
-6.0,
-4.0,
-3.0,
-2.0,
-1.5,
-1.0,
-0.5,
-0.0,
0.5,
1.0,
1.5,
2.0,
3.0,
4.0,
6.0,
],
dtype=np.float32,
)


CB4_QUANTILES = np.array(
[
Expand Down Expand Up @@ -77,3 +98,6 @@
],
dtype=np.float32,
)


CENTER_OF_E2M1_QUANTILES = (E2M1_QUANTILES[1:] + E2M1_QUANTILES[:-1]) / 2
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
from nncf.errors import UnsupportedModelError
from nncf.parameters import CompressWeightsMode
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_E2M1_QUANTILES
from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES
from nncf.quantization.algorithms.weight_compression.constants import E2M1_QUANTILES
from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES
from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
from nncf.quantization.fake_quantize import calculate_scale_zero_point
Expand Down Expand Up @@ -136,14 +138,14 @@ def do_float_quantization(
"""
Computes quantization scale if not provided, and performs corresponding (nf4, e2m1) weight quantization.
For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
For E2M1 and CODEBOOK currently returns normalized weight without quantization.
TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved
For E2M1 quantization quantizes the weights to 16 levels on [-6, 6] interval.
For CODEBOOK currently returns normalized weight without quantization.

:param weight: Weight array to compress.
:param config: Weight compression configuration.
:param reduction_axes: Axes, along which to reduce (collect) different statistics.
:param precomputed_scale: Optional precomputed scale.
:return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor and
:return: Returns quantized (for codebook normalized) weight tensor and corresponding scale tensor and
optional indexes for codebook.
"""
assert not config.is_integer
Expand All @@ -153,7 +155,7 @@ def do_float_quantization(
weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)

# Optimized implementation
if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight):
if _can_run_optimized(weight):
from nncf.openvino.optimized_functions import do_float_quantization as do_float_quantization_ov

return do_float_quantization_ov(weight, config, reduction_axes, precomputed_scale)
Expand All @@ -168,20 +170,18 @@ def do_float_quantization(
if scale is None:
scale = calculate_float_quantization_params(weight, reduction_axes, config)
norm_weight = _calculate_normalized_weight(weight, scale)
if config.mode == CompressWeightsMode.NF4:
if original_weight_backend == TensorBackend.ov:
# Can convert through OpenVINO and return OpenVINO-native NF4 tensor
compressed_weight = norm_weight.as_openvino_tensor().astype(TensorDataType.nf4)
else:
compressed_weight = _calculate_nf4_quantized_weight(norm_weight)
elif config.is_codebook:
if config.is_codebook:
compressed_weight, indexes = _calculate_codebook_quantized_weight(
norm_weight, quantiles=config.get_numpy_codebook()
)
return compressed_weight, scale, indexes

if original_weight_backend == TensorBackend.ov:
# Can convert through OpenVINO and return OpenVINO-native tensor
target_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1
compressed_weight = norm_weight.as_openvino_tensor().astype(target_dtype)
else:
# TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved
compressed_weight = norm_weight
compressed_weight = _calculate_float_quantized_weight(norm_weight, config.mode)
return compressed_weight, scale, None


Expand All @@ -193,8 +193,7 @@ def float_quantize_dequantize_weight(
return_compressed_weight: Optional[bool] = False,
) -> Union[Tensor, tuple[Tensor, Tensor, Tensor]]:
"""
First quantizes the given weight tensor to float (nf4) dtype and then dequantizes it back to obtain float32 values.
E2M1 mode is currently not supported.
First quantizes the given weight tensor to float dtype and then dequantizes it back to obtain float32 values.

:param weight: The weight tensor to quantize-dequantize.
:param config: Compression configuration.
Expand All @@ -203,11 +202,15 @@ def float_quantize_dequantize_weight(
:param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale.
:return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale.
"""
assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]
# TODO(nikita-savelyevv): add support for f4e2m1 once ticket 164851 is resolved
assert config.mode in [
CompressWeightsMode.NF4,
CompressWeightsMode.E2M1,
CompressWeightsMode.CODEBOOK,
CompressWeightsMode.CB4_F8E4M3,
]

# Optimized implementation
if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight):
if config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] and _can_run_optimized(weight):
from nncf.openvino.optimized_functions import (
float_quantize_dequantize_weight as float_quantize_dequantize_weight_ov,
)
Expand Down Expand Up @@ -496,17 +499,30 @@ def integer_quantize_dequantize_weight(
return decompressed_weight


def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor:
def _calculate_float_quantized_weight(norm_weight: Tensor, mode: CompressWeightsMode) -> Tensor:
"""
Performs NF4 quantization. Look-up table is used to "round" or "quantize" to the closest quant.
Performs float (currently NF4 or F4E2M1) quantization. Look-up table is used to "round" or "quantize" to the
closest quant.

:param norm_weight: Weight tensor to quantize already normalized to [-1, 1] range.
:return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants on [-1, 1].
:param norm_weight: Normalized weight tensor to quantize.
:return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants.
"""
center_nf4_quantiles = fns.from_numpy(CENTER_OF_NF4_QUANTILES, backend=norm_weight.backend)
indexes = fns.searchsorted(center_nf4_quantiles, norm_weight)
nf4_quantiles = fns.from_numpy(NF4_QUANTILES, backend=indexes.backend)
quantized_weight = nf4_quantiles[indexes]
assert mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]
quantiles_np = NF4_QUANTILES if mode == CompressWeightsMode.NF4 else E2M1_QUANTILES
quantile_centers_np = CENTER_OF_NF4_QUANTILES if mode == CompressWeightsMode.NF4 else CENTER_OF_E2M1_QUANTILES
quantile_centers = fns.from_numpy(quantile_centers_np, backend=norm_weight.backend)
indexes = fns.searchsorted(quantile_centers, norm_weight)
quantiles = fns.from_numpy(quantiles_np, backend=indexes.backend)

if mode == CompressWeightsMode.E2M1:
# If in-between two quantiles, round to the nearest even quantile.
shifted_indexes = fns.clip(indexes + 1, 0, quantiles.size - 1)
dist_left = fns.abs(norm_weight - quantiles[indexes])
dist_right = fns.abs(norm_weight - quantiles[shifted_indexes])
choose_right = (dist_right < dist_left) | ((dist_left == dist_right) & ((shifted_indexes + 1) % 2 == 0))
indexes = fns.where(choose_right, shifted_indexes, indexes)

quantized_weight = quantiles[indexes]
return quantized_weight


Expand Down
2 changes: 2 additions & 0 deletions src/nncf/tensor/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ class TensorDataType(StrEnum):
float64 = auto()
f8e4m3 = auto()
f8e5m2 = auto()
f8e8m0 = auto()
f4e2m1 = auto()
nf4 = auto()
int8 = auto()
int32 = auto()
Expand Down
6 changes: 5 additions & 1 deletion src/nncf/tensor/functions/openvino_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

DTYPE_MAP: dict[TensorDataType, ov.Type] = {
TensorDataType.nf4: ov.Type.nf4,
TensorDataType.f4e2m1: ov.Type.f4e2m1,
TensorDataType.f8e8m0: ov.Type.f8e8m0,
TensorDataType.f8e4m3: ov.Type.f8e4m3,
TensorDataType.f8e5m2: ov.Type.f8e5m2,
TensorDataType.float16: ov.Type.f16,
Expand All @@ -42,6 +44,8 @@
TensorDataType.int4,
TensorDataType.uint4,
TensorDataType.nf4,
TensorDataType.f4e2m1,
TensorDataType.f8e8m0,
TensorDataType.f8e4m3,
TensorDataType.f8e5m2,
]
Expand Down Expand Up @@ -95,7 +99,7 @@ def _(a: ov.Tensor, shape: Union[int, tuple[int, ...]]) -> ov.Tensor:

@numeric.as_numpy_tensor.register
def _(a: ov.Tensor) -> NDArray[Any]:
# Cannot convert bfloat16, uint4, int4, nf4, f8e4m3, f8e5m2 to numpy directly
# Cannot convert bfloat16, uint4, int4, nf4, f4e2m1, f8e8m0, f8e4m3, f8e5m2 to numpy directly
a_dtype = DTYPE_MAP_REV[a.get_element_type()]
if a_dtype in NATIVE_OV_CAST_DTYPES:
dtype = TensorDataType.float32
Expand Down
9 changes: 9 additions & 0 deletions src/nncf/tensor/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@ def __len__(self) -> int:

# built-in operations

def __or__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
return Tensor(self.data | unwrap_tensor_data(other))

def __and__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
return Tensor(self.data & unwrap_tensor_data(other))

def __add__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
return Tensor(self.data + unwrap_tensor_data(other))

Expand Down Expand Up @@ -144,6 +150,9 @@ def __ifloordiv__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
self._data //= unwrap_tensor_data(other)
return self

def __mod__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
return cast(Tensor, _call_function("_binary_op_nowarn", self, other, operator.mod))

def __matmul__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
return Tensor(self.data @ unwrap_tensor_data(other))

Expand Down
2 changes: 1 addition & 1 deletion src/nncf/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "2.18.0"
__version__ = "2.18.0.dev0+66c0366aedirty"


BKC_TORCH_SPEC = "==2.7.*"
Expand Down
25 changes: 25 additions & 0 deletions tests/cross_fw/test_templates/template_test_nncf_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
}
BINARY_OPERATORS = ["add", "sub", "pow", "mul", "truediv", "floordiv"]

BOOLEAN_OPERATOR_MAP = {"and": operator.and_, "or": operator.or_}

COMPARISON_OPERATOR_MAP = {
"lt": operator.lt,
"le": operator.le,
Expand Down Expand Up @@ -98,6 +100,25 @@ def test_operator_clone(self):
assert id(tensor_a.data) is not id(tensor_b.data)
assert all(tensor_a == tensor_b)

@pytest.mark.parametrize("op_name", BOOLEAN_OPERATOR_MAP.keys())
@pytest.mark.parametrize("value", [True, False])
def test_operators_bool(self, op_name, value):
tensor_a = self.to_tensor([True, False])

nncf_tensor_a = Tensor(tensor_a)

fn = BOOLEAN_OPERATOR_MAP[op_name]
res = fn(tensor_a, value)
res_nncf = fn(nncf_tensor_a, value)

assert res.dtype == res_nncf.data.dtype
assert all(res == res_nncf.data)
assert isinstance(res_nncf, Tensor)
if (
self.backend() != TensorBackend.tf
): # native Tensorflow operaors do not guarantee to return a tensor on an initial device.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
): # native Tensorflow operaors do not guarantee to return a tensor on an initial device.
): # native Tensorflow operators do not guarantee to return a tensor on an initial device.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

assert res_nncf.device == nncf_tensor_a.device

@pytest.mark.parametrize("op_name", OPERATOR_MAP.keys())
def test_operators_tensor(self, op_name):
tensor_a = self.to_tensor([1.0, 2.0])
Expand Down Expand Up @@ -1982,6 +2003,8 @@ def test_fn_zeros(self):
TensorDataType.int4,
TensorDataType.uint4,
TensorDataType.nf4,
TensorDataType.f4e2m1,
TensorDataType.f8e8m0,
TensorDataType.f8e4m3,
TensorDataType.f8e5m2,
]
Expand Down Expand Up @@ -2014,6 +2037,8 @@ def test_fn_eye(self, n, m, ref):
TensorDataType.int4,
TensorDataType.uint4,
TensorDataType.nf4,
TensorDataType.f4e2m1,
TensorDataType.f8e8m0,
TensorDataType.f8e4m3,
TensorDataType.f8e5m2,
]
Expand Down
Loading