Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/Algorithms.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
- NF4 compression mode
- Arbitrary look-up table (CODEBOOK) or predefined lookup table based on NF4 (CB4_F8E4M3)
- MX-compliant types - MXFP4 and MXFP8_E4M3
- FP8 type - FP8_E4M3
- Mixed precision weights compression
- Grouped weights compression

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ NNCF can automatically distribute precision assignments based on quantization se
| CB4_F8E4M3 | E4M3 | FP16 | Per-channel / Group-wise | A fixed lookup table with 16 E4M3 values based on NF4 values |
| MXFP4 | E2M1 | E8M0 | Group-wise (32) | [MX-compliant FP4](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
| MXFP8_E4M3 | E4M3 | E8M0 | Group-wise (32) | [MX-compliant FP8](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
| FP8_E4M3 | E4M3 | FP16 | Per-channel / Group-wise | [FP8](https://arxiv.org/pdf/2209.05433) |

**Note**: Granularity refers to the scope of elements sharing quantization parameters. "Per-channel" applies different parameters for each output channel, while "Group-wise" divides weights into groups (e.g., group_size=128) that share the same parameters.

Expand Down
2 changes: 2 additions & 0 deletions src/nncf/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ class CompressWeightsMode(StrEnum):
:param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead.
:param MXFP4: MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32.
:param MXFP8_E4M3: MX-compliant FP8 format with E4M3 values sharing group-level E8M0 scale. The size of group is 32.
:param FP8_E4M3: A FP8 format with E4M3 values sharing group-level fp16 scale.
:param CODEBOOK: Codebook (LUT) quantization format.
:param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format.
"""
Expand All @@ -105,6 +106,7 @@ class CompressWeightsMode(StrEnum):
INT8 = "int8" # Deprecated mode
MXFP4 = "mxfp4"
MXFP8_E4M3 = "mxfp8_e4m3"
FP8_E4M3 = "fp8_e4m3"
CODEBOOK = "codebook"


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
CompressWeightsMode.NF4,
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
CompressWeightsMode.FP8_E4M3,
]
SUPPORTED_DATA_TYPES = [
TensorDataType.float16,
Expand Down Expand Up @@ -300,6 +301,7 @@ def __init__(
NF4 is the same as INT4_SYM mode, but primary precision is NF4 data type without zero point.
MXFP4 is MX-compliant FP4 with E2M1 values sharing group-level E8M0 scale. The size of group is 32.
MXFP8_E4M3 is MX-compliant FP8 with E4M3 values sharing group-level E8M0 scale. The size of group is 32.
FP8_E4M3 is FP8 with E4M3 values sharing group-level FP16 scale.
:param ratio: the ratio between primary and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
and the rest to backup_mode).
:param group_size: number of weights (e.g. 128) in the channel dimension
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def is_integer(self):
CompressWeightsMode.NF4,
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
CompressWeightsMode.FP8_E4M3,
CompressWeightsMode.CODEBOOK,
CompressWeightsMode.CB4_F8E4M3,
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@ def _create_compression_subgraph(
elif compression_config.mode == CompressWeightsMode.MXFP8_E4M3:
compression_dtype = ov.Type.f8e4m3
scale_dtype = ov.Type.f8e8m0
elif compression_config.mode == CompressWeightsMode.FP8_E4M3:
compression_dtype = ov.Type.f8e4m3
elif compression_config.mode == CompressWeightsMode.INT4_SYM:
compression_dtype = ov.Type.i4
elif compression_config.mode == CompressWeightsMode.INT4_ASYM:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ def transform_model(
CompressWeightsMode.NF4,
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
CompressWeightsMode.FP8_E4M3,
]:
msg = f"{compression_config.mode.value} is not supported."
raise nncf.ParameterNotSupportedError(msg)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def transform_model(
CompressWeightsMode.NF4,
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
CompressWeightsMode.FP8_E4M3,
]:
msg = f"{compression_config.mode.value} is not supported."
raise nncf.ParameterNotSupportedError(msg)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def calculate_float_quantization_params(
weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig
) -> Tensor:
"""
Calculates the scale for nf4 or mxfp4/mxfp8_e4m3 quantization.
Calculates the scale for nf4 or mxfp4/mxfp8_e4m3/fp8_e4m3 quantization.
:param weight: Weight array to compress.
:param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max).
Expand All @@ -97,6 +97,7 @@ def calculate_float_quantization_params(
FP_MAX_VALS = {
CompressWeightsMode.MXFP4: 6.0,
CompressWeightsMode.MXFP8_E4M3: 448.0,
CompressWeightsMode.FP8_E4M3: 448.0,
}
if config.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] + list(FP_MAX_VALS.keys()):
if config.mode in FP_MAX_VALS:
Expand Down Expand Up @@ -146,17 +147,17 @@ def do_float_quantization(
) -> tuple[Tensor, Tensor, Tensor]:
"""
Computes quantization scale if not provided,
and performs corresponding (nf4, MXFP4 and MXFP8_E4M3) weight quantization.
and performs corresponding (nf4, MXFP4, MXFP8_E4M3, FP8_E4M3) weight quantization.
For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
For MXFP4, MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization.
TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
For MXFP4, MXFP8_E4M3, FP8_E4M3 and CODEBOOK currently returns normalized weight without quantization.
TODO(nikita-savelyevv): add support for MXFP4, MXFP8_E4M3 and FP8_E4M3 once ticket 164851 is resolved
:param weight: Weight array to compress.
:param config: Weight compression configuration.
:param reduction_axes: Axes, along which to reduce (collect) different statistics.
:param precomputed_scale: Optional precomputed scale.
:return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor and
optional indexes for codebook.
:return: Returns quantized (for MXFP4, MXFP8_E4M3 and FP8_E4M3 normalized) weight tensor and
corresponding scale tensor and optional indexes for codebook.
"""
assert not config.is_integer

Expand Down Expand Up @@ -192,7 +193,7 @@ def do_float_quantization(
)
return compressed_weight, scale, indexes
else:
# TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
# TODO(nikita-savelyevv): add support for MXFP4, MXFP8_E4M3, FP8_E4M3 once ticket 164851 is resolved
compressed_weight = norm_weight
return compressed_weight, scale, None

Expand Down
51 changes: 34 additions & 17 deletions src/nncf/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,7 @@ def compress_weights(
MXFP4 is MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32.
MXFP8_E4M3 - is MX-compliant FP8 format with E4M3 values sharing a group-level E8M0 scale.
The size of group is 32.
FP8_E4M3 - is FP8 format with E4M3 values sharing a group-level FP16 scale.
:type mode: nncf.CompressWeightsMode
:param ratio: the ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
and the rest to INT8_ASYM).
Expand Down Expand Up @@ -517,14 +518,18 @@ def compress_weights(
from nncf.torch.nncf_network import NNCFNetwork
from nncf.torch.quantization.quantize_model import compress_weights_impl as pt_compression_weights_impl

if mode in [
not_supported_modes = [
CompressWeightsMode.NF4,
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
CompressWeightsMode.FP8_E4M3,
CompressWeightsMode.CODEBOOK,
CompressWeightsMode.CB4_F8E4M3,
]:
msg = "Torch backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression."
]
if mode in not_supported_modes:
msg = (
f"Torch backend does not support {[m.value for m in not_supported_modes]} modes for weight compression."
)
raise nncf.ParameterNotSupportedError(msg)

options = {"gptq": gptq, "lora_correction": lora_correction}
Expand Down Expand Up @@ -567,14 +572,18 @@ def compress_weights(
compress_weights_impl as fx_compression_weights_impl,
)

if mode in [
not_supported_modes = [
CompressWeightsMode.NF4,
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
CompressWeightsMode.FP8_E4M3,
CompressWeightsMode.CODEBOOK,
CompressWeightsMode.CB4_F8E4M3,
]:
msg = "Torch backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression."
]
if mode in not_supported_modes:
msg = (
f"Torch backend does not support {[m.value for m in not_supported_modes]} modes for weight compression."
)
raise nncf.ParameterNotSupportedError(msg)

options = {
Expand Down Expand Up @@ -610,14 +619,18 @@ def compress_weights(
msg = "Scale estimation, GPTQ or Lora Correction algorithm is defined, but dataset is None."
raise nncf.ParameterNotSupportedError(msg)

if any((awq, scale_estimation, gptq, lora_correction)) and mode in [
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
]:
msg = (
"AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode in [MXFP4, MXFP8_E4M3]."
)
raise nncf.ParameterNotSupportedError(msg)
if any((awq, scale_estimation, gptq, lora_correction)):
not_supported_modes = [
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
CompressWeightsMode.FP8_E4M3,
]
if mode in not_supported_modes:
msg = (
"AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined,"
f" but mode in {[m.value for m in not_supported_modes]}."
)
raise nncf.ParameterNotSupportedError(msg)

if gptq and lora_correction:
msg = "Simultaneous use of Lora correction and GPTQ algorithms is not supported. Select one of them."
Expand All @@ -632,14 +645,18 @@ def compress_weights(
elif backend == BackendType.ONNX:
from nncf.onnx.quantization.quantize_model import compress_weights_impl as onnx_compress_weights_impl

if mode in [
not_supported_modes = [
CompressWeightsMode.NF4,
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
CompressWeightsMode.FP8_E4M3,
CompressWeightsMode.CODEBOOK,
CompressWeightsMode.CB4_F8E4M3,
]:
msg = "ONNX backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression."
]
if mode in not_supported_modes:
msg = (
f"ONNX backend does not support {[m.value for m in not_supported_modes]} modes for weight compression."
)
raise nncf.ParameterNotSupportedError(msg)

options = {
Expand Down
26 changes: 26 additions & 0 deletions tests/openvino/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,46 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
from pathlib import Path

import pytest
from pytest import Config

from nncf import set_log_level
from tests.cross_fw.shared.case_collection import COMMON_SCOPE_MARKS_VS_OPTIONS
from tests.cross_fw.shared.case_collection import skip_marked_cases_if_options_not_specified
from tests.cross_fw.shared.install_fixtures import tmp_venv_with_nncf # noqa: F401
from tests.cross_fw.shared.paths import TEST_ROOT


def pytest_addoption(parser):
parser.addoption(
"--regen-ref-data",
action="store_true",
default=False,
help="If specified, the reference files will be regenerated using the current state of the repository.",
)
parser.addoption(
"--nncf-debug",
action="store_true",
default=False,
help="Set debug level for nncf logger.",
)
parser.addoption("--data", type=str, default=None, help="Directory path to cached data.")


def pytest_configure(config: Config) -> None:
regen_dot = config.getoption("--regen-ref-data", False)
if regen_dot:
os.environ["NNCF_TEST_REGEN_DOT"] = "1"

nncf_debug = config.getoption("--nncf-debug", False)
if nncf_debug:
set_log_level(logging.DEBUG)


@pytest.fixture(name="data_dir")
def data(request):
option = request.config.getoption("--data")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,33 @@
{
"matmul_2_data": {
"compressed_weight": [
55,
249,
21,
56,
162,
197,
244,
38,
251,
248,
185,
255,
207,
223,
234,
216,
253,
178,
254,
208,
255
],
"zero_point": [
0,
0,
0
],
"scale": [
[
[
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,28 @@
{
"matmul_2_data": {
"compressed_weight": [
-19,
-117,
-2,
-20,
-65,
-83,
-114,
-3,
-117,
-116,
-36,
-117,
-56,
-54,
-101,
-84,
-118,
-81,
-120,
-112,
-120
],
"scale": [
[
[
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,28 @@
{
"matmul_2_data": {
"compressed_weight": [
156,
253,
138,
173,
216,
235,
250,
139,
254,
252,
205,
253,
207,
206,
237,
236,
254,
233,
255,
247,
255
],
"scale": [
[
[
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,23 @@
{
"matmul_2_data": {
"compressed_weight": [
198,
194,
191,
187,
184,
178,
170,
0,
41,
49,
54,
57,
60,
64,
66,
70
],
"scale": [
[
[
Expand Down
Loading