diff --git a/docs/Algorithms.md b/docs/Algorithms.md index 6c5cc0e9f99..5f36537d889 100644 --- a/docs/Algorithms.md +++ b/docs/Algorithms.md @@ -13,6 +13,7 @@ - NF4 compression mode - Arbitrary look-up table (CODEBOOK) or predefined lookup table based on NF4 (CB4_F8E4M3) - MX-compliant types - MXFP4 and MXFP8_E4M3 + - FP8 type - FP8_E4M3 - Mixed precision weights compression - Grouped weights compression diff --git a/docs/usage/post_training_compression/weights_compression/Usage.md b/docs/usage/post_training_compression/weights_compression/Usage.md index 88ee3df7835..570c94af2a2 100644 --- a/docs/usage/post_training_compression/weights_compression/Usage.md +++ b/docs/usage/post_training_compression/weights_compression/Usage.md @@ -47,6 +47,7 @@ NNCF can automatically distribute precision assignments based on quantization se | CB4_F8E4M3 | E4M3 | FP16 | Per-channel / Group-wise | A fixed lookup table with 16 E4M3 values based on NF4 values | | MXFP4 | E2M1 | E8M0 | Group-wise (32) | [MX-compliant FP4](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) | | MXFP8_E4M3 | E4M3 | E8M0 | Group-wise (32) | [MX-compliant FP8](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) | +| FP8_E4M3 | E4M3 | FP16 | Per-channel / Group-wise | [FP8](https://arxiv.org/pdf/2209.05433) | **Note**: Granularity refers to the scope of elements sharing quantization parameters. "Per-channel" applies different parameters for each output channel, while "Group-wise" divides weights into groups (e.g., group_size=128) that share the same parameters. diff --git a/src/nncf/parameters.py b/src/nncf/parameters.py index 21756ba32d1..37eb787aca0 100644 --- a/src/nncf/parameters.py +++ b/src/nncf/parameters.py @@ -92,6 +92,7 @@ class CompressWeightsMode(StrEnum): :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead. :param MXFP4: MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32. :param MXFP8_E4M3: MX-compliant FP8 format with E4M3 values sharing group-level E8M0 scale. The size of group is 32. + :param FP8_E4M3: A FP8 format with E4M3 values sharing group-level fp16 scale. :param CODEBOOK: Codebook (LUT) quantization format. :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format. """ @@ -105,6 +106,7 @@ class CompressWeightsMode(StrEnum): INT8 = "int8" # Deprecated mode MXFP4 = "mxfp4" MXFP8_E4M3 = "mxfp8_e4m3" + FP8_E4M3 = "fp8_e4m3" CODEBOOK = "codebook" diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index 50a5399a5c8..05ac27c34c1 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -65,6 +65,7 @@ CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3, + CompressWeightsMode.FP8_E4M3, ] SUPPORTED_DATA_TYPES = [ TensorDataType.float16, @@ -300,6 +301,7 @@ def __init__( NF4 is the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. MXFP4 is MX-compliant FP4 with E2M1 values sharing group-level E8M0 scale. The size of group is 32. MXFP8_E4M3 is MX-compliant FP8 with E4M3 values sharing group-level E8M0 scale. The size of group is 32. + FP8_E4M3 is FP8 with E4M3 values sharing group-level FP16 scale. :param ratio: the ratio between primary and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4 and the rest to backup_mode). :param group_size: number of weights (e.g. 128) in the channel dimension diff --git a/src/nncf/quantization/algorithms/weight_compression/config.py b/src/nncf/quantization/algorithms/weight_compression/config.py index b7c5f032f03..679df25272d 100644 --- a/src/nncf/quantization/algorithms/weight_compression/config.py +++ b/src/nncf/quantization/algorithms/weight_compression/config.py @@ -61,6 +61,7 @@ def is_integer(self): CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3, + CompressWeightsMode.FP8_E4M3, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, ] diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index ca6f83573c7..9e81059fd17 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -231,6 +231,8 @@ def _create_compression_subgraph( elif compression_config.mode == CompressWeightsMode.MXFP8_E4M3: compression_dtype = ov.Type.f8e4m3 scale_dtype = ov.Type.f8e8m0 + elif compression_config.mode == CompressWeightsMode.FP8_E4M3: + compression_dtype = ov.Type.f8e4m3 elif compression_config.mode == CompressWeightsMode.INT4_SYM: compression_dtype = ov.Type.i4 elif compression_config.mode == CompressWeightsMode.INT4_ASYM: diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py index 46f3361842e..78c7feadb84 100644 --- a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -440,6 +440,7 @@ def transform_model( CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3, + CompressWeightsMode.FP8_E4M3, ]: msg = f"{compression_config.mode.value} is not supported." raise nncf.ParameterNotSupportedError(msg) diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index 61fc28e005f..122f649b010 100644 --- a/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -180,6 +180,7 @@ def transform_model( CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3, + CompressWeightsMode.FP8_E4M3, ]: msg = f"{compression_config.mode.value} is not supported." raise nncf.ParameterNotSupportedError(msg) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 0e0783cf468..e386cf35c22 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -81,7 +81,7 @@ def calculate_float_quantization_params( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig ) -> Tensor: """ - Calculates the scale for nf4 or mxfp4/mxfp8_e4m3 quantization. + Calculates the scale for nf4 or mxfp4/mxfp8_e4m3/fp8_e4m3 quantization. :param weight: Weight array to compress. :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max). @@ -97,6 +97,7 @@ def calculate_float_quantization_params( FP_MAX_VALS = { CompressWeightsMode.MXFP4: 6.0, CompressWeightsMode.MXFP8_E4M3: 448.0, + CompressWeightsMode.FP8_E4M3: 448.0, } if config.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] + list(FP_MAX_VALS.keys()): if config.mode in FP_MAX_VALS: @@ -146,17 +147,17 @@ def do_float_quantization( ) -> tuple[Tensor, Tensor, Tensor]: """ Computes quantization scale if not provided, - and performs corresponding (nf4, MXFP4 and MXFP8_E4M3) weight quantization. + and performs corresponding (nf4, MXFP4, MXFP8_E4M3, FP8_E4M3) weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - For MXFP4, MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization. - TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved + For MXFP4, MXFP8_E4M3, FP8_E4M3 and CODEBOOK currently returns normalized weight without quantization. + TODO(nikita-savelyevv): add support for MXFP4, MXFP8_E4M3 and FP8_E4M3 once ticket 164851 is resolved :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor and - optional indexes for codebook. + :return: Returns quantized (for MXFP4, MXFP8_E4M3 and FP8_E4M3 normalized) weight tensor and + corresponding scale tensor and optional indexes for codebook. """ assert not config.is_integer @@ -192,7 +193,7 @@ def do_float_quantization( ) return compressed_weight, scale, indexes else: - # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved + # TODO(nikita-savelyevv): add support for MXFP4, MXFP8_E4M3, FP8_E4M3 once ticket 164851 is resolved compressed_weight = norm_weight return compressed_weight, scale, None diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py index 8d89bd19821..d32827346a3 100644 --- a/src/nncf/quantization/quantize_model.py +++ b/src/nncf/quantization/quantize_model.py @@ -460,6 +460,7 @@ def compress_weights( MXFP4 is MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32. MXFP8_E4M3 - is MX-compliant FP8 format with E4M3 values sharing a group-level E8M0 scale. The size of group is 32. + FP8_E4M3 - is FP8 format with E4M3 values sharing a group-level FP16 scale. :type mode: nncf.CompressWeightsMode :param ratio: the ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4 and the rest to INT8_ASYM). @@ -517,14 +518,18 @@ def compress_weights( from nncf.torch.nncf_network import NNCFNetwork from nncf.torch.quantization.quantize_model import compress_weights_impl as pt_compression_weights_impl - if mode in [ + not_supported_modes = [ CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3, + CompressWeightsMode.FP8_E4M3, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, - ]: - msg = "Torch backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression." + ] + if mode in not_supported_modes: + msg = ( + f"Torch backend does not support {[m.value for m in not_supported_modes]} modes for weight compression." + ) raise nncf.ParameterNotSupportedError(msg) options = {"gptq": gptq, "lora_correction": lora_correction} @@ -567,14 +572,18 @@ def compress_weights( compress_weights_impl as fx_compression_weights_impl, ) - if mode in [ + not_supported_modes = [ CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3, + CompressWeightsMode.FP8_E4M3, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, - ]: - msg = "Torch backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression." + ] + if mode in not_supported_modes: + msg = ( + f"Torch backend does not support {[m.value for m in not_supported_modes]} modes for weight compression." + ) raise nncf.ParameterNotSupportedError(msg) options = { @@ -610,14 +619,18 @@ def compress_weights( msg = "Scale estimation, GPTQ or Lora Correction algorithm is defined, but dataset is None." raise nncf.ParameterNotSupportedError(msg) - if any((awq, scale_estimation, gptq, lora_correction)) and mode in [ - CompressWeightsMode.MXFP4, - CompressWeightsMode.MXFP8_E4M3, - ]: - msg = ( - "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode in [MXFP4, MXFP8_E4M3]." - ) - raise nncf.ParameterNotSupportedError(msg) + if any((awq, scale_estimation, gptq, lora_correction)): + not_supported_modes = [ + CompressWeightsMode.MXFP4, + CompressWeightsMode.MXFP8_E4M3, + CompressWeightsMode.FP8_E4M3, + ] + if mode in not_supported_modes: + msg = ( + "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined," + f" but mode in {[m.value for m in not_supported_modes]}." + ) + raise nncf.ParameterNotSupportedError(msg) if gptq and lora_correction: msg = "Simultaneous use of Lora correction and GPTQ algorithms is not supported. Select one of them." @@ -632,14 +645,18 @@ def compress_weights( elif backend == BackendType.ONNX: from nncf.onnx.quantization.quantize_model import compress_weights_impl as onnx_compress_weights_impl - if mode in [ + not_supported_modes = [ CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3, + CompressWeightsMode.FP8_E4M3, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, - ]: - msg = "ONNX backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression." + ] + if mode in not_supported_modes: + msg = ( + f"ONNX backend does not support {[m.value for m in not_supported_modes]} modes for weight compression." + ) raise nncf.ParameterNotSupportedError(msg) options = { diff --git a/tests/openvino/conftest.py b/tests/openvino/conftest.py index e7bb4c772b9..1d0ccf6564f 100644 --- a/tests/openvino/conftest.py +++ b/tests/openvino/conftest.py @@ -9,10 +9,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging +import os from pathlib import Path import pytest +from pytest import Config +from nncf import set_log_level from tests.cross_fw.shared.case_collection import COMMON_SCOPE_MARKS_VS_OPTIONS from tests.cross_fw.shared.case_collection import skip_marked_cases_if_options_not_specified from tests.cross_fw.shared.install_fixtures import tmp_venv_with_nncf # noqa: F401 @@ -20,9 +24,31 @@ def pytest_addoption(parser): + parser.addoption( + "--regen-ref-data", + action="store_true", + default=False, + help="If specified, the reference files will be regenerated using the current state of the repository.", + ) + parser.addoption( + "--nncf-debug", + action="store_true", + default=False, + help="Set debug level for nncf logger.", + ) parser.addoption("--data", type=str, default=None, help="Directory path to cached data.") +def pytest_configure(config: Config) -> None: + regen_dot = config.getoption("--regen-ref-data", False) + if regen_dot: + os.environ["NNCF_TEST_REGEN_DOT"] = "1" + + nncf_debug = config.getoption("--nncf-debug", False) + if nncf_debug: + set_log_level(logging.DEBUG) + + @pytest.fixture(name="data_dir") def data(request): option = request.config.getoption("--data") diff --git a/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int4_asym.json b/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int4_asym.json index c3eb015c607..dcae54169cd 100644 --- a/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int4_asym.json +++ b/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int4_asym.json @@ -1,5 +1,33 @@ { "matmul_2_data": { + "compressed_weight": [ + 55, + 249, + 21, + 56, + 162, + 197, + 244, + 38, + 251, + 248, + 185, + 255, + 207, + 223, + 234, + 216, + 253, + 178, + 254, + 208, + 255 + ], + "zero_point": [ + 0, + 0, + 0 + ], "scale": [ [ [ diff --git a/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int4_sym.json b/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int4_sym.json index 17c9ea559e7..a3a24acbac5 100644 --- a/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int4_sym.json +++ b/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int4_sym.json @@ -1,5 +1,28 @@ { "matmul_2_data": { + "compressed_weight": [ + -19, + -117, + -2, + -20, + -65, + -83, + -114, + -3, + -117, + -116, + -36, + -117, + -56, + -54, + -101, + -84, + -118, + -81, + -120, + -112, + -120 + ], "scale": [ [ [ diff --git a/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_nf4.json b/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_nf4.json index d825556d1d2..64cf16d8061 100644 --- a/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_nf4.json +++ b/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_nf4.json @@ -1,5 +1,28 @@ { "matmul_2_data": { + "compressed_weight": [ + 156, + 253, + 138, + 173, + 216, + 235, + 250, + 139, + 254, + 252, + 205, + 253, + 207, + 206, + 237, + 236, + 254, + 233, + 255, + 247, + 255 + ], "scale": [ [ [ diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json index b8712bf3839..fd0e9324603 100644 --- a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_cb4_f8e4m3.json @@ -1,5 +1,23 @@ { "matmul_2_data": { + "compressed_weight": [ + 198, + 194, + 191, + 187, + 184, + 178, + 170, + 0, + 41, + 49, + 54, + 57, + 60, + 64, + 66, + 70 + ], "scale": [ [ [ diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_fp8_e4m3.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_fp8_e4m3.json new file mode 100644 index 00000000000..5be9febb54e --- /dev/null +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_fp8_e4m3.json @@ -0,0 +1,396 @@ +{ + "matmul_2_data": { + "compressed_weight": [ + [ + [ + 113, + 104, + 117, + 123, + 109, + 92, + 115, + 107, + 98, + 120, + 112, + 121, + 107, + 125, + 114, + 100, + 121, + 125, + 116, + 125, + 118, + 116, + 121, + 126, + 125, + 117, + 123, + 118, + 119, + 123, + 116, + 122 + ] + ], + [ + [ + 122, + 125, + 101, + 122, + 125, + 126, + 77, + 124, + 126, + 126, + 104, + 126, + 125, + 124, + 118, + 109, + 123, + 125, + 111, + 119, + 117, + 125, + 89, + 122, + 121, + 85, + 122, + 79, + 123, + 119, + 125, + 95 + ] + ], + [ + [ + 124, + 95, + 114, + 116, + 126, + 120, + 111, + 110, + 125, + 109, + 102, + 112, + 120, + 120, + 124, + 120, + 112, + 116, + 124, + 121, + 126, + 115, + 120, + 121, + 124, + 104, + 116, + 125, + 90, + 124, + 116, + 124 + ] + ], + [ + [ + 73, + 114, + 97, + 121, + 111, + 122, + 125, + 102, + 124, + 93, + 115, + 116, + 118, + 126, + 123, + 113, + 111, + 124, + 124, + 118, + 114, + 126, + 113, + 106, + 124, + 123, + 121, + 125, + 125, + 123, + 124, + 110 + ] + ], + [ + [ + 104, + 122, + 122, + 106, + 115, + 125, + 120, + 120, + 107, + 119, + 119, + 98, + 126, + 120, + 68, + 123, + 126, + 120, + 113, + 107, + 122, + 107, + 120, + 121, + 126, + 96, + 118, + 123, + 106, + 115, + 94, + 122 + ] + ], + [ + [ + 98, + 115, + 124, + 117, + 125, + 123, + 125, + 102, + 96, + 96, + 124, + 121, + 118, + 105, + 122, + 113, + 122, + 117, + 118, + 123, + 99, + 120, + 107, + 123, + 118, + 126, + 106, + 126, + 123, + 118, + 124, + 121 + ] + ] + ], + "scale": [ + [ + [ + 0.0022220611572265625 + ] + ], + [ + [ + 0.00218963623046875 + ] + ], + [ + [ + 0.0021572113037109375 + ] + ], + [ + [ + 0.00222015380859375 + ] + ], + [ + [ + 0.0021915435791015625 + ] + ], + [ + [ + 0.0022068023681640625 + ] + ] + ] + }, + "matmul_1_data": { + "compressed_weight": [ + [ + 119, + 168, + 11, + 49, + 255, + 255 + ], + [ + 255, + 159, + 255, + 255, + 255, + 255 + ], + [ + 255, + 169, + 59, + 255, + 228, + 135 + ], + [ + 202, + 255, + 255, + 149, + 238, + 134 + ], + [ + 229, + 130, + 151, + 255, + 87, + 240 + ], + [ + 26, + 255, + 245, + 75, + 255, + 18 + ] + ], + "zero_point": [ + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ] + ], + "scale": [ + [ + 0.0025196075439453125 + ], + [ + 0.0024051666259765625 + ], + [ + 0.002300262451171875 + ], + [ + 0.0024013519287109375 + ], + [ + 0.0025997161865234375 + ], + [ + 0.003208160400390625 + ] + ] + }, + "gather_2_data": { + "compressed_weight": [ + [ + 181, + 77, + 12, + 5, + 231, + 255 + ], + [ + 166, + 200, + 149, + 255, + 223, + 1 + ], + [ + 255, + 10, + 224, + 54, + 255, + 166 + ] + ], + "zero_point": [ + [ + 0 + ], + [ + 0 + ], + [ + 0 + ] + ], + "scale": [ + [ + 0.0035152435302734375 + ], + [ + 0.0036563873291015625 + ], + [ + 0.003253936767578125 + ] + ] + } +} \ No newline at end of file diff --git a/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_mxfp8_e4m3.json b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_mxfp8_e4m3.json new file mode 100644 index 00000000000..bb0c8608c37 --- /dev/null +++ b/tests/openvino/native/data/2025.2/reference_scales/IntegerModel_compressed_weights_mxfp8_e4m3.json @@ -0,0 +1,396 @@ +{ + "matmul_2_data": { + "compressed_weight": [ + [ + [ + 107, + 98, + 110, + 117, + 103, + 85, + 109, + 101, + 92, + 113, + 106, + 115, + 101, + 119, + 108, + 94, + 114, + 119, + 110, + 119, + 112, + 110, + 114, + 120, + 119, + 111, + 116, + 112, + 112, + 117, + 109, + 116 + ] + ], + [ + [ + 115, + 119, + 95, + 116, + 119, + 119, + 71, + 118, + 120, + 119, + 98, + 120, + 118, + 117, + 111, + 103, + 117, + 119, + 105, + 113, + 110, + 119, + 82, + 116, + 114, + 79, + 116, + 72, + 116, + 112, + 119, + 88 + ] + ], + [ + [ + 117, + 89, + 107, + 110, + 119, + 113, + 104, + 103, + 118, + 102, + 96, + 105, + 113, + 113, + 117, + 113, + 105, + 109, + 117, + 114, + 119, + 108, + 113, + 114, + 118, + 97, + 109, + 119, + 83, + 117, + 109, + 117 + ] + ], + [ + [ + 66, + 108, + 90, + 114, + 105, + 115, + 119, + 96, + 118, + 87, + 108, + 110, + 112, + 120, + 116, + 106, + 105, + 118, + 118, + 112, + 107, + 120, + 106, + 100, + 118, + 117, + 115, + 119, + 119, + 116, + 118, + 104 + ] + ], + [ + [ + 97, + 115, + 115, + 99, + 109, + 119, + 113, + 113, + 100, + 112, + 112, + 91, + 120, + 113, + 61, + 116, + 120, + 113, + 106, + 100, + 115, + 100, + 113, + 114, + 119, + 89, + 112, + 116, + 99, + 108, + 88, + 116 + ] + ], + [ + [ + 91, + 109, + 118, + 111, + 119, + 116, + 119, + 96, + 89, + 89, + 118, + 114, + 112, + 98, + 115, + 106, + 115, + 111, + 112, + 117, + 92, + 113, + 101, + 117, + 112, + 120, + 100, + 119, + 117, + 111, + 117, + 114 + ] + ] + ], + "scale": [ + [ + [ + 119 + ] + ], + [ + [ + 119 + ] + ], + [ + [ + 119 + ] + ], + [ + [ + 119 + ] + ], + [ + [ + 119 + ] + ], + [ + [ + 119 + ] + ] + ] + }, + "matmul_1_data": { + "compressed_weight": [ + [ + 119, + 168, + 11, + 49, + 255, + 255 + ], + [ + 255, + 159, + 255, + 255, + 255, + 255 + ], + [ + 255, + 169, + 59, + 255, + 228, + 135 + ], + [ + 202, + 255, + 255, + 149, + 238, + 134 + ], + [ + 229, + 130, + 151, + 255, + 87, + 240 + ], + [ + 26, + 255, + 245, + 75, + 255, + 18 + ] + ], + "zero_point": [ + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ] + ], + "scale": [ + [ + 0.0025196075439453125 + ], + [ + 0.0024051666259765625 + ], + [ + 0.002300262451171875 + ], + [ + 0.0024013519287109375 + ], + [ + 0.0025997161865234375 + ], + [ + 0.003208160400390625 + ] + ] + }, + "gather_2_data": { + "compressed_weight": [ + [ + 181, + 77, + 12, + 5, + 231, + 255 + ], + [ + 166, + 200, + 149, + 255, + 223, + 1 + ], + [ + 255, + 10, + 224, + 54, + 255, + 166 + ] + ], + "zero_point": [ + [ + 0 + ], + [ + 0 + ], + [ + 0 + ] + ], + "scale": [ + [ + 0.0035152435302734375 + ], + [ + 0.0036563873291015625 + ], + [ + 0.003253936767578125 + ] + ] + } +} \ No newline at end of file diff --git a/tests/openvino/native/data/2025.3/reference_scales/IntegerModel_compressed_weights_mxfp4.json b/tests/openvino/native/data/2025.3/reference_scales/IntegerModel_compressed_weights_mxfp4.json new file mode 100644 index 00000000000..da08ec38018 --- /dev/null +++ b/tests/openvino/native/data/2025.3/reference_scales/IntegerModel_compressed_weights_mxfp4.json @@ -0,0 +1,276 @@ +{ + "matmul_2_data": { + "compressed_weight": [ + 19, + 84, + 2, + 35, + 65, + 82, + 98, + 19, + 101, + 100, + 52, + 100, + 70, + 69, + 84, + 83, + 101, + 81, + 102, + 80, + 102, + 97, + 86, + 36, + 101, + 66, + 100, + 80, + 4, + 5, + 69, + 22, + 21, + 51, + 70, + 34, + 38, + 33, + 68, + 69, + 50, + 85, + 54, + 68, + 21, + 99, + 80, + 83, + 48, + 81, + 82, + 22, + 5, + 51, + 100, + 37, + 82, + 70, + 99, + 19, + 86, + 101, + 86, + 37, + 81, + 21, + 99, + 68, + 66, + 20, + 70, + 80, + 70, + 35, + 37, + 68, + 22, + 84, + 49, + 81, + 49, + 69, + 86, + 22, + 17, + 85, + 20, + 53, + 69, + 84, + 65, + 82, + 100, + 97, + 69, + 69 + ], + "scale": [ + [ + [ + 125 + ] + ], + [ + [ + 125 + ] + ], + [ + [ + 125 + ] + ], + [ + [ + 125 + ] + ], + [ + [ + 125 + ] + ], + [ + [ + 125 + ] + ] + ] + }, + "matmul_1_data": { + "compressed_weight": [ + [ + 119, + 168, + 11, + 49, + 255, + 255 + ], + [ + 255, + 159, + 255, + 255, + 255, + 255 + ], + [ + 255, + 169, + 59, + 255, + 228, + 135 + ], + [ + 202, + 255, + 255, + 149, + 238, + 134 + ], + [ + 229, + 130, + 151, + 255, + 87, + 240 + ], + [ + 26, + 255, + 245, + 75, + 255, + 18 + ] + ], + "zero_point": [ + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ], + [ + 0 + ] + ], + "scale": [ + [ + 0.0025196075439453125 + ], + [ + 0.0024051666259765625 + ], + [ + 0.002300262451171875 + ], + [ + 0.0024013519287109375 + ], + [ + 0.0025997161865234375 + ], + [ + 0.003208160400390625 + ] + ] + }, + "gather_2_data": { + "compressed_weight": [ + [ + 181, + 77, + 12, + 5, + 231, + 255 + ], + [ + 166, + 200, + 149, + 255, + 223, + 1 + ], + [ + 255, + 10, + 224, + 54, + 255, + 166 + ] + ], + "zero_point": [ + [ + 0 + ], + [ + 0 + ], + [ + 0 + ] + ], + "scale": [ + [ + 0.0035152435302734375 + ], + [ + 0.0036563873291015625 + ], + [ + 0.003253936767578125 + ] + ] + } +} \ No newline at end of file diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 017d695c25e..4125931fd13 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -170,6 +170,10 @@ def check_int8_node(op: ov.Node, mode: CompressWeightsMode = CompressWeightsMode def check_int4_grouped(op: ov.Node, mode: CompressWeightsMode, group_size: int = 7): dtype = ov.Type.u4 if mode == CompressWeightsMode.INT4_ASYM else ov.Type.i4 assert op.get_element_type() == dtype + + compressed_weight = get_const_value_as_numpy_tensor(op) + stats = {"compressed_weight": compressed_weight} + weight_shape = op.shape # NOTE: get_const_value_as_numpy_tensor doesn't work for 4-bit types assert list(weight_shape)[-1] == group_size @@ -189,6 +193,10 @@ def check_int4_grouped(op: ov.Node, mode: CompressWeightsMode, group_size: int = zero_point_node = convert_node.input_value(0).get_node() assert zero_point_node.get_element_type() == dtype assert list(zero_point_node.shape) == reduced_weight_shape + + zero_point = get_const_value_as_numpy_tensor(zero_point_node) + stats["zero_point"] = zero_point + mul_node = get_next_node(sub_node) else: mul_node = get_next_node(convert_node) @@ -203,13 +211,49 @@ def check_int4_grouped(op: ov.Node, mode: CompressWeightsMode, group_size: int = convert_node = get_next_node(reshape_node) assert convert_node.get_type_name() == "Convert" - return { - "scale": get_const_value_as_numpy_tensor(scale_node), - } + stats["scale"] = get_const_value_as_numpy_tensor(scale_node) + return stats + + +def check_fp(op: ov.Node, mode: CompressWeightsMode, group_size: int = 32): + dtype = ov.Type.f4e2m1 if mode == CompressWeightsMode.MXFP4 else ov.Type.f8e4m3 + assert op.get_element_type() == dtype + + compressed_weight = get_const_value_as_numpy_tensor(op) + stats = {"compressed_weight": compressed_weight} + + weight_shape = op.shape + # NOTE: get_const_value_as_numpy_tensor doesn't work for 4-bit types + assert list(weight_shape)[-1] == group_size + reduced_weight_shape = list(weight_shape) + reduced_weight_shape[-1] = 1 + + convert_node = get_next_node(op) + assert convert_node.get_type_name() == "Convert" + + mul_node = get_next_node(convert_node) + assert mul_node.get_type_name() == "Multiply" + scale_node = mul_node.input_value(1).get_node() + assert list(scale_node.shape) == reduced_weight_shape + if mode is not CompressWeightsMode.FP8_E4M3: + scale_node = scale_node.input_value(0).get_node() + stats["scale"] = get_const_value_as_numpy_tensor(scale_node) + + reshape_node = get_next_node(mul_node) + assert reshape_node.get_type_name() == "Reshape" + + convert_node = get_next_node(reshape_node) + assert convert_node.get_type_name() == "Convert" + + return stats def check_nf4_grouped(op: ov.Node, group_size: int = 7): assert op.get_element_type() == ov.Type.nf4 + + compressed_weight = get_const_value_as_numpy_tensor(op) + stats = {"compressed_weight": compressed_weight} + weight_shape = op.shape # NOTE: get_const_value_as_numpy_tensor doesn't work for 4-bit types assert list(weight_shape)[-1] == group_size @@ -229,15 +273,16 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7): convert_node = get_next_node(reshape_node) assert convert_node.get_type_name() == "Convert" - - return { - "scale": get_const_value_as_numpy_tensor(scale_node), - } + stats["scale"] = get_const_value_as_numpy_tensor(scale_node) + return stats def check_codebook_grouped(op: ov.Node, group_size: int = 7, dtype=ov.Type.f8e4m3): assert op.get_element_type() == dtype + compressed_weight = get_const_value_as_numpy_tensor(op) + stats = {"compressed_weight": compressed_weight} + if dtype == ov.Type.f16: convert_node = op else: @@ -264,9 +309,8 @@ def check_codebook_grouped(op: ov.Node, group_size: int = 7, dtype=ov.Type.f8e4m convert_node = get_next_node(reshape_node) assert convert_node.get_type_name() == "Convert" - return { - "scale": get_const_value_as_numpy_tensor(scale_node), - } + stats["scale"] = get_const_value_as_numpy_tensor(scale_node) + return stats def check_codebook_indexes(op: ov.Node, dtype=ov.Type.u4): @@ -298,6 +342,18 @@ def check_int8_sym(op: ov.Node): return check_int8_node(op, mode=CompressWeightsMode.INT8_SYM) +def check_mxfp4(op: ov.Node): + return check_fp(op, mode=CompressWeightsMode.MXFP4, group_size=32) + + +def check_mxfp8(op: ov.Node): + return check_fp(op, mode=CompressWeightsMode.MXFP8_E4M3, group_size=32) + + +def check_fp8(op: ov.Node): + return check_fp(op, mode=CompressWeightsMode.FP8_E4M3, group_size=32) + + def get_mixed_mapping(primary_fn: Callable, list_layers: list[str]): mapping = {node_name: check_int8_node for node_name in list_layers} primary_node_name = TEST_MODELS[IntegerModel][0] @@ -314,10 +370,13 @@ def get_mixed_mapping(primary_fn: Callable, list_layers: list[str]): (CompressWeightsMode.INT4_ASYM, 7, get_mixed_mapping(check_int4_asym_grouped, TEST_MODELS[IntegerModel])), (CompressWeightsMode.NF4, 7, get_mixed_mapping(check_nf4_grouped, TEST_MODELS[IntegerModel])), (CompressWeightsMode.CB4_F8E4M3, 7, get_mixed_mapping(check_codebook_grouped, TEST_MODELS[IntegerModel])), + (CompressWeightsMode.MXFP4, 32, get_mixed_mapping(check_mxfp4, TEST_MODELS[IntegerModel])), + (CompressWeightsMode.MXFP8_E4M3, 32, get_mixed_mapping(check_mxfp8, TEST_MODELS[IntegerModel])), + (CompressWeightsMode.FP8_E4M3, 32, get_mixed_mapping(check_fp8, TEST_MODELS[IntegerModel])), ), ) def test_compare_compressed_weights(mode, group_size, check_fn_per_node_map): - model = IntegerModel().ov_model + model = IntegerModel(dim2=group_size if group_size > 0 else 7).ov_model compressed_model = compress_weights(model, mode=mode, group_size=group_size) actual_stats = {} for op in compressed_model.get_ops(): @@ -1145,7 +1204,7 @@ def test_call_gptq_with_dataset_scale_estimation_neg_group_size(mode): (CompressWeightsMode.MXFP4, ov.Type.f4e2m1), ], ) -def test_mixed_precision_fp(sensitivity_metric, all_layers, ratio, ref_ids, mode, ov_type, group_size): +def test_mixed_precision_mxfp(sensitivity_metric, all_layers, ratio, ref_ids, mode, ov_type, group_size): # Use hidden dim % 32 == 0 to make it possible to quantize in MX format model = SequentialMatmulModel(mm_hidden_dim=32).ov_model dataset = Dataset([np.ones([1, 4, 32]), np.arange(128).reshape(1, 4, 32)]) @@ -1179,6 +1238,70 @@ def test_mixed_precision_fp(sensitivity_metric, all_layers, ratio, ref_ids, mode assert ref_e8m0_nodes == names_e8m0 +@pytest.mark.parametrize( + ("sensitivity_metric", "all_layers", "ratio", "ref_ids", "group_size"), + ( + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 1, 2], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, [], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 2], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, [], None), + (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, True, 0.8, [0, 1, 2], None), + (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, False, 0.8, [0, 1, 2], None), + (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2], None), + (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2], None), + (SensitivityMetric.MAX_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2], None), + (SensitivityMetric.MAX_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2], None), + (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, True, 0.8, [0, 1, 2], None), + (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2], None), + # One test to check manual group size setup is working as expected + (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2], 128), + ), +) +@pytest.mark.parametrize( + "mode, ov_type", + [ + (CompressWeightsMode.FP8_E4M3, ov.Type.f8e4m3), + ], +) +def test_mixed_precision_fp(sensitivity_metric, all_layers, ratio, ref_ids, mode, ov_type, group_size): + model = SequentialMatmulModel(mm_hidden_dim=128).ov_model + dataset = Dataset([np.ones([1, 4, 128]), np.arange(512).reshape(1, 4, 128)]) + kwargs = {} + if group_size is not None: + kwargs["group_size"] = group_size + compressed_model = compress_weights( + model, + mode=mode, + ratio=ratio, + all_layers=all_layers, + sensitivity_metric=sensitivity_metric, + dataset=dataset, + **kwargs, + ) + ops = [] + for op in compressed_model.get_ordered_ops(): + if op.get_element_type() == ov_type: + # Check effective default group size == 128 + assert tuple(op.shape) == (128, 1, 128) + ops.append(op) + + names_fp = {op.get_friendly_name() for op in ops} + ref_fp_nodes = {f"weights_{i}" for i in ref_ids} + assert ref_fp_nodes == names_fp + + names_scales = { + op.get_friendly_name() + for op in compressed_model.get_ordered_ops() + if op.get_element_type() == ov.Type.f16 and "scale" in op.get_friendly_name() + } + ref_scale_nodes = {f"weights_{i}/scale" for i in range(5)} + assert ref_scale_nodes == names_scales + + @pytest.mark.parametrize( ("mode", "all_layers", "ratio", "ref_ids"), ( diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py index 7bf959e89e1..8c3d7052228 100644 --- a/tests/torch2/function_hook/quantization/test_weights_compression.py +++ b/tests/torch2/function_hook/quantization/test_weights_compression.py @@ -54,6 +54,7 @@ CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3, + CompressWeightsMode.FP8_E4M3, )