1919from nncf .errors import UnsupportedModelError
2020from nncf .parameters import CompressWeightsMode
2121from nncf .quantization .algorithms .weight_compression .config import WeightCompressionConfig
22- from nncf .quantization .algorithms .weight_compression .constants import CENTER_OF_MXFP4_QUANTILES
22+ from nncf .quantization .algorithms .weight_compression .constants import CENTER_OF_F4E2M1_QUANTILES
2323from nncf .quantization .algorithms .weight_compression .constants import CENTER_OF_NF4_QUANTILES
24- from nncf .quantization .algorithms .weight_compression .constants import MXFP4_QUANTILES
24+ from nncf .quantization .algorithms .weight_compression .constants import F4E2M1_QUANTILES
2525from nncf .quantization .algorithms .weight_compression .constants import NF4_QUANTILES
2626from nncf .quantization .algorithms .weight_compression .parameters import CompressedWeight
2727from nncf .quantization .fake_quantize import calculate_scale_zero_point
3232
3333ReductionAxes = Union [int , tuple [int , ...]]
3434
35+
36+ OPTIMIZED_COMPRESSION_COMPATIBLE_MODES = (
37+ CompressWeightsMode .INT8_ASYM ,
38+ CompressWeightsMode .INT8_SYM ,
39+ CompressWeightsMode .INT4_ASYM ,
40+ CompressWeightsMode .INT4_SYM ,
41+ CompressWeightsMode .NF4 ,
42+ CompressWeightsMode .MXFP4 ,
43+ CompressWeightsMode .FP4 ,
44+ )
3545MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000
3646
3747
@@ -168,7 +178,7 @@ def do_float_quantization(
168178 weight , reduction_axes = reshape_weight_for_grouped_quantization (weight , reduction_axes , config .group_size )
169179
170180 # Optimized implementation
171- if config . mode in [ CompressWeightsMode . NF4 , CompressWeightsMode . MXFP4 ] and _can_run_optimized (weight ):
181+ if _can_run_optimized (weight , config . mode ):
172182 from nncf .openvino .optimized_functions import do_float_quantization as do_float_quantization_ov
173183
174184 return do_float_quantization_ov (weight , config , reduction_axes , precomputed_scale )
@@ -183,7 +193,7 @@ def do_float_quantization(
183193 if scale is None :
184194 scale = calculate_float_quantization_params (weight , reduction_axes , config )
185195 norm_weight = _calculate_normalized_weight (weight , scale )
186- if config .mode in [CompressWeightsMode .NF4 , CompressWeightsMode .MXFP4 ]:
196+ if config .mode in [CompressWeightsMode .NF4 , CompressWeightsMode .MXFP4 , CompressWeightsMode . FP4 ]:
187197 if original_weight_backend == TensorBackend .ov :
188198 # Can convert through OpenVINO and return OpenVINO-native nf4/f4e2m1 tensor
189199 target_dtype = TensorDataType .nf4 if config .mode == CompressWeightsMode .NF4 else TensorDataType .f4e2m1
@@ -209,7 +219,7 @@ def float_quantize_dequantize_weight(
209219) -> Union [Tensor , tuple [Tensor , Tensor , Tensor ]]:
210220 """
211221 First quantizes the given weight tensor to float dtype and then dequantizes it back to obtain float32 values.
212- MXFP8_E4M3, FP8_E4M3 and FP4 modes currently are not supported.
222+ MXFP8_E4M3 and FP8_E4M3 modes currently are not supported.
213223
214224 :param weight: The weight tensor to quantize-dequantize.
215225 :param config: Compression configuration.
@@ -221,12 +231,13 @@ def float_quantize_dequantize_weight(
221231 assert config .mode in [
222232 CompressWeightsMode .NF4 ,
223233 CompressWeightsMode .MXFP4 ,
234+ CompressWeightsMode .FP4 ,
224235 CompressWeightsMode .CODEBOOK ,
225236 CompressWeightsMode .CB4_F8E4M3 ,
226237 ]
227238
228239 # Optimized implementation
229- if config . mode in [ CompressWeightsMode . NF4 , CompressWeightsMode . MXFP4 ] and _can_run_optimized (weight ):
240+ if _can_run_optimized (weight , config . mode ):
230241 from nncf .openvino .optimized_functions import (
231242 float_quantize_dequantize_weight as float_quantize_dequantize_weight_ov ,
232243 )
@@ -302,7 +313,7 @@ def get_integer_quantization_error(
302313 :return: The quantity characterizing the error of integer quantization.
303314 """
304315 # Optimized implementation
305- if _can_run_optimized (weight ):
316+ if _can_run_optimized (weight , config . mode ):
306317 from nncf .openvino .optimized_functions import (
307318 get_integer_quantization_error as get_integer_quantization_error_ov ,
308319 )
@@ -439,7 +450,7 @@ def do_integer_quantization(
439450 weight , reduction_axes = reshape_weight_for_grouped_quantization (weight , reduction_axes , config .group_size )
440451
441452 # Optimized implementation
442- if _can_run_optimized (weight ):
453+ if _can_run_optimized (weight , config . mode ):
443454 from nncf .openvino .optimized_functions import do_integer_quantization as do_integer_quantization_ov
444455
445456 return do_integer_quantization_ov (weight , config , reduction_axes , precomputed_scale , precomputed_zero_point )
@@ -488,7 +499,7 @@ def integer_quantize_dequantize_weight(
488499 (and zero point).
489500 """
490501 # Optimized implementation
491- if _can_run_optimized (weight ):
502+ if _can_run_optimized (weight , config . mode ):
492503 from nncf .openvino .optimized_functions import (
493504 integer_quantize_dequantize_weight as integer_quantize_dequantize_weight_ov ,
494505 )
@@ -520,14 +531,14 @@ def _calculate_float_quantized_weight(norm_weight: Tensor, mode: CompressWeights
520531 :param norm_weight: Normalized weight tensor to quantize.
521532 :return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants.
522533 """
523- assert mode in [CompressWeightsMode .NF4 , CompressWeightsMode .MXFP4 ]
524- quantiles_np = NF4_QUANTILES if mode == CompressWeightsMode .NF4 else MXFP4_QUANTILES
525- quantile_centers_np = CENTER_OF_NF4_QUANTILES if mode == CompressWeightsMode .NF4 else CENTER_OF_MXFP4_QUANTILES
534+ assert mode in [CompressWeightsMode .NF4 , CompressWeightsMode .MXFP4 , CompressWeightsMode . FP4 ]
535+ quantiles_np = NF4_QUANTILES if mode == CompressWeightsMode .NF4 else F4E2M1_QUANTILES
536+ quantile_centers_np = CENTER_OF_NF4_QUANTILES if mode == CompressWeightsMode .NF4 else CENTER_OF_F4E2M1_QUANTILES
526537 quantile_centers = fns .from_numpy (quantile_centers_np , backend = norm_weight .backend )
527538 indexes = fns .searchsorted (quantile_centers , norm_weight )
528539 quantiles = fns .from_numpy (quantiles_np , backend = indexes .backend )
529540
530- if mode == CompressWeightsMode .MXFP4 :
541+ if mode in [ CompressWeightsMode .MXFP4 , CompressWeightsMode . FP4 ] :
531542 # If in-between two quantiles, round to the nearest even quantile.
532543 shifted_indexes = fns .clip (indexes + 1 , 0 , quantiles .size - 1 )
533544 dist_left = fns .abs (norm_weight - quantiles [indexes ])
@@ -639,11 +650,12 @@ def _calculate_integer_quantized_weight(
639650 return compressed_weights
640651
641652
642- def _can_run_optimized (inp : Tensor ) -> bool :
653+ def _can_run_optimized (inp : Tensor , mode : CompressWeightsMode ) -> bool :
643654 if (
644655 inp .backend in [TensorBackend .ov , TensorBackend .numpy ]
645656 and inp .size >= MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
646657 and os .environ .get ("NNCF_DISABLE_OPTIMIZED_COMPRESSION" ) is None
658+ and mode in OPTIMIZED_COMPRESSION_COMPATIBLE_MODES
647659 ):
648660 if is_openvino_available ():
649661 from nncf .openvino .cpu_info import is_arm_cpu
0 commit comments