Default to dequantize if cpu in device_map for mxfp4 (#39993)

MekkCyber · web-flow · commit b6ba59554385 · 2025-08-12T16:48:52.000+02:00
* default to dq if cpu

* an other check

* style

* revert some changes
diff --git a/src/transformers/quantizers/quantizer_mxfp4.py b/src/transformers/quantizers/quantizer_mxfp4.py
@@ -61,7 +61,14 @@ def validate_environment(self, *args, **kwargs):
             return
 
         if not torch.cuda.is_available():
-            raise RuntimeError("Using MXFP4 quantized models requires a GPU")
+            if self.pre_quantized:
+                logger.warning_once(
+                    "Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16"
+                )
+                self.quantization_config.dequantize = True
+                return
+            else:
+                raise RuntimeError("Quantizing a model using MXFP4 requires a GPU")
 
         if not is_accelerate_available():
             raise ImportError("Using mxfp4 requires Accelerate: `pip install accelerate`")