update

DN6 · DN6 · commit f4c14c222de8 · 2025-02-10T08:20:14.000+01:00
diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 
 # Quanto
 
-[Quanto](https://github.com/huggingface/optimum-quanto) is a PyTorch quantization backend for [Optimum.](https://huggingface.co/docs/optimum/en/index) 
+[Quanto](https://github.com/huggingface/optimum-quanto) is a PyTorch quantization backend for [Optimum.](https://huggingface.co/docs/optimum/en/index)
 It has been designed with versatility and simplicity in mind:
 
 - All features are available in eager mode (works with non-traceable models)
@@ -27,10 +27,10 @@ In order to use the Quanto backend, you will first need to install `optimum-quan
 pip install optimum-quanto accelerate
 ```
 
-Now you can quantize a model by passing the `QuantoConfig` object to the `from_pretrained()` method. The following snippet demonstrates how to apply `float8` quantization with Quanto. 
+Now you can quantize a model by passing the `QuantoConfig` object to the `from_pretrained()` method. The following snippet demonstrates how to apply `float8` quantization with Quanto.
 
 ```python
-import torch 
+import torch
 from diffusers import FluxTransformer2DModel, QuantoConfig
 
 model_id = "black-forest-labs/FLUX.1-dev"
@@ -46,24 +46,57 @@ image = pipe(
 ).images[0]
 image.save("output.png")
 ```
-## Saving Quantized models
 
-Diffusers supports serializing and saving Quanto models using the `save_pretrained` method. 
+## Using `from_single_file` with the Quanto Backend
+
 ```python
+import torch
+from diffusers import FluxTransformer2DModel, QuantoConfig
+
+ckpt_path = "https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors"
+quantization_config = QuantoConfig(weights="float8")
+transformer = FluxTransformer2DModel.from_single_file(ckpt_path, quantization_config=quantization_config, torch_dtype=torch.bfloat16)
+```
+
+## Saving Quantized models
+
+Diffusers supports serializing and saving Quanto models using the `save_pretrained` method.
 
-import torch 
+```python
+import torch
 from diffusers import FluxTransformer2DModel, QuantoConfig
 
 model_id = "black-forest-labs/FLUX.1-dev"
 quantization_config = QuantoConfig(weights="float8")
 transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16)
 
 # save quantized model to reuse
-transformer.save_pretrained("<your save path>")
+transformer.save_pretrained("<your quantized model save path>")
+
+# you can reload your quantized model with
+model = FluxTransformer2DModel.from_pretrained("<your quantized model save path>")
+```
+
+## Using `torch.compile` with Quanto
+
+Currently the Quanto backend only supports `torch.compile` for `int8` weights and activations.
+
+```python
+import torch
+from diffusers import FluxTransformer2DModel, QuantoConfig
+
+model_id = "black-forest-labs/FLUX.1-dev"
+quantization_config = QuantoConfig(weights="int8")
+transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16)
+transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True)
+
+pipe = FluxPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch_dtype)
+pipe.to("cuda")
+```
 
 ## Supported Quantization Types
 
-### Weights 
+### Weights
 
 - float8
 - int8
@@ -73,15 +106,3 @@ transformer.save_pretrained("<your save path>")
 ### Activations
 - float8
 - int8
-
-
-```
-```
-```
-
-
-```
-
-
-
-
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -2,6 +2,15 @@
 
 from typing import TYPE_CHECKING
 
+from diffusers.quantizers import quantization_config
+from diffusers.utils import dummy_gguf_objects
+from diffusers.utils.import_utils import (
+    is_bitsandbytes_available,
+    is_gguf_available,
+    is_optimum_quanto_version,
+    is_torchao_available,
+)
+
 from .utils import (
     DIFFUSERS_SLOW_IMPORT,
     OptionalDependencyNotAvailable,
@@ -33,12 +42,7 @@
     "loaders": ["FromOriginalModelMixin"],
     "models": [],
     "pipelines": [],
-    "quantizers.quantization_config": [
-        "BitsAndBytesConfig",
-        "GGUFQuantizationConfig",
-        "QuantoConfig",
-        "TorchAoConfig",
-    ],
+    "quantizers.quantization_config": [],
     "schedulers": [],
     "utils": [
         "OptionalDependencyNotAvailable",
@@ -73,6 +77,56 @@
 else:
     _import_structure["quantizers.quantization_config"].extend("QuantoConfig")
 """
+
+try:
+    if not is_bitsandbytes_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_bitsandbytes_objects
+
+    _import_structure["utils.dummy_bitsandbytes_objects"] = [
+        name for name in dir(dummy_bitsandbytes_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["quantizers.quantization_config"].append("BitsAndBytesConfig")
+
+try:
+    if not is_gguf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_gguf_objects
+
+    _import_structure["utils.dummy_gguf_objects"] = [
+        name for name in dir(dummy_gguf_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["quantizers.quantization_config"].append("GGUFQuantizationConfig")
+
+try:
+    if not is_torchao_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torchao_objects
+
+    _import_structure["utils.dummy_torchao_bjects"] = [
+        name for name in dir(dummy_torchao_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["quantizers.quantization_config"].append("TorchAoConfig")
+
+try:
+    if not is_optimum_quanto_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from utils import dummy_optimum_quanto_objects
+
+    _import_structure["utils.dummy_optimum_quanto_objects"] = [
+        name for name in dir(dummy_optimum_quanto_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["quantizers.quantization_config"].append("QuantoConfig")
+
+
 try:
     if not is_onnx_available():
         raise OptionalDependencyNotAvailable()
@@ -600,7 +654,38 @@
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     from .configuration_utils import ConfigMixin
-    from .quantizers.quantization_config import BitsAndBytesConfig, GGUFQuantizationConfig, QuantoConfig, TorchAoConfig
+
+    try:
+        if not is_bitsandbytes_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_bitsandbytes_objects import *
+    else:
+        from .quantizers.quantization_config import BitsAndBytesConfig
+
+    try:
+        if not is_gguf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_gguf_objects import *
+    else:
+        from .quantizers.quantization_config import GGUFQuantizationConfig
+
+    try:
+        if not is_torchao_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torchao_objects import *
+    else:
+        from .quantizers.quantization_config import TorchAoConfig
+
+    try:
+        if not is_optimum_quanto_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_optimum_quanto_objects import *
+    else:
+        from .quantizers.quantization_config import QuantoConfig
 
     try:
         if not is_onnx_available():
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -1041,7 +1041,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                         model,
                         state_dict,
                         device=param_device,
-                        dtype=torch_dtype,
                         model_name_or_path=pretrained_model_name_or_path,
                         hf_quantizer=hf_quantizer,
                         keep_in_fp32_modules=keep_in_fp32_modules,
diff --git a/src/diffusers/utils/dummy_bitsandbytes_objects.py b/src/diffusers/utils/dummy_bitsandbytes_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class BitsAndBytesConfig(metaclass=DummyObject):
+    _backends = ["bitsandbytes"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["bitsandbytes"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["bitsandbytes"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["bitsandbytes"])
diff --git a/src/diffusers/utils/dummy_gguf_objects.py b/src/diffusers/utils/dummy_gguf_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class GGUFQuantizationConfig(metaclass=DummyObject):
+    _backends = ["gguf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["gguf"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["gguf"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["gguf"])
diff --git a/src/diffusers/utils/dummy_optimum_quanto_objects.py b/src/diffusers/utils/dummy_optimum_quanto_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class QuantoConfig(metaclass=DummyObject):
+    _backends = ["optimum_quanto"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["optimum_quanto"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["optimum_quanto"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["optimum_quanto"])
diff --git a/src/diffusers/utils/dummy_torchao_objects.py b/src/diffusers/utils/dummy_torchao_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class TorchAoConfig(metaclass=DummyObject):
+    _backends = ["torchao"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchao"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torchao"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torchao"])
diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py
@@ -9,6 +9,7 @@
 from diffusers.utils import is_optimum_quanto_available
 from diffusers.utils.testing_utils import (
     nightly,
+    numpy_cosine_similarity_distance,
     require_accelerate,
     require_big_gpu_with_torch_cuda,
     torch_device,
@@ -142,6 +143,25 @@ class FluxTransformerInt8(FluxTransformerQuantoMixin, unittest.TestCase):
     def get_dummy_init_kwargs(self):
         return {"weights": "int8"}
 
+    def test_torch_compile(self):
+        model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs())
+        compiled_model = torch.compile(model, mode="max-autotune", fullgraph=True)
+        inputs = self.get_dummy_inputs()
+
+        model.to(torch_device)
+        with torch.no_grad():
+            model_output = model(**inputs).sample
+        model.to("cpu")
+
+        compiled_model.to(torch_device)
+        with torch.no_grad():
+            compiled_model_output = compiled_model(**inputs).sample
+
+        max_diff = numpy_cosine_similarity_distance(
+            model_output.cpu().flatten(), compiled_model_output.cpu().flatten()
+        )
+        assert max_diff < 1e-4
+
 
 class FluxTransformerInt4(FluxTransformerQuantoMixin, unittest.TestCase):
     expected_memory_use_in_gb = 6