make style

a-r-r-o-w · a-r-r-o-w · commit b78a36cb61e8 · 2024-11-24T02:38:33.000+01:00
diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py
@@ -19,7 +19,7 @@
 from typing import Dict, Optional, Union
 
 from .bitsandbytes import BnB4BitDiffusersQuantizer, BnB8BitDiffusersQuantizer
-from .quantization_config import BitsAndBytesConfig, TorchAoConfig, QuantizationConfigMixin, QuantizationMethod
+from .quantization_config import BitsAndBytesConfig, QuantizationConfigMixin, QuantizationMethod, TorchAoConfig
 
 
 AUTO_QUANTIZER_MAPPING = {
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
@@ -400,13 +400,15 @@ class TorchAoConfig(QuantizationConfigMixin):
 
     Args:
         quant_type (`str`):
-            The type of quantization we want to use, currently supporting: `int4_weight_only`, `int8_weight_only` and `int8_dynamic_activation_int8_weight`.
+            The type of quantization we want to use, currently supporting: `int4_weight_only`, `int8_weight_only` and
+            `int8_dynamic_activation_int8_weight`.
         modules_to_not_convert (`list`, *optional*, default to `None`):
-            The list of modules to not quantize, useful for quantizing models that explicitly require to have
-            some modules left in their original precision.
+            The list of modules to not quantize, useful for quantizing models that explicitly require to have some
+            modules left in their original precision.
         kwargs (`Dict[str, Any]`, *optional*):
-            The keyword arguments for the chosen type of quantization, for example, int4_weight_only quantization supports two keyword arguments
-            `group_size` and `inner_k_tiles` currently. More API examples and documentation of arguments can be found in
+            The keyword arguments for the chosen type of quantization, for example, int4_weight_only quantization
+            supports two keyword arguments `group_size` and `inner_k_tiles` currently. More API examples and
+            documentation of arguments can be found in
             https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques
 
     Example:
@@ -415,15 +417,17 @@ class TorchAoConfig(QuantizationConfigMixin):
     TODO(aryan): update
     quantization_config = TorchAoConfig("int4_weight_only", group_size=32)
     # int4_weight_only quant is only working with *torch.bfloat16* dtype right now
-    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id, device_map="cuda", torch_dtype=torch.bfloat16, quantization_config=quantization_config
+    )
     ```
     """
 
     def __init__(self, quant_type: str, modules_to_not_convert: Optional[List] = None, **kwargs):
         self.quant_method = QuantizationMethod.TORCHAO
         self.quant_type = quant_type
         self.modules_to_not_convert = modules_to_not_convert
-        
+
         # When we load from serialized config, "quant_type_kwargs" will be the key
         if "quant_type_kwargs" in kwargs:
             self.quant_type_kwargs = kwargs["quant_type_kwargs"]
@@ -448,7 +452,7 @@ def __init__(self, quant_type: str, modules_to_not_convert: Optional[List] = Non
 
         if len(unsupported_kwargs) > 0:
             raise ValueError(
-                f"The quantization method \"{method}\" does not supported the following keyword arguments: "
+                f'The quantization method "{method}" does not supported the following keyword arguments: '
                 f"{unsupported_kwargs}. The following keywords arguments are supported: {all_kwargs}."
             )
 
@@ -460,16 +464,17 @@ def _get_torchao_quant_type_to_method(cls):
 
         if is_torchao_available():
             from torchao.quantization import (
-                int4_weight_only,
-                int8_dynamic_activation_int8_weight,
-                int8_dynamic_activation_int4_weight,
-                int8_weight_only,
                 float8_dynamic_activation_float8_weight,
                 float8_static_activation_float8_weight,
                 float8_weight_only,
                 fpx_weight_only,
+                int4_weight_only,
+                int8_dynamic_activation_int4_weight,
+                int8_dynamic_activation_int8_weight,
+                int8_weight_only,
                 uintx_weight_only,
             )
+
             # TODO(aryan): Add a note on how to use PerAxis and PerGroup observers
             from torchao.quantization.observer import PerRow, PerTensor
 
@@ -502,8 +507,10 @@ def _get_torchao_quant_type_to_method(cls):
             def generate_float8dq_types(dtype: torch.dtype):
                 name = "e5m2" if dtype == torch.float8_e5m2 else "e4m3"
                 types = {}
-                
-                types[f"float8dq_{name}_a8w8"] = partial(float8_dynamic_activation_float8_weight, activation_dtype=dtype, weight_dtype=dtype)
+
+                types[f"float8dq_{name}_a8w8"] = partial(
+                    float8_dynamic_activation_float8_weight, activation_dtype=dtype, weight_dtype=dtype
+                )
                 for activation_granularity_cls in [PerTensor, PerRow]:
                     for weight_granularity_cls in [PerTensor, PerRow]:
                         activation_name = "t" if activation_granularity_cls is PerTensor else "r"
@@ -526,22 +533,22 @@ def generate_float8dq_types(dtype: torch.dtype):
                             weight_dtype=dtype,
                             granularity=(activation_granularity_cls(), weight_granularity_cls()),
                         )
-                
+
                 return types
 
             def generate_fpx_quantization_types(bits: int):
                 types = {}
-                
+
                 for ebits in range(1, bits):
                     mbits = bits - ebits - 1
                     types[f"fp{bits}_e{ebits}m{mbits}"] = partial(fpx_weight_only, ebits=ebits, mbits=mbits)
                     types[f"fp{bits}_e{ebits}m{mbits}_a16w{bits}"] = partial(fpx_weight_only, ebits=ebits, mbits=mbits)
-                
+
                 non_sign_bits = bits - 1
                 default_ebits = (non_sign_bits + 1) // 2
                 default_mbits = non_sign_bits - default_ebits
                 types[f"fp{bits}"] = partial(fpx_weight_only, ebits=default_ebits, mbits=default_mbits)
-                
+
                 return types
 
             # TODO(aryan): handle cuda capability and torch 2.2/2.3
@@ -561,11 +568,19 @@ def generate_fpx_quantization_types(bits: int):
                 # float8_e5m2 weight + float8 activation (dynamic)
                 "float8_dynamic_activation_float8_weight": float8_dynamic_activation_float8_weight,
                 "float8dq": float8_dynamic_activation_float8_weight,
-                "float8dq_e5m2": partial(float8_dynamic_activation_float8_weight, activation_dtype=torch.float8_e5m2, weight_dtype=torch.float8_e5m2),
+                "float8dq_e5m2": partial(
+                    float8_dynamic_activation_float8_weight,
+                    activation_dtype=torch.float8_e5m2,
+                    weight_dtype=torch.float8_e5m2,
+                ),
                 "float8_a8w8": float8_dynamic_activation_float8_weight,
                 **generate_float8dq_types(torch.float8_e5m2),
                 # float8_e4m3 weight + float8 activation (dynamic)
-                "float8dq_e4m3": partial(float8_dynamic_activation_float8_weight, activation_dtype=torch.float8_e4m3fn, weight_dtype=torch.float8_e4m3fn),
+                "float8dq_e4m3": partial(
+                    float8_dynamic_activation_float8_weight,
+                    activation_dtype=torch.float8_e4m3fn,
+                    weight_dtype=torch.float8_e4m3fn,
+                ),
                 **generate_float8dq_types(torch.float8_e4m3fn),
                 # float8 weight + float8 activation (static)
                 "float8_static_activation_float8_weight": float8_static_activation_float8_weight,
diff --git a/src/diffusers/quantizers/torchao/torchao_quantizer.py b/src/diffusers/quantizers/torchao/torchao_quantizer.py
@@ -19,11 +19,13 @@
 
 import importlib
 import types
-from packaging import version
 from typing import TYPE_CHECKING, Any, Dict, List, Union
 
-from ..base import DiffusersQuantizer
+from packaging import version
+
 from ...utils import get_module_from_name, is_torch_available, is_torchao_available, logging
+from ..base import DiffusersQuantizer
+
 
 if TYPE_CHECKING:
     from ...models.modeling_utils import ModelMixin
@@ -69,10 +71,12 @@ def __init__(self, quantization_config, **kwargs):
 
     def validate_environment(self, *args, **kwargs):
         if not is_torchao_available():
-            raise ImportError("Loading a TorchAO quantized model requires the torchao library. Please install with `pip install torchao`")
+            raise ImportError(
+                "Loading a TorchAO quantized model requires the torchao library. Please install with `pip install torchao`"
+            )
 
         self.offload = False
-        
+
         device_map = kwargs.get("device_map", None)
         if isinstance(device_map, dict):
             if "cpu" in device_map.values() or "disk" in device_map.values():
@@ -83,7 +87,7 @@ def validate_environment(self, *args, **kwargs):
                     )
                 else:
                     self.offload = True
-        
+
         if self.pre_quantized:
             weights_only = kwargs.get("weights_only", None)
             if weights_only:
@@ -96,29 +100,41 @@ def validate_environment(self, *args, **kwargs):
 
     def update_torch_dtype(self, torch_dtype):
         quant_type = self.quantization_config.quant_type
-        
+
         if quant_type.startswith("int") or quant_type.startswith("uint"):
             if torch_dtype is not None and torch_dtype != torch.bfloat16:
                 logger.warning(
                     f"Setting torch_dtype to {torch_dtype} for int4/int8/uintx quantization, but only bfloat16 is supported right now. Please set `torch_dtype=torch.bfloat16`."
                 )
-        
+
         if torch_dtype is None:
             # we need to set the torch_dtype, otherwise we have dtype mismatch when performing the quantized linear op
             logger.info(
-                f"Overriding `torch_dtype` with `torch_dtype=torch.bfloat16` due to requirements of `torchao` "
-                f"to enable model loading in different precisions. Pass your own `torch_dtype` to specify the "
-                f"dtype of the remaining non-linear layers, or pass torch_dtype=torch.bfloat16, to remove this warning."
+                "Overriding `torch_dtype` with `torch_dtype=torch.bfloat16` due to requirements of `torchao` "
+                "to enable model loading in different precisions. Pass your own `torch_dtype` to specify the "
+                "dtype of the remaining non-linear layers, or pass torch_dtype=torch.bfloat16, to remove this warning."
             )
             torch_dtype = torch.bfloat16
-        
+
         return torch_dtype
 
     def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
-        supported_dtypes = (torch.int8, torch.float8_e4m3fn, torch.float8_e5m2, torch.uint1, torch.uint2, torch.uint3, torch.uint4, torch.uint5, torch.uint6, torch.uint7, torch.uint8)
+        supported_dtypes = (
+            torch.int8,
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+            torch.uint1,
+            torch.uint2,
+            torch.uint3,
+            torch.uint4,
+            torch.uint5,
+            torch.uint6,
+            torch.uint7,
+            torch.uint8,
+        )
         if isinstance(target_dtype, supported_dtypes):
             return target_dtype
-    
+
         raise ValueError(
             f"You are using `device_map='auto'` on a TorchAO quantized model but a suitable target dtype "
             f"could not be inferred. The supported target_dtypes are: {supported_dtypes}. If you think the "
@@ -161,8 +177,8 @@ def create_quantized_param(
         unexpected_keys: List[str],
     ):
         r"""
-        Each nn.Linear layer that needs to be quantized is processsed here.
-        First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module.
+        Each nn.Linear layer that needs to be quantized is processsed here. First, we set the value the weight tensor,
+        then we move it to the target device. Finally, we quantize the module.
         """
         from torchao.quantization import quantize_
 
@@ -187,22 +203,22 @@ def _process_model_before_weight_loading(
 
         if not isinstance(self.modules_to_not_convert, list):
             self.modules_to_not_convert = [self.modules_to_not_convert]
-        
+
         self.modules_to_not_convert.extend(keep_in_fp32_modules)
 
         # Extend `self.modules_to_not_convert` to keys that are supposed to be offloaded to `cpu` or `disk`
         if isinstance(device_map, dict) and len(device_map.keys()) > 1:
             keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
             self.modules_to_not_convert.extend(keys_on_cpu)
-        
+
         # Purge `None`.
         # Unlike `transformers`, we don't know if we should always keep certain modules in FP32
         # in case of diffusion transformer models. For language models and others alike, `lm_head`
         # and tied modules are usually kept in FP32.
         self.modules_to_not_convert = [module for module in self.modules_to_not_convert if module is not None]
 
         model.config.quantization_config = self.quantization_config
-    
+
     def _process_model_after_weight_loading(self, model: "ModelMixin"):
         return model
 
@@ -213,21 +229,21 @@ def is_serializable(self, safe_serialization=None):
                 "torchao quantized model does not support safe serialization, please set `safe_serialization` to False."
             )
             return False
-        
+
         _is_torchao_serializable = version.parse(importlib.metadata.version("huggingface_hub")) >= version.parse(
             "0.25.0"
         )
-        
+
         if not _is_torchao_serializable:
             logger.warning("torchao quantized model is only serializable after huggingface_hub >= 0.25.0 ")
-        
+
         if self.offload and self.quantization_config.modules_to_not_convert is None:
             logger.warning(
                 "The model contains offloaded modules and these modules are not quantized. We don't recommend saving the model as we won't be able to reload them."
                 "If you want to specify modules to not quantize, please specify modules_to_not_convert in the quantization_config."
             )
             return False
-        
+
         return _is_torchao_serializable
 
     @property
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
@@ -82,11 +82,11 @@
     is_sentencepiece_available,
     is_tensorboard_available,
     is_timm_available,
-    is_torchao_available,
     is_torch_available,
     is_torch_npu_available,
     is_torch_version,
     is_torch_xla_available,
+    is_torchao_available,
     is_torchsde_available,
     is_torchvision_available,
     is_transformers_available,