update

a-r-r-o-w · a-r-r-o-w · commit cbb0da499502 · 2024-11-24T06:21:34.000+01:00
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
@@ -176,11 +176,9 @@ def load_model_dict_into_meta(
     hf_quantizer=None,
     keep_in_fp32_modules=None,
 ) -> List[str]:
-    if hf_quantizer is None:
-        device = device or torch.device("cpu")
+    device = device or torch.device("cpu")
     dtype = dtype or torch.float32
     is_quantized = hf_quantizer is not None
-    is_quant_method_bnb = getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES
 
     accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
     empty_state_dict = model.state_dict()
@@ -213,12 +211,12 @@ def load_model_dict_into_meta(
         # bnb params are flattened.
         if empty_state_dict[param_name].shape != param.shape:
             if (
-                is_quant_method_bnb
+                is_quantized
                 and hf_quantizer.pre_quantized
                 and hf_quantizer.check_if_quantized_param(model, param, param_name, state_dict, param_device=device)
             ):
                 hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name].shape, param.shape)
-            elif not is_quant_method_bnb:
+            else:
                 model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
                 raise ValueError(
                     f"Cannot load {model_name_or_path_str} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -835,7 +835,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     if hf_quantizer is None:
                         param_device = "cpu"
                     # TODO (sayakpaul,  SunMarc): remove this after model loading refactor
-                    elif is_quant_method_bnb:
+                    else:
                         param_device = torch.cuda.current_device()
                     state_dict = load_state_dict(model_file, variant=variant)
                     model._convert_deprecated_attention_blocks(state_dict)
diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py
@@ -19,12 +19,14 @@
 from typing import Dict, Optional, Union
 
 from .bitsandbytes import BnB4BitDiffusersQuantizer, BnB8BitDiffusersQuantizer
+from .torchao import TorchAoHfQuantizer
 from .quantization_config import BitsAndBytesConfig, QuantizationConfigMixin, QuantizationMethod, TorchAoConfig
 
 
 AUTO_QUANTIZER_MAPPING = {
     "bitsandbytes_4bit": BnB4BitDiffusersQuantizer,
     "bitsandbytes_8bit": BnB8BitDiffusersQuantizer,
+    "torchao": TorchAoHfQuantizer,
 }
 
 AUTO_QUANTIZATION_CONFIG_MAPPING = {
diff --git a/src/diffusers/quantizers/torchao/torchao_quantizer.py b/src/diffusers/quantizers/torchao/torchao_quantizer.py
@@ -101,15 +101,16 @@ def validate_environment(self, *args, **kwargs):
     def update_torch_dtype(self, torch_dtype):
         quant_type = self.quantization_config.quant_type
 
-        if quant_type.startswith("int") or quant_type.startswith("uint"):
+        if quant_type.startswith("int"):
             if torch_dtype is not None and torch_dtype != torch.bfloat16:
                 logger.warning(
-                    f"Setting torch_dtype to {torch_dtype} for int4/int8/uintx quantization, but only bfloat16 is supported right now. Please set `torch_dtype=torch.bfloat16`."
+                    f"You are trying to set torch_dtype to {torch_dtype} for int4/int8/uintx quantization, but "
+                    f"only bfloat16 is supported right now. Please set `torch_dtype=torch.bfloat16`."
                 )
 
         if torch_dtype is None:
             # we need to set the torch_dtype, otherwise we have dtype mismatch when performing the quantized linear op
-            logger.info(
+            logger.warning(
                 "Overriding `torch_dtype` with `torch_dtype=torch.bfloat16` due to requirements of `torchao` "
                 "to enable model loading in different precisions. Pass your own `torch_dtype` to specify the "
                 "dtype of the remaining non-linear layers, or pass torch_dtype=torch.bfloat16, to remove this warning."