update

DN6 · DN6 · commit afd5d7d7344b · 2024-12-04T10:36:05.000+01:00
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -338,8 +338,8 @@
             "StableDiffusion3ControlNetPipeline",
             "StableDiffusion3Img2ImgPipeline",
             "StableDiffusion3InpaintPipeline",
-            "StableDiffusion3PAGPipeline",
             "StableDiffusion3PAGImg2ImgPipeline",
+            "StableDiffusion3PAGPipeline",
             "StableDiffusion3Pipeline",
             "StableDiffusionAdapterPipeline",
             "StableDiffusionAttendAndExcitePipeline",
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
@@ -349,7 +349,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
         if hf_quantizer is not None:
             hf_quantizer.postprocess_model(model)
 
-        if torch_dtype is not None:
+        if torch_dtype is not None and hf_quantizer is None:
             model.to(torch_dtype)
 
         model.eval()
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
@@ -449,7 +449,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
         import gguf
         from gguf import GGUFReader
 
-        from ..quantizers.gguf.utils import GGUFParameter
+        from ..quantizers.gguf.utils import SUPPORTED_GGUF_QUANT_TYPES, GGUFParameter
     else:
         logger.error(
             "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
@@ -458,8 +458,6 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
         raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
 
     reader = GGUFReader(gguf_checkpoint_path)
-    fields = reader.fields
-    reader_keys = list(fields.keys())
 
     parsed_parameters = {}
     for tensor in tqdm(reader.tensors, desc="Loading GGUF Parameters: "):
@@ -468,10 +466,16 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
 
         # if the tensor is a torch supported dtype do not use GGUFParameter
         is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16]
+        if is_gguf_quant and quant_type not in SUPPORTED_GGUF_QUANT_TYPES:
+            raise ValueError(
+                (
+                    f"{name} has a quantization type: {quant_type} which is unsupported."
+                    f" Currently the following quantization types are supported: {SUPPORTED_GGUF_QUANT_TYPES}"
+                    "To request support for this quantization type please open an issue here: https://github.com/huggingface/diffusers"
+                )
+            )
+
         weights = torch.from_numpy(tensor.data.copy())
         parsed_parameters[name] = GGUFParameter(weights, quant_type=quant_type) if is_gguf_quant else weights
 
-    if len(reader_keys) > 0:
-        logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
-
     return parsed_parameters
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 from ..base import DiffusersQuantizer
 
@@ -12,6 +12,7 @@
     is_accelerate_available,
     is_accelerate_version,
     is_gguf_available,
+    is_gguf_version,
     is_torch_available,
     logging,
 )
@@ -21,7 +22,11 @@
     import gguf
     import torch
 
-    from .utils import GGUFParameter, _quant_shape_from_byte_shape, _replace_with_gguf_linear
+    from .utils import (
+        GGUFParameter,
+        _quant_shape_from_byte_shape,
+        _replace_with_gguf_linear,
+    )
 
 
 logger = logging.get_logger(__name__)
@@ -39,11 +44,26 @@ def validate_environment(self, *args, **kwargs):
             raise ImportError(
                 "Loading GGUF Parameters requires `accelerate` installed in your enviroment: `pip install 'accelerate>=0.26.0'`"
             )
-        if not is_gguf_available():
+        if not is_gguf_available() or is_gguf_version("<", "0.10.0"):
             raise ImportError(
-                "To load GGUF format files you must have `gguf` installed in your environment: `pip install gguf`"
+                "To load GGUF format files you must have `gguf` installed in your environment: `pip install gguf>=0.10.0`"
             )
 
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        # need more space for buffers that are created during quantization
+        max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+        return max_memory
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        if target_dtype != torch.uint8:
+            logger.info(f"target_dtype {target_dtype} is replaced by `torch.uint8` for GGUF quantization")
+        return torch.uint8
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            torch_dtype = self.compute_dtype
+        return torch_dtype
+
     def check_quantized_param_shape(self, param_name, current_param, loaded_param):
         loaded_param_shape = loaded_param.shape
         current_param_shape = current_param.shape
@@ -62,7 +82,7 @@ def check_quantized_param_shape(self, param_name, current_param, loaded_param):
     def check_if_quantized_param(
         self,
         model: "ModelMixin",
-        param_value: "torch.Tensor",
+        param_value: Union["GGUFParameter", "torch.Tensor"],
         param_name: str,
         state_dict: Dict[str, Any],
         **kwargs,
@@ -82,10 +102,13 @@ def create_quantized_param(
         unexpected_keys: Optional[List[str]] = None,
     ):
         module, tensor_name = get_module_from_name(model, param_name)
-        if tensor_name not in module._parameters:
+        if tensor_name not in module._parameters and tensor_name not in module._buffers:
             raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
 
-        module._parameters[tensor_name] = param_value
+        if tensor_name in module._parameters:
+            module._parameters[tensor_name] = param_value.to(target_device)
+        if tensor_name in module._buffers:
+            module._buffers[tensor_name] = param_value.to(target_device)
 
     def _process_model_before_weight_loading(
         self,
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
@@ -13,10 +13,18 @@
 # # limitations under the License.
 
 
+from contextlib import nullcontext
+
 import gguf
 import torch
 import torch.nn as nn
 
+from ...utils import is_accelerate_available
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
 
 def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix=""):
     def _should_convert_to_gguf(module, state_dict, prefix):
@@ -32,12 +40,14 @@ def _should_convert_to_gguf(module, state_dict, prefix):
         _replace_with_gguf_linear(module, compute_dtype, state_dict, module_prefix)
 
         if isinstance(module, nn.Linear) and _should_convert_to_gguf(module, state_dict, module_prefix):
-            model._modules[name] = GGUFLinear(
-                module.in_features,
-                module.out_features,
-                module.bias is not None,
-                compute_dtype=compute_dtype,
-            )
+            ctx = init_empty_weights if is_accelerate_available() else nullcontext
+            with ctx():
+                model._modules[name] = GGUFLinear(
+                    module.in_features,
+                    module.out_features,
+                    module.bias is not None,
+                    compute_dtype=compute_dtype,
+                )
             model._modules[name].source_cls = type(module)
             # Force requires grad to False to avoid unexpected errors
             model._modules[name].requires_grad_(False)
@@ -296,6 +306,7 @@ def dequantize_blocks_BF16(blocks, block_size, type_size, dtype=None):
     gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K,
     gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K,
 }
+SUPPORTED_GGUF_QUANT_TYPES = list(dequantize_functions.keys())
 
 
 def _quant_shape_from_byte_shape(shape, type_size, block_size):
@@ -323,7 +334,7 @@ def dequantize_gguf_tensor(tensor):
     return dequant.as_tensor()
 
 
-class GGUFParameter(torch.Tensor):
+class GGUFParameter(torch.nn.Parameter):
     def __new__(cls, data, requires_grad=False, quant_type=None):
         data = data if data is not None else torch.empty(0)
         self = torch.Tensor._make_subclass(cls, data, requires_grad)
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
@@ -68,6 +68,7 @@
     is_flax_available,
     is_ftfy_available,
     is_gguf_available,
+    is_gguf_version,
     is_google_colab,
     is_inflect_available,
     is_invisible_watermark_available,
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
@@ -777,6 +777,21 @@ def is_bitsandbytes_version(operation: str, version: str):
     return compare_versions(parse(_bitsandbytes_version), operation, version)
 
 
+def is_gguf_version(operation: str, version: str):
+    """
+    Compares the current Accelerate version to a given reference with an operation.
+
+    Args:
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A version string
+    """
+    if not _is_gguf_available:
+        return False
+    return compare_versions(parse(_gguf_version), operation, version)
+
+
 def is_k_diffusion_version(operation: str, version: str):
     """
     Compares the current k-diffusion version to a given reference with an operation.