update

DN6 · DN6 · commit 12a837b49348 · 2025-04-07T16:40:03.000+02:00
diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py
@@ -23,6 +23,7 @@
     deprecate,
     get_submodule_by_name,
     is_bitsandbytes_available,
+    is_gguf_available,
     is_peft_available,
     is_peft_version,
     is_torch_version,
@@ -49,9 +50,6 @@
 )
 
 
-if is_bitsandbytes_available():
-    from ..quantizers.bitsandbytes import dequantize_bnb_weight
-
 _LOW_CPU_MEM_USAGE_DEFAULT_LORA = False
 if is_torch_version(">=", "1.9.0"):
     if (
@@ -72,6 +70,49 @@
 _MODULE_NAME_TO_ATTRIBUTE_MAP_FLUX = {"x_embedder": "in_channels"}
 
 
+def _dequantize_weight_for_expanded_lora(model, module):
+    if is_bitsandbytes_available():
+        from ..quantizers.bitsandbytes import dequantize_bnb_weight
+
+    if is_gguf_available():
+        from ..quantizers.gguf.utils import dequantize_gguf_tensor
+
+    is_bnb_4bit_quantized = module.weight.__class__.__name__ == "Params4bit"
+    is_gguf_quantized = module.weight.__class__.__name__ == "GGUFParameter"
+
+    if is_bnb_4bit_quantized and not is_bitsandbytes_available():
+        raise ValueError(
+            "The checkpoint seems to have been quantized with `bitsandbytes` (4bits). Install `bitsandbytes` to load quantized checkpoints."
+        )
+    if is_gguf_quantized and not is_gguf_available():
+        raise ValueError(
+            "The checkpoint seems to have been quantized with `gguf`. Install `gguf` to load quantized checkpoints."
+        )
+
+    weight_on_cpu = False
+    if not module.weight.is_cuda:
+        weight_on_cpu = True
+
+    if is_bnb_4bit_quantized:
+        module_weight = dequantize_bnb_weight(
+            module.weight.cuda() if weight_on_cpu else module.weight,
+            state=module.weight.quant_state,
+            dtype=model.dtype,
+        ).data
+    elif is_gguf_quantized:
+        module_weight = dequantize_gguf_tensor(
+            module.weight.cuda() if weight_on_cpu else module.weight,
+        )
+        module_weight = module_weight.to(model.dtype)
+    else:
+        module_weight = module.weight.data
+
+    if weight_on_cpu:
+        module_weight = module_weight.cpu()
+
+    return module_weight
+
+
 class StableDiffusionLoraLoaderMixin(LoraBaseMixin):
     r"""
     Load LoRA layers into Stable Diffusion [`UNet2DConditionModel`] and
@@ -1970,26 +2011,10 @@ def _maybe_expand_transformer_param_shape_or_error_(
         overwritten_params = {}
 
         is_peft_loaded = getattr(transformer, "peft_config", None) is not None
+        is_quantized = hasattr(transformer, "hf_quantizer")
         for name, module in transformer.named_modules():
             if isinstance(module, torch.nn.Linear):
-                is_bnb_4bit_quantized = module.weight.__class__.__name__ == "Params4bit"
-                if is_bnb_4bit_quantized and not is_bitsandbytes_available():
-                    raise ValueError(
-                        "The checkpoint seems to have been quantized with `bitsandbytes` (4bits). Install `bitsandbytes` to load quantized checkpoints."
-                    )
-                elif is_bnb_4bit_quantized:
-                    weight_on_cpu = False
-                    if not module.weight.is_cuda:
-                        weight_on_cpu = True
-                    module_weight = dequantize_bnb_weight(
-                        module.weight.cuda() if weight_on_cpu else module.weight,
-                        state=module.weight.quant_state,
-                        dtype=transformer.dtype,
-                    ).data
-                    if weight_on_cpu:
-                        module_weight = module_weight.cpu()
-                else:
-                    module_weight = module.weight.data
+                module_weight = module.weight.data
                 module_bias = module.bias.data if module.bias is not None else None
                 bias = module_bias is not None
 
@@ -2034,6 +2059,9 @@ def _maybe_expand_transformer_param_shape_or_error_(
                     parent_module_name, _, current_module_name = name.rpartition(".")
                     parent_module = transformer.get_submodule(parent_module_name)
 
+                    if is_quantized:
+                        module_weight = _dequantize_weight_for_expanded_lora(transformer, module)
+
                     with torch.device("meta"):
                         expanded_module = torch.nn.Linear(
                             in_features, out_features, bias=bias, dtype=module_weight.dtype
@@ -2134,7 +2162,12 @@ def _calculate_module_shape(
         base_weight_param_name: str = None,
     ) -> "torch.Size":
         def _get_weight_shape(weight: torch.Tensor):
-            return weight.quant_state.shape if weight.__class__.__name__ == "Params4bit" else weight.shape
+            if weight.__class__.__name__ == "Params4bit":
+                return weight.quant_state.shape
+            elif weight.__class__.__name__ == "GGUFParameter":
+                return weight.quant_shape
+            else:
+                return weight.shape
 
         if base_module is not None:
             return _get_weight_shape(base_module.weight)
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
@@ -400,6 +400,8 @@ def __new__(cls, data, requires_grad=False, quant_type=None):
         data = data if data is not None else torch.empty(0)
         self = torch.Tensor._make_subclass(cls, data, requires_grad)
         self.quant_type = quant_type
+        block_size, type_size = GGML_QUANT_SIZES[quant_type]
+        self.quant_shape = _quant_shape_from_byte_shape(self.shape, type_size, block_size)
 
         return self