update

DN6 · DN6 · commit 8abfa5559cb7 · 2024-12-05T06:44:01.000+01:00
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -19,10 +19,10 @@
 
 
 if is_torch_available() and is_gguf_available():
-    import gguf
     import torch
 
     from .utils import (
+        GGML_QUANT_SIZES,
         GGUFParameter,
         _quant_shape_from_byte_shape,
         _replace_with_gguf_linear,
@@ -33,11 +33,17 @@
 
 
 class GGUFQuantizer(DiffusersQuantizer):
+    use_keep_in_fp32_modules = True
+
     def __init__(self, quantization_config, **kwargs):
         super().__init__(quantization_config, **kwargs)
 
         self.compute_dtype = quantization_config.compute_dtype
         self.pre_quantized = quantization_config.pre_quantized
+        self.modules_to_not_convert = quantization_config.modules_to_not_convert
+
+        if not isinstance(self.modules_to_not_convert, list):
+            self.modules_to_not_convert = [self.modules_to_not_convert]
 
     def validate_environment(self, *args, **kwargs):
         if not is_accelerate_available() or is_accelerate_version("<", "0.26.0"):
@@ -70,7 +76,7 @@ def check_quantized_param_shape(self, param_name, current_param, loaded_param):
         current_param_shape = current_param.shape
         quant_type = loaded_param.quant_type
 
-        block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
+        block_size, type_size = GGML_QUANT_SIZES[quant_type]
 
         inferred_shape = _quant_shape_from_byte_shape(loaded_param_shape, type_size, block_size)
         if inferred_shape != current_param_shape:
@@ -96,7 +102,7 @@ def check_if_quantized_param(
     def create_quantized_param(
         self,
         model: "ModelMixin",
-        param_value: "torch.Tensor",
+        param_value: Union["GGUFParameter", "torch.Tensor"],
         param_name: str,
         target_device: "torch.device",
         state_dict: Dict[str, Any],
@@ -119,7 +125,13 @@ def _process_model_before_weight_loading(
         **kwargs,
     ):
         state_dict = kwargs.get("state_dict", None)
-        _replace_with_gguf_linear(model, self.compute_dtype, state_dict)
+
+        self.modules_to_not_convert.extend(keep_in_fp32_modules)
+        self.modules_to_not_convert = [module for module in self.modules_to_not_convert if module is not None]
+
+        _replace_with_gguf_linear(
+            model, self.compute_dtype, state_dict, modules_to_not_convert=self.modules_to_not_convert
+        )
 
     def _process_model_after_weight_loading(self, model: "ModelMixin", **kwargs):
         return model
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
@@ -26,7 +26,7 @@
     from accelerate import init_empty_weights
 
 
-def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix=""):
+def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix="", modules_to_not_convert=[]):
     def _should_convert_to_gguf(module, state_dict, prefix):
         weight_key = prefix + "weight"
         return weight_key in state_dict and isinstance(state_dict[weight_key], GGUFParameter)
@@ -37,9 +37,13 @@ def _should_convert_to_gguf(module, state_dict, prefix):
 
     for name, module in model.named_children():
         module_prefix = prefix + name + "."
-        _replace_with_gguf_linear(module, compute_dtype, state_dict, module_prefix)
+        _replace_with_gguf_linear(module, compute_dtype, state_dict, module_prefix, modules_to_not_convert)
 
-        if isinstance(module, nn.Linear) and _should_convert_to_gguf(module, state_dict, module_prefix):
+        if (
+            isinstance(module, nn.Linear)
+            and _should_convert_to_gguf(module, state_dict, module_prefix)
+            and name not in modules_to_not_convert
+        ):
             ctx = init_empty_weights if is_accelerate_available() else nullcontext
             with ctx():
                 model._modules[name] = GGUFLinear(
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
@@ -393,11 +393,12 @@ def to_diff_dict(self) -> Dict[str, Any]:
 
 
 class GGUFQuantizationConfig(QuantizationConfigMixin):
-    def __init__(self, compute_dtype=None, quant_storage=None):
+    def __init__(self, compute_dtype=None, quant_storage=None, modules_to_not_convert=None):
         self.quant_method = QuantizationMethod.GGUF
         self.compute_dtype = compute_dtype
         self.quant_storage = quant_storage
         self.pre_quantized = True
+        self.modules_to_not_convert = modules_to_not_convert
 
         if self.compute_dtype is None:
             self.compute_dtype = torch.float32