update

DN6 · DN6 · commit d7f09f27d2b2 · 2024-11-19T18:46:40.000+05:30
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
@@ -182,8 +182,7 @@ def load_model_dict_into_meta(
     hf_quantizer=None,
     keep_in_fp32_modules=None,
 ) -> List[str]:
-    if hf_quantizer is None:
-        device = device or torch.device("cpu")
+    device = device or torch.device("cpu")
     dtype = dtype or torch.float32
     is_quantized = hf_quantizer is not None
 
@@ -223,7 +222,7 @@ def load_model_dict_into_meta(
                 and hf_quantizer.pre_quantized
                 and hf_quantizer.check_if_quantized_param(model, param, param_name, state_dict, param_device=device)
             ):
-                hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name].shape, param.shape)
+                hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name], param)
             else:
                 model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
                 raise ValueError(
@@ -469,12 +468,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
 
         # if the tensor is a torch supported dtype do not use GGUFParameter
         is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16]
-        weights = torch.from_numpy(tensor.data)
-        parsed_parameters[name] = (
-            GGUFParameter(weights, quant_type=quant_type)
-            if is_gguf_quant
-            else weights.permute(*torch.arange(weights.ndim - 1, -1, -1))
-        )
+        weights = torch.from_numpy(tensor.data.copy())
+        parsed_parameters[name] = GGUFParameter(weights, quant_type=quant_type) if is_gguf_quant else weights
 
     if len(reader_keys) > 0:
         logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
diff --git a/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py b/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py
@@ -204,7 +204,10 @@ def create_quantized_param(
 
         module._parameters[tensor_name] = new_value
 
-    def check_quantized_param_shape(self, param_name, current_param_shape, loaded_param_shape):
+    def check_quantized_param_shape(self, param_name, current_param, loaded_param):
+        current_param_shape = current_param.shape
+        loaded_param_shape = loaded_param.shape
+
         n = current_param_shape.numel()
         inferred_shape = (n,) if "bias" in param_name else ((n + 1) // 2, 1)
         if loaded_param_shape != inferred_shape:
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -9,18 +9,17 @@
     from ...models.modeling_utils import ModelMixin
 
 from ...utils import (
-    is_accelerate_available,
+    is_gguf_available,
     is_torch_available,
     logging,
 )
 
 
-if is_accelerate_available():
-    pass
-
 if is_torch_available():
     import torch
 
+if is_gguf_available():
+    import gguf
 
 logger = logging.get_logger(__name__)
 
@@ -32,9 +31,20 @@ def __init__(self, quantization_config, **kwargs):
         self.compute_dtype = quantization_config.compute_dtype
         self.pre_quantized = True
 
-    def check_quantized_param_shape(self, param_name, current_param_shape, loaded_param_shape):
-        if _quant_shape_from_byte_shape(loaded_param_shape) == current_param_shape:
-            return True
+    def check_quantized_param_shape(self, param_name, current_param, loaded_param):
+        loaded_param_shape = loaded_param.shape
+        current_param_shape = current_param.shape
+        quant_type = loaded_param.quant_type
+
+        block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
+
+        inferred_shape = _quant_shape_from_byte_shape(loaded_param_shape, type_size, block_size)
+        if inferred_shape != current_param_shape:
+            raise ValueError(
+                f"{param_name} has an expected quantized shape of: {inferred_shape}, but receieved shape: {loaded_param_shape}"
+            )
+
+        return True
 
     def check_if_quantized_param(
         self,
@@ -44,8 +54,7 @@ def check_if_quantized_param(
         state_dict: Dict[str, Any],
         **kwargs,
     ) -> bool:
-        module, tensor_name = get_module_from_name(model, param_name)
-        if isinstance(module._parameters.get(tensor_name, None), GGUFParameter):
+        if isinstance(param_value, GGUFParameter):
             return True
 
         return False
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
@@ -17,26 +17,6 @@
 import torch.nn as nn
 
 
-_GGUF_FILE_TYPE_MAPPING = {
-    0: "ALL_F32",
-    1: "MOSTLY_F16",
-    2: "MOSTLY_Q4_0",
-    3: "MOSTLY_Q4_1",
-    4: "MOSTLY_Q4_1_SOME_F16",
-    8: "MOSTLY_Q5_0",
-    9: "MOSTLY_Q5_1",
-    10: "MOSTLY_Q2_K",
-    11: "MOSTLY_Q3_K_S",
-    12: "MOSTLY_Q3_K_M",
-    13: "MOSTLY_Q3_K_L",
-    14: "MOSTLY_Q4_K_S",
-    15: "MOSTLY_Q4_K_M",
-    16: "MOSTLY_Q5_K_S",
-    17: "MOSTLY_Q5_K_M",
-    18: "MOSTLY_Q6_K",
-}
-
-
 def _replace_with_gguf_linear(model, compute_dtype):
     for name, module in model.named_children():
         if isinstance(module, nn.Linear):
@@ -321,7 +301,6 @@ def dequantize_gguf_tensor(tensor, compute_dtype):
 
     block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
 
-    tensor = torch.tensor(tensor)
     tensor = tensor.view(torch.uint8)
     shape = _quant_shape_from_byte_shape(tensor.shape, type_size, block_size)