huggingface · dxqb · Oct 16, 2025 · Isotr0py · Oct 17, 2025 · dxqb
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
@@ -79,7 +79,8 @@
 def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, qweight_type: int) -> torch.Tensor:
     # there is no need to call any kernel for fp16/bf16
     if qweight_type in UNQUANTIZED_TYPES:
-        return x @ qweight.T
+        weight = dequantize_gguf_tensor(qweight)
+        return x @ weight.T
 dequantize_functions = { 
     gguf.GGMLQuantizationType.IQ4_NL: dequantize_blocks_IQ4_NL, 
     gguf.GGMLQuantizationType.IQ4_XS: dequantize_blocks_IQ4_XS, 
     gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16, 
     gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0, 
     gguf.GGMLQuantizationType.Q5_1: dequantize_blocks_Q5_1, 
     gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0, 
     gguf.GGMLQuantizationType.Q4_1: dequantize_blocks_Q4_1, 
     gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0, 
     gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K, 
     gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K, 
     gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K, 
     gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K, 
     gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K, 
 } 
 dequantize_functions = { 
     gguf.GGMLQuantizationType.IQ4_NL: dequantize_blocks_IQ4_NL, 
     gguf.GGMLQuantizationType.IQ4_XS: dequantize_blocks_IQ4_XS, 
     gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16, 
     gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0, 
     gguf.GGMLQuantizationType.Q5_1: dequantize_blocks_Q5_1, 
     gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0, 
     gguf.GGMLQuantizationType.Q4_1: dequantize_blocks_Q4_1, 
     gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0, 
     gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K, 
     gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K, 
     gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K, 
     gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K, 
     gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K, 
 } 
 
     # TODO(Isotr0py): GGUF's MMQ and MMVQ implementation are designed for
     # contiguous batching and inefficient with diffusers' batching,