From 3ae202ff94a7573382de6f0c8b3aba012e75a61c Mon Sep 17 00:00:00 2001
From: dxqb <183307934+dxqb@users.noreply.github.com>
Date: Thu, 16 Oct 2025 20:05:41 +0200
Subject: [PATCH] GGUF fix for unquantized types when using unquantize kernels

Even if the `qweight_type` is one of the `UNQUANTIZED_TYPES`, qweight still has to be "dequantized" because it is stored as an 8-bit tensor. Without doing so, it is therefore a shape mismatch in the following matmul.

Side notes:
 - why isn't DIFFUSERS_GGUF_CUDA_KERNELS on by default? It's significantly faster and only used when installed
 - https://huggingface.co/Isotr0py/ggml/tree/main/build has no build for torch 2.8 (or the upcoming 2.9). Who can we contact to make such a build?
---
 src/diffusers/quantizers/gguf/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index 2fba9986e825..adb429688723 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -79,7 +79,8 @@
 def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, qweight_type: int) -> torch.Tensor:
     # there is no need to call any kernel for fp16/bf16
     if qweight_type in UNQUANTIZED_TYPES:
-        return x @ qweight.T
+        weight = dequantize_gguf_tensor(qweight)
+        return x @ weight.T
 
     # TODO(Isotr0py): GGUF's MMQ and MMVQ implementation are designed for
     # contiguous batching and inefficient with diffusers' batching,