From 3ae202ff94a7573382de6f0c8b3aba012e75a61c Mon Sep 17 00:00:00 2001 From: dxqb <183307934+dxqb@users.noreply.github.com> Date: Thu, 16 Oct 2025 20:05:41 +0200 Subject: [PATCH] GGUF fix for unquantized types when using unquantize kernels Even if the `qweight_type` is one of the `UNQUANTIZED_TYPES`, qweight still has to be "dequantized" because it is stored as an 8-bit tensor. Without doing so, it is therefore a shape mismatch in the following matmul. Side notes: - why isn't DIFFUSERS_GGUF_CUDA_KERNELS on by default? It's significantly faster and only used when installed - https://huggingface.co/Isotr0py/ggml/tree/main/build has no build for torch 2.8 (or the upcoming 2.9). Who can we contact to make such a build? --- src/diffusers/quantizers/gguf/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index 2fba9986e825..adb429688723 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -79,7 +79,8 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, qweight_type: int) -> torch.Tensor: # there is no need to call any kernel for fp16/bf16 if qweight_type in UNQUANTIZED_TYPES: - return x @ qweight.T + weight = dequantize_gguf_tensor(qweight) + return x @ weight.T # TODO(Isotr0py): GGUF's MMQ and MMVQ implementation are designed for # contiguous batching and inefficient with diffusers' batching,