Skip to content

Commit 5b6c988

Browse files
committed
update mma branch
Signed-off-by: Isotr0py <[email protected]>
1 parent 23ebbb4 commit 5b6c988

File tree

1 file changed

+8
-11
lines changed

1 file changed

+8
-11
lines changed

src/diffusers/quantizers/gguf/utils.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,12 @@
3434
and torch.cuda.is_available()
3535
and torch.cuda.get_device_capability()[0] >= 7
3636
)
37+
is_int8_tensor_core_available = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
38+
3739
if can_use_cuda_kernels and is_kernels_available():
3840
from kernels import get_kernel
3941

40-
ops = get_kernel("Isotr0py/ggml")
42+
ops = get_kernel("Isotr0py/ggml", revision="mma-standard")
4143
else:
4244
ops = None
4345

@@ -81,17 +83,12 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, qweight_type: in
8183
if qweight_type in UNQUANTIZED_TYPES:
8284
return x @ qweight.T
8385

84-
# TODO(Isotr0py): GGUF's MMQ and MMVQ implementation are designed for
85-
# contiguous batching and inefficient with diffusers' batching,
86-
# so we disabled it now.
87-
88-
# elif qweight_type in MMVQ_QUANT_TYPES:
89-
# y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
90-
# elif qweight_type in MMQ_QUANT_TYPES:
91-
# y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
92-
86+
# For best performance, we only use MMQ kernels with int8 MMA
87+
# implementation for Ampere and newer architectures.
88+
if qweight_type in MMQ_QUANT_TYPES and is_int8_tensor_core_available:
89+
y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
9390
# If there is no available MMQ kernel, fallback to dequantize
94-
if qweight_type in DEQUANT_TYPES:
91+
elif qweight_type in DEQUANT_TYPES:
9592
block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
9693
shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
9794
weight = ops.ggml_dequantize(qweight, qweight_type, *shape)

0 commit comments

Comments
 (0)