Skip to content

Commit bc4016d

Browse files
authored
Merge branch 'main' into enable-compilation
2 parents 4ffc993 + 69cdc25 commit bc4016d

File tree

10 files changed

+363
-111
lines changed

10 files changed

+363
-111
lines changed

.github/workflows/nightly_tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ jobs:
333333
additional_deps: ["peft"]
334334
- backend: "gguf"
335335
test_location: "gguf"
336-
additional_deps: ["peft"]
336+
additional_deps: ["peft", "kernels"]
337337
- backend: "torchao"
338338
test_location: "torchao"
339339
additional_deps: []

docs/source/en/quantization/gguf.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,16 @@ image = pipe(prompt, generator=torch.manual_seed(0)).images[0]
5353
image.save("flux-gguf.png")
5454
```
5555

56+
## Using Optimized CUDA Kernels with GGUF
57+
58+
Optimized CUDA kernels can accelerate GGUF quantized model inference by approximately 10%. This functionality requires a compatible GPU with `torch.cuda.get_device_capability` greater than 7 and the kernels library:
59+
60+
```shell
61+
pip install -U kernels
62+
```
63+
64+
Once installed, set `DIFFUSERS_GGUF_CUDA_KERNELS=true` to use optimized kernels when available. Note that CUDA kernels may introduce minor numerical differences compared to the original GGUF implementation, potentially causing subtle visual variations in generated images. To disable CUDA kernel usage, set the environment variable `DIFFUSERS_GGUF_CUDA_KERNELS=false`.
65+
5666
## Supported Quantization Types
5767

5868
- BF16

src/diffusers/hooks/group_offloading.py

Lines changed: 89 additions & 104 deletions
Large diffs are not rendered by default.

src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,11 @@ def __call__(
636636
if self.attention_kwargs is None:
637637
self._attention_kwargs = {}
638638

639+
txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist() if prompt_embeds_mask is not None else None
640+
negative_txt_seq_lens = (
641+
negative_prompt_embeds_mask.sum(dim=1).tolist() if negative_prompt_embeds_mask is not None else None
642+
)
643+
639644
# 6. Denoising loop
640645
self.scheduler.set_begin_index(0)
641646
with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -654,7 +659,7 @@ def __call__(
654659
encoder_hidden_states_mask=prompt_embeds_mask,
655660
encoder_hidden_states=prompt_embeds,
656661
img_shapes=img_shapes,
657-
txt_seq_lens=prompt_embeds_mask.sum(dim=1).tolist(),
662+
txt_seq_lens=txt_seq_lens,
658663
attention_kwargs=self.attention_kwargs,
659664
return_dict=False,
660665
)[0]
@@ -668,7 +673,7 @@ def __call__(
668673
encoder_hidden_states_mask=negative_prompt_embeds_mask,
669674
encoder_hidden_states=negative_prompt_embeds,
670675
img_shapes=img_shapes,
671-
txt_seq_lens=negative_prompt_embeds_mask.sum(dim=1).tolist(),
676+
txt_seq_lens=negative_txt_seq_lens,
672677
attention_kwargs=self.attention_kwargs,
673678
return_dict=False,
674679
)[0]

src/diffusers/quantizers/gguf/utils.py

Lines changed: 92 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,15 @@
1212
# # See the License for the specific language governing permissions and
1313
# # limitations under the License.
1414

15-
1615
import inspect
16+
import os
1717
from contextlib import nullcontext
1818

1919
import gguf
2020
import torch
2121
import torch.nn as nn
2222

23-
from ...utils import is_accelerate_available
23+
from ...utils import is_accelerate_available, is_kernels_available
2424

2525

2626
if is_accelerate_available():
@@ -29,6 +29,82 @@
2929
from accelerate.hooks import add_hook_to_module, remove_hook_from_module
3030

3131

32+
can_use_cuda_kernels = (
33+
os.getenv("DIFFUSERS_GGUF_CUDA_KERNELS", "false").lower() in ["1", "true", "yes"]
34+
and torch.cuda.is_available()
35+
and torch.cuda.get_device_capability()[0] >= 7
36+
)
37+
if can_use_cuda_kernels and is_kernels_available():
38+
from kernels import get_kernel
39+
40+
ops = get_kernel("Isotr0py/ggml")
41+
else:
42+
ops = None
43+
44+
UNQUANTIZED_TYPES = {gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16, gguf.GGMLQuantizationType.BF16}
45+
STANDARD_QUANT_TYPES = {
46+
gguf.GGMLQuantizationType.Q4_0,
47+
gguf.GGMLQuantizationType.Q4_1,
48+
gguf.GGMLQuantizationType.Q5_0,
49+
gguf.GGMLQuantizationType.Q5_1,
50+
gguf.GGMLQuantizationType.Q8_0,
51+
gguf.GGMLQuantizationType.Q8_1,
52+
}
53+
KQUANT_TYPES = {
54+
gguf.GGMLQuantizationType.Q2_K,
55+
gguf.GGMLQuantizationType.Q3_K,
56+
gguf.GGMLQuantizationType.Q4_K,
57+
gguf.GGMLQuantizationType.Q5_K,
58+
gguf.GGMLQuantizationType.Q6_K,
59+
}
60+
IMATRIX_QUANT_TYPES = {
61+
gguf.GGMLQuantizationType.IQ1_M,
62+
gguf.GGMLQuantizationType.IQ1_S,
63+
gguf.GGMLQuantizationType.IQ2_XXS,
64+
gguf.GGMLQuantizationType.IQ2_XS,
65+
gguf.GGMLQuantizationType.IQ2_S,
66+
gguf.GGMLQuantizationType.IQ3_XXS,
67+
gguf.GGMLQuantizationType.IQ3_S,
68+
gguf.GGMLQuantizationType.IQ4_XS,
69+
gguf.GGMLQuantizationType.IQ4_NL,
70+
}
71+
# TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization.
72+
# Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add
73+
# MMQ kernel for I-Matrix quantization.
74+
DEQUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
75+
MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
76+
MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
77+
78+
79+
def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, qweight_type: int) -> torch.Tensor:
80+
# there is no need to call any kernel for fp16/bf16
81+
if qweight_type in UNQUANTIZED_TYPES:
82+
return x @ qweight.T
83+
84+
# TODO(Isotr0py): GGUF's MMQ and MMVQ implementation are designed for
85+
# contiguous batching and inefficient with diffusers' batching,
86+
# so we disabled it now.
87+
88+
# elif qweight_type in MMVQ_QUANT_TYPES:
89+
# y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
90+
# elif qweight_type in MMQ_QUANT_TYPES:
91+
# y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
92+
93+
# If there is no available MMQ kernel, fallback to dequantize
94+
if qweight_type in DEQUANT_TYPES:
95+
block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
96+
shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
97+
weight = ops.ggml_dequantize(qweight, qweight_type, *shape)
98+
y = x @ weight.to(x.dtype).T
99+
else:
100+
# Raise an error if the quantization type is not supported.
101+
# Might be useful if llama.cpp adds a new quantization type.
102+
# Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type.
103+
qweight_type = gguf.GGMLQuantizationType(qweight_type)
104+
raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}")
105+
return y.as_tensor()
106+
107+
32108
# Copied from diffusers.quantizers.bitsandbytes.utils._create_accelerate_new_hook
33109
def _create_accelerate_new_hook(old_hook):
34110
r"""
@@ -451,11 +527,24 @@ def __init__(
451527
) -> None:
452528
super().__init__(in_features, out_features, bias, device)
453529
self.compute_dtype = compute_dtype
530+
self.device = device
531+
532+
def forward(self, inputs: torch.Tensor):
533+
if ops is not None and self.weight.is_cuda and inputs.is_cuda:
534+
return self.forward_cuda(inputs)
535+
return self.forward_native(inputs)
454536

455-
def forward(self, inputs):
537+
def forward_native(self, inputs: torch.Tensor):
456538
weight = dequantize_gguf_tensor(self.weight)
457539
weight = weight.to(self.compute_dtype)
458540
bias = self.bias.to(self.compute_dtype) if self.bias is not None else None
459541

460542
output = torch.nn.functional.linear(inputs, weight, bias)
461543
return output
544+
545+
def forward_cuda(self, inputs: torch.Tensor):
546+
quant_type = self.weight.quant_type
547+
output = _fused_mul_mat_gguf(inputs.to(self.compute_dtype), self.weight, quant_type)
548+
if self.bias is not None:
549+
output += self.bias.to(self.compute_dtype)
550+
return output

src/diffusers/utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
is_invisible_watermark_available,
8282
is_k_diffusion_available,
8383
is_k_diffusion_version,
84+
is_kernels_available,
8485
is_librosa_available,
8586
is_matplotlib_available,
8687
is_nltk_available,

src/diffusers/utils/import_utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ def _is_package_available(pkg_name: str, get_dist_name: bool = False) -> Tuple[b
192192
_torch_npu_available, _torch_npu_version = _is_package_available("torch_npu")
193193
_transformers_available, _transformers_version = _is_package_available("transformers")
194194
_hf_hub_available, _hf_hub_version = _is_package_available("huggingface_hub")
195+
_kernels_available, _kernels_version = _is_package_available("kernels")
195196
_inflect_available, _inflect_version = _is_package_available("inflect")
196197
_unidecode_available, _unidecode_version = _is_package_available("unidecode")
197198
_k_diffusion_available, _k_diffusion_version = _is_package_available("k_diffusion")
@@ -277,6 +278,10 @@ def is_accelerate_available():
277278
return _accelerate_available
278279

279280

281+
def is_kernels_available():
282+
return _kernels_available
283+
284+
280285
def is_k_diffusion_available():
281286
return _k_diffusion_available
282287

src/diffusers/utils/testing_utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
is_compel_available,
3737
is_flax_available,
3838
is_gguf_available,
39+
is_kernels_available,
3940
is_note_seq_available,
4041
is_onnx_available,
4142
is_opencv_available,
@@ -634,6 +635,18 @@ def decorator(test_case):
634635
return decorator
635636

636637

638+
def require_kernels_version_greater_or_equal(kernels_version):
639+
def decorator(test_case):
640+
correct_kernels_version = is_kernels_available() and version.parse(
641+
version.parse(importlib.metadata.version("kernels")).base_version
642+
) >= version.parse(kernels_version)
643+
return unittest.skipUnless(
644+
correct_kernels_version, f"Test requires kernels with version greater than {kernels_version}."
645+
)(test_case)
646+
647+
return decorator
648+
649+
637650
def deprecate_after_peft_backend(test_case):
638651
"""
639652
Decorator marking a test that will be skipped after PEFT backend

tests/hooks/test_group_offloading.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
import unittest
1818

1919
import torch
20+
from parameterized import parameterized
2021

22+
from diffusers.hooks import HookRegistry, ModelHook
2123
from diffusers.models import ModelMixin
2224
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
2325
from diffusers.utils import get_logger
@@ -99,6 +101,29 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
99101
return x
100102

101103

104+
# Test for https://github.com/huggingface/diffusers/pull/12077
105+
class DummyModelWithLayerNorm(ModelMixin):
106+
def __init__(self, in_features: int, hidden_features: int, out_features: int, num_layers: int) -> None:
107+
super().__init__()
108+
109+
self.linear_1 = torch.nn.Linear(in_features, hidden_features)
110+
self.activation = torch.nn.ReLU()
111+
self.blocks = torch.nn.ModuleList(
112+
[DummyBlock(hidden_features, hidden_features, hidden_features) for _ in range(num_layers)]
113+
)
114+
self.layer_norm = torch.nn.LayerNorm(hidden_features, elementwise_affine=True)
115+
self.linear_2 = torch.nn.Linear(hidden_features, out_features)
116+
117+
def forward(self, x: torch.Tensor) -> torch.Tensor:
118+
x = self.linear_1(x)
119+
x = self.activation(x)
120+
for block in self.blocks:
121+
x = block(x)
122+
x = self.layer_norm(x)
123+
x = self.linear_2(x)
124+
return x
125+
126+
102127
class DummyPipeline(DiffusionPipeline):
103128
model_cpu_offload_seq = "model"
104129

@@ -113,6 +138,16 @@ def __call__(self, x: torch.Tensor) -> torch.Tensor:
113138
return x
114139

115140

141+
class LayerOutputTrackerHook(ModelHook):
142+
def __init__(self):
143+
super().__init__()
144+
self.outputs = []
145+
146+
def post_forward(self, module, output):
147+
self.outputs.append(output)
148+
return output
149+
150+
116151
@require_torch_accelerator
117152
class GroupOffloadTests(unittest.TestCase):
118153
in_features = 64
@@ -258,6 +293,7 @@ def test_error_raised_if_group_offloading_applied_on_sequential_offloaded_module
258293
def test_block_level_stream_with_invocation_order_different_from_initialization_order(self):
259294
if torch.device(torch_device).type not in ["cuda", "xpu"]:
260295
return
296+
261297
model = DummyModelWithMultipleBlocks(
262298
in_features=self.in_features,
263299
hidden_features=self.hidden_features,
@@ -274,3 +310,54 @@ def test_block_level_stream_with_invocation_order_different_from_initialization_
274310

275311
with context:
276312
model(self.input)
313+
314+
@parameterized.expand([("block_level",), ("leaf_level",)])
315+
def test_block_level_offloading_with_parameter_only_module_group(self, offload_type: str):
316+
if torch.device(torch_device).type not in ["cuda", "xpu"]:
317+
return
318+
319+
def apply_layer_output_tracker_hook(model: DummyModelWithLayerNorm):
320+
for name, module in model.named_modules():
321+
registry = HookRegistry.check_if_exists_or_initialize(module)
322+
hook = LayerOutputTrackerHook()
323+
registry.register_hook(hook, "layer_output_tracker")
324+
325+
model_ref = DummyModelWithLayerNorm(128, 256, 128, 2)
326+
model = DummyModelWithLayerNorm(128, 256, 128, 2)
327+
328+
model.load_state_dict(model_ref.state_dict(), strict=True)
329+
330+
model_ref.to(torch_device)
331+
model.enable_group_offload(torch_device, offload_type=offload_type, num_blocks_per_group=1, use_stream=True)
332+
333+
apply_layer_output_tracker_hook(model_ref)
334+
apply_layer_output_tracker_hook(model)
335+
336+
x = torch.randn(2, 128).to(torch_device)
337+
338+
out_ref = model_ref(x)
339+
out = model(x)
340+
self.assertTrue(torch.allclose(out_ref, out, atol=1e-5), "Outputs do not match.")
341+
342+
num_repeats = 4
343+
for i in range(num_repeats):
344+
out_ref = model_ref(x)
345+
out = model(x)
346+
347+
self.assertTrue(torch.allclose(out_ref, out, atol=1e-5), "Outputs do not match after multiple invocations.")
348+
349+
for (ref_name, ref_module), (name, module) in zip(model_ref.named_modules(), model.named_modules()):
350+
assert ref_name == name
351+
ref_outputs = (
352+
HookRegistry.check_if_exists_or_initialize(ref_module).get_hook("layer_output_tracker").outputs
353+
)
354+
outputs = HookRegistry.check_if_exists_or_initialize(module).get_hook("layer_output_tracker").outputs
355+
cumulated_absmax = 0.0
356+
for i in range(len(outputs)):
357+
diff = ref_outputs[0] - outputs[i]
358+
absdiff = diff.abs()
359+
absmax = absdiff.max().item()
360+
cumulated_absmax += absmax
361+
self.assertLess(
362+
cumulated_absmax, 1e-5, f"Output differences for {name} exceeded threshold: {cumulated_absmax:.5f}"
363+
)

0 commit comments

Comments
 (0)