Skip to content

Commit 076104a

Browse files
authored
Merge branch 'main' into main
2 parents daddd87 + 751e250 commit 076104a

File tree

79 files changed

+1591
-147
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+1591
-147
lines changed

docs/source/en/api/pipelines/cogvideox.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ from diffusers.utils import export_to_video
5050
pipeline_quant_config = PipelineQuantizationConfig(
5151
quant_backend="torchao",
5252
quant_kwargs={"quant_type": "int8wo"},
53-
components_to_quantize=["transformer"]
53+
components_to_quantize="transformer"
5454
)
5555

5656
# fp8 layerwise weight-casting

docs/source/en/api/pipelines/hunyuan_video.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
5454
"bnb_4bit_quant_type": "nf4",
5555
"bnb_4bit_compute_dtype": torch.bfloat16
5656
},
57-
components_to_quantize=["transformer"]
57+
components_to_quantize="transformer"
5858
)
5959

6060
pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -91,7 +91,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
9191
"bnb_4bit_quant_type": "nf4",
9292
"bnb_4bit_compute_dtype": torch.bfloat16
9393
},
94-
components_to_quantize=["transformer"]
94+
components_to_quantize="transformer"
9595
)
9696

9797
pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -139,7 +139,7 @@ export_to_video(video, "output.mp4", fps=15)
139139
"bnb_4bit_quant_type": "nf4",
140140
"bnb_4bit_compute_dtype": torch.bfloat16
141141
},
142-
components_to_quantize=["transformer"]
142+
components_to_quantize="transformer"
143143
)
144144

145145
pipeline = HunyuanVideoPipeline.from_pretrained(

docs/source/en/optimization/memory.md

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -291,13 +291,53 @@ Group offloading moves groups of internal layers ([torch.nn.ModuleList](https://
291291
> [!WARNING]
292292
> Group offloading may not work with all models if the forward implementation contains weight-dependent device casting of inputs because it may clash with group offloading's device casting mechanism.
293293
294-
Call [`~ModelMixin.enable_group_offload`] to enable it for standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
295-
296-
The `offload_type` parameter can be set to `block_level` or `leaf_level`.
294+
Enable group offloading by configuring the `offload_type` parameter to `block_level` or `leaf_level`.
297295

298296
- `block_level` offloads groups of layers based on the `num_blocks_per_group` parameter. For example, if `num_blocks_per_group=2` on a model with 40 layers, 2 layers are onloaded and offloaded at a time (20 total onloads/offloads). This drastically reduces memory requirements.
299297
- `leaf_level` offloads individual layers at the lowest level and is equivalent to [CPU offloading](#cpu-offloading). But it can be made faster if you use streams without giving up inference speed.
300298

299+
Group offloading is supported for entire pipelines or individual models. Applying group offloading to the entire pipeline is the easiest option while selectively applying it to individual models gives users more flexibility to use different offloading techniques for different models.
300+
301+
<hfoptions id="group-offloading">
302+
<hfoption id="pipeline">
303+
304+
Call [`~DiffusionPipeline.enable_group_offload`] on a pipeline.
305+
306+
```py
307+
import torch
308+
from diffusers import CogVideoXPipeline
309+
from diffusers.hooks import apply_group_offloading
310+
from diffusers.utils import export_to_video
311+
312+
onload_device = torch.device("cuda")
313+
offload_device = torch.device("cpu")
314+
315+
pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
316+
pipeline.enable_group_offload(
317+
onload_device=onload_device,
318+
offload_device=offload_device,
319+
offload_type="leaf_level",
320+
use_stream=True
321+
)
322+
323+
prompt = (
324+
"A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
325+
"The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
326+
"pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
327+
"casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
328+
"The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
329+
"atmosphere of this unique musical performance."
330+
)
331+
video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
332+
print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
333+
export_to_video(video, "output.mp4", fps=8)
334+
```
335+
336+
</hfoption>
337+
<hfoption id="model">
338+
339+
Call [`~ModelMixin.enable_group_offload`] on standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
340+
301341
```py
302342
import torch
303343
from diffusers import CogVideoXPipeline
@@ -328,6 +368,9 @@ print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} G
328368
export_to_video(video, "output.mp4", fps=8)
329369
```
330370

371+
</hfoption>
372+
</hfoptions>
373+
331374
#### CUDA stream
332375

333376
The `use_stream` parameter can be activated for CUDA devices that support asynchronous data transfer streams to reduce overall execution time compared to [CPU offloading](#cpu-offloading). It overlaps data transfer and computation by using layer prefetching. The next layer to be executed is loaded onto the GPU while the current layer is still being executed. It can increase CPU memory significantly so ensure you have 2x the amount of memory as the model size.

docs/source/en/quantization/overview.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ Initialize [`~quantizers.PipelineQuantizationConfig`] with the following paramet
3434
> [!TIP]
3535
> These `quant_kwargs` arguments are different for each backend. Refer to the [Quantization API](../api/quantization) docs to view the arguments for each backend.
3636
37-
- `components_to_quantize` specifies which components of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.
37+
- `components_to_quantize` specifies which component(s) of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.
38+
39+
`components_to_quantize` accepts either a list for multiple models or a string for a single model.
3840

3941
The example below loads the bitsandbytes backend with the following arguments from [`~quantizers.quantization_config.BitsAndBytesConfig`], `load_in_4bit`, `bnb_4bit_quant_type`, and `bnb_4bit_compute_dtype`.
4042

@@ -62,6 +64,7 @@ pipe = DiffusionPipeline.from_pretrained(
6264
image = pipe("photo of a cute dog").images[0]
6365
```
6466

67+
6568
### Advanced quantization
6669

6770
The `quant_mapping` argument provides more options for how to quantize each individual component in a pipeline, like combining different quantization backends.

docs/source/en/using-diffusers/text-img2vid.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
9898
"bnb_4bit_quant_type": "nf4",
9999
"bnb_4bit_compute_dtype": torch.bfloat16
100100
},
101-
components_to_quantize=["transformer"]
101+
components_to_quantize="transformer"
102102
)
103103

104104
pipeline = HunyuanVideoPipeline.from_pretrained(

examples/community/pipeline_faithdiff_stable_diffusion_xl.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1705,6 +1705,12 @@ def enable_vae_tiling(self):
17051705
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
17061706
processing larger images.
17071707
"""
1708+
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
1709+
deprecate(
1710+
"enable_vae_tiling",
1711+
"0.40.0",
1712+
depr_message,
1713+
)
17081714
self.vae.enable_tiling()
17091715
self.unet.denoise_encoder.enable_tiling()
17101716

@@ -1713,6 +1719,12 @@ def disable_vae_tiling(self):
17131719
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
17141720
computing decoding in one step.
17151721
"""
1722+
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
1723+
deprecate(
1724+
"disable_vae_tiling",
1725+
"0.40.0",
1726+
depr_message,
1727+
)
17161728
self.vae.disable_tiling()
17171729
self.unet.denoise_encoder.disable_tiling()
17181730

examples/community/pipeline_flux_kontext_multiple_images.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
3636
from diffusers.utils import (
3737
USE_PEFT_BACKEND,
38+
deprecate,
3839
is_torch_xla_available,
3940
logging,
4041
replace_example_docstring,
@@ -643,6 +644,12 @@ def enable_vae_tiling(self):
643644
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
644645
processing larger images.
645646
"""
647+
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
648+
deprecate(
649+
"enable_vae_tiling",
650+
"0.40.0",
651+
depr_message,
652+
)
646653
self.vae.enable_tiling()
647654

648655
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_tiling
@@ -651,6 +658,12 @@ def disable_vae_tiling(self):
651658
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
652659
computing decoding in one step.
653660
"""
661+
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
662+
deprecate(
663+
"disable_vae_tiling",
664+
"0.40.0",
665+
depr_message,
666+
)
654667
self.vae.disable_tiling()
655668

656669
def preprocess_image(self, image: PipelineImageInput, _auto_resize: bool, multiple_of: int) -> torch.Tensor:

examples/community/pipeline_flux_rf_inversion.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
3131
from diffusers.utils import (
3232
USE_PEFT_BACKEND,
33+
deprecate,
3334
is_torch_xla_available,
3435
logging,
3536
replace_example_docstring,
@@ -526,13 +527,25 @@ def enable_vae_slicing(self):
526527
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
527528
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
528529
"""
530+
depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
531+
deprecate(
532+
"enable_vae_slicing",
533+
"0.40.0",
534+
depr_message,
535+
)
529536
self.vae.enable_slicing()
530537

531538
def disable_vae_slicing(self):
532539
r"""
533540
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
534541
computing decoding in one step.
535542
"""
543+
depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
544+
deprecate(
545+
"disable_vae_slicing",
546+
"0.40.0",
547+
depr_message,
548+
)
536549
self.vae.disable_slicing()
537550

538551
def enable_vae_tiling(self):
@@ -541,13 +554,25 @@ def enable_vae_tiling(self):
541554
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
542555
processing larger images.
543556
"""
557+
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
558+
deprecate(
559+
"enable_vae_tiling",
560+
"0.40.0",
561+
depr_message,
562+
)
544563
self.vae.enable_tiling()
545564

546565
def disable_vae_tiling(self):
547566
r"""
548567
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
549568
computing decoding in one step.
550569
"""
570+
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
571+
deprecate(
572+
"disable_vae_tiling",
573+
"0.40.0",
574+
depr_message,
575+
)
551576
self.vae.disable_tiling()
552577

553578
def prepare_latents_inversion(

examples/community/pipeline_flux_semantic_guidance.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
3636
from diffusers.utils import (
3737
USE_PEFT_BACKEND,
38+
deprecate,
3839
is_torch_xla_available,
3940
logging,
4041
replace_example_docstring,
@@ -702,6 +703,12 @@ def enable_vae_tiling(self):
702703
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
703704
processing larger images.
704705
"""
706+
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
707+
deprecate(
708+
"enable_vae_tiling",
709+
"0.40.0",
710+
depr_message,
711+
)
705712
self.vae.enable_tiling()
706713

707714
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_tiling
@@ -710,6 +717,12 @@ def disable_vae_tiling(self):
710717
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
711718
computing decoding in one step.
712719
"""
720+
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
721+
deprecate(
722+
"disable_vae_tiling",
723+
"0.40.0",
724+
depr_message,
725+
)
713726
self.vae.disable_tiling()
714727

715728
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latents

examples/community/pipeline_flux_with_cfg.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
2929
from diffusers.utils import (
3030
USE_PEFT_BACKEND,
31+
deprecate,
3132
is_torch_xla_available,
3233
logging,
3334
replace_example_docstring,
@@ -503,13 +504,25 @@ def enable_vae_slicing(self):
503504
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
504505
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
505506
"""
507+
depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
508+
deprecate(
509+
"enable_vae_slicing",
510+
"0.40.0",
511+
depr_message,
512+
)
506513
self.vae.enable_slicing()
507514

508515
def disable_vae_slicing(self):
509516
r"""
510517
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
511518
computing decoding in one step.
512519
"""
520+
depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
521+
deprecate(
522+
"disable_vae_slicing",
523+
"0.40.0",
524+
depr_message,
525+
)
513526
self.vae.disable_slicing()
514527

515528
def enable_vae_tiling(self):
@@ -518,13 +531,25 @@ def enable_vae_tiling(self):
518531
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
519532
processing larger images.
520533
"""
534+
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
535+
deprecate(
536+
"enable_vae_tiling",
537+
"0.40.0",
538+
depr_message,
539+
)
521540
self.vae.enable_tiling()
522541

523542
def disable_vae_tiling(self):
524543
r"""
525544
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
526545
computing decoding in one step.
527546
"""
547+
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
548+
deprecate(
549+
"disable_vae_tiling",
550+
"0.40.0",
551+
depr_message,
552+
)
528553
self.vae.disable_tiling()
529554

530555
def prepare_latents(

0 commit comments

Comments
 (0)