Skip to content

Commit a018ee1

Browse files
authored
Merge branch 'main' into group-offloading-with-disk
2 parents 8029cd7 + 00b179f commit a018ee1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+521
-132
lines changed

docs/source/en/quantization/bitsandbytes.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,45 @@ text_encoder_2_4bit.dequantize()
416416
transformer_4bit.dequantize()
417417
```
418418

419+
## torch.compile
420+
421+
Speed up inference with `torch.compile`. Make sure you have the latest `bitsandbytes` installed and we also recommend installing [PyTorch nightly](https://pytorch.org/get-started/locally/).
422+
423+
<hfoptions id="bnb">
424+
<hfoption id="8-bit">
425+
```py
426+
torch._dynamo.config.capture_dynamic_output_shape_ops = True
427+
428+
quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
429+
transformer_4bit = AutoModel.from_pretrained(
430+
"black-forest-labs/FLUX.1-dev",
431+
subfolder="transformer",
432+
quantization_config=quant_config,
433+
torch_dtype=torch.float16,
434+
)
435+
transformer_4bit.compile(fullgraph=True)
436+
```
437+
438+
</hfoption>
439+
<hfoption id="4-bit">
440+
441+
```py
442+
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
443+
transformer_4bit = AutoModel.from_pretrained(
444+
"black-forest-labs/FLUX.1-dev",
445+
subfolder="transformer",
446+
quantization_config=quant_config,
447+
torch_dtype=torch.float16,
448+
)
449+
transformer_4bit.compile(fullgraph=True)
450+
```
451+
</hfoption>
452+
</hfoptions>
453+
454+
On an RTX 4090 with compilation, 4-bit Flux generation completed in 25.809 seconds versus 32.570 seconds without.
455+
456+
Check out the [benchmarking script](https://gist.github.com/sayakpaul/0db9d8eeeb3d2a0e5ed7cf0d9ca19b7d) for more details.
457+
419458
## Resources
420459

421460
* [End-to-end notebook showing Flux.1 Dev inference in a free-tier Colab](https://gist.github.com/sayakpaul/c76bd845b48759e11687ac550b99d8b4)

src/diffusers/loaders/lora_pipeline.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,17 @@ def _maybe_dequantize_weight_for_expanded_lora(model, module):
8181
from ..quantizers.gguf.utils import dequantize_gguf_tensor
8282

8383
is_bnb_4bit_quantized = module.weight.__class__.__name__ == "Params4bit"
84+
is_bnb_8bit_quantized = module.weight.__class__.__name__ == "Int8Params"
8485
is_gguf_quantized = module.weight.__class__.__name__ == "GGUFParameter"
8586

8687
if is_bnb_4bit_quantized and not is_bitsandbytes_available():
8788
raise ValueError(
8889
"The checkpoint seems to have been quantized with `bitsandbytes` (4bits). Install `bitsandbytes` to load quantized checkpoints."
8990
)
91+
if is_bnb_8bit_quantized and not is_bitsandbytes_available():
92+
raise ValueError(
93+
"The checkpoint seems to have been quantized with `bitsandbytes` (8bits). Install `bitsandbytes` to load quantized checkpoints."
94+
)
9095
if is_gguf_quantized and not is_gguf_available():
9196
raise ValueError(
9297
"The checkpoint seems to have been quantized with `gguf`. Install `gguf` to load quantized checkpoints."
@@ -97,10 +102,10 @@ def _maybe_dequantize_weight_for_expanded_lora(model, module):
97102
weight_on_cpu = True
98103

99104
device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
100-
if is_bnb_4bit_quantized:
105+
if is_bnb_4bit_quantized or is_bnb_8bit_quantized:
101106
module_weight = dequantize_bnb_weight(
102107
module.weight.to(device) if weight_on_cpu else module.weight,
103-
state=module.weight.quant_state,
108+
state=module.weight.quant_state if is_bnb_4bit_quantized else module.state,
104109
dtype=model.dtype,
105110
).data
106111
elif is_gguf_quantized:

src/diffusers/models/embeddings.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,9 +1149,7 @@ def get_1d_rotary_pos_embed(
11491149

11501150
theta = theta * ntk_factor
11511151
freqs = (
1152-
1.0
1153-
/ (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=pos.device)[: (dim // 2)] / dim))
1154-
/ linear_factor
1152+
1.0 / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=pos.device) / dim)) / linear_factor
11551153
) # [D/2]
11561154
freqs = torch.outer(pos, freqs) # type: ignore # [S, D/2]
11571155
is_npu = freqs.device.type == "npu"

src/diffusers/models/modeling_utils.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -816,14 +816,43 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
816816
Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
817817
guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
818818
information.
819-
device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
819+
device_map (`Union[int, str, torch.device]` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
820820
A map that specifies where each submodule should go. It doesn't need to be defined for each
821821
parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
822822
same device. Defaults to `None`, meaning that the model will be loaded on CPU.
823823
824+
Examples:
825+
826+
```py
827+
>>> from diffusers import AutoModel
828+
>>> import torch
829+
830+
>>> # This works.
831+
>>> model = AutoModel.from_pretrained(
832+
... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="cuda"
833+
... )
834+
>>> # This also works (integer accelerator device ID).
835+
>>> model = AutoModel.from_pretrained(
836+
... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map=0
837+
... )
838+
>>> # Specifying a supported offloading strategy like "auto" also works.
839+
>>> model = AutoModel.from_pretrained(
840+
... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="auto"
841+
... )
842+
>>> # Specifying a dictionary as `device_map` also works.
843+
>>> model = AutoModel.from_pretrained(
844+
... "stabilityai/stable-diffusion-xl-base-1.0",
845+
... subfolder="unet",
846+
... device_map={"": torch.device("cuda")},
847+
... )
848+
```
849+
824850
Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
825851
more information about each option see [designing a device
826-
map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
852+
map](https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#the-devicemap). You
853+
can also refer to the [Diffusers-specific
854+
documentation](https://huggingface.co/docs/diffusers/main/en/training/distributed_inference#model-sharding)
855+
for more concrete examples.
827856
max_memory (`Dict`, *optional*):
828857
A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
829858
each GPU and the available CPU RAM if unset.
@@ -1389,7 +1418,7 @@ def _load_pretrained_model(
13891418
low_cpu_mem_usage: bool = True,
13901419
dtype: Optional[Union[str, torch.dtype]] = None,
13911420
keep_in_fp32_modules: Optional[List[str]] = None,
1392-
device_map: Dict[str, Union[int, str, torch.device]] = None,
1421+
device_map: Union[str, int, torch.device, Dict[str, Union[int, str, torch.device]]] = None,
13931422
offload_state_dict: Optional[bool] = None,
13941423
offload_folder: Optional[Union[str, os.PathLike]] = None,
13951424
dduf_entries: Optional[Dict[str, DDUFEntry]] = None,

src/diffusers/pipelines/flux/pipeline_flux.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -898,6 +898,7 @@ def __call__(
898898
)
899899

900900
# 6. Denoising loop
901+
self.scheduler.set_begin_index(0)
901902
with self.progress_bar(total=num_inference_steps) as progress_bar:
902903
for i, t in enumerate(timesteps):
903904
if self.interrupt:

src/diffusers/pipelines/flux/pipeline_flux_inpaint.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,6 +1193,11 @@ def __call__(
11931193
image = self.vae.decode(latents, return_dict=False)[0]
11941194
image = self.image_processor.postprocess(image, output_type=output_type)
11951195

1196+
if padding_mask_crop is not None:
1197+
image = [
1198+
self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image
1199+
]
1200+
11961201
# Offload all models
11971202
self.maybe_free_model_hooks()
11981203

src/diffusers/pipelines/pipeline_utils.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -669,14 +669,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
669669
Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
670670
guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
671671
information.
672-
device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
673-
A map that specifies where each submodule should go. It doesn’t need to be defined for each
674-
parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
675-
same device.
676-
677-
Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
678-
more information about each option see [designing a device
679-
map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
672+
device_map (`str`, *optional*):
673+
Strategy that dictates how the different components of a pipeline should be placed on available
674+
devices. Currently, only "balanced" `device_map` is supported. Check out
675+
[this](https://huggingface.co/docs/diffusers/main/en/tutorials/inference_with_big_models#device-placement)
676+
to know more.
680677
max_memory (`Dict`, *optional*):
681678
A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
682679
each GPU and the available CPU RAM if unset.

src/diffusers/pipelines/wan/pipeline_wan.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -388,8 +388,10 @@ def __call__(
388388
389389
Args:
390390
prompt (`str` or `List[str]`, *optional*):
391-
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
392-
instead.
391+
The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead.
392+
negative_prompt (`str` or `List[str]`, *optional*):
393+
The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds`
394+
instead. Ignored when not using guidance (`guidance_scale` < `1`).
393395
height (`int`, defaults to `480`):
394396
The height in pixels of the generated image.
395397
width (`int`, defaults to `832`):
@@ -434,8 +436,9 @@ def __call__(
434436
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
435437
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
436438
`._callback_tensor_inputs` attribute of your pipeline class.
437-
autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
438-
The dtype to use for the torch.amp.autocast.
439+
max_sequence_length (`int`, defaults to `512`):
440+
The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
441+
truncated. If the prompt is shorter, it will be padded to this length.
439442
440443
Examples:
441444

src/diffusers/pipelines/wan/pipeline_wan_i2v.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -562,12 +562,10 @@ def __call__(
562562
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
563563
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
564564
`._callback_tensor_inputs` attribute of your pipeline class.
565-
max_sequence_length (`int`, *optional*, defaults to `512`):
566-
The maximum sequence length of the prompt.
567-
shift (`float`, *optional*, defaults to `5.0`):
568-
The shift of the flow.
569-
autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
570-
The dtype to use for the torch.amp.autocast.
565+
max_sequence_length (`int`, defaults to `512`):
566+
The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
567+
truncated. If the prompt is shorter, it will be padded to this length.
568+
571569
Examples:
572570
573571
Returns:

src/diffusers/pipelines/wan/pipeline_wan_vace.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -687,8 +687,33 @@ def __call__(
687687
688688
Args:
689689
prompt (`str` or `List[str]`, *optional*):
690-
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
690+
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`
691691
instead.
692+
negative_prompt (`str` or `List[str]`, *optional*):
693+
The prompt or prompts not to guide the image generation. If not defined, one has to pass
694+
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
695+
less than `1`).
696+
video (`List[PIL.Image.Image]`, *optional*):
697+
The input video or videos to be used as a starting point for the generation. The video should be a list
698+
of PIL images, a numpy array, or a torch tensor. Currently, the pipeline only supports generating one
699+
video at a time.
700+
mask (`List[PIL.Image.Image]`, *optional*):
701+
The input mask defines which video regions to condition on and which to generate. Black areas in the
702+
mask indicate conditioning regions, while white areas indicate regions for generation. The mask should
703+
be a list of PIL images, a numpy array, or a torch tensor. Currently supports generating a single video
704+
at a time.
705+
reference_images (`List[PIL.Image.Image]`, *optional*):
706+
A list of one or more reference images as extra conditioning for the generation. For example, if you
707+
are trying to inpaint a video to change the character, you can pass reference images of the new
708+
character here. Refer to the Diffusers [examples](https://github.com/huggingface/diffusers/pull/11582)
709+
and original [user
710+
guide](https://github.com/ali-vilab/VACE/blob/0897c6d055d7d9ea9e191dce763006664d9780f8/UserGuide.md)
711+
for a full list of supported tasks and use cases.
712+
conditioning_scale (`float`, `List[float]`, `torch.Tensor`, defaults to `1.0`):
713+
The conditioning scale to be applied when adding the control conditioning latent stream to the
714+
denoising latent stream in each control layer of the model. If a float is provided, it will be applied
715+
uniformly to all layers. If a list or tensor is provided, it should have the same length as the number
716+
of control layers in the model (`len(transformer.config.vace_layers)`).
692717
height (`int`, defaults to `480`):
693718
The height in pixels of the generated image.
694719
width (`int`, defaults to `832`):
@@ -733,8 +758,9 @@ def __call__(
733758
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
734759
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
735760
`._callback_tensor_inputs` attribute of your pipeline class.
736-
autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
737-
The dtype to use for the torch.amp.autocast.
761+
max_sequence_length (`int`, defaults to `512`):
762+
The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
763+
truncated. If the prompt is shorter, it will be padded to this length.
738764
739765
Examples:
740766

0 commit comments

Comments
 (0)