Skip to content

Commit 3ed52be

Browse files
authored
Merge branch 'main' into bnb-compile-docs
2 parents 17f2c18 + 47ef794 commit 3ed52be

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+386
-127
lines changed

src/diffusers/models/embeddings.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,9 +1149,7 @@ def get_1d_rotary_pos_embed(
11491149

11501150
theta = theta * ntk_factor
11511151
freqs = (
1152-
1.0
1153-
/ (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=pos.device)[: (dim // 2)] / dim))
1154-
/ linear_factor
1152+
1.0 / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=pos.device) / dim)) / linear_factor
11551153
) # [D/2]
11561154
freqs = torch.outer(pos, freqs) # type: ignore # [S, D/2]
11571155
is_npu = freqs.device.type == "npu"

src/diffusers/models/modeling_utils.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -814,14 +814,43 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
814814
Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
815815
guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
816816
information.
817-
device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
817+
device_map (`Union[int, str, torch.device]` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
818818
A map that specifies where each submodule should go. It doesn't need to be defined for each
819819
parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
820820
same device. Defaults to `None`, meaning that the model will be loaded on CPU.
821821
822+
Examples:
823+
824+
```py
825+
>>> from diffusers import AutoModel
826+
>>> import torch
827+
828+
>>> # This works.
829+
>>> model = AutoModel.from_pretrained(
830+
... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="cuda"
831+
... )
832+
>>> # This also works (integer accelerator device ID).
833+
>>> model = AutoModel.from_pretrained(
834+
... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map=0
835+
... )
836+
>>> # Specifying a supported offloading strategy like "auto" also works.
837+
>>> model = AutoModel.from_pretrained(
838+
... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="auto"
839+
... )
840+
>>> # Specifying a dictionary as `device_map` also works.
841+
>>> model = AutoModel.from_pretrained(
842+
... "stabilityai/stable-diffusion-xl-base-1.0",
843+
... subfolder="unet",
844+
... device_map={"": torch.device("cuda")},
845+
... )
846+
```
847+
822848
Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
823849
more information about each option see [designing a device
824-
map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
850+
map](https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#the-devicemap). You
851+
can also refer to the [Diffusers-specific
852+
documentation](https://huggingface.co/docs/diffusers/main/en/training/distributed_inference#model-sharding)
853+
for more concrete examples.
825854
max_memory (`Dict`, *optional*):
826855
A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
827856
each GPU and the available CPU RAM if unset.
@@ -1387,7 +1416,7 @@ def _load_pretrained_model(
13871416
low_cpu_mem_usage: bool = True,
13881417
dtype: Optional[Union[str, torch.dtype]] = None,
13891418
keep_in_fp32_modules: Optional[List[str]] = None,
1390-
device_map: Dict[str, Union[int, str, torch.device]] = None,
1419+
device_map: Union[str, int, torch.device, Dict[str, Union[int, str, torch.device]]] = None,
13911420
offload_state_dict: Optional[bool] = None,
13921421
offload_folder: Optional[Union[str, os.PathLike]] = None,
13931422
dduf_entries: Optional[Dict[str, DDUFEntry]] = None,

src/diffusers/pipelines/flux/pipeline_flux.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -898,6 +898,7 @@ def __call__(
898898
)
899899

900900
# 6. Denoising loop
901+
self.scheduler.set_begin_index(0)
901902
with self.progress_bar(total=num_inference_steps) as progress_bar:
902903
for i, t in enumerate(timesteps):
903904
if self.interrupt:

src/diffusers/pipelines/pipeline_utils.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -669,14 +669,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
669669
Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
670670
guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
671671
information.
672-
device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
673-
A map that specifies where each submodule should go. It doesn’t need to be defined for each
674-
parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
675-
same device.
676-
677-
Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
678-
more information about each option see [designing a device
679-
map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
672+
device_map (`str`, *optional*):
673+
Strategy that dictates how the different components of a pipeline should be placed on available
674+
devices. Currently, only "balanced" `device_map` is supported. Check out
675+
[this](https://huggingface.co/docs/diffusers/main/en/tutorials/inference_with_big_models#device-placement)
676+
to know more.
680677
max_memory (`Dict`, *optional*):
681678
A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
682679
each GPU and the available CPU RAM if unset.

src/diffusers/pipelines/wan/pipeline_wan.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -388,8 +388,10 @@ def __call__(
388388
389389
Args:
390390
prompt (`str` or `List[str]`, *optional*):
391-
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
392-
instead.
391+
The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead.
392+
negative_prompt (`str` or `List[str]`, *optional*):
393+
The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds`
394+
instead. Ignored when not using guidance (`guidance_scale` < `1`).
393395
height (`int`, defaults to `480`):
394396
The height in pixels of the generated image.
395397
width (`int`, defaults to `832`):
@@ -434,8 +436,9 @@ def __call__(
434436
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
435437
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
436438
`._callback_tensor_inputs` attribute of your pipeline class.
437-
autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
438-
The dtype to use for the torch.amp.autocast.
439+
max_sequence_length (`int`, defaults to `512`):
440+
The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
441+
truncated. If the prompt is shorter, it will be padded to this length.
439442
440443
Examples:
441444

src/diffusers/pipelines/wan/pipeline_wan_i2v.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -562,12 +562,10 @@ def __call__(
562562
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
563563
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
564564
`._callback_tensor_inputs` attribute of your pipeline class.
565-
max_sequence_length (`int`, *optional*, defaults to `512`):
566-
The maximum sequence length of the prompt.
567-
shift (`float`, *optional*, defaults to `5.0`):
568-
The shift of the flow.
569-
autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
570-
The dtype to use for the torch.amp.autocast.
565+
max_sequence_length (`int`, defaults to `512`):
566+
The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
567+
truncated. If the prompt is shorter, it will be padded to this length.
568+
571569
Examples:
572570
573571
Returns:

src/diffusers/pipelines/wan/pipeline_wan_vace.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -687,8 +687,33 @@ def __call__(
687687
688688
Args:
689689
prompt (`str` or `List[str]`, *optional*):
690-
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
690+
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`
691691
instead.
692+
negative_prompt (`str` or `List[str]`, *optional*):
693+
The prompt or prompts not to guide the image generation. If not defined, one has to pass
694+
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
695+
less than `1`).
696+
video (`List[PIL.Image.Image]`, *optional*):
697+
The input video or videos to be used as a starting point for the generation. The video should be a list
698+
of PIL images, a numpy array, or a torch tensor. Currently, the pipeline only supports generating one
699+
video at a time.
700+
mask (`List[PIL.Image.Image]`, *optional*):
701+
The input mask defines which video regions to condition on and which to generate. Black areas in the
702+
mask indicate conditioning regions, while white areas indicate regions for generation. The mask should
703+
be a list of PIL images, a numpy array, or a torch tensor. Currently supports generating a single video
704+
at a time.
705+
reference_images (`List[PIL.Image.Image]`, *optional*):
706+
A list of one or more reference images as extra conditioning for the generation. For example, if you
707+
are trying to inpaint a video to change the character, you can pass reference images of the new
708+
character here. Refer to the Diffusers [examples](https://github.com/huggingface/diffusers/pull/11582)
709+
and original [user
710+
guide](https://github.com/ali-vilab/VACE/blob/0897c6d055d7d9ea9e191dce763006664d9780f8/UserGuide.md)
711+
for a full list of supported tasks and use cases.
712+
conditioning_scale (`float`, `List[float]`, `torch.Tensor`, defaults to `1.0`):
713+
The conditioning scale to be applied when adding the control conditioning latent stream to the
714+
denoising latent stream in each control layer of the model. If a float is provided, it will be applied
715+
uniformly to all layers. If a list or tensor is provided, it should have the same length as the number
716+
of control layers in the model (`len(transformer.config.vace_layers)`).
692717
height (`int`, defaults to `480`):
693718
The height in pixels of the generated image.
694719
width (`int`, defaults to `832`):
@@ -733,8 +758,9 @@ def __call__(
733758
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
734759
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
735760
`._callback_tensor_inputs` attribute of your pipeline class.
736-
autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
737-
The dtype to use for the torch.amp.autocast.
761+
max_sequence_length (`int`, defaults to `512`):
762+
The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
763+
truncated. If the prompt is shorter, it will be padded to this length.
738764
739765
Examples:
740766

src/diffusers/pipelines/wan/pipeline_wan_video2video.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -419,12 +419,7 @@ def prepare_latents(
419419
)
420420

421421
if latents is None:
422-
if isinstance(generator, list):
423-
init_latents = [
424-
retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
425-
]
426-
else:
427-
init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
422+
init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), sample_mode="argmax") for vid in video]
428423

429424
init_latents = torch.cat(init_latents, dim=0).to(dtype)
430425

@@ -441,7 +436,7 @@ def prepare_latents(
441436
if hasattr(self.scheduler, "add_noise"):
442437
latents = self.scheduler.add_noise(init_latents, noise, timestep)
443438
else:
444-
latents = self.scheduelr.scale_noise(init_latents, timestep, noise)
439+
latents = self.scheduler.scale_noise(init_latents, timestep, noise)
445440
else:
446441
latents = latents.to(device)
447442

@@ -513,7 +508,7 @@ def __call__(
513508
514509
Args:
515510
prompt (`str` or `List[str]`, *optional*):
516-
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
511+
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`
517512
instead.
518513
height (`int`, defaults to `480`):
519514
The height in pixels of the generated image.
@@ -530,6 +525,8 @@ def __call__(
530525
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
531526
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
532527
the text `prompt`, usually at the expense of lower image quality.
528+
strength (`float`, defaults to `0.8`):
529+
Higher strength leads to more differences between original image and generated video.
533530
num_videos_per_prompt (`int`, *optional*, defaults to 1):
534531
The number of images to generate per prompt.
535532
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -559,8 +556,9 @@ def __call__(
559556
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
560557
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
561558
`._callback_tensor_inputs` attribute of your pipeline class.
562-
autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
563-
The dtype to use for the torch.amp.autocast.
559+
max_sequence_length (`int`, defaults to `512`):
560+
The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
561+
truncated. If the prompt is shorter, it will be padded to this length.
564562
565563
Examples:
566564

src/diffusers/quantizers/quantization_config.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,7 @@ def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]]
493493
TORCHAO_QUANT_TYPE_METHODS = self._get_torchao_quant_type_to_method()
494494
if self.quant_type not in TORCHAO_QUANT_TYPE_METHODS.keys():
495495
is_floating_quant_type = self.quant_type.startswith("float") or self.quant_type.startswith("fp")
496-
if is_floating_quant_type and not self._is_cuda_capability_atleast_8_9():
496+
if is_floating_quant_type and not self._is_xpu_or_cuda_capability_atleast_8_9():
497497
raise ValueError(
498498
f"Requested quantization type: {self.quant_type} is not supported on GPUs with CUDA capability <= 8.9. You "
499499
f"can check the CUDA capability of your GPU using `torch.cuda.get_device_capability()`."
@@ -645,7 +645,7 @@ def generate_fpx_quantization_types(bits: int):
645645
QUANTIZATION_TYPES.update(INT8_QUANTIZATION_TYPES)
646646
QUANTIZATION_TYPES.update(UINTX_QUANTIZATION_DTYPES)
647647

648-
if cls._is_cuda_capability_atleast_8_9():
648+
if cls._is_xpu_or_cuda_capability_atleast_8_9():
649649
QUANTIZATION_TYPES.update(FLOATX_QUANTIZATION_TYPES)
650650

651651
return QUANTIZATION_TYPES
@@ -655,14 +655,16 @@ def generate_fpx_quantization_types(bits: int):
655655
)
656656

657657
@staticmethod
658-
def _is_cuda_capability_atleast_8_9() -> bool:
659-
if not torch.cuda.is_available():
660-
raise RuntimeError("TorchAO requires a CUDA compatible GPU and installation of PyTorch.")
661-
662-
major, minor = torch.cuda.get_device_capability()
663-
if major == 8:
664-
return minor >= 9
665-
return major >= 9
658+
def _is_xpu_or_cuda_capability_atleast_8_9() -> bool:
659+
if torch.cuda.is_available():
660+
major, minor = torch.cuda.get_device_capability()
661+
if major == 8:
662+
return minor >= 9
663+
return major >= 9
664+
elif torch.xpu.is_available():
665+
return True
666+
else:
667+
raise RuntimeError("TorchAO requires a CUDA compatible GPU or Intel XPU and installation of PyTorch.")
666668

667669
def get_apply_tensor_subclass(self):
668670
TORCHAO_QUANT_TYPE_METHODS = self._get_torchao_quant_type_to_method()

src/diffusers/utils/import_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ def _is_package_available(pkg_name: str, get_dist_name: bool = False) -> Tuple[b
9999
else:
100100
logger.info("Disabling PyTorch because USE_TORCH is set")
101101
_torch_available = False
102+
_torch_version = "N/A"
102103

103104
_jax_version = "N/A"
104105
_flax_version = "N/A"

0 commit comments

Comments
 (0)