Skip to content

Commit fb59f36

Browse files
authored
Merge branch 'main' into record-streams
2 parents d5afea5 + e5c6027 commit fb59f36

File tree

21 files changed

+1420
-45
lines changed

21 files changed

+1420
-45
lines changed

docs/source/en/api/pipelines/wan.md

Lines changed: 394 additions & 11 deletions
Large diffs are not rendered by default.

docs/source/en/installation.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -161,10 +161,10 @@ Your Python environment will find the `main` version of 🤗 Diffusers on the ne
161161

162162
Model weights and files are downloaded from the Hub to a cache which is usually your home directory. You can change the cache location by specifying the `HF_HOME` or `HUGGINFACE_HUB_CACHE` environment variables or configuring the `cache_dir` parameter in methods like [`~DiffusionPipeline.from_pretrained`].
163163

164-
Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `True` and 🤗 Diffusers will only load previously downloaded files in the cache.
164+
Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `1` and 🤗 Diffusers will only load previously downloaded files in the cache.
165165

166166
```shell
167-
export HF_HUB_OFFLINE=True
167+
export HF_HUB_OFFLINE=1
168168
```
169169

170170
For more details about managing and cleaning the cache, take a look at the [caching](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) guide.
@@ -179,14 +179,16 @@ Telemetry is only sent when loading models and pipelines from the Hub,
179179
and it is not collected if you're loading local files.
180180

181181
We understand that not everyone wants to share additional information,and we respect your privacy.
182-
You can disable telemetry collection by setting the `DISABLE_TELEMETRY` environment variable from your terminal:
182+
You can disable telemetry collection by setting the `HF_HUB_DISABLE_TELEMETRY` environment variable from your terminal:
183183

184184
On Linux/MacOS:
185+
185186
```bash
186-
export DISABLE_TELEMETRY=YES
187+
export HF_HUB_DISABLE_TELEMETRY=1
187188
```
188189

189190
On Windows:
191+
190192
```bash
191-
set DISABLE_TELEMETRY=YES
193+
set HF_HUB_DISABLE_TELEMETRY=1
192194
```

docs/source/en/using-diffusers/loading.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,23 @@ Use the Space below to gauge a pipeline's memory requirements before you downloa
9595
></iframe>
9696
</div>
9797

98+
### Specifying Component-Specific Data Types
99+
100+
You can customize the data types for individual sub-models by passing a dictionary to the `torch_dtype` parameter. This allows you to load different components of a pipeline in different floating point precisions. For instance, if you want to load the transformer with `torch.bfloat16` and all other components with `torch.float16`, you can pass a dictionary mapping:
101+
102+
```python
103+
from diffusers import HunyuanVideoPipeline
104+
import torch
105+
106+
pipe = HunyuanVideoPipeline.from_pretrained(
107+
"hunyuanvideo-community/HunyuanVideo",
108+
torch_dtype={'transformer': torch.bfloat16, 'default': torch.float16},
109+
)
110+
print(pipe.transformer.dtype, pipe.vae.dtype) # (torch.bfloat16, torch.float16)
111+
```
112+
113+
If a component is not explicitly specified in the dictionary and no `default` is provided, it will be loaded with `torch.float32`.
114+
98115
### Local pipeline
99116

100117
To load a pipeline locally, use [git-lfs](https://git-lfs.github.com/) to manually download a checkpoint to your local disk.

examples/community/lpw_stable_diffusion_xl.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1773,7 +1773,7 @@ def denoising_value_valid(dnv):
17731773
f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
17741774
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
17751775
f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
1776-
f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
1776+
f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
17771777
" `pipeline.unet` or your `mask_image` or `image` input."
17781778
)
17791779
elif num_channels_unet != 4:
@@ -1924,7 +1924,22 @@ def denoising_value_valid(dnv):
19241924
self.upcast_vae()
19251925
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
19261926

1927-
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
1927+
# unscale/denormalize the latents
1928+
# denormalize with the mean and std if available and not None
1929+
has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
1930+
has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
1931+
if has_latents_mean and has_latents_std:
1932+
latents_mean = (
1933+
torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
1934+
)
1935+
latents_std = (
1936+
torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
1937+
)
1938+
latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
1939+
else:
1940+
latents = latents / self.vae.config.scaling_factor
1941+
1942+
image = self.vae.decode(latents, return_dict=False)[0]
19281943

19291944
# cast back to fp16 if needed
19301945
if needs_upcasting:

src/diffusers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,7 @@
509509
"VQDiffusionPipeline",
510510
"WanImageToVideoPipeline",
511511
"WanPipeline",
512+
"WanVideoToVideoPipeline",
512513
"WuerstchenCombinedPipeline",
513514
"WuerstchenDecoderPipeline",
514515
"WuerstchenPriorPipeline",
@@ -1062,6 +1063,7 @@
10621063
VQDiffusionPipeline,
10631064
WanImageToVideoPipeline,
10641065
WanPipeline,
1066+
WanVideoToVideoPipeline,
10651067
WuerstchenCombinedPipeline,
10661068
WuerstchenDecoderPipeline,
10671069
WuerstchenPriorPipeline,

src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ def __init__(
105105
self.width_pad = width_pad
106106
self.time_pad = time_pad
107107
self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)
108+
self.const_padding_conv3d = (0, self.width_pad, self.height_pad)
108109

109110
self.temporal_dim = 2
110111
self.time_kernel_size = time_kernel_size
@@ -117,6 +118,8 @@ def __init__(
117118
kernel_size=kernel_size,
118119
stride=stride,
119120
dilation=dilation,
121+
padding=0 if self.pad_mode == "replicate" else self.const_padding_conv3d,
122+
padding_mode="zeros",
120123
)
121124

122125
def fake_context_parallel_forward(
@@ -137,9 +140,7 @@ def forward(self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = Non
137140
if self.pad_mode == "replicate":
138141
conv_cache = None
139142
else:
140-
padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
141143
conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
142-
inputs = F.pad(inputs, padding_2d, mode="constant", value=0)
143144

144145
output = self.conv(inputs)
145146
return output, conv_cache

src/diffusers/models/transformers/latte_transformer_3d.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def forward(
273273
hidden_states = hidden_states.reshape(-1, hidden_states.shape[-2], hidden_states.shape[-1])
274274

275275
if i == 0 and num_frame > 1:
276-
hidden_states = hidden_states + self.temp_pos_embed
276+
hidden_states = hidden_states + self.temp_pos_embed.to(hidden_states.dtype)
277277

278278
if torch.is_grad_enabled() and self.gradient_checkpointing:
279279
hidden_states = self._gradient_checkpointing_func(

src/diffusers/pipelines/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@
356356
"WuerstchenDecoderPipeline",
357357
"WuerstchenPriorPipeline",
358358
]
359-
_import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline"]
359+
_import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline"]
360360
try:
361361
if not is_onnx_available():
362362
raise OptionalDependencyNotAvailable()
@@ -709,7 +709,7 @@
709709
UniDiffuserPipeline,
710710
UniDiffuserTextDecoder,
711711
)
712-
from .wan import WanImageToVideoPipeline, WanPipeline
712+
from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline
713713
from .wuerstchen import (
714714
WuerstchenCombinedPipeline,
715715
WuerstchenDecoderPipeline,

src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -487,19 +487,21 @@ def prepare_latents(
487487
) -> torch.Tensor:
488488
height = height // self.vae_spatial_compression_ratio
489489
width = width // self.vae_spatial_compression_ratio
490-
num_frames = (
491-
(num_frames - 1) // self.vae_temporal_compression_ratio + 1 if latents is None else latents.size(2)
492-
)
490+
num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
493491

494492
shape = (batch_size, num_channels_latents, num_frames, height, width)
495493
mask_shape = (batch_size, 1, num_frames, height, width)
496494

497495
if latents is not None:
498-
conditioning_mask = latents.new_zeros(shape)
496+
conditioning_mask = latents.new_zeros(mask_shape)
499497
conditioning_mask[:, :, 0] = 1.0
500498
conditioning_mask = self._pack_latents(
501499
conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
502-
)
500+
).squeeze(-1)
501+
if latents.ndim != 3 or latents.shape[:2] != conditioning_mask.shape:
502+
raise ValueError(
503+
f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is {conditioning_mask.shape + (num_channels_latents,)}."
504+
)
503505
return latents.to(device=device, dtype=dtype), conditioning_mask
504506

505507
if isinstance(generator, list):

src/diffusers/pipelines/pipeline_loading_utils.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,11 @@ def _get_final_device_map(device_map, pipeline_class, passed_class_obj, init_dic
592592
loaded_sub_model = passed_class_obj[name]
593593

594594
else:
595+
sub_model_dtype = (
596+
torch_dtype.get(name, torch_dtype.get("default", torch.float32))
597+
if isinstance(torch_dtype, dict)
598+
else torch_dtype
599+
)
595600
loaded_sub_model = _load_empty_model(
596601
library_name=library_name,
597602
class_name=class_name,
@@ -600,7 +605,7 @@ def _get_final_device_map(device_map, pipeline_class, passed_class_obj, init_dic
600605
is_pipeline_module=is_pipeline_module,
601606
pipeline_class=pipeline_class,
602607
name=name,
603-
torch_dtype=torch_dtype,
608+
torch_dtype=sub_model_dtype,
604609
cached_folder=kwargs.get("cached_folder", None),
605610
force_download=kwargs.get("force_download", None),
606611
proxies=kwargs.get("proxies", None),
@@ -616,7 +621,12 @@ def _get_final_device_map(device_map, pipeline_class, passed_class_obj, init_dic
616621
# Obtain a sorted dictionary for mapping the model-level components
617622
# to their sizes.
618623
module_sizes = {
619-
module_name: compute_module_sizes(module, dtype=torch_dtype)[""]
624+
module_name: compute_module_sizes(
625+
module,
626+
dtype=torch_dtype.get(module_name, torch_dtype.get("default", torch.float32))
627+
if isinstance(torch_dtype, dict)
628+
else torch_dtype,
629+
)[""]
620630
for module_name, module in init_empty_modules.items()
621631
if isinstance(module, torch.nn.Module)
622632
}

0 commit comments

Comments
 (0)