Skip to content

Commit b211eea

Browse files
authored
Merge branch 'main' into flux_lora_advanced
2 parents 8bf49c7 + df1d7b0 commit b211eea

File tree

14 files changed

+1341
-31
lines changed

14 files changed

+1341
-31
lines changed

docs/source/en/api/pipelines/wan.md

Lines changed: 394 additions & 11 deletions
Large diffs are not rendered by default.

docs/source/en/installation.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -161,10 +161,10 @@ Your Python environment will find the `main` version of 🤗 Diffusers on the ne
161161

162162
Model weights and files are downloaded from the Hub to a cache which is usually your home directory. You can change the cache location by specifying the `HF_HOME` or `HUGGINFACE_HUB_CACHE` environment variables or configuring the `cache_dir` parameter in methods like [`~DiffusionPipeline.from_pretrained`].
163163

164-
Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `True` and 🤗 Diffusers will only load previously downloaded files in the cache.
164+
Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `1` and 🤗 Diffusers will only load previously downloaded files in the cache.
165165

166166
```shell
167-
export HF_HUB_OFFLINE=True
167+
export HF_HUB_OFFLINE=1
168168
```
169169

170170
For more details about managing and cleaning the cache, take a look at the [caching](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) guide.
@@ -179,14 +179,16 @@ Telemetry is only sent when loading models and pipelines from the Hub,
179179
and it is not collected if you're loading local files.
180180

181181
We understand that not everyone wants to share additional information,and we respect your privacy.
182-
You can disable telemetry collection by setting the `DISABLE_TELEMETRY` environment variable from your terminal:
182+
You can disable telemetry collection by setting the `HF_HUB_DISABLE_TELEMETRY` environment variable from your terminal:
183183

184184
On Linux/MacOS:
185+
185186
```bash
186-
export DISABLE_TELEMETRY=YES
187+
export HF_HUB_DISABLE_TELEMETRY=1
187188
```
188189

189190
On Windows:
191+
190192
```bash
191-
set DISABLE_TELEMETRY=YES
193+
set HF_HUB_DISABLE_TELEMETRY=1
192194
```

examples/community/lpw_stable_diffusion_xl.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1773,7 +1773,7 @@ def denoising_value_valid(dnv):
17731773
f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
17741774
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
17751775
f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
1776-
f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
1776+
f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
17771777
" `pipeline.unet` or your `mask_image` or `image` input."
17781778
)
17791779
elif num_channels_unet != 4:
@@ -1924,7 +1924,22 @@ def denoising_value_valid(dnv):
19241924
self.upcast_vae()
19251925
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
19261926

1927-
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
1927+
# unscale/denormalize the latents
1928+
# denormalize with the mean and std if available and not None
1929+
has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
1930+
has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
1931+
if has_latents_mean and has_latents_std:
1932+
latents_mean = (
1933+
torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
1934+
)
1935+
latents_std = (
1936+
torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
1937+
)
1938+
latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
1939+
else:
1940+
latents = latents / self.vae.config.scaling_factor
1941+
1942+
image = self.vae.decode(latents, return_dict=False)[0]
19281943

19291944
# cast back to fp16 if needed
19301945
if needs_upcasting:

src/diffusers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,7 @@
509509
"VQDiffusionPipeline",
510510
"WanImageToVideoPipeline",
511511
"WanPipeline",
512+
"WanVideoToVideoPipeline",
512513
"WuerstchenCombinedPipeline",
513514
"WuerstchenDecoderPipeline",
514515
"WuerstchenPriorPipeline",
@@ -1062,6 +1063,7 @@
10621063
VQDiffusionPipeline,
10631064
WanImageToVideoPipeline,
10641065
WanPipeline,
1066+
WanVideoToVideoPipeline,
10651067
WuerstchenCombinedPipeline,
10661068
WuerstchenDecoderPipeline,
10671069
WuerstchenPriorPipeline,

src/diffusers/models/transformers/latte_transformer_3d.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def forward(
273273
hidden_states = hidden_states.reshape(-1, hidden_states.shape[-2], hidden_states.shape[-1])
274274

275275
if i == 0 and num_frame > 1:
276-
hidden_states = hidden_states + self.temp_pos_embed
276+
hidden_states = hidden_states + self.temp_pos_embed.to(hidden_states.dtype)
277277

278278
if torch.is_grad_enabled() and self.gradient_checkpointing:
279279
hidden_states = self._gradient_checkpointing_func(

src/diffusers/pipelines/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@
356356
"WuerstchenDecoderPipeline",
357357
"WuerstchenPriorPipeline",
358358
]
359-
_import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline"]
359+
_import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline"]
360360
try:
361361
if not is_onnx_available():
362362
raise OptionalDependencyNotAvailable()
@@ -709,7 +709,7 @@
709709
UniDiffuserPipeline,
710710
UniDiffuserTextDecoder,
711711
)
712-
from .wan import WanImageToVideoPipeline, WanPipeline
712+
from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline
713713
from .wuerstchen import (
714714
WuerstchenCombinedPipeline,
715715
WuerstchenDecoderPipeline,

src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -487,19 +487,21 @@ def prepare_latents(
487487
) -> torch.Tensor:
488488
height = height // self.vae_spatial_compression_ratio
489489
width = width // self.vae_spatial_compression_ratio
490-
num_frames = (
491-
(num_frames - 1) // self.vae_temporal_compression_ratio + 1 if latents is None else latents.size(2)
492-
)
490+
num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
493491

494492
shape = (batch_size, num_channels_latents, num_frames, height, width)
495493
mask_shape = (batch_size, 1, num_frames, height, width)
496494

497495
if latents is not None:
498-
conditioning_mask = latents.new_zeros(shape)
496+
conditioning_mask = latents.new_zeros(mask_shape)
499497
conditioning_mask[:, :, 0] = 1.0
500498
conditioning_mask = self._pack_latents(
501499
conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
502-
)
500+
).squeeze(-1)
501+
if latents.ndim != 3 or latents.shape[:2] != conditioning_mask.shape:
502+
raise ValueError(
503+
f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is {conditioning_mask.shape + (num_channels_latents,)}."
504+
)
503505
return latents.to(device=device, dtype=dtype), conditioning_mask
504506

505507
if isinstance(generator, list):

src/diffusers/pipelines/wan/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
else:
2525
_import_structure["pipeline_wan"] = ["WanPipeline"]
2626
_import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
27-
27+
_import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
2828
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
2929
try:
3030
if not (is_transformers_available() and is_torch_available()):
@@ -35,6 +35,7 @@
3535
else:
3636
from .pipeline_wan import WanPipeline
3737
from .pipeline_wan_i2v import WanImageToVideoPipeline
38+
from .pipeline_wan_video2video import WanVideoToVideoPipeline
3839

3940
else:
4041
import sys

src/diffusers/pipelines/wan/pipeline_wan.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,13 @@ def __call__(
458458
callback_on_step_end_tensor_inputs,
459459
)
460460

461+
if num_frames % self.vae_scale_factor_temporal != 1:
462+
logger.warning(
463+
f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
464+
)
465+
num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
466+
num_frames = max(num_frames, 1)
467+
461468
self._guidance_scale = guidance_scale
462469
self._attention_kwargs = attention_kwargs
463470
self._current_timestep = None

src/diffusers/pipelines/wan/pipeline_wan_i2v.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -220,8 +220,13 @@ def _get_t5_prompt_embeds(
220220

221221
return prompt_embeds
222222

223-
def encode_image(self, image: PipelineImageInput):
224-
image = self.image_processor(images=image, return_tensors="pt").to(self.device)
223+
def encode_image(
224+
self,
225+
image: PipelineImageInput,
226+
device: Optional[torch.device] = None,
227+
):
228+
device = device or self._execution_device
229+
image = self.image_processor(images=image, return_tensors="pt").to(device)
225230
image_embeds = self.image_encoder(**image, output_hidden_states=True)
226231
return image_embeds.hidden_states[-2]
227232

@@ -554,6 +559,13 @@ def __call__(
554559
callback_on_step_end_tensor_inputs,
555560
)
556561

562+
if num_frames % self.vae_scale_factor_temporal != 1:
563+
logger.warning(
564+
f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
565+
)
566+
num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
567+
num_frames = max(num_frames, 1)
568+
557569
self._guidance_scale = guidance_scale
558570
self._attention_kwargs = attention_kwargs
559571
self._current_timestep = None
@@ -587,7 +599,7 @@ def __call__(
587599
if negative_prompt_embeds is not None:
588600
negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
589601

590-
image_embeds = self.encode_image(image)
602+
image_embeds = self.encode_image(image, device)
591603
image_embeds = image_embeds.repeat(batch_size, 1, 1)
592604
image_embeds = image_embeds.to(transformer_dtype)
593605

0 commit comments

Comments
 (0)