huggingface
diff --git a/‎docs/source/en/api/pipelines/wan.md‎
Lines changed: 394 additions & 11 deletions b/‎docs/source/en/api/pipelines/wan.md‎
Lines changed: 394 additions & 11 deletions
diff --git a/‎docs/source/en/installation.md‎
Lines changed: 7 additions & 5 deletions b/‎docs/source/en/installation.md‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎examples/community/lpw_stable_diffusion_xl.py‎
Lines changed: 17 additions & 2 deletions b/‎examples/community/lpw_stable_diffusion_xl.py‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/models/transformers/latte_transformer_3d.py‎
Lines changed: 1 addition & 1 deletion b/‎src/diffusers/models/transformers/latte_transformer_3d.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py‎
Lines changed: 7 additions & 5 deletions b/‎src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎src/diffusers/pipelines/wan/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎src/diffusers/pipelines/wan/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/diffusers/pipelines/wan/pipeline_wan.py‎
Lines changed: 7 additions & 0 deletions b/‎src/diffusers/pipelines/wan/pipeline_wan.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/diffusers/pipelines/wan/pipeline_wan_i2v.py‎
Lines changed: 15 additions & 3 deletions b/‎src/diffusers/pipelines/wan/pipeline_wan_i2v.py‎
Lines changed: 15 additions & 3 deletions
@@ -161,10 +161,10 @@ Your Python environment will find the `main` version of 🤗 Diffusers on the ne
 
 Model weights and files are downloaded from the Hub to a cache which is usually your home directory. You can change the cache location by specifying the `HF_HOME` or `HUGGINFACE_HUB_CACHE` environment variables or configuring the `cache_dir` parameter in methods like [`~DiffusionPipeline.from_pretrained`].
 
-Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `True` and 🤗 Diffusers will only load previously downloaded files in the cache.
+Cached files allow you to run 🤗 Diffusers offline. To prevent 🤗 Diffusers from connecting to the internet, set the `HF_HUB_OFFLINE` environment variable to `1` and 🤗 Diffusers will only load previously downloaded files in the cache.
 
 ```shell
-export HF_HUB_OFFLINE=True
+export HF_HUB_OFFLINE=1
 ```
 
 For more details about managing and cleaning the cache, take a look at the [caching](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) guide.
@@ -179,14 +179,16 @@ Telemetry is only sent when loading models and pipelines from the Hub,
 and it is not collected if you're loading local files.
 
 We understand that not everyone wants to share additional information,and we respect your privacy.
-You can disable telemetry collection by setting the `DISABLE_TELEMETRY` environment variable from your terminal:
+You can disable telemetry collection by setting the `HF_HUB_DISABLE_TELEMETRY` environment variable from your terminal:
 
 On Linux/MacOS:
+
 ```bash
-export DISABLE_TELEMETRY=YES
+export HF_HUB_DISABLE_TELEMETRY=1
 ```
 
 On Windows:
+
 ```bash
-set DISABLE_TELEMETRY=YES
+set HF_HUB_DISABLE_TELEMETRY=1
 ```
@@ -1773,7 +1773,7 @@ def denoising_value_valid(dnv):
                         f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
                         f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                         f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                        f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                        f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
                         " `pipeline.unet` or your `mask_image` or `image` input."
                     )
             elif num_channels_unet != 4:
@@ -1924,7 +1924,22 @@ def denoising_value_valid(dnv):
                 self.upcast_vae()
                 latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
 
-            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
 
             # cast back to fp16 if needed
             if needs_upcasting:
 
@@ -509,6 +509,7 @@
             "VQDiffusionPipeline",
             "WanImageToVideoPipeline",
             "WanPipeline",
+            "WanVideoToVideoPipeline",
             "WuerstchenCombinedPipeline",
             "WuerstchenDecoderPipeline",
             "WuerstchenPriorPipeline",
@@ -1062,6 +1063,7 @@
             VQDiffusionPipeline,
             WanImageToVideoPipeline,
             WanPipeline,
+            WanVideoToVideoPipeline,
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
 
@@ -273,7 +273,7 @@ def forward(
                 hidden_states = hidden_states.reshape(-1, hidden_states.shape[-2], hidden_states.shape[-1])
 
                 if i == 0 and num_frame > 1:
-                    hidden_states = hidden_states + self.temp_pos_embed
+                    hidden_states = hidden_states + self.temp_pos_embed.to(hidden_states.dtype)
 
                 if torch.is_grad_enabled() and self.gradient_checkpointing:
                     hidden_states = self._gradient_checkpointing_func(
 
@@ -356,7 +356,7 @@
         "WuerstchenDecoderPipeline",
         "WuerstchenPriorPipeline",
     ]
-    _import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline"]
+    _import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline"]
 try:
     if not is_onnx_available():
         raise OptionalDependencyNotAvailable()
@@ -709,7 +709,7 @@
             UniDiffuserPipeline,
             UniDiffuserTextDecoder,
         )
-        from .wan import WanImageToVideoPipeline, WanPipeline
+        from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline
         from .wuerstchen import (
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
 
@@ -487,19 +487,21 @@ def prepare_latents(
     ) -> torch.Tensor:
         height = height // self.vae_spatial_compression_ratio
         width = width // self.vae_spatial_compression_ratio
-        num_frames = (
-            (num_frames - 1) // self.vae_temporal_compression_ratio + 1 if latents is None else latents.size(2)
-        )
+        num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
 
         shape = (batch_size, num_channels_latents, num_frames, height, width)
         mask_shape = (batch_size, 1, num_frames, height, width)
 
         if latents is not None:
-            conditioning_mask = latents.new_zeros(shape)
+            conditioning_mask = latents.new_zeros(mask_shape)
             conditioning_mask[:, :, 0] = 1.0
             conditioning_mask = self._pack_latents(
                 conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
-            )
+            ).squeeze(-1)
+            if latents.ndim != 3 or latents.shape[:2] != conditioning_mask.shape:
+                raise ValueError(
+                    f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is {conditioning_mask.shape + (num_channels_latents,)}."
+                )
             return latents.to(device=device, dtype=dtype), conditioning_mask
 
         if isinstance(generator, list):
 
@@ -24,7 +24,7 @@
 else:
     _import_structure["pipeline_wan"] = ["WanPipeline"]
     _import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
-
+    _import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
         if not (is_transformers_available() and is_torch_available()):
@@ -35,6 +35,7 @@
     else:
         from .pipeline_wan import WanPipeline
         from .pipeline_wan_i2v import WanImageToVideoPipeline
+        from .pipeline_wan_video2video import WanVideoToVideoPipeline
 
 else:
     import sys
 
@@ -458,6 +458,13 @@ def __call__(
             callback_on_step_end_tensor_inputs,
         )
 
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+
         self._guidance_scale = guidance_scale
         self._attention_kwargs = attention_kwargs
         self._current_timestep = None
 
@@ -220,8 +220,13 @@ def _get_t5_prompt_embeds(
 
         return prompt_embeds
 
-    def encode_image(self, image: PipelineImageInput):
-        image = self.image_processor(images=image, return_tensors="pt").to(self.device)
+    def encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        image = self.image_processor(images=image, return_tensors="pt").to(device)
         image_embeds = self.image_encoder(**image, output_hidden_states=True)
         return image_embeds.hidden_states[-2]
 
@@ -554,6 +559,13 @@ def __call__(
             callback_on_step_end_tensor_inputs,
         )
 
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+
         self._guidance_scale = guidance_scale
         self._attention_kwargs = attention_kwargs
         self._current_timestep = None
@@ -587,7 +599,7 @@ def __call__(
         if negative_prompt_embeds is not None:
             negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
 
-        image_embeds = self.encode_image(image)
+        image_embeds = self.encode_image(image, device)
         image_embeds = image_embeds.repeat(batch_size, 1, 1)
         image_embeds = image_embeds.to(transformer_dtype)