update

DN6 · DN6 · commit 598ca27dca25 · 2025-03-14T11:20:57.000+01:00
diff --git a/src/diffusers/pipelines/wan/pipeline_wan_video2video.py b/src/diffusers/pipelines/wan/pipeline_wan_video2video.py
@@ -395,11 +395,13 @@ def prepare_latents(
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
 
-        num_frames = (video.size(2) - 1) // self.vae_scale_factor_temporal + 1 if latents is None else latents.size(1)
+        num_latent_frames = (
+            (video.size(2) - 1) // self.vae_scale_factor_temporal + 1 if latents is None else latents.size(1)
+        )
         shape = (
             batch_size,
-            num_frames,
             num_channels_latents,
+            num_latent_frames,
             height // self.vae_scale_factor_spatial,
             width // self.vae_scale_factor_spatial,
         )
@@ -412,10 +414,19 @@ def prepare_latents(
             else:
                 init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
 
-            init_latents = torch.cat(init_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
+            init_latents = torch.cat(init_latents, dim=0).to(dtype)
+
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).to(device, dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                device, dtype
+            )
+
+            init_latents = (init_latents - latents_mean) * latents_std
 
             noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-            latents = self.scheduler.scale_noise(init_latents, timestep, noise)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep)
         else:
             latents = latents.to(device)
 
@@ -464,7 +475,6 @@ def __call__(
         negative_prompt: Union[str, List[str]] = None,
         height: int = 480,
         width: int = 832,
-        num_frames: int = 81,
         num_inference_steps: int = 50,
         timesteps: Optional[List[int]] = None,
         guidance_scale: float = 5.0,
@@ -605,8 +615,9 @@ def __call__(
         self._num_timesteps = len(timesteps)
 
         if latents is None:
-            video = self.video_processor.preprocess_video(video, height=height, width=width)
-            video = video.to(device=device, dtype=prompt_embeds.dtype)
+            video = self.video_processor.preprocess_video(video, height=height, width=width).to(
+                device, dtype=torch.float32
+            )
 
         # 5. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels
diff --git a/tests/pipelines/wan/test_wan_video_to_video.py b/tests/pipelines/wan/test_wan_video_to_video.py
@@ -20,14 +20,14 @@
 from PIL import Image
 from transformers import AutoTokenizer, T5EncoderModel
 
-from diffusers import AutoencoderKLWan, FlowMatchEulerDiscreteScheduler, WanTransformer3DModel, WanVideoToVideoPipeline
+from diffusers import AutoencoderKLWan, UniPCMultistepScheduler, WanTransformer3DModel, WanVideoToVideoPipeline
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
     require_torch_accelerator,
     slow,
 )
 
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..pipeline_params import TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import (
     PipelineTesterMixin,
 )
@@ -39,8 +39,7 @@
 class WanVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = WanVideoToVideoPipeline
     params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    batch_params = frozenset(["video", "prompt", "negative_prompt"])
     image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
     required_optional_params = frozenset(
         [
@@ -66,8 +65,7 @@ def get_dummy_components(self):
         )
 
         torch.manual_seed(0)
-        # TODO: impl FlowDPMSolverMultistepScheduler
-        scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
+        scheduler = UniPCMultistepScheduler(flow_shift=3.0)
         text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
 
@@ -102,7 +100,7 @@ def get_dummy_inputs(self, device, seed=0):
         else:
             generator = torch.Generator(device=device).manual_seed(seed)
 
-        video = [Image.new("RGB", (16, 16))] * 19
+        video = [Image.new("RGB", (16, 16))] * 17
         inputs = {
             "video": video,
             "prompt": "dance monkey",
@@ -112,7 +110,6 @@ def get_dummy_inputs(self, device, seed=0):
             "guidance_scale": 6.0,
             "height": 16,
             "width": 16,
-            "num_frames": 9,
             "max_sequence_length": 16,
             "output_type": "pt",
         }
@@ -130,15 +127,27 @@ def test_inference(self):
         video = pipe(**inputs).frames
         generated_video = video[0]
 
-        self.assertEqual(generated_video.shape, (9, 3, 16, 16))
-        expected_video = torch.randn(9, 3, 16, 16)
+        self.assertEqual(generated_video.shape, (17, 3, 16, 16))
+        expected_video = torch.randn(17, 3, 16, 16)
         max_diff = np.abs(generated_video - expected_video).max()
         self.assertLessEqual(max_diff, 1e10)
 
     @unittest.skip("Test not supported")
     def test_attention_slicing_forward_pass(self):
         pass
 
+    @unittest.skip(
+        "WanVideoToVideoPipeline has to run in mixed precision. Casting the entire pipeline will result in errors"
+    )
+    def test_float16_inference(self):
+        pass
+
+    @unittest.skip(
+        "WanVideoToVideoPipeline has to run in mixed precision. Save/Load the entire pipeline in FP16 will result in errors"
+    )
+    def test_save_load_float16(self):
+        pass
+
 
 @slow
 @require_torch_accelerator