huggingface
diff --git a/‎src/diffusers/modular_pipelines/wan/__init__.py‎
Lines changed: 9 additions & 1 deletion b/‎src/diffusers/modular_pipelines/wan/__init__.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/diffusers/modular_pipelines/wan/before_denoise.py‎
Lines changed: 12 additions & 19 deletions b/‎src/diffusers/modular_pipelines/wan/before_denoise.py‎
Lines changed: 12 additions & 19 deletions
diff --git a/‎src/diffusers/modular_pipelines/wan/decoders.py‎
Lines changed: 105 additions & 0 deletions b/‎src/diffusers/modular_pipelines/wan/decoders.py‎
Lines changed: 105 additions & 0 deletions
@@ -21,12 +21,16 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["encoders"] = []
+    _import_structure["encoders"] = ["WanTextEncoderStep"]
     _import_structure["modular_blocks"] = [
         "ALL_BLOCKS",
         "AUTO_BLOCKS",
         "TEXT2VIDEO_BLOCKS",
+        "WanAutoBeforeDenoiseStep",
         "WanAutoBlocks",
+        "WanAutoBlocks",
+        "WanAutoDecodeStep",
+        "WanAutoDenoiseStep",
     ]
     _import_structure["modular_pipeline"] = ["WanModularPipeline"]
 
@@ -37,11 +41,15 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
+        from .encoders import WanTextEncoderStep
         from .modular_blocks import (
             ALL_BLOCKS,
             AUTO_BLOCKS,
             TEXT2VIDEO_BLOCKS,
+            WanAutoBeforeDenoiseStep,
             WanAutoBlocks,
+            WanAutoDecodeStep,
+            WanAutoDenoiseStep,
         )
         from .modular_pipeline import WanModularPipeline
 else:
 
@@ -17,7 +17,6 @@
 
 import torch
 
-from ...models import AutoencoderKLWan
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import logging
 from ...utils.torch_utils import randn_tensor
@@ -230,7 +229,6 @@ def intermediate_outputs(self) -> List[OutputParam]:
     @torch.no_grad()
     def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
-
         block_state.device = components._execution_device
 
         block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps(
@@ -250,10 +248,7 @@ class WanPrepareLatentsStep(PipelineBlock):
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
-            ComponentSpec("vae", AutoencoderKLWan),
-        ]
+        return []
 
     @property
     def description(self) -> str:
@@ -262,11 +257,11 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("height"),
-            InputParam("width"),
-            InputParam("num_frames"),
-            InputParam("latents"),
-            InputParam("num_videos_per_prompt", default=1),
+            InputParam("height", type_hint=int),
+            InputParam("width", type_hint=int),
+            InputParam("num_frames", type_hint=int),
+            InputParam("latents", type_hint=Optional[torch.Tensor]),
+            InputParam("num_videos_per_prompt", type_hint=int, default=1),
         ]
 
     @property
@@ -277,7 +272,7 @@ def intermediate_inputs(self) -> List[InputParam]:
                 "batch_size",
                 required=True,
                 type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_videos_per_prompt. Can be generated in input step.",
+                description="Number of prompts, the final batch size of model inputs should be `batch_size * num_videos_per_prompt`. Can be generated in input step.",
             ),
             InputParam("dtype", type_hint=torch.dtype, description="The dtype of the model inputs"),
         ]
@@ -343,17 +338,15 @@ def prepare_latents(
     def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
-        if block_state.dtype is None:
-            block_state.dtype = components.vae.dtype
-
-        block_state.device = components._execution_device
-
-        self.check_inputs(components, block_state)
-
         block_state.height = block_state.height or components.default_height
         block_state.width = block_state.width or components.default_width
         block_state.num_frames = block_state.num_frames or components.default_num_frames
+        block_state.device = components._execution_device
+        block_state.dtype = torch.float32  # Wan latents should be torch.float32 for best quality
         block_state.num_channels_latents = components.num_channels_latents
+
+        self.check_inputs(components, block_state)
+
         block_state.latents = self.prepare_latents(
             components,
             block_state.batch_size * block_state.num_videos_per_prompt,
 
@@ -0,0 +1,105 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, List, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKLWan
+from ...utils import logging
+from ...video_processor import VideoProcessor
+from ..modular_pipeline import PipelineBlock, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class WanDecodeStep(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLWan),
+            ComponentSpec(
+                "video_processor",
+                VideoProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def description(self) -> str:
+        return "Step that decodes the denoised latents into images"
+
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("output_type", default="pil"),
+        ]
+
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents from the denoising step",
+            )
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "videos",
+                type_hint=Union[List[List[PIL.Image.Image]], List[torch.Tensor], List[np.ndarray]],
+                description="The generated videos, can be a PIL.Image.Image, torch.Tensor or a numpy array",
+            )
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        vae_dtype = components.vae.dtype
+
+        if not block_state.output_type == "latent":
+            latents = block_state.latents
+            latents_mean = (
+                torch.tensor(components.vae.config.latents_mean)
+                .view(1, components.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view(
+                1, components.vae.config.z_dim, 1, 1, 1
+            ).to(latents.device, latents.dtype)
+            latents = latents / latents_std + latents_mean
+            latents = latents.to(vae_dtype)
+            block_state.videos = components.vae.decode(latents, return_dict=False)[0]
+        else:
+            block_state.videos = block_state.latents
+
+        block_state.videos = components.video_processor.postprocess_video(
+            block_state.videos, output_type=block_state.output_type
+        )
+
+        self.set_block_state(state, block_state)
+
+        return components, state