|  | 
|  | 1 | +# Copyright 2025 The HuggingFace Team. All rights reserved. | 
|  | 2 | +# | 
|  | 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 4 | +# you may not use this file except in compliance with the License. | 
|  | 5 | +# You may obtain a copy of the License at | 
|  | 6 | +# | 
|  | 7 | +#     http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 8 | +# | 
|  | 9 | +# Unless required by applicable law or agreed to in writing, software | 
|  | 10 | +# distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 12 | +# See the License for the specific language governing permissions and | 
|  | 13 | +# limitations under the License. | 
|  | 14 | + | 
|  | 15 | +from typing import Any, List, Tuple, Union | 
|  | 16 | + | 
|  | 17 | +import numpy as np | 
|  | 18 | +import PIL | 
|  | 19 | +import torch | 
|  | 20 | + | 
|  | 21 | +from ...configuration_utils import FrozenDict | 
|  | 22 | +from ...models import AutoencoderKLWan | 
|  | 23 | +from ...utils import logging | 
|  | 24 | +from ...video_processor import VideoProcessor | 
|  | 25 | +from ..modular_pipeline import PipelineBlock, PipelineState | 
|  | 26 | +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam | 
|  | 27 | + | 
|  | 28 | + | 
|  | 29 | +logger = logging.get_logger(__name__)  # pylint: disable=invalid-name | 
|  | 30 | + | 
|  | 31 | + | 
|  | 32 | +class WanDecodeStep(PipelineBlock): | 
|  | 33 | +    model_name = "stable-diffusion-xl" | 
|  | 34 | + | 
|  | 35 | +    @property | 
|  | 36 | +    def expected_components(self) -> List[ComponentSpec]: | 
|  | 37 | +        return [ | 
|  | 38 | +            ComponentSpec("vae", AutoencoderKLWan), | 
|  | 39 | +            ComponentSpec( | 
|  | 40 | +                "video_processor", | 
|  | 41 | +                VideoProcessor, | 
|  | 42 | +                config=FrozenDict({"vae_scale_factor": 8}), | 
|  | 43 | +                default_creation_method="from_config", | 
|  | 44 | +            ), | 
|  | 45 | +        ] | 
|  | 46 | + | 
|  | 47 | +    @property | 
|  | 48 | +    def description(self) -> str: | 
|  | 49 | +        return "Step that decodes the denoised latents into images" | 
|  | 50 | + | 
|  | 51 | +    @property | 
|  | 52 | +    def inputs(self) -> List[Tuple[str, Any]]: | 
|  | 53 | +        return [ | 
|  | 54 | +            InputParam("output_type", default="pil"), | 
|  | 55 | +        ] | 
|  | 56 | + | 
|  | 57 | +    @property | 
|  | 58 | +    def intermediate_inputs(self) -> List[str]: | 
|  | 59 | +        return [ | 
|  | 60 | +            InputParam( | 
|  | 61 | +                "latents", | 
|  | 62 | +                required=True, | 
|  | 63 | +                type_hint=torch.Tensor, | 
|  | 64 | +                description="The denoised latents from the denoising step", | 
|  | 65 | +            ) | 
|  | 66 | +        ] | 
|  | 67 | + | 
|  | 68 | +    @property | 
|  | 69 | +    def intermediate_outputs(self) -> List[str]: | 
|  | 70 | +        return [ | 
|  | 71 | +            OutputParam( | 
|  | 72 | +                "videos", | 
|  | 73 | +                type_hint=Union[List[List[PIL.Image.Image]], List[torch.Tensor], List[np.ndarray]], | 
|  | 74 | +                description="The generated videos, can be a PIL.Image.Image, torch.Tensor or a numpy array", | 
|  | 75 | +            ) | 
|  | 76 | +        ] | 
|  | 77 | + | 
|  | 78 | +    @torch.no_grad() | 
|  | 79 | +    def __call__(self, components, state: PipelineState) -> PipelineState: | 
|  | 80 | +        block_state = self.get_block_state(state) | 
|  | 81 | +        vae_dtype = components.vae.dtype | 
|  | 82 | + | 
|  | 83 | +        if not block_state.output_type == "latent": | 
|  | 84 | +            latents = block_state.latents | 
|  | 85 | +            latents_mean = ( | 
|  | 86 | +                torch.tensor(components.vae.config.latents_mean) | 
|  | 87 | +                .view(1, components.vae.config.z_dim, 1, 1, 1) | 
|  | 88 | +                .to(latents.device, latents.dtype) | 
|  | 89 | +            ) | 
|  | 90 | +            latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view( | 
|  | 91 | +                1, components.vae.config.z_dim, 1, 1, 1 | 
|  | 92 | +            ).to(latents.device, latents.dtype) | 
|  | 93 | +            latents = latents / latents_std + latents_mean | 
|  | 94 | +            latents = latents.to(vae_dtype) | 
|  | 95 | +            block_state.videos = components.vae.decode(latents, return_dict=False)[0] | 
|  | 96 | +        else: | 
|  | 97 | +            block_state.videos = block_state.latents | 
|  | 98 | + | 
|  | 99 | +        block_state.videos = components.video_processor.postprocess_video( | 
|  | 100 | +            block_state.videos, output_type=block_state.output_type | 
|  | 101 | +        ) | 
|  | 102 | + | 
|  | 103 | +        self.set_block_state(state, block_state) | 
|  | 104 | + | 
|  | 105 | +        return components, state | 
0 commit comments