faaany
diff --git a/‎docs/source/en/api/pipelines/wan.md‎
Lines changed: 44 additions & 4 deletions b/‎docs/source/en/api/pipelines/wan.md‎
Lines changed: 44 additions & 4 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py‎
Lines changed: 3 additions & 2 deletions b/‎src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/diffusers/models/modeling_utils.py‎
Lines changed: 4 additions & 1 deletion b/‎src/diffusers/models/modeling_utils.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/pipeline_loading_utils.py‎
Lines changed: 12 additions & 2 deletions b/‎src/diffusers/pipelines/pipeline_loading_utils.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/pipeline_utils.py‎
Lines changed: 13 additions & 5 deletions b/‎src/diffusers/pipelines/pipeline_utils.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎src/diffusers/pipelines/wan/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎src/diffusers/pipelines/wan/__init__.py‎
Lines changed: 2 additions & 1 deletion
@@ -133,6 +133,46 @@ output = pipe(
 export_to_video(output, "wan-i2v.mp4", fps=16)
 ```
 
+### Video to Video Generation
+
+```python
+import torch
+from diffusers.utils import load_video, export_to_video
+from diffusers import AutoencoderKLWan, WanVideoToVideoPipeline, UniPCMultistepScheduler
+
+# Available models: Wan-AI/Wan2.1-T2V-14B-Diffusers, Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
+vae = AutoencoderKLWan.from_pretrained(
+    model_id, subfolder="vae", torch_dtype=torch.float32
+)
+pipe = WanVideoToVideoPipeline.from_pretrained(
+    model_id, vae=vae, torch_dtype=torch.bfloat16
+)
+flow_shift = 3.0  # 5.0 for 720P, 3.0 for 480P
+pipe.scheduler = UniPCMultistepScheduler.from_config(
+    pipe.scheduler.config, flow_shift=flow_shift
+)
+# change to pipe.to("cuda") if you have sufficient VRAM
+pipe.enable_model_cpu_offload()
+
+prompt = "A robot standing on a mountain top. The sun is setting in the background"
+negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+video = load_video(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hiker.mp4"
+)
+output = pipe(
+    video=video,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=480,
+    width=512,
+    guidance_scale=7.0,
+    strength=0.7,
+).frames[0]
+
+export_to_video(output, "wan-v2v.mp4", fps=16)
+```
+
 ## Memory Optimizations for Wan 2.1
 
 Base inference with the large 14B Wan 2.1 models can take up to 35GB of VRAM when generating videos at 720p resolution. We'll outline a few memory optimizations we can apply to reduce the VRAM required to run the model.
@@ -323,7 +363,7 @@ import numpy as np
 from diffusers import AutoencoderKLWan, WanTransformer3DModel, WanImageToVideoPipeline
 from diffusers.hooks.group_offloading import apply_group_offloading
 from diffusers.utils import export_to_video, load_image
-from transformers import UMT5EncoderModel, CLIPVisionMode
+from transformers import UMT5EncoderModel, CLIPVisionModel
 
 model_id = "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers"
 image_encoder = CLIPVisionModel.from_pretrained(
@@ -356,7 +396,7 @@ prompt = (
     "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
     "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
 )
-negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards
+negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
 num_frames = 33
 
 output = pipe(
@@ -372,7 +412,7 @@ output = pipe(
 export_to_video(output, "wan-i2v.mp4", fps=16)
 ```
 
-### Using a Custom Scheduler
+## Using a Custom Scheduler
 
 Wan can be used with many different schedulers, each with their own benefits regarding speed and generation quality. By default, Wan uses the `UniPCMultistepScheduler(prediction_type="flow_prediction", use_flow_sigmas=True, flow_shift=3.0)` scheduler. You can use a different scheduler as follows:
 
@@ -403,7 +443,7 @@ transformer = WanTransformer3DModel.from_single_file(ckpt_path, torch_dtype=torc
 pipe = WanPipeline.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers", transformer=transformer)
 ```
 
-## Recommendations for Inference:
+## Recommendations for Inference
 - Keep `AutencoderKLWan` in `torch.float32` for better decoding quality.
 - `num_frames` should satisfy the following constraint: `(num_frames - 1) % 4 == 0`
 - For smaller resolution videos, try lower values of `shift` (between `2.0` to `5.0`) in the [Scheduler](https://huggingface.co/docs/diffusers/main/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler.shift). For larger resolution videos, try higher values (between `7.0` and `12.0`). The default value is `3.0` for Wan.
 
@@ -509,6 +509,7 @@
             "VQDiffusionPipeline",
             "WanImageToVideoPipeline",
             "WanPipeline",
+            "WanVideoToVideoPipeline",
             "WuerstchenCombinedPipeline",
             "WuerstchenDecoderPipeline",
             "WuerstchenPriorPipeline",
@@ -1062,6 +1063,7 @@
             VQDiffusionPipeline,
             WanImageToVideoPipeline,
             WanPipeline,
+            WanVideoToVideoPipeline,
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
 
@@ -105,6 +105,7 @@ def __init__(
         self.width_pad = width_pad
         self.time_pad = time_pad
         self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)
+        self.const_padding_conv3d = (0, self.width_pad, self.height_pad)
 
         self.temporal_dim = 2
         self.time_kernel_size = time_kernel_size
@@ -117,6 +118,8 @@ def __init__(
             kernel_size=kernel_size,
             stride=stride,
             dilation=dilation,
+            padding=0 if self.pad_mode == "replicate" else self.const_padding_conv3d,
+            padding_mode="zeros",
         )
 
     def fake_context_parallel_forward(
@@ -137,9 +140,7 @@ def forward(self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = Non
         if self.pad_mode == "replicate":
             conv_cache = None
         else:
-            padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
             conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
-            inputs = F.pad(inputs, padding_2d, mode="constant", value=0)
 
         output = self.conv(inputs)
         return output, conv_cache
 
@@ -714,7 +714,10 @@ def save_pretrained(
             if safe_serialization:
                 # At some point we will need to deal better with save_function (used for TPU and other distributed
                 # joyfulness), but for now this enough.
-                safetensors.torch.save_file(shard, filepath, metadata={"format": "pt"})
+                try:
+                    safetensors.torch.save_file(shard, filepath, metadata={"format": "pt"})
+                except RuntimeError:
+                    safetensors.torch.save_model(model_to_save, filepath, metadata={"format": "pt"})
             else:
                 torch.save(shard, filepath)
 
 
@@ -356,7 +356,7 @@
         "WuerstchenDecoderPipeline",
         "WuerstchenPriorPipeline",
     ]
-    _import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline"]
+    _import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline"]
 try:
     if not is_onnx_available():
         raise OptionalDependencyNotAvailable()
@@ -709,7 +709,7 @@
             UniDiffuserPipeline,
             UniDiffuserTextDecoder,
         )
-        from .wan import WanImageToVideoPipeline, WanPipeline
+        from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline
         from .wuerstchen import (
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
 
@@ -592,6 +592,11 @@ def _get_final_device_map(device_map, pipeline_class, passed_class_obj, init_dic
                 loaded_sub_model = passed_class_obj[name]
 
         else:
+            sub_model_dtype = (
+                torch_dtype.get(name, torch_dtype.get("default", torch.float32))
+                if isinstance(torch_dtype, dict)
+                else torch_dtype
+            )
             loaded_sub_model = _load_empty_model(
                 library_name=library_name,
                 class_name=class_name,
@@ -600,7 +605,7 @@ def _get_final_device_map(device_map, pipeline_class, passed_class_obj, init_dic
                 is_pipeline_module=is_pipeline_module,
                 pipeline_class=pipeline_class,
                 name=name,
-                torch_dtype=torch_dtype,
+                torch_dtype=sub_model_dtype,
                 cached_folder=kwargs.get("cached_folder", None),
                 force_download=kwargs.get("force_download", None),
                 proxies=kwargs.get("proxies", None),
@@ -616,7 +621,12 @@ def _get_final_device_map(device_map, pipeline_class, passed_class_obj, init_dic
     # Obtain a sorted dictionary for mapping the model-level components
     # to their sizes.
     module_sizes = {
-        module_name: compute_module_sizes(module, dtype=torch_dtype)[""]
+        module_name: compute_module_sizes(
+            module,
+            dtype=torch_dtype.get(module_name, torch_dtype.get("default", torch.float32))
+            if isinstance(torch_dtype, dict)
+            else torch_dtype,
+        )[""]
         for module_name, module in init_empty_modules.items()
         if isinstance(module, torch.nn.Module)
     }
 
@@ -552,9 +552,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                       saved using
                     [`~DiffusionPipeline.save_pretrained`].
                     - A path to a *directory* (for example `./my_pipeline_directory/`) containing a dduf file
-            torch_dtype (`str` or `torch.dtype`, *optional*):
+            torch_dtype (`str` or `torch.dtype` or `dict[str, Union[str, torch.dtype]]`, *optional*):
                 Override the default `torch.dtype` and load the model with another dtype. If "auto" is passed, the
-                dtype is automatically derived from the model's weights.
+                dtype is automatically derived from the model's weights. To load submodels with different dtype pass a
+                `dict` (for example `{'transformer': torch.bfloat16, 'vae': torch.float16}`). Set the default dtype for
+                unspecified components with `default` (for example `{'transformer': torch.bfloat16, 'default':
+                torch.float16}`). If a component is not specified and no default is set, `torch.float32` is used.
             custom_pipeline (`str`, *optional*):
 
                 <Tip warning={true}>
@@ -703,7 +706,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         use_onnx = kwargs.pop("use_onnx", None)
         load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
 
-        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+        if torch_dtype is not None and not isinstance(torch_dtype, dict) and not isinstance(torch_dtype, torch.dtype):
             torch_dtype = torch.float32
             logger.warning(
                 f"Passed `torch_dtype` {torch_dtype} is not a `torch.dtype`. Defaulting to `torch.float32`."
@@ -950,14 +953,19 @@ def load_module(name, value):
                 loaded_sub_model = passed_class_obj[name]
             else:
                 # load sub model
+                sub_model_dtype = (
+                    torch_dtype.get(name, torch_dtype.get("default", torch.float32))
+                    if isinstance(torch_dtype, dict)
+                    else torch_dtype
+                )
                 loaded_sub_model = load_sub_model(
                     library_name=library_name,
                     class_name=class_name,
                     importable_classes=importable_classes,
                     pipelines=pipelines,
                     is_pipeline_module=is_pipeline_module,
                     pipeline_class=pipeline_class,
-                    torch_dtype=torch_dtype,
+                    torch_dtype=sub_model_dtype,
                     provider=provider,
                     sess_options=sess_options,
                     device_map=current_device_map,
@@ -998,7 +1006,7 @@ def load_module(name, value):
             for module in missing_modules:
                 init_kwargs[module] = passed_class_obj.get(module, None)
         elif len(missing_modules) > 0:
-            passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs
+            passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - set(optional_kwargs)
             raise ValueError(
                 f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
             )
 
@@ -24,7 +24,7 @@
 else:
     _import_structure["pipeline_wan"] = ["WanPipeline"]
     _import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
-
+    _import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
         if not (is_transformers_available() and is_torch_available()):
@@ -35,6 +35,7 @@
     else:
         from .pipeline_wan import WanPipeline
         from .pipeline_wan_i2v import WanImageToVideoPipeline
+        from .pipeline_wan_video2video import WanVideoToVideoPipeline
 
 else:
     import sys