[Feature] Add wan2.2 5b i2v (#760)

JerryZhou54 · SolitaryThinker · web-flow · commit 359151d9a0fe · 2025-08-28T18:15:59.000-07:00
Co-authored-by: SolitaryThinker &lt;wlsaidhi@gmail.com&gt;
diff --git a/examples/inference/basic/basic_wan2_2_ti2v.py b/examples/inference/basic/basic_wan2_2_ti2v.py
@@ -0,0 +1,41 @@
+from fastvideo import VideoGenerator
+
+OUTPUT_PATH = "video_samples_wan2_2_5B_ti2v"
+def main():
+    # FastVideo will automatically use the optimal default arguments for the
+    # model.
+    # If a local path is provided, FastVideo will make a best effort
+    # attempt to identify the optimal arguments.
+    model_name = "Wan-AI/Wan2.2-TI2V-5B-Diffusers"
+    generator = VideoGenerator.from_pretrained(
+        model_name,
+        # FastVideo will automatically handle distributed setup
+        num_gpus=1,
+        use_fsdp_inference=True,
+        dit_cpu_offload=True,
+        vae_cpu_offload=False,
+        text_encoder_cpu_offload=True,
+        pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
+        # image_encoder_cpu_offload=False,
+    )
+
+    # I2V is triggered just by passing in an image_path argument
+    prompt = "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside."
+    image_path = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"
+    video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, image_path=image_path)
+
+    # Generate another video with a different prompt, without reloading the
+    # model!
+
+    # T2V mode
+    prompt2 = (
+        "A majestic lion strides across the golden savanna, its powerful frame "
+        "glistening under the warm afternoon sun. The tall grass ripples gently in "
+        "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
+        "embodying the raw energy of the wild. Low angle, steady tracking shot, "
+        "cinematic.")
+    video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fastvideo/configs/pipelines/base.py b/fastvideo/configs/pipelines/base.py
@@ -85,6 +85,9 @@ class PipelineConfig:
     # DMD parameters
     dmd_denoising_steps: list[int] | None = field(default=None)
 
+    # Wan2.2 TI2V parameters
+    ti2v_task: bool = False
+
     # Compilation
     # enable_torch_compile: bool = False
 
diff --git a/fastvideo/configs/pipelines/registry.py b/fastvideo/configs/pipelines/registry.py
@@ -7,11 +7,14 @@
 from fastvideo.configs.pipelines.base import PipelineConfig
 from fastvideo.configs.pipelines.hunyuan import FastHunyuanConfig, HunyuanConfig
 from fastvideo.configs.pipelines.stepvideo import StepVideoT2VConfig
-from fastvideo.configs.pipelines.wan import (FastWan2_1_T2V_480P_Config,
-                                             FastWan2_2_TI2V_5B_Config,
-                                             SelfForcingWanT2V480PConfig,
-                                             WanI2V480PConfig, WanI2V720PConfig,
-                                             WanT2V480PConfig, WanT2V720PConfig)
+
+# isort: off
+from fastvideo.configs.pipelines.wan import (
+    FastWan2_1_T2V_480P_Config, FastWan2_2_TI2V_5B_Config,
+    SelfForcingWanT2V480PConfig, Wan2_2_I2V_A14B_Config, Wan2_2_T2V_A14B_Config,
+    Wan2_2_TI2V_5B_Config, WanI2V480PConfig, WanI2V720PConfig, WanT2V480PConfig,
+    WanT2V720PConfig)
+# isort: on
 from fastvideo.logger import init_logger
 from fastvideo.utils import (maybe_download_model_index,
                              verify_model_config_and_directory)
@@ -32,10 +35,10 @@
     "FastVideo/FastWan2.2-TI2V-5B-Diffusers": FastWan2_2_TI2V_5B_Config,
     "FastVideo/stepvideo-t2v-diffusers": StepVideoT2VConfig,
     "FastVideo/Wan2.1-VSA-T2V-14B-720P-Diffusers": WanT2V720PConfig,
-    "Wan-AI/Wan2.2-TI2V-5B-Diffusers": WanT2V720PConfig,
-    "Wan-AI/Wan2.2-T2V-A14B-Diffusers": WanT2V480PConfig,
-    "Wan-AI/Wan2.2-I2V-A14B-Diffusers": WanI2V480PConfig,
     "wlsaidhi/SFWan2.1-T2V-1.3B-Diffusers": SelfForcingWanT2V480PConfig,
+    "Wan-AI/Wan2.2-TI2V-5B-Diffusers": Wan2_2_TI2V_5B_Config,
+    "Wan-AI/Wan2.2-T2V-A14B-Diffusers": Wan2_2_T2V_A14B_Config,
+    "Wan-AI/Wan2.2-I2V-A14B-Diffusers": Wan2_2_I2V_A14B_Config,
     # Add other specific weight variants
 }
 
diff --git a/fastvideo/configs/sample/registry.py b/fastvideo/configs/sample/registry.py
@@ -10,6 +10,7 @@
 # isort: off
 from fastvideo.configs.sample.wan import (
     FastWanT2V480PConfig,
+    Wan2_1_Fun_1_3B_InP_SamplingParam,
     Wan2_2_I2V_A14B_SamplingParam,
     Wan2_2_T2V_A14B_SamplingParam,
     Wan2_2_TI2V_5B_SamplingParam,
@@ -36,6 +37,8 @@
     "Wan-AI/Wan2.1-T2V-14B-Diffusers": WanT2V_14B_SamplingParam,
     "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers": WanI2V_14B_480P_SamplingParam,
     "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers": WanI2V_14B_720P_SamplingParam,
+    "weizhou03/Wan2.1-Fun-1.3B-InP-Diffusers":
+    Wan2_1_Fun_1_3B_InP_SamplingParam,
 
     # Wan2.2
     "Wan-AI/Wan2.2-TI2V-5B-Diffusers": Wan2_2_TI2V_5B_SamplingParam,
diff --git a/fastvideo/configs/sample/wan.py b/fastvideo/configs/sample/wan.py
@@ -107,6 +107,21 @@ class FastWanT2V480PConfig(WanT2V_1_3B_SamplingParam):
     fps: int = 16
 
 
+# =============================================
+# ============= Wan2.1 Fun Models =============
+# =============================================
+@dataclass
+class Wan2_1_Fun_1_3B_InP_SamplingParam(SamplingParam):
+    """Sampling parameters for Wan2.1 Fun 1.3B InP model."""
+    height: int = 480
+    width: int = 832
+    num_frames: int = 81
+    fps: int = 16
+    negative_prompt: str | None = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+    guidance_scale: float = 6.0
+    num_inference_steps: int = 50
+
+
 # =============================================
 # ============= Wan2.2 TI2V Models =============
 # =============================================
diff --git a/fastvideo/layers/visual_embedding.py b/fastvideo/layers/visual_embedding.py
@@ -86,12 +86,16 @@ def __init__(
                        dtype=dtype)
         self.freq_dtype = freq_dtype
 
-    def forward(self, t: torch.Tensor) -> torch.Tensor:
+    def forward(self,
+                t: torch.Tensor,
+                timestep_seq_len: int | None = None) -> torch.Tensor:
         t_freq = timestep_embedding(t,
                                     self.frequency_embedding_size,
                                     self.max_period,
                                     dtype=self.freq_dtype).to(
                                         self.mlp.fc_in.weight.dtype)
+        if timestep_seq_len is not None:
+            t_freq = t_freq.unflatten(0, (1, timestep_seq_len))
         # t_freq = t_freq.to(self.mlp.fc_in.weight.dtype)
         t_emb = self.mlp(t_freq)
         return t_emb
diff --git a/fastvideo/models/dits/wanvideo.py b/fastvideo/models/dits/wanvideo.py
@@ -81,8 +81,9 @@ def forward(
         timestep: torch.Tensor,
         encoder_hidden_states: torch.Tensor,
         encoder_hidden_states_image: torch.Tensor | None = None,
+        timestep_seq_len: int | None = None,
     ):
-        temb = self.time_embedder(timestep)
+        temb = self.time_embedder(timestep, timestep_seq_len)
         timestep_proj = self.time_modulation(temb)
 
         encoder_hidden_states = self.text_embedder(encoder_hidden_states)
@@ -319,9 +320,24 @@ def forward(
         bs, seq_length, _ = hidden_states.shape
         orig_dtype = hidden_states.dtype
         # assert orig_dtype != torch.float32
-        e = self.scale_shift_table + temb.float()
-        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = e.chunk(
-            6, dim=1)
+
+        if temb.dim() == 4:
+            # temb: batch_size, seq_len, 6, inner_dim (wan2.2 ti2v)
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table.unsqueeze(0) + temb.float()
+            ).chunk(6, dim=2)
+            # batch_size, seq_len, 1, inner_dim
+            shift_msa = shift_msa.squeeze(2)
+            scale_msa = scale_msa.squeeze(2)
+            gate_msa = gate_msa.squeeze(2)
+            c_shift_msa = c_shift_msa.squeeze(2)
+            c_scale_msa = c_scale_msa.squeeze(2)
+            c_gate_msa = c_gate_msa.squeeze(2)
+        else:
+            # temb: batch_size, 6, inner_dim (wan2.1/wan2.2 14B)
+            e = self.scale_shift_table + temb.float()
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = e.chunk(
+                6, dim=1)
         assert shift_msa.dtype == torch.float32
 
         # 1. Self-attention
@@ -649,9 +665,21 @@ def forward(self,
         hidden_states = self.patch_embedding(hidden_states)
         hidden_states = hidden_states.flatten(2).transpose(1, 2)
 
+        # timestep shape: batch_size, or batch_size, seq_len (wan 2.2 ti2v)
+        if timestep.dim() == 2:
+            ts_seq_len = timestep.shape[1]
+            timestep = timestep.flatten()  # batch_size * seq_len
+        else:
+            ts_seq_len = None
+
         temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
-            timestep, encoder_hidden_states, encoder_hidden_states_image)
-        timestep_proj = timestep_proj.unflatten(1, (6, -1))
+            timestep, encoder_hidden_states, encoder_hidden_states_image, timestep_seq_len=ts_seq_len)
+        if ts_seq_len is not None:
+            # batch_size, seq_len, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(2, (6, -1))
+        else:
+            # batch_size, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(1, (6, -1))
 
         if encoder_hidden_states_image is not None:
             encoder_hidden_states = torch.concat(
@@ -688,8 +716,15 @@ def forward(self,
             if enable_teacache:
                 self.maybe_cache_states(hidden_states, original_hidden_states)
         # 5. Output norm, projection & unpatchify
-        shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2,
-                                                                          dim=1)
+        if temb.dim() == 3:
+            # batch_size, seq_len, inner_dim (wan 2.2 ti2v)
+            shift, scale = (self.scale_shift_table.unsqueeze(0) + temb.unsqueeze(2)).chunk(2, dim=2)
+            shift = shift.squeeze(2)
+            scale = scale.squeeze(2)
+        else:
+            # batch_size, inner_dim
+            shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
+            
         hidden_states = self.norm_out(hidden_states, shift, scale)
         hidden_states = self.proj_out(hidden_states)
 
@@ -793,3 +828,4 @@ def retrieve_cached_states(self,
             return hidden_states + self.previous_residual_even
         else:
             return hidden_states + self.previous_residual_odd
+        
diff --git a/fastvideo/pipelines/basic/wan/wan_pipeline.py b/fastvideo/pipelines/basic/wan/wan_pipeline.py
@@ -63,6 +63,7 @@ def create_pipeline_stages(self, fastvideo_args: FastVideoArgs) -> None:
                            transformer=self.get_module("transformer"),
                            transformer_2=self.get_module("transformer_2", None),
                            scheduler=self.get_module("scheduler"),
+                           vae=self.get_module("vae"),
                            pipeline=self))
 
         self.add_stage(stage_name="decoding_stage",
diff --git a/fastvideo/pipelines/stages/denoising.py b/fastvideo/pipelines/stages/denoising.py
@@ -4,6 +4,7 @@
 """
 
 import inspect
+import math
 import weakref
 from collections.abc import Iterable
 from typing import Any
@@ -30,7 +31,7 @@
 from fastvideo.pipelines.stages.validators import StageValidators as V
 from fastvideo.pipelines.stages.validators import VerificationResult
 from fastvideo.platforms import AttentionBackendEnum
-from fastvideo.utils import dict_to_3d_list
+from fastvideo.utils import dict_to_3d_list, masks_like
 
 try:
     from fastvideo.attention.backends.sliding_tile_attn import (
@@ -61,11 +62,13 @@ def __init__(self,
                  transformer,
                  scheduler,
                  pipeline=None,
-                 transformer_2=None) -> None:
+                 transformer_2=None,
+                 vae=None) -> None:
         super().__init__()
         self.transformer = transformer
         self.transformer_2 = transformer_2
         self.scheduler = scheduler
+        self.vae = vae
         self.pipeline = weakref.ref(pipeline) if pipeline else None
         attn_head_size = self.transformer.hidden_size // self.transformer.num_attention_heads
         self.attn_backend = get_attn_backend(
@@ -194,6 +197,44 @@ def forward(
             boundary_timestep = fastvideo_args.boundary_ratio * self.scheduler.num_train_timesteps
         else:
             boundary_timestep = None
+        latent_model_input = latents.to(target_dtype)
+        assert latent_model_input.shape[0] == 1, "only support batch size 1"
+
+        if fastvideo_args.pipeline_config.ti2v_task and batch.pil_image is not None:
+            # TI2V directly replaces the first frame of the latent with
+            # the image latent instead of appending along the channel dim
+            assert batch.image_latent is None, "TI2V task should not have image latents"
+            assert self.vae is not None, "VAE is not provided for TI2V task"
+            z = self.vae.encode(batch.pil_image).mean.float()
+            if (hasattr(self.vae, "shift_factor")
+                    and self.vae.shift_factor is not None):
+                if isinstance(self.vae.shift_factor, torch.Tensor):
+                    z -= self.vae.shift_factor.to(z.device, z.dtype)
+                else:
+                    z -= self.vae.shift_factor
+
+            if isinstance(self.vae.scaling_factor, torch.Tensor):
+                z = z * self.vae.scaling_factor.to(z.device, z.dtype)
+            else:
+                z = z * self.vae.scaling_factor
+
+            latent_model_input = latent_model_input.squeeze(0)
+            _, mask2 = masks_like([latent_model_input], zero=True)
+
+            latent_model_input = (1. -
+                                  mask2[0]) * z + mask2[0] * latent_model_input
+            # latent_model_input = latent_model_input.unsqueeze(0)
+            latent_model_input = latent_model_input.to(get_local_torch_device())
+            latents = latent_model_input
+            F = batch.num_frames
+            temporal_scale = fastvideo_args.pipeline_config.vae_config.arch_config.scale_factor_temporal
+            spatial_scale = fastvideo_args.pipeline_config.vae_config.arch_config.scale_factor_spatial
+            patch_size = fastvideo_args.pipeline_config.dit_config.arch_config.patch_size
+            seq_len = ((F - 1) // temporal_scale +
+                       1) * (batch.height // spatial_scale) * (
+                           batch.width // spatial_scale) // (patch_size[1] *
+                                                             patch_size[2])
+            seq_len = int(math.ceil(seq_len / sp_world_size)) * sp_world_size
 
         # Run denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -218,19 +259,32 @@ def forward(
                         self.transformer.to('cpu')
                     current_model = self.transformer_2
                     current_guidance_scale = batch.guidance_scale_2
+                assert current_model is not None, "current_model is None"
 
                 # Expand latents for I2V
                 latent_model_input = latents.to(target_dtype)
                 if batch.image_latent is not None:
+                    assert not fastvideo_args.pipeline_config.ti2v_task, "image latents should not be provided for TI2V task"
                     latent_model_input = torch.cat(
                         [latent_model_input, batch.image_latent],
                         dim=1).to(target_dtype)
+                if fastvideo_args.pipeline_config.ti2v_task and batch.pil_image is not None:
+                    timestep = torch.stack([t]).to(get_local_torch_device())
+                    temp_ts = (mask2[0][0][:, ::2, ::2] * timestep).flatten()
+                    temp_ts = torch.cat([
+                        temp_ts,
+                        temp_ts.new_ones(seq_len - temp_ts.size(0)) * timestep
+                    ])
+                    timestep = temp_ts.unsqueeze(0)
+                    t_expand = timestep.repeat(latent_model_input.shape[0], 1)
+                else:
+                    t_expand = t.repeat(latent_model_input.shape[0])
+
                 assert torch.isnan(latent_model_input).sum() == 0
                 latent_model_input = self.scheduler.scale_model_input(
                     latent_model_input, t)
 
                 # Prepare inputs for transformer
-                t_expand = t.repeat(latent_model_input.shape[0])
                 guidance_expand = (
                     torch.tensor(
                         [fastvideo_args.pipeline_config.embedded_cfg_scale] *
@@ -330,6 +384,11 @@ def forward(
                                                   latents,
                                                   **extra_step_kwargs,
                                                   return_dict=False)[0]
+                    if fastvideo_args.pipeline_config.ti2v_task and batch.pil_image is not None:
+                        latents = latents.squeeze(0)
+                        latents = (1. - mask2[0]) * z + mask2[0] * latents
+                        # latents = latents.unsqueeze(0)
+
                 # Update progress bar
                 if i == len(timesteps) - 1 or (
                     (i + 1) > num_warmup_steps and
diff --git a/fastvideo/pipelines/stages/input_validation.py b/fastvideo/pipelines/stages/input_validation.py
diff --git a/fastvideo/utils.py b/fastvideo/utils.py