hao-ai-lab
diff --git a/‎examples/inference/basic/basic_self_forcing_causal.py‎
Lines changed: 31 additions & 0 deletions b/‎examples/inference/basic/basic_self_forcing_causal.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎fastvideo/configs/models/dits/wanvideo.py‎
Lines changed: 6 additions & 0 deletions b/‎fastvideo/configs/models/dits/wanvideo.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎fastvideo/configs/pipelines/registry.py‎
Lines changed: 2 additions & 0 deletions b/‎fastvideo/configs/pipelines/registry.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎fastvideo/configs/pipelines/wan.py‎
Lines changed: 10 additions & 0 deletions b/‎fastvideo/configs/pipelines/wan.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎fastvideo/configs/sample/registry.py‎
Lines changed: 15 additions & 2 deletions b/‎fastvideo/configs/sample/registry.py‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎fastvideo/configs/sample/wan.py‎
Lines changed: 8 additions & 0 deletions b/‎fastvideo/configs/sample/wan.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎fastvideo/layers/rotary_embedding.py‎
Lines changed: 9 additions & 0 deletions b/‎fastvideo/layers/rotary_embedding.py‎
Lines changed: 9 additions & 0 deletions
@@ -0,0 +1,31 @@
+import os
+import time
+from fastvideo import VideoGenerator, SamplingParam
+
+OUTPUT_PATH = "video_samples_causal"
+def main():
+    # FastVideo will automatically use the optimal default arguments for the
+    # model.
+    # If a local path is provided, FastVideo will make a best effort
+    # attempt to identify the optimal arguments.
+    model_name = "wlsaidhi/SFWan2.1-T2V-1.3B-Diffusers"
+    generator = VideoGenerator.from_pretrained(
+        model_name,
+        # FastVideo will automatically handle distributed setup
+        num_gpus=1,
+        use_fsdp_inference=True,
+        text_encoder_cpu_offload=False,
+        dit_cpu_offload=False,
+    )
+
+    sampling_param = SamplingParam.from_pretrained(model_name)
+
+    prompt = (
+        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
+        "wide with interest. The playful yet serene atmosphere is complemented by soft "
+        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
+    )
+    video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, sampling_param=sampling_param)
+
+if __name__ == "__main__":
+    main()
@@ -92,6 +92,12 @@ class WanVideoArchConfig(DiTArchConfig):
     pos_embed_seq_len: int | None = None
     exclude_lora_layers: list[str] = field(default_factory=lambda: ["embedder"])
 
+    # Causal Wan
+    local_attn_size: int = -1  # Window size for temporal local attention (-1 indicates global attention)
+    sink_size: int = 0  # Size of the attention sink, we keep the first `sink_size` frames unchanged when rolling the KV cache
+    num_frames_per_block: int = 3
+    sliding_window_num_frames: int = 21
+
     def __post_init__(self):
         super().__post_init__()
         self.out_channels = self.out_channels or self.in_channels
 
@@ -9,6 +9,7 @@
 from fastvideo.configs.pipelines.stepvideo import StepVideoT2VConfig
 from fastvideo.configs.pipelines.wan import (FastWan2_1_T2V_480P_Config,
                                              FastWan2_2_TI2V_5B_Config,
+                                             SelfForcingWanT2V480PConfig,
                                              WanI2V480PConfig, WanI2V720PConfig,
                                              WanT2V480PConfig, WanT2V720PConfig)
 from fastvideo.logger import init_logger
@@ -34,6 +35,7 @@
     "Wan-AI/Wan2.2-TI2V-5B-Diffusers": WanT2V720PConfig,
     "Wan-AI/Wan2.2-T2V-A14B-Diffusers": WanT2V480PConfig,
     "Wan-AI/Wan2.2-I2V-A14B-Diffusers": WanI2V480PConfig,
+    "wlsaidhi/SFWan2.1-T2V-1.3B-Diffusers": SelfForcingWanT2V480PConfig,
     # Add other specific weight variants
 }
 
 
@@ -138,3 +138,13 @@ class Wan2_2_T2V_A14B_Config(WanT2V480PConfig):
 @dataclass
 class Wan2_2_I2V_A14B_Config(WanT2V480PConfig):
     pass
+
+
+# =============================================
+# ============= Causal Self-Forcing =============
+# =============================================
+@dataclass
+class SelfForcingWanT2V480PConfig(WanT2V480PConfig):
+    is_causal: bool = True
+    dmd_denoising_steps: list[int] | None = field(
+        default_factory=lambda: [1000, 750, 500, 250])
@@ -17,6 +17,7 @@
     WanI2V_14B_720P_SamplingParam,
     WanT2V_1_3B_SamplingParam,
     WanT2V_14B_SamplingParam,
+    SelfForcingWanT2V480PConfig,
 )
 # isort: on
 from fastvideo.logger import init_logger
@@ -28,17 +29,29 @@
 SAMPLING_PARAM_REGISTRY: dict[str, Any] = {
     "FastVideo/FastHunyuan-diffusers": FastHunyuanSamplingParam,
     "hunyuanvideo-community/HunyuanVideo": HunyuanSamplingParam,
+    "FastVideo/stepvideo-t2v-diffusers": StepVideoT2VSamplingParam,
+
+    # Wan2.1
     "Wan-AI/Wan2.1-T2V-1.3B-Diffusers": WanT2V_1_3B_SamplingParam,
     "Wan-AI/Wan2.1-T2V-14B-Diffusers": WanT2V_14B_SamplingParam,
     "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers": WanI2V_14B_480P_SamplingParam,
     "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers": WanI2V_14B_720P_SamplingParam,
-    "FastVideo/stepvideo-t2v-diffusers": StepVideoT2VSamplingParam,
-    "FastVideo/FastWan2.1-T2V-1.3B-Diffusers": FastWanT2V480PConfig,
+
+    # Wan2.2
     "Wan-AI/Wan2.2-TI2V-5B-Diffusers": Wan2_2_TI2V_5B_SamplingParam,
     "FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers":
     Wan2_2_TI2V_5B_SamplingParam,
     "Wan-AI/Wan2.2-T2V-A14B-Diffusers": Wan2_2_T2V_A14B_SamplingParam,
     "Wan-AI/Wan2.2-I2V-A14B-Diffusers": Wan2_2_I2V_A14B_SamplingParam,
+
+    # FastWan2.1
+    "FastVideo/FastWan2.1-T2V-1.3B-Diffusers": FastWanT2V480PConfig,
+
+    # FastWan2.2
+    "FastVideo/FastWan2.2-TI2V-5B-Diffusers": Wan2_2_TI2V_5B_SamplingParam,
+
+    # Causal Self-Forcing Wan2.1
+    "wlsaidhi/SFWan2.1-T2V-1.3B-Diffusers": SelfForcingWanT2V480PConfig,
     # Add other specific weight variants
 }
 
 
@@ -141,3 +141,11 @@ class Wan2_2_I2V_A14B_SamplingParam(Wan2_2_Base_SamplingParam):
     guidance_scale_2: float = 3.5
     num_inference_steps: int = 40
     fps: int = 16
+
+
+# =============================================
+# ============= Causal Self-Forcing =============
+# =============================================
+@dataclass
+class SelfForcingWanT2V480PConfig(WanT2V_1_3B_SamplingParam):
+    pass
@@ -29,6 +29,9 @@
 
 from fastvideo.distributed.parallel_state import get_sp_group
 from fastvideo.layers.custom_op import CustomOp
+from fastvideo.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -267,6 +270,7 @@ def get_nd_rotary_pos_embed(
     sp_rank: int = 0,
     sp_world_size: int = 1,
     dtype: torch.dtype = torch.float32,
+    start_frame: int = 0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     This is a n-d version of precompute_freqs_cis, which is a RoPE for tokens with n-d structure.
@@ -292,6 +296,9 @@ def get_nd_rotary_pos_embed(
     full_grid = get_meshgrid_nd(
         start, *args, dim=len(rope_dim_list))  # [3, W, H, D] / [2, W, H]
 
+    if start_frame > 0:
+        full_grid[0] += start_frame
+
     # Shard the grid if using sequence parallelism (sp_world_size > 1)
     assert shard_dim < len(
         rope_dim_list
@@ -370,6 +377,7 @@ def get_rotary_pos_embed(
     interpolation_factor=1.0,
     shard_dim: int = 0,
     dtype: torch.dtype = torch.float32,
+    start_frame: int = 0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Generate rotary positional embeddings for the given sizes.
@@ -413,6 +421,7 @@ def get_rotary_pos_embed(
         sp_rank=sp_rank,
         sp_world_size=sp_world_size,
         dtype=dtype,
+        start_frame=start_frame,
     )
     return freqs_cos, freqs_sin