hao-ai-lab
diff --git a/‎examples/inference/basic/basic.py‎
Lines changed: 6 additions & 4 deletions b/‎examples/inference/basic/basic.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎examples/inference/basic/default_args.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/inference/basic/default_args.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/inference/lora/wan_lora_inference.py‎
Lines changed: 45 additions & 0 deletions b/‎examples/inference/lora/wan_lora_inference.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎fastvideo/v1/configs/models/dits/base.py‎
Lines changed: 3 additions & 1 deletion b/‎fastvideo/v1/configs/models/dits/base.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎fastvideo/v1/configs/models/dits/hunyuanvideo.py‎
Lines changed: 3 additions & 1 deletion b/‎fastvideo/v1/configs/models/dits/hunyuanvideo.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎fastvideo/v1/configs/models/dits/stepvideo.py‎
Lines changed: 1 addition & 0 deletions b/‎fastvideo/v1/configs/models/dits/stepvideo.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎fastvideo/v1/configs/models/dits/wanvideo.py‎
Lines changed: 19 additions & 1 deletion b/‎fastvideo/v1/configs/models/dits/wanvideo.py‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎fastvideo/v1/configs/pipelines/base.py‎
Lines changed: 0 additions & 1 deletion b/‎fastvideo/v1/configs/pipelines/base.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎fastvideo/v1/configs/pipelines/hunyuan.py‎
Lines changed: 0 additions & 3 deletions b/‎fastvideo/v1/configs/pipelines/hunyuan.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎fastvideo/v1/configs/pipelines/stepvideo.py‎
Lines changed: 0 additions & 3 deletions b/‎fastvideo/v1/configs/pipelines/stepvideo.py‎
Lines changed: 0 additions & 3 deletions
@@ -2,7 +2,7 @@
 
 # from fastvideo.v1.configs.sample import SamplingParam
 
-
+OUTPUT_PATH = "video_samples"
 def main():
     # FastVideo will automatically use the optimal default arguments for the
     # model.
@@ -11,7 +11,9 @@ def main():
     generator = VideoGenerator.from_pretrained(
         "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
         # if num_gpus > 1, FastVideo will automatically handle distributed setup
-        num_gpus=1,
+        num_gpus=2,
+        use_fsdp_inference=True,
+        use_cpu_offload=False
     )
 
     # sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
@@ -23,7 +25,7 @@ def main():
         "wide with interest. The playful yet serene atmosphere is complemented by soft "
         "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
     )
-    video = generator.generate_video(prompt)
+    video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True)
     # video = generator.generate_video(prompt, sampling_param=sampling_param, output_path="wan_t2v_videos/")
 
     # Generate another video with a different prompt, without reloading the
@@ -34,7 +36,7 @@ def main():
         "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
         "embodying the raw energy of the wild. Low angle, steady tracking shot, "
         "cinematic.")
-    video2 = generator.generate_video(prompt2)
+    video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)
 
 
 if __name__ == "__main__":
 
@@ -1,5 +1,5 @@
 from fastvideo import VideoGenerator
-
+from fastvideo.v1.configs.pipelines.base import PipelineConfig
 
 def main():
 
 
@@ -0,0 +1,45 @@
+from fastvideo import VideoGenerator
+from fastvideo.v1.configs.sample import SamplingParam
+
+OUTPUT_PATH = "./lora"
+def main():
+    # Initialize VideoGenerator with the Wan model
+    generator = VideoGenerator.from_pretrained(
+        "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+        num_gpus=2,
+        lora_path="benjamin-paine/steamboat-willie-1.3b",
+        lora_nickname="steamboat"
+    )
+    kwargs = {
+        "height": 480,
+        "width": 832,
+        "num_frames": 81,
+        "guidance_scale": 5.0,
+        "num_inference_steps": 32,
+    }
+    # Generate video with LoRA style
+    prompt = "steamboat willie style, golden era animation, close-up of a short fluffy monster  kneeling beside a melting red candle. the mood is one of wonder and curiosity,  as the monster gazes at the flame with wide eyes and open mouth. Its pose and expression  convey a sense of innocence and playfulness, as if it is exploring the world around it for the first time.  The use of warm colors and dramatic lighting further enhances the cozy atmosphere of the image."
+    negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+
+    video = generator.generate_video(
+        prompt,
+        # sampling_param=sampling_param,
+        output_path=OUTPUT_PATH,
+        save_video=True,
+        negative_prompt=negative_prompt,
+        **kwargs
+    )
+
+    generator.set_lora_adapter(lora_nickname="flat_color", lora_path="motimalu/wan-flat-color-1.3b-v2")
+    prompt = "flat color, no lineart, blending, negative space, artist:[john kafka|ponsuke kaikai|hara id 21|yoneyama mai|fuzichoco],  1girl, sakura miko, pink hair, cowboy shot, white shirt, floral print, off shoulder, outdoors, cherry blossom, tree shade, wariza, looking up, falling petals, half-closed eyes, white sky, clouds,  live2d animation, upper body, high quality cinematic video of a woman sitting under a sakura tree. Dreamy and lonely, the camera close-ups on the face of the woman as she turns towards the viewer. The Camera is steady, This is a cowboy shot. The animation is smooth and fluid."
+    negative_prompt = "bad quality video,色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+    video = generator.generate_video(
+        prompt,
+        output_path=OUTPUT_PATH,
+        save_video=True,
+        negative_prompt=negative_prompt,
+        **kwargs
+    )
+
+if __name__ == "__main__":
+    main()
@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import Any, Optional, Tuple
+from typing import Any, List, Optional, Tuple
 
 from fastvideo.v1.configs.models.base import ArchConfig, ModelConfig
 from fastvideo.v1.layers.quantization import QuantizationConfig
@@ -11,6 +11,7 @@ class DiTArchConfig(ArchConfig):
     _fsdp_shard_conditions: list = field(default_factory=list)
     _compile_conditions: list = field(default_factory=list)
     _param_names_mapping: dict = field(default_factory=dict)
+    _lora_param_names_mapping: dict = field(default_factory=dict)
     _supported_attention_backends: Tuple[_Backend,
                                          ...] = (_Backend.SLIDING_TILE_ATTN,
                                                  _Backend.SAGE_ATTN,
@@ -20,6 +21,7 @@ class DiTArchConfig(ArchConfig):
     hidden_size: int = 0
     num_attention_heads: int = 0
     num_channels_latents: int = 0
+    exclude_lora_layers: List[str] = field(default_factory=list)
 
     def __post_init__(self) -> None:
         if not self._compile_conditions:
 
@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 
@@ -163,6 +163,8 @@ class HunyuanVideoArchConfig(DiTArchConfig):
     pooled_projection_dim: int = 768
     rope_theta: int = 256
     qk_norm: str = "rms_norm"
+    exclude_lora_layers: List[str] = field(
+        default_factory=lambda: ["img_in", "txt_in", "time_in", "vector_in"])
 
     def __post_init__(self):
         super().__post_init__()
 
@@ -51,6 +51,7 @@ class StepVideoArchConfig(DiTArchConfig):
         default_factory=lambda: [6144, 1024])
     attention_type: Optional[str] = "torch"
     use_additional_conditions: Optional[bool] = False
+    exclude_lora_layers: List[str] = field(default_factory=lambda: [])
 
     def __post_init__(self):
         self.hidden_size = self.num_attention_heads * self.attention_head_dim
 
@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 from fastvideo.v1.configs.models.dits.base import DiTArchConfig, DiTConfig
 
@@ -51,6 +51,23 @@ class WanVideoArchConfig(DiTArchConfig):
             r"blocks\.(\d+)\.norm2\.(.*)$":
             r"blocks.\1.self_attn_residual_norm.norm.\2",
         })
+    # Some LoRA adapters use the original official layer names instead of hf layer names,
+    # so apply this before the param_names_mapping
+    _lora_param_names_mapping: dict = field(
+        default_factory=lambda: {
+            r"^blocks\.(\d+)\.self_attn\.q\.(.*)$": r"blocks.\1.attn1.to_q.\2",
+            r"^blocks\.(\d+)\.self_attn\.k\.(.*)$": r"blocks.\1.attn1.to_k.\2",
+            r"^blocks\.(\d+)\.self_attn\.v\.(.*)$": r"blocks.\1.attn1.to_v.\2",
+            r"^blocks\.(\d+)\.self_attn\.o\.(.*)$":
+            r"blocks.\1.attn1.to_out.0.\2",
+            r"^blocks\.(\d+)\.cross_attn\.q\.(.*)$": r"blocks.\1.attn2.to_q.\2",
+            r"^blocks\.(\d+)\.cross_attn\.k\.(.*)$": r"blocks.\1.attn2.to_k.\2",
+            r"^blocks\.(\d+)\.cross_attn\.v\.(.*)$": r"blocks.\1.attn2.to_v.\2",
+            r"^blocks\.(\d+)\.cross_attn\.o\.(.*)$":
+            r"blocks.\1.attn2.to_out.0.\2",
+            r"^blocks\.(\d+)\.ffn\.0\.(.*)$": r"blocks.\1.ffn.fc_in.\2",
+            r"^blocks\.(\d+)\.ffn\.2\.(.*)$": r"blocks.\1.ffn.fc_out.\2",
+        })
 
     patch_size: Tuple[int, int, int] = (1, 2, 2)
     text_len = 512
@@ -68,6 +85,7 @@ class WanVideoArchConfig(DiTArchConfig):
     image_dim: Optional[int] = None
     added_kv_proj_dim: Optional[int] = None
     rope_max_seq_len: int = 1024
+    exclude_lora_layers: List[str] = field(default_factory=lambda: ["embedder"])
 
     def __post_init__(self):
         super().__post_init__()
 
@@ -27,7 +27,6 @@ class PipelineConfig:
     # Video generation parameters
     embedded_cfg_scale: float = 6.0
     flow_shift: Optional[float] = None
-    use_cpu_offload: bool = False
     disable_autocast: bool = False
 
     # Model configuration
 
@@ -68,9 +68,6 @@ class HunyuanConfig(PipelineConfig):
     embedded_cfg_scale: int = 6
     flow_shift: int = 7
 
-    # Video parameters
-    use_cpu_offload: bool = True
-
     # Text encoding stage
     text_encoder_configs: Tuple[EncoderConfig, ...] = field(
         default_factory=lambda: (LlamaConfig(), CLIPTextConfig()))
 
@@ -18,9 +18,6 @@ class StepVideoT2VConfig(PipelineConfig):
     vae_tiling: bool = False
     vae_sp: bool = False
 
-    # Video parameters
-    use_cpu_offload: bool = True
-
     # Denoising stage
     flow_shift: int = 13
     timesteps_scale: bool = False