Merge pull request #77 from SkyworkAI/dev

yjp999 · web-flow · commit abed76fec911 · 2025-05-16T17:19:04.000+08:00
Dev branch: support video extension and start/end frame control
diff --git a/.gitignore b/.gitignore
@@ -16,4 +16,5 @@ scripts/.gradio/*
 # *.csv
 *.jsonl
 out/*
-model/
+model/
+run.sh
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ Welcome to the **SkyReels V2** repository! Here, you'll find the model weights a
 
 
 ## 🔥🔥🔥 News!!
+* May 16, 2025: 🔥 We release the inference code for [video extension](#ve) and [start/end frame control](#se) in diffusion forcing model.
 * Apr 24, 2025: 🔥 We release the 720P models, [SkyReels-V2-DF-14B-720P](https://huggingface.co/Skywork/SkyReels-V2-DF-14B-720P) and [SkyReels-V2-I2V-14B-720P](https://huggingface.co/Skywork/SkyReels-V2-I2V-14B-720P). The former facilitates infinite-length autoregressive video generation, and the latter focuses on Image2Video synthesis.
 * Apr 21, 2025: 👋 We release the inference code and model weights of [SkyReels-V2](https://huggingface.co/collections/Skywork/skyreels-v2-6801b1b93df627d441d0d0d9) Series Models and the video captioning model [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1) .
 * Apr 3, 2025: 🔥 We also release [SkyReels-A2](https://github.com/SkyworkAI/SkyReels-A2). This is an open-sourced controllable video generation framework capable of assembling arbitrary visual elements.
@@ -222,6 +223,51 @@ python3 generate_video_df.py \
 > - `--addnoise_condition` is used to help smooth the long video generation by adding some noise to the clean condition. Too large noise can cause the inconsistency as well. 20 is a recommended value, and you may try larger ones, but it is recommended to not exceed 50.
 > - Generating a 540P video using the 1.3B model requires approximately 14.7GB peak VRAM, while the same resolution video using the 14B model demands around 51.2GB peak VRAM.
 
+- **<span id="ve">Video Extention</span>**
+```shell
+model_id=Skywork/SkyReels-V2-DF-14B-540P
+# video extention
+python3 generate_video_df.py \
+  --model_id ${model_id} \
+  --resolution 540P \
+  --ar_step 0 \
+  --base_num_frames 97 \
+  --num_frames 120 \
+  --overlap_history 17 \
+  --prompt ${prompt} \
+  --addnoise_condition 20 \
+  --offload \
+  --use_ret_steps \
+  --teacache \
+  --teacache_thresh 0.3 \
+  --video_path ${video_path}
+```
+> **Note**: 
+> - When performing video extension, you need to pass the `--video_path  ${video_path}` parameter to specify the video to be extended.
+
+- **<span id="se">Start/End Frame Control</span>**
+```shell
+model_id=Skywork/SkyReels-V2-DF-14B-540P
+# start/end frame control
+python3 generate_video_df.py \
+  --model_id ${model_id} \
+  --resolution 540P \
+  --ar_step 0 \
+  --base_num_frames 97 \
+  --num_frames 97 \
+  --overlap_history 17 \
+  --prompt ${prompt} \
+  --addnoise_condition 20 \
+  --offload \
+  --use_ret_steps \
+  --teacache \
+  --teacache_thresh 0.3 \
+  --image ${image} \
+  --end_image ${end_image}
+```
+> **Note**:
+> - When controlling the start and end frames, you need to pass the `--image  ${image}` parameter to control the generation of the start frame and the `--end_image  ${end_image}` parameter to control the generation of the end frame.
+
 - **Text To Video & Image To Video**
 
 ```shell
@@ -288,6 +334,8 @@ Below are the key parameters you can customize for video generation:
 | --overlap_history | 17 | Number of frames to overlap for smooth transitions in long videos |
 | --addnoise_condition | 20 | Improves consistency in long video generation |
 | --causal_block_size | 5 | Recommended when using asynchronous inference (--ar_step > 0) |
+--video_path |  | Path to input video for video extension |
+--end_image | | Path to input image for end frame control |
 
 #### Multi-GPU inference using xDiT USP
 
diff --git a/generate_video_df.py b/generate_video_df.py
@@ -11,16 +11,27 @@
 from skyreels_v2_infer import DiffusionForcingPipeline
 from skyreels_v2_infer.modules import download_model
 from skyreels_v2_infer.pipelines import PromptEnhancer
-from skyreels_v2_infer.pipelines import resizecrop
+from skyreels_v2_infer.pipelines.image2video_pipeline import resizecrop
+from moviepy.editor import VideoFileClip
+
+
+def get_video_num_frames_moviepy(video_path):
+    with VideoFileClip(video_path) as clip:
+        num_frames = 0
+        for _ in clip.iter_frames():
+            num_frames += 1
+        return clip.size, num_frames
 
-if __name__ == "__main__":
 
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--outdir", type=str, default="diffusion_forcing")
     parser.add_argument("--model_id", type=str, default="Skywork/SkyReels-V2-DF-1.3B-540P")
     parser.add_argument("--resolution", type=str, choices=["540P", "720P"])
     parser.add_argument("--num_frames", type=int, default=97)
     parser.add_argument("--image", type=str, default=None)
+    parser.add_argument("--end_image", type=str, default=None)
+    parser.add_argument("--video_path", type=str, default='')
     parser.add_argument("--ar_step", type=int, default=0)
     parser.add_argument("--causal_attention", action="store_true")
     parser.add_argument("--causal_block_size", type=int, default=1)
@@ -45,13 +56,11 @@
         "--teacache_thresh",
         type=float,
         default=0.2,
-        help="Higher speedup will cause to worse quality -- 0.1 for 2.0x speedup -- 0.2 for 3.0x speedup",
-    )
+        help="Higher speedup will cause to worse quality -- 0.1 for 2.0x speedup -- 0.2 for 3.0x speedup")
     parser.add_argument(
         "--use_ret_steps",
         action="store_true",
-        help="Using Retention Steps will result in faster generation speed and better generation quality.",
-    )
+        help="Using Retention Steps will result in faster generation speed and better generation quality.")
     args = parser.parse_args()
 
     args.model_id = download_model(args.model_id)
@@ -85,22 +94,14 @@
 
     guidance_scale = args.guidance_scale
     shift = args.shift
-    if args.image:
-        args.image = load_image(args.image)
-        image_width, image_height = args.image.size
-        if image_height > image_width:
-            height, width = width, height
-        args.image = resizecrop(args.image, height, width)
-    image = args.image.convert("RGB") if args.image else None
+    
     negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
 
     save_dir = os.path.join("result", args.outdir)
     os.makedirs(save_dir, exist_ok=True)
     local_rank = 0
     if args.use_usp:
-        assert (
-            not args.prompt_enhancer
-        ), "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
+        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
         from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
         import torch.distributed as dist
 
@@ -138,32 +139,31 @@
 
     if args.causal_attention:
         pipe.transformer.set_ar_attention(args.causal_block_size)
-
+    
     if args.teacache:
         if args.ar_step > 0:
-            num_steps = (
-                args.inference_steps
-                + (((args.base_num_frames - 1) // 4 + 1) // args.causal_block_size - 1) * args.ar_step
-            )
-            print("num_steps:", num_steps)
+            num_steps = args.inference_steps + (((args.base_num_frames - 1) // 4 + 1) // args.causal_block_size - 1) * args.ar_step
+            print('num_steps:', num_steps)
         else:
             num_steps = args.inference_steps
-        pipe.transformer.initialize_teacache(
-            enable_teacache=True,
-            num_steps=num_steps,
-            teacache_thresh=args.teacache_thresh,
-            use_ret_steps=args.use_ret_steps,
-            ckpt_dir=args.model_id,
-        )
+        pipe.transformer.initialize_teacache(enable_teacache=True, num_steps=num_steps, 
+                                             teacache_thresh=args.teacache_thresh, use_ret_steps=args.use_ret_steps, 
+                                             ckpt_dir=args.model_id)
 
     print(f"prompt:{prompt_input}")
     print(f"guidance_scale:{guidance_scale}")
 
-    with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
-        video_frames = pipe(
+    if os.path.exists(args.video_path):
+        (v_width, v_height), input_num_frames = get_video_num_frames_moviepy(args.video_path)
+        assert input_num_frames >= args.overlap_history, "The input video is too short."
+
+        if v_height > v_width:
+            width, heigth = height, width
+
+        video_frames = pipe.extend_video(
             prompt=prompt_input,
             negative_prompt=negative_prompt,
-            image=image,
+            prefix_video_path=args.video_path,
             height=height,
             width=width,
             num_frames=num_frames,
@@ -178,6 +178,40 @@
             causal_block_size=args.causal_block_size,
             fps=fps,
         )[0]
+    else:
+        if args.image:
+            args.image = load_image(args.image)
+            image_width, image_height = args.image.size
+            if image_height > image_width:
+                height, width = width, height
+            args.image = resizecrop(args.image, height, width)
+            if args.end_image:
+                args.end_image = load_image(args.end_image)
+                args.end_image = resizecrop(args.end_image, height, width)
+
+        image = args.image.convert("RGB") if args.image else None
+        end_image = args.end_image.convert("RGB") if args.end_image else None
+        
+        with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
+            video_frames = pipe(
+                prompt=prompt_input,
+                negative_prompt=negative_prompt,
+                image=image,
+                end_image=end_image,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                num_inference_steps=args.inference_steps,
+                shift=shift,
+                guidance_scale=guidance_scale,
+                generator=torch.Generator(device="cuda").manual_seed(args.seed),
+                overlap_history=args.overlap_history,
+                addnoise_condition=args.addnoise_condition,
+                base_num_frames=args.base_num_frames,
+                ar_step=args.ar_step,
+                causal_block_size=args.causal_block_size,
+                fps=fps,
+            )[0]
 
     if local_rank == 0:
         current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py

-Original file line number
+Diff line change
 # *.csv
 *.jsonl
 out/*
 -model/
 +model/
 +run.sh