From 08f8ca49ff76395aadddd4832e42dea4189b80ab Mon Sep 17 00:00:00 2001
From: chaojie <fles@qq.com>
Date: Tue, 22 Apr 2025 03:44:06 +0800
Subject: [PATCH 001/117] add prompt travel

python3 generate_video_df.py   --model_id ${model_id}   --resolution 540P   --ar_step 0   --base_num_frames 97   --num_frames 177   --overlap_history 17    --addnoise_condition 20   --offload --prompt 'A woman in a leather jacket and sunglasses riding a vintage motorcycle through a desert highway at sunset, her hair blowing wildly in the wind as the motorcycle kicks up dust, with the golden sun casting long shadows across the barren landscape.' 'A woman flies into space'
---
 generate_video_df.py                               |  3 ++-
 .../pipelines/diffusion_forcing_pipeline.py        | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index b1bbd9c..9d192da 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -35,6 +35,7 @@
     parser.add_argument("--seed", type=int, default=-1)
     parser.add_argument(
         "--prompt",
+        nargs="+",
         type=str,
         default="A woman in a leather jacket and sunglasses riding a vintage motorcycle through a desert highway at sunset, her hair blowing wildly in the wind as the motorcycle kicks up dust, with the golden sun casting long shadows across the barren landscape.",
     )
@@ -143,6 +144,6 @@
 
     if local_rank == 0:
         current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
-        video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
+        video_out_file = f"{args.prompt[0][:100].replace('/','')}_{args.seed}_{current_time}.mp4"
         output_path = os.path.join(save_dir, video_out_file)
         imageio.mimwrite(output_path, video_frames, fps=fps, quality=8, output_params=["-loglevel", "error"])
diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index c6f39ec..3d4d6ce 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -183,7 +183,7 @@ def generate_timestep_matrix(
     @torch.no_grad()
     def __call__(
         self,
-        prompt: Union[str, List[str]],
+        prompt,
         negative_prompt: Union[str, List[str]] = "",
         image: PipelineImageInput = None,
         height: int = 480,
@@ -213,7 +213,14 @@ def __call__(
             prefix_video, predix_video_latent_length = self.encode_image(image, height, width, num_frames)
 
         self.text_encoder.to(self.device)
-        prompt_embeds = self.text_encoder.encode(prompt).to(self.transformer.dtype)
+        prompt_embeds_list = []
+        if type(prompt) is list:
+            for prompt_iter in prompt:
+                prompt_embeds_list.append(self.text_encoder.encode(prompt_iter).to(self.transformer.dtype))
+        else:
+            prompt_embeds_list.append(self.text_encoder.encode(prompt).to(self.transformer.dtype))
+        prompt_embeds = prompt_embeds_list[0]
+        
         if self.do_classifier_free_guidance:
             negative_prompt_embeds = self.text_encoder.encode(negative_prompt).to(self.transformer.dtype)
         if self.offload:
@@ -317,6 +324,9 @@ def __call__(
             print(f"n_iter:{n_iter}")
             output_video = None
             for i in range(n_iter):
+                if type(prompt) is list:
+                    if len(prompt) > i:
+                        prompt_embeds = prompt_embeds_list[i]
                 if output_video is not None:  # i !=0
                     prefix_video = output_video[:, -overlap_history:].to(prompt_embeds.device)
                     prefix_video = [self.vae.encode(prefix_video.unsqueeze(0))[0]]  # [(c, f, h, w)]

From 1382b4cc7c753b24131071a5013e6d31dd1fb883 Mon Sep 17 00:00:00 2001
From: "fles@qq.com" <snrc@8899>
Date: Tue, 22 Apr 2025 11:01:29 +0800
Subject: [PATCH 002/117] add --video reference

---
 generate_video_df.py                          |  6 ++++-
 .../pipelines/diffusion_forcing_pipeline.py   | 26 +++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 9d192da..19a6b65 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -6,7 +6,7 @@
 
 import imageio
 import torch
-from diffusers.utils import load_image
+from diffusers.utils import load_image, load_video
 
 from skyreels_v2_infer import DiffusionForcingPipeline
 from skyreels_v2_infer.modules import download_model
@@ -20,6 +20,7 @@
     parser.add_argument("--resolution", type=str, choices=["540P", "720P"])
     parser.add_argument("--num_frames", type=int, default=97)
     parser.add_argument("--image", type=str, default=None)
+    parser.add_argument("--video", type=str, default=None)
     parser.add_argument("--ar_step", type=int, default=0)
     parser.add_argument("--causal_attention", action="store_true")
     parser.add_argument("--causal_block_size", type=int, default=1)
@@ -74,6 +75,8 @@
     guidance_scale = args.guidance_scale
     shift = args.shift
     image = load_image(args.image).convert("RGB") if args.image else None
+    video = load_video(args.video) if args.video else None
+    video = video[-17:]
     negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
 
     save_dir = os.path.join("result", args.outdir)
@@ -127,6 +130,7 @@
             prompt=prompt_input,
             negative_prompt=negative_prompt,
             image=image,
+            video=video,
             height=height,
             width=width,
             num_frames=num_frames,
diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index 3d4d6ce..e48be51 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -95,6 +95,29 @@ def encode_image(
         predix_video_latent_length = prefix_video[0].shape[1]
         return prefix_video, predix_video_latent_length
 
+    def encode_video(
+        self, video: List[PipelineImageInput], height: int, width: int, num_frames: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        # prefix_video
+        prefix_video = []
+        for image in video:
+            prefix_video.append(image.convert("RGB").resize((width, height)))
+        prefix_video = np.array(prefix_video).transpose(3, 0, 1, 2)
+        prefix_video = torch.tensor(prefix_video)  # .to(image_embeds.dtype).unsqueeze(1)
+        if prefix_video.dtype == torch.uint8:
+            prefix_video = (prefix_video.float() / (255.0 / 2.0)) - 1.0
+        prefix_video = prefix_video.to(self.device)
+        prefix_video = [self.vae.encode(prefix_video.unsqueeze(0))[0]]  # [(c, f, h, w)]
+        print(prefix_video[0].shape)
+        causal_block_size = self.transformer.num_frame_per_block
+        if prefix_video[0].shape[1] % causal_block_size != 0:
+            truncate_len = prefix_video[0].shape[1] % causal_block_size
+            print("the length of prefix video is truncated for the casual block size alignment.")
+            prefix_video[0] = prefix_video[0][:, : prefix_video[0].shape[1] - truncate_len]
+        predix_video_latent_length = prefix_video[0].shape[1]
+        return prefix_video, predix_video_latent_length
+
     def prepare_latents(
         self,
         shape: Tuple[int],
@@ -186,6 +209,7 @@ def __call__(
         prompt,
         negative_prompt: Union[str, List[str]] = "",
         image: PipelineImageInput = None,
+        video: List[PipelineImageInput] = None,
         height: int = 480,
         width: int = 832,
         num_frames: int = 97,
@@ -211,6 +235,8 @@ def __call__(
         predix_video_latent_length = 0
         if image:
             prefix_video, predix_video_latent_length = self.encode_image(image, height, width, num_frames)
+        elif video:
+            prefix_video, predix_video_latent_length = self.encode_video(video, height, width, num_frames)
 
         self.text_encoder.to(self.device)
         prompt_embeds_list = []

From 38214637df960d06d986bf319cd75af5a99f8ab8 Mon Sep 17 00:00:00 2001
From: "fles@qq.com" <snrc@8899>
Date: Tue, 22 Apr 2025 14:23:35 +0800
Subject: [PATCH 003/117] fix image size compatibility when Diffusion Forcing

---
 generate_video_df.py                          | 27 ++++++++++++++++---
 .../pipelines/diffusion_forcing_pipeline.py   |  5 +---
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 19a6b65..5cbf509 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -11,6 +11,7 @@
 from skyreels_v2_infer import DiffusionForcingPipeline
 from skyreels_v2_infer.modules import download_model
 from skyreels_v2_infer.pipelines import PromptEnhancer
+from skyreels_v2_infer.pipelines import resizecrop
 
 if __name__ == "__main__":
 
@@ -74,9 +75,29 @@
 
     guidance_scale = args.guidance_scale
     shift = args.shift
-    image = load_image(args.image).convert("RGB") if args.image else None
-    video = load_video(args.video) if args.video else None
-    video = video[-17:]
+    if args.image:
+        args.image = load_image(args.image)
+        image_width, image_height = args.image.size
+        if image_height > image_width:
+            height, width = width, height
+        args.image = resizecrop(args.image, height, width)
+    image = args.image.convert("RGB") if args.image else None
+
+    video = []
+    if args.video:
+        args.video = load_video(args.video) 
+        arg_width = width
+        arg_height = height
+        for img in args.video:
+            image_width, image_height = img.size
+            if image_height > image_width:
+                height, width = arg_width, arg_height
+            img = resizecrop(img, height, width)
+            video.append(img.convert("RGB").resize((width, height)))
+            video = video[-17:]
+    else:
+        video = None
+    
     negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
 
     save_dir = os.path.join("result", args.outdir)
diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index e48be51..8cff0d5 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -100,10 +100,7 @@ def encode_video(
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
 
         # prefix_video
-        prefix_video = []
-        for image in video:
-            prefix_video.append(image.convert("RGB").resize((width, height)))
-        prefix_video = np.array(prefix_video).transpose(3, 0, 1, 2)
+        prefix_video = np.array(video).transpose(3, 0, 1, 2)
         prefix_video = torch.tensor(prefix_video)  # .to(image_embeds.dtype).unsqueeze(1)
         if prefix_video.dtype == torch.uint8:
             prefix_video = (prefix_video.float() / (255.0 / 2.0)) - 1.0

From ef5c736dc6f030b4f22df580a97d8639c5f63638 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 15:48:31 -0700
Subject: [PATCH 004/117] Update README.md

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 3c25b67..aeecf7a 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,7 @@
+## Changes from pftq:
+- Added seed synchronization code to allow random seed with multi-GPU.
+- Added batch_size parameter to allow multiple videos to generate without reloading the model.
+
 <p align="center">
   <img src="assets/logo2.png" alt="SkyReels Logo" width="50%">
 </p>

From f260b156d32bbfc6b03ab0af83328802dee35ffc Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 15:57:08 -0700
Subject: [PATCH 005/117] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index aeecf7a..7248b87 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 ## Changes from pftq:
 - Added seed synchronization code to allow random seed with multi-GPU.
 - Added batch_size parameter to allow multiple videos to generate without reloading the model.
+- Friendlier filenames with date, seed, cfg, steps, and other details in front.
+<hr>
 
 <p align="center">
   <img src="assets/logo2.png" alt="SkyReels Logo" width="50%">

From 9edec53702ea49d15a0617dc9a5159fb578618a7 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 16:42:27 -0700
Subject: [PATCH 006/117] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7248b87..cbbb438 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 ## Changes from pftq:
 - Added seed synchronization code to allow random seed with multi-GPU.
 - Added batch_size parameter to allow multiple videos to generate without reloading the model.
+- Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
 - Friendlier filenames with date, seed, cfg, steps, and other details in front.
 <hr>
 

From 6dc830715556c5141bf348ad48e626c9831a7eab Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 16:43:41 -0700
Subject: [PATCH 007/117] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index cbbb438..1743709 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,6 @@
 ## Changes from pftq:
 - Added seed synchronization code to allow random seed with multi-GPU.
+- Added image broadcast/synchronization to avoid potential sync issues in multi-GPU.
 - Added batch_size parameter to allow multiple videos to generate without reloading the model.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
 - Friendlier filenames with date, seed, cfg, steps, and other details in front.

From 9fc4d1a826cebffaec0bff58b417b3fb87c0c476 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 17:10:43 -0700
Subject: [PATCH 008/117] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1743709..a833ec7 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 ## Changes from pftq:
 - Added seed synchronization code to allow random seed with multi-GPU.
 - Added image broadcast/synchronization to avoid potential sync issues in multi-GPU.
-- Added batch_size parameter to allow multiple videos to generate without reloading the model.
+- Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
 - Friendlier filenames with date, seed, cfg, steps, and other details in front.
 <hr>

From 98dae5c6832ae61932cf3179d8ffe2b9ff913f74 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 19:22:10 -0700
Subject: [PATCH 009/117] Update generate_video_df.py

Added batch mode, added option to keep original aspect ratio, synchronized seeds on multi-gpu.
---
 generate_video_df.py | 187 ++++++++++++++++++++++++++++++-------------
 1 file changed, 131 insertions(+), 56 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 4bc3d55..cddc85f 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -3,10 +3,11 @@
 import os
 import random
 import time
-
-import imageio
 import torch
 from diffusers.utils import load_image
+import imageio
+from PIL import Image  #20250422 pftq: Added for image resizing and cropping
+import numpy as np  #20250422 pftq: Added for seed synchronization
 
 from skyreels_v2_infer import DiffusionForcingPipeline
 from skyreels_v2_infer.modules import download_model
@@ -32,22 +33,42 @@
     parser.add_argument("--use_usp", action="store_true")
     parser.add_argument("--offload", action="store_true")
     parser.add_argument("--fps", type=int, default=24)
-    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--seed", type=int, default=-1)
     parser.add_argument(
         "--prompt",
         type=str,
         default="A woman in a leather jacket and sunglasses riding a vintage motorcycle through a desert highway at sunset, her hair blowing wildly in the wind as the motorcycle kicks up dust, with the golden sun casting long shadows across the barren landscape.",
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
+    
+    parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
+    parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
+    
     args = parser.parse_args()
 
     args.model_id = download_model(args.model_id)
     print("model_id:", args.model_id)
 
-    assert (args.use_usp and args.seed is not None) or (not args.use_usp), "usp mode need seed"
-    if args.seed is None:
-        random.seed(time.time())
-        args.seed = int(random.randrange(4294967294))
+    #20250422 pftq: unneeded with seed synchronization code
+    #assert (args.use_usp and args.seed != -1) or (not args.use_usp), "usp mode requires a valid seed"
+
+    local_rank = 0
+    if args.use_usp:
+        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
+        import torch.distributed as dist
+
+        dist.init_process_group("nccl")
+        local_rank = dist.get_rank()
+        torch.cuda.set_device(dist.get_rank())
+        device = "cuda"
+
+        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+
+        initialize_model_parallel(
+            sequence_parallel_degree=dist.get_world_size(),
+            ring_degree=1,
+            ulysses_degree=dist.get_world_size(),
+        )
 
     if args.resolution == "540P":
         height = 544
@@ -64,37 +85,52 @@
     if num_frames > args.base_num_frames:
         assert (
             args.overlap_history is not None
-        ), 'You are supposed to specify the "overlap_history" to support the long video generation. 17 and 37 are recommanded to set.'
+        ), 'You are supposed to specify the "overlap_history" to support the long video generation. 17 and 37 are recommended to set.'
     if args.addnoise_condition > 60:
         print(
-            f'You have set "addnoise_condition" as {args.addnoise_condition}. The value is too large which can cause inconsistency in long video generation. The value is recommanded to set 20.'
+            f'You have set "addnoise_condition" as {args.addnoise_condition}. The value is too large which can cause inconsistency in long video generation. The value is recommended to set 20.'
         )
 
     guidance_scale = args.guidance_scale
     shift = args.shift
-    image = load_image(args.image).convert("RGB") if args.image else None
+    #image = load_image(args.image).convert("RGB") if args.image else None
+
+    #20250422 pftq: Add error handling for image loading, aspect ratio preservation, and multi-GPU synchronization
+    image = None
+    if args.image:
+        if local_rank == 0:
+            try:
+                image = load_image(args.image).convert("RGB")
+
+                # 20250422 pftq: option to preserve image aspect ratio
+                if args.preserve_image_aspect_ratio:
+                    img_width, img_height = image.size
+                    height = int(width / img_width * img_height)
+            except Exception as e:
+                raise ValueError(f"Failed to load or process image: {e}")
+                
+        if args.use_usp:
+            # Broadcast image to other ranks
+            image_data = torch.tensor(np.array(image), dtype=torch.uint8, device="cuda") if image is not None else None
+            if local_rank == 0:
+                dist.broadcast(image_data, src=0)
+            else:
+                image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
+                dist.broadcast(image_data, src=0)
+                image = Image.fromarray(image_data.cpu().numpy())
+
+            #20250422 pftq: Broadcast height and width to ensure consistency
+            height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
+            width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
+            dist.broadcast(height_tensor, src=0)
+            dist.broadcast(width_tensor, src=0)
+            height = height_tensor.item()
+            width = width_tensor.item()
+
     negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
 
     save_dir = os.path.join("result", args.outdir)
     os.makedirs(save_dir, exist_ok=True)
-    local_rank = 0
-    if args.use_usp:
-        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
-        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
-        import torch.distributed as dist
-
-        dist.init_process_group("nccl")
-        local_rank = dist.get_rank()
-        torch.cuda.set_device(dist.get_rank())
-        device = "cuda"
-
-        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
-
-        initialize_model_parallel(
-            sequence_parallel_degree=dist.get_world_size(),
-            ring_degree=1,
-            ulysses_degree=dist.get_world_size(),
-        )
 
     prompt_input = args.prompt
     if args.prompt_enhancer and args.image is None:
@@ -118,31 +154,70 @@
     if args.causal_attention:
         pipe.transformer.set_ar_attention(args.causal_block_size)
 
-    print(f"prompt:{prompt_input}")
-    print(f"guidance_scale:{guidance_scale}")
-
-    with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
-        video_frames = pipe(
-            prompt=prompt_input,
-            negative_prompt=negative_prompt,
-            image=image,
-            height=height,
-            width=width,
-            num_frames=num_frames,
-            num_inference_steps=args.inference_steps,
-            shift=shift,
-            guidance_scale=guidance_scale,
-            generator=torch.Generator(device="cuda").manual_seed(args.seed),
-            overlap_history=args.overlap_history,
-            addnoise_condition=args.addnoise_condition,
-            base_num_frames=args.base_num_frames,
-            ar_step=args.ar_step,
-            causal_block_size=args.causal_block_size,
-            fps=fps,
-        )[0]
-
-    if local_rank == 0:
-        current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
-        video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
-        output_path = os.path.join(save_dir, video_out_file)
-        imageio.mimwrite(output_path, video_frames, fps=fps, quality=8, output_params=["-loglevel", "error"])
+    for idx in range(args.batch_size): # 20250422 pftq: implemented --batch_size
+        if local_rank == 0:
+            print(f"prompt:{prompt_input}")
+            print(f"guidance_scale:{guidance_scale}")
+            print(f"Generating video {idx+1} of {args.batch_size}")
+
+        #20250422 pftq: Synchronize seed across all ranks
+        if args.use_usp:
+            try:
+                #20250422 pftq: Synchronize ranks before seed broadcasting
+                dist.barrier()
+
+                #20250422 pftq: Always broadcast seed to ensure consistency
+                if local_rank == 0:
+                    if args.seed == -1 or idx > 0:
+                        args.seed = int(random.randrange(4294967294))
+                seed_tensor = torch.tensor(args.seed, dtype=torch.int64, device="cuda")
+                dist.broadcast(seed_tensor, src=0)
+                args.seed = seed_tensor.item()
+
+                #20250422 pftq: Synchronize ranks after seed broadcasting
+                dist.barrier()
+            except Exception as e:
+                print(f"[Rank {local_rank}] Seed broadcasting error: {e}")
+                dist.destroy_process_group()
+                raise
+
+        else:
+            #20250422 pftq: Single GPU seed initialization
+            if args.seed == -1 or idx > 0:
+                args.seed = int(random.randrange(4294967294))
+
+        #20250422 pftq: Set seeds for reproducibility
+        random.seed(args.seed)
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed_all(args.seed)
+        
+        with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
+            video_frames = pipe(
+                prompt=prompt_input,
+                negative_prompt=negative_prompt,
+                image=image,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                num_inference_steps=args.inference_steps,
+                shift=shift,
+                guidance_scale=guidance_scale,
+                generator=torch.Generator(device="cuda").manual_seed(args.seed),
+                overlap_history=args.overlap_history,
+                addnoise_condition=args.addnoise_condition,
+                base_num_frames=args.base_num_frames,
+                ar_step=args.ar_step,
+                causal_block_size=args.causal_block_size,
+                fps=fps,
+            )[0]
+    
+        if local_rank == 0:
+            current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
+            #video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
+            
+            # 20250422 pftq: more useful filename
+            video_out_file = f"{current_time}_cfg{guidance_scale}_steps{args.inference_steps}_seed{args.seed}_{args.prompt[:100].replace('/','')}_{idx}.mp4"
+            
+            output_path = os.path.join(save_dir, video_out_file)
+            imageio.mimwrite(output_path, video_frames, fps=fps, quality=8, output_params=["-loglevel", "error"])

From 36d42bf77f30d7447e89ba5a73b4a49003b9a869 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 19:23:16 -0700
Subject: [PATCH 010/117] Update README.md

---
 README.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/README.md b/README.md
index a833ec7..de8e31b 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,27 @@
 - Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
 - Friendlier filenames with date, seed, cfg, steps, and other details in front.
+
+Example prompt (multi-GPU):
+```
+model_id=Skywork/SkyReels-V2-DF-14B-540P
+torchrun --nproc_per_node=2 generate_video_df.py \
+  --model_id ${model_id} \
+  --resolution 540P \
+  --ar_step 0 \
+  --base_num_frames 97 \
+  --num_frames 257 \
+  --overlap_history 17 \
+  --inference_steps 50 \
+  --guidance_scale 6 \
+  --batch_size 10 \
+  --preserve_image_aspect_ratio \
+  --image "image.jpg" \
+  --prompt "" \
+  --addnoise_condition 20 \
+  --use_usp \
+  --offload
+```
 <hr>
 
 <p align="center">

From 4a3c4b25f7513967fcaa13f4ca45394f0625a987 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 19:24:57 -0700
Subject: [PATCH 011/117] Update README.md

---
 README.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/README.md b/README.md
index de8e31b..b9bbf18 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,26 @@ torchrun --nproc_per_node=2 generate_video_df.py \
   --use_usp \
   --offload
 ```
+
+Single GPU:
+```
+model_id=Skywork/SkyReels-V2-DF-14B-540P
+python3 generate_video_df.py \
+  --model_id ${model_id} \
+  --resolution 540P \
+  --ar_step 0 \
+  --base_num_frames 97 \
+  --num_frames 257 \
+  --overlap_history 17 \
+  --inference_steps 50 \
+  --guidance_scale 6 \
+  --batch_size 10 \
+  --preserve_image_aspect_ratio \
+  --image "image.jpg" \
+  --prompt "" \
+  --addnoise_condition 20 \
+  --offload
+```
 <hr>
 
 <p align="center">

From 3dced695eb881828504461bf34fe75a24cae92f2 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 20:29:40 -0700
Subject: [PATCH 012/117] Update README.md

---
 README.md | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/README.md b/README.md
index b9bbf18..57d448a 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,34 @@
 - Added image broadcast/synchronization to avoid potential sync issues in multi-GPU.
 - Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
+- Exposed negative_prompt to allow that to be changed/overwritten.
 - Friendlier filenames with date, seed, cfg, steps, and other details in front.
 
+Easy install instructions for those using Runpod like me:
+```
+#create once on new pod
+export HF_HOME=/workspace/
+export TZ=America/Los_Angeles
+python -m venv venv
+git clone https://github.com/pftq/SkyReels-V2_Improvements
+mv SkyReels-V2_Improvements SkyReels-V2
+cd /workspace/SkyReels-V2
+source /workspace/venv/bin/activate
+pip install torch==2.5.1
+pip install --upgrade wheel setuptools
+pip install packaging
+pip install -r requirements.txt --no-build-isolation
+pip install "huggingface_hub[cli]"
+huggingface-cli download Skywork/SkyReels-V2-DF-14B-540P --local-dir ./SkyReels-V2-DF-14B-540P
+deactivate
+
+#always run at the start to use persisting drive
+export HF_HOME=/workspace/
+export TZ=America/Los_Angeles
+source /workspace/venv/bin/activate
+cd /workspace/SkyReels-V2
+```
+
 Example prompt (multi-GPU):
 ```
 model_id=Skywork/SkyReels-V2-DF-14B-540P
@@ -21,6 +47,7 @@ torchrun --nproc_per_node=2 generate_video_df.py \
   --preserve_image_aspect_ratio \
   --image "image.jpg" \
   --prompt "" \
+  --negative_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
   --addnoise_condition 20 \
   --use_usp \
   --offload
@@ -42,9 +69,13 @@ python3 generate_video_df.py \
   --preserve_image_aspect_ratio \
   --image "image.jpg" \
   --prompt "" \
+  --negative_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
   --addnoise_condition 20 \
   --offload
 ```
+
+Change "DF" to "I2V' or "T2V" accordingly if you don't want to use the infinite length version of the model.
+
 <hr>
 
 <p align="center">

From a8352b86d9eba77bb931fe64d8a0cf6bc1b184a9 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 20:30:44 -0700
Subject: [PATCH 013/117] Exposed negative_prompt option

---
 generate_video_df.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index cddc85f..1411ccc 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -43,6 +43,7 @@
     
     parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
     parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
+    parser.add_argument("--negative_prompt", type=str, default="Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards") # 20250422 pftq: expose negative prompt
     
     args = parser.parse_args()
 
@@ -127,7 +128,7 @@
             height = height_tensor.item()
             width = width_tensor.item()
 
-    negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+    negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
 
     save_dir = os.path.join("result", args.outdir)
     os.makedirs(save_dir, exist_ok=True)

From 214e79d08340bd5141afd6f6fcba1379e1553ed6 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 20:31:55 -0700
Subject: [PATCH 014/117] Added batch mode, added option to keep original
 aspect ratio, synchronized randomized seeds on multi-gpu, exposed
 negative_prompt option.

---
 generate_video.py | 175 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 125 insertions(+), 50 deletions(-)

diff --git a/generate_video.py b/generate_video.py
index 2e09350..eb9a109 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -8,10 +8,12 @@
 import torch
 from diffusers.utils import load_image
 
+from PIL import Image  #20250422 pftq: Added for image resizing and cropping
+import numpy as np  #20250422 pftq: Added for seed synchronization
+
 from skyreels_v2_infer.modules import download_model
 from skyreels_v2_infer.pipelines import Image2VideoPipeline
 from skyreels_v2_infer.pipelines import PromptEnhancer
-from skyreels_v2_infer.pipelines import resizecrop
 from skyreels_v2_infer.pipelines import Text2VideoPipeline
 
 MODEL_ID_CONFIG = {
@@ -41,37 +43,28 @@
     parser.add_argument("--use_usp", action="store_true")
     parser.add_argument("--offload", action="store_true")
     parser.add_argument("--fps", type=int, default=24)
-    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--seed", type=int, default=-1)
     parser.add_argument(
         "--prompt",
         type=str,
         default="A serene lake surrounded by towering mountains, with a few swans gracefully gliding across the water and sunlight dancing on the surface.",
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
+
+    parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
+    parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
+    parser.add_argument("--negative_prompt", type=str, default="Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards") # 20250422 pftq: expose negative prompt
+    
     args = parser.parse_args()
 
     args.model_id = download_model(args.model_id)
     print("model_id:", args.model_id)
 
-    assert (args.use_usp and args.seed is not None) or (not args.use_usp), "usp mode need seed"
-    if args.seed is None:
-        random.seed(time.time())
-        args.seed = int(random.randrange(4294967294))
-
-    if args.resolution == "540P":
-        height = 544
-        width = 960
-    elif args.resolution == "720P":
-        height = 720
-        width = 1280
-    else:
-        raise ValueError(f"Invalid resolution: {args.resolution}")
+    #20250422 pftq: unneeded with seed synchronization code
+    #assert (args.use_usp and args.seed is not None) or (not args.use_usp), "usp mode need seed"
 
-    image = load_image(args.image).convert("RGB") if args.image else None
-    negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
     local_rank = 0
     if args.use_usp:
-        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
         from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
         import torch.distributed as dist
 
@@ -88,6 +81,51 @@
             ulysses_degree=dist.get_world_size(),
         )
 
+    if args.resolution == "540P":
+        height = 544
+        width = 960
+    elif args.resolution == "720P":
+        height = 720
+        width = 1280
+    else:
+        raise ValueError(f"Invalid resolution: {args.resolution}")
+
+    #image = load_image(args.image).convert("RGB") if args.image else None
+
+    #20250422 pftq: Add error handling for image loading, aspect ratio preservation, and multi-GPU synchronization
+    image = None
+    if args.image:
+        if local_rank == 0:
+            try:
+                image = load_image(args.image).convert("RGB")
+
+                # 20250422 pftq: option to preserve image aspect ratio
+                if args.preserve_image_aspect_ratio:
+                    img_width, img_height = image.size
+                    height = int(width / img_width * img_height)
+            except Exception as e:
+                raise ValueError(f"Failed to load or process image: {e}")
+                
+        if args.use_usp:
+            # Broadcast image to other ranks
+            image_data = torch.tensor(np.array(image), dtype=torch.uint8, device="cuda") if image is not None else None
+            if local_rank == 0:
+                dist.broadcast(image_data, src=0)
+            else:
+                image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
+                dist.broadcast(image_data, src=0)
+                image = Image.fromarray(image_data.cpu().numpy())
+
+            #20250422 pftq: Broadcast height and width to ensure consistency
+            height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
+            width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
+            dist.broadcast(height_tensor, src=0)
+            dist.broadcast(width_tensor, src=0)
+            height = height_tensor.item()
+            width = width_tensor.item()
+
+    negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
+
     prompt_input = args.prompt
     if args.prompt_enhancer and args.image is None:
         print(f"init prompt enhancer")
@@ -110,41 +148,78 @@
         pipe = Image2VideoPipeline(
             model_path=args.model_id, dit_path=args.model_id, use_usp=args.use_usp, offload=args.offload
         )
-        args.image = load_image(args.image)
-        image_width, image_height = args.image.size
-        if image_height > image_width:
-            height, width = width, height
-        args.image = resizecrop(args.image, height, width)
 
     prompt_input = args.prompt
     if args.prompt_enhancer and image is not None:
         prompt_input = prompt_enhancer(prompt_input)
         print(f"enhanced prompt: {prompt_input}")
 
-    kwargs = {
-        "prompt": prompt_input,
-        "negative_prompt": negative_prompt,
-        "num_frames": args.num_frames,
-        "num_inference_steps": args.inference_steps,
-        "guidance_scale": args.guidance_scale,
-        "shift": args.shift,
-        "generator": torch.Generator(device="cuda").manual_seed(args.seed),
-        "height": height,
-        "width": width,
-    }
-
-    if image is not None:
-        kwargs["image"] = args.image.convert("RGB")
-
-    save_dir = os.path.join("result", args.outdir)
-    os.makedirs(save_dir, exist_ok=True)
-
-    with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
-        print(f"infer kwargs:{kwargs}")
-        video_frames = pipe(**kwargs)[0]
-
-    if local_rank == 0:
-        current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
-        video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
-        output_path = os.path.join(save_dir, video_out_file)
-        imageio.mimwrite(output_path, video_frames, fps=args.fps, quality=8, output_params=["-loglevel", "error"])
+    for idx in range(args.batch_size): # 20250422 pftq: implemented --batch_size
+        if local_rank == 0:
+            print(f"Generating video {idx+1} of {args.batch_size}")
+
+        #20250422 pftq: Synchronize seed across all ranks
+        if args.use_usp:
+            try:
+                #20250422 pftq: Synchronize ranks before seed broadcasting
+                dist.barrier()
+
+                #20250422 pftq: Always broadcast seed to ensure consistency
+                if local_rank == 0:
+                    if args.seed == -1 or idx > 0:
+                        args.seed = int(random.randrange(4294967294))
+                seed_tensor = torch.tensor(args.seed, dtype=torch.int64, device="cuda")
+                dist.broadcast(seed_tensor, src=0)
+                args.seed = seed_tensor.item()
+
+                #20250422 pftq: Synchronize ranks after seed broadcasting
+                dist.barrier()
+            except Exception as e:
+                print(f"[Rank {local_rank}] Seed broadcasting error: {e}")
+                dist.destroy_process_group()
+                raise
+
+        else:
+            #20250422 pftq: Single GPU seed initialization
+            if args.seed == -1 or idx > 0:
+                args.seed = int(random.randrange(4294967294))
+
+        #20250422 pftq: Set seeds for reproducibility
+        random.seed(args.seed)
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed_all(args.seed)
+        
+        kwargs = {
+            "prompt": prompt_input,
+            "negative_prompt": negative_prompt,
+            "num_frames": args.num_frames,
+            "num_inference_steps": args.inference_steps,
+            "guidance_scale": args.guidance_scale,
+            "shift": args.shift,
+            "generator": torch.Generator(device="cuda").manual_seed(args.seed),
+            "height": height,
+            "width": width,
+        }
+    
+        if image is not None:
+            #kwargs["image"] = load_image(args.image).convert("RGB") 
+            # 20250422 pftq: redundant reloading of the image
+            kwargs["image"] = image
+    
+        save_dir = os.path.join("result", args.outdir)
+        os.makedirs(save_dir, exist_ok=True)
+    
+        with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
+            print(f"infer kwargs:{kwargs}")
+            video_frames = pipe(**kwargs)[0]
+    
+        if local_rank == 0:
+            current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
+            #video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
+            
+            # 20250422 pftq: more useful filename
+            video_out_file = f"{current_time}_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
+            
+            output_path = os.path.join(save_dir, video_out_file)
+            imageio.mimwrite(output_path, video_frames, fps=args.fps, quality=8, output_params=["-loglevel", "error"])

From cdf25f65d44c98b1f444e144cfdd0cbbfaaddead Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 20:42:00 -0700
Subject: [PATCH 015/117] Update README.md

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index 57d448a..a929dc1 100644
--- a/README.md
+++ b/README.md
@@ -20,8 +20,6 @@ pip install torch==2.5.1
 pip install --upgrade wheel setuptools
 pip install packaging
 pip install -r requirements.txt --no-build-isolation
-pip install "huggingface_hub[cli]"
-huggingface-cli download Skywork/SkyReels-V2-DF-14B-540P --local-dir ./SkyReels-V2-DF-14B-540P
 deactivate
 
 #always run at the start to use persisting drive

From 7b135c0b7db1360a815396bdce3ca9b716da5c0c Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 21:33:11 -0700
Subject: [PATCH 016/117] Fixed CuSolver issues

---
 generate_video.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/generate_video.py b/generate_video.py
index eb9a109..c1d09c2 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -154,6 +154,10 @@
         prompt_input = prompt_enhancer(prompt_input)
         print(f"enhanced prompt: {prompt_input}")
 
+
+    #20250422 pftq: Set preferred linear algebra backend to avoid cuSOLVER issues
+    torch.backends.cuda.preferred_linalg_library("default")  # or try "magma" if available
+    
     for idx in range(args.batch_size): # 20250422 pftq: implemented --batch_size
         if local_rank == 0:
             print(f"Generating video {idx+1} of {args.batch_size}")

From 357f013916b27d50b643416d41db183e82884d91 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 21:33:27 -0700
Subject: [PATCH 017/117] Fixed CuSolver issues

---
 generate_video_df.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/generate_video_df.py b/generate_video_df.py
index 1411ccc..0b55286 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -155,6 +155,10 @@
     if args.causal_attention:
         pipe.transformer.set_ar_attention(args.causal_block_size)
 
+
+    #20250422 pftq: Set preferred linear algebra backend to avoid cuSOLVER issues
+    torch.backends.cuda.preferred_linalg_library("default")  # or try "magma" if available
+    
     for idx in range(args.batch_size): # 20250422 pftq: implemented --batch_size
         if local_rank == 0:
             print(f"prompt:{prompt_input}")

From 037012f1b58bb0940e4a397c849fe8a047fe61d7 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 21:46:45 -0700
Subject: [PATCH 018/117] Update generate_video.py

---
 generate_video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generate_video.py b/generate_video.py
index c1d09c2..3ad38e2 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -170,7 +170,7 @@
 
                 #20250422 pftq: Always broadcast seed to ensure consistency
                 if local_rank == 0:
-                    if args.seed == -1 or idx > 0:
+                    if args.seed == -1 or args.seed is None or idx > 0:
                         args.seed = int(random.randrange(4294967294))
                 seed_tensor = torch.tensor(args.seed, dtype=torch.int64, device="cuda")
                 dist.broadcast(seed_tensor, src=0)

From dfc531de324418d69dd2056198ce808831e797ed Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 21:47:01 -0700
Subject: [PATCH 019/117] Update generate_video_df.py

---
 generate_video_df.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 0b55286..f53a503 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -173,7 +173,7 @@
 
                 #20250422 pftq: Always broadcast seed to ensure consistency
                 if local_rank == 0:
-                    if args.seed == -1 or idx > 0:
+                    if args.seed == -1 or args.seed is None or idx > 0:
                         args.seed = int(random.randrange(4294967294))
                 seed_tensor = torch.tensor(args.seed, dtype=torch.int64, device="cuda")
                 dist.broadcast(seed_tensor, src=0)

From 74ac5dc0cfb7c5704d2c098949ab21eca5608540 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 21:49:53 -0700
Subject: [PATCH 020/117] Update generate_video_df.py

---
 generate_video_df.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/generate_video_df.py b/generate_video_df.py
index f53a503..e858e6d 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -55,6 +55,7 @@
 
     local_rank = 0
     if args.use_usp:
+        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
         from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
         import torch.distributed as dist
 

From 1497cb0fec000ac02248c48ac985c9eb06706a66 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 21:50:11 -0700
Subject: [PATCH 021/117] Update generate_video.py

---
 generate_video.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/generate_video.py b/generate_video.py
index 3ad38e2..fdb9145 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -65,6 +65,7 @@
 
     local_rank = 0
     if args.use_usp:
+        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
         from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
         import torch.distributed as dist
 

From d7ad88ee2b5242b28f27f76d1e92304b53c393fd Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 21:57:24 -0700
Subject: [PATCH 022/117] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index a929dc1..06852c6 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@
 - Added image broadcast/synchronization to avoid potential sync issues in multi-GPU.
 - Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
+- Fixed DF script not resize-cropping the image (I2V script does it but DF is missing the code).
 - Exposed negative_prompt to allow that to be changed/overwritten.
 - Friendlier filenames with date, seed, cfg, steps, and other details in front.
 

From 5b46048401fddf05d9fd34fc51467ecab2d417b5 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 21:57:46 -0700
Subject: [PATCH 023/117] Update generate_video.py

---
 generate_video.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/generate_video.py b/generate_video.py
index fdb9145..d86c064 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -14,6 +14,7 @@
 from skyreels_v2_infer.modules import download_model
 from skyreels_v2_infer.pipelines import Image2VideoPipeline
 from skyreels_v2_infer.pipelines import PromptEnhancer
+from skyreels_v2_infer.pipelines import resizecrop
 from skyreels_v2_infer.pipelines import Text2VideoPipeline
 
 MODEL_ID_CONFIG = {
@@ -104,6 +105,11 @@
                 if args.preserve_image_aspect_ratio:
                     img_width, img_height = image.size
                     height = int(width / img_width * img_height)
+                else:
+                    image_width, image_height = image.size
+                    if image_height > image_width:
+                        height, width = width, height
+                    image = resizecrop(image, height, width)
             except Exception as e:
                 raise ValueError(f"Failed to load or process image: {e}")
                 

From 0898476d17b238ea8b9a0fc1126403445934df80 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 21:57:56 -0700
Subject: [PATCH 024/117] Update generate_video_df.py

---
 generate_video_df.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/generate_video_df.py b/generate_video_df.py
index e858e6d..d61d722 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -11,6 +11,7 @@
 
 from skyreels_v2_infer import DiffusionForcingPipeline
 from skyreels_v2_infer.modules import download_model
+from skyreels_v2_infer.pipelines import resizecrop
 from skyreels_v2_infer.pipelines import PromptEnhancer
 
 if __name__ == "__main__":
@@ -108,6 +109,11 @@
                 if args.preserve_image_aspect_ratio:
                     img_width, img_height = image.size
                     height = int(width / img_width * img_height)
+                else:
+                    image_width, image_height = image.size
+                    if image_height > image_width:
+                        height, width = width, height
+                    image = resizecrop(image, height, width)
             except Exception as e:
                 raise ValueError(f"Failed to load or process image: {e}")
                 

From c284670584d031b2e7873b3787982e4fa7be4460 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 22:04:31 -0700
Subject: [PATCH 025/117] Update README.md

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 06852c6..a3a5df9 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,8 @@ cd /workspace/SkyReels-V2
 Example prompt (multi-GPU):
 ```
 model_id=Skywork/SkyReels-V2-DF-14B-540P
-torchrun --nproc_per_node=2 generate_video_df.py \
+gpu_count=2
+torchrun --nproc_per_node=${gpu_count} generate_video_df.py \
   --model_id ${model_id} \
   --resolution 540P \
   --ar_step 0 \

From 7a7d65d1357d14e0f68a1e51a6dc917e2b0b902c Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 22:30:26 -0700
Subject: [PATCH 026/117] Update generate_video_df.py

---
 generate_video_df.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index d61d722..5d01029 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -229,7 +229,10 @@
             #video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
             
             # 20250422 pftq: more useful filename
-            video_out_file = f"{current_time}_cfg{guidance_scale}_steps{args.inference_steps}_seed{args.seed}_{args.prompt[:100].replace('/','')}_{idx}.mp4"
+            gpucount = ""
+            if args.use_usp and dist.get_world_size():
+                gpucount = "_"+dist.get_world_size()+"xGPU"
+            video_out_file = f"{current_time}_skyreels2df_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
             
             output_path = os.path.join(save_dir, video_out_file)
             imageio.mimwrite(output_path, video_frames, fps=fps, quality=8, output_params=["-loglevel", "error"])

From dd5dde34a1a68c588c424cc2a39474c891ffc7c3 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 22:30:36 -0700
Subject: [PATCH 027/117] Update generate_video.py

---
 generate_video.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/generate_video.py b/generate_video.py
index d86c064..e9e6b0d 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -230,7 +230,10 @@
             #video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
             
             # 20250422 pftq: more useful filename
-            video_out_file = f"{current_time}_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
+            gpucount = ""
+            if args.use_usp and dist.get_world_size():
+                gpucount = "_"+dist.get_world_size()+"xGPU"
+            video_out_file = f"{current_time}_skyreels2_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
             
             output_path = os.path.join(save_dir, video_out_file)
             imageio.mimwrite(output_path, video_frames, fps=args.fps, quality=8, output_params=["-loglevel", "error"])

From 9433d9c357d89b2b646541da085a6ee5db9d3d3e Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 23:48:08 -0700
Subject: [PATCH 028/117] Update generate_video.py

---
 generate_video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generate_video.py b/generate_video.py
index e9e6b0d..5d2de0e 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -232,7 +232,7 @@
             # 20250422 pftq: more useful filename
             gpucount = ""
             if args.use_usp and dist.get_world_size():
-                gpucount = "_"+dist.get_world_size()+"xGPU"
+                gpucount = "_"+str(dist.get_world_size())+"xGPU"
             video_out_file = f"{current_time}_skyreels2_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
             
             output_path = os.path.join(save_dir, video_out_file)

From 2f8c320565bb135fff2b804d9a8d0aa7e61f594a Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Tue, 22 Apr 2025 23:48:20 -0700
Subject: [PATCH 029/117] Update generate_video_df.py

---
 generate_video_df.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 5d01029..32a6575 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -231,7 +231,7 @@
             # 20250422 pftq: more useful filename
             gpucount = ""
             if args.use_usp and dist.get_world_size():
-                gpucount = "_"+dist.get_world_size()+"xGPU"
+                gpucount = "_"+str(dist.get_world_size())+"xGPU"
             video_out_file = f"{current_time}_skyreels2df_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
             
             output_path = os.path.join(save_dir, video_out_file)

From 46d836966c6f85596e70b7c1f6bd5b76b571c0aa Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 00:36:45 -0700
Subject: [PATCH 030/117] Update generate_video.py

---
 generate_video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generate_video.py b/generate_video.py
index 5d2de0e..7eac127 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -233,7 +233,7 @@
             gpucount = ""
             if args.use_usp and dist.get_world_size():
                 gpucount = "_"+str(dist.get_world_size())+"xGPU"
-            video_out_file = f"{current_time}_skyreels2_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
+            video_out_file = f"{current_time}_skyreels2_{args.resolution}-{args.num_frames}f_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
             
             output_path = os.path.join(save_dir, video_out_file)
             imageio.mimwrite(output_path, video_frames, fps=args.fps, quality=8, output_params=["-loglevel", "error"])

From b5f346b2a225bc7441b695d5d86995952a7a0cfa Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 00:36:57 -0700
Subject: [PATCH 031/117] Update generate_video_df.py

---
 generate_video_df.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 32a6575..e0fb91e 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -232,7 +232,7 @@
             gpucount = ""
             if args.use_usp and dist.get_world_size():
                 gpucount = "_"+str(dist.get_world_size())+"xGPU"
-            video_out_file = f"{current_time}_skyreels2df_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
+            video_out_file = f"{current_time}_skyreels2df_{args.resolution}-{args.num_frames}f_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
             
             output_path = os.path.join(save_dir, video_out_file)
             imageio.mimwrite(output_path, video_frames, fps=fps, quality=8, output_params=["-loglevel", "error"])

From 2e1717147b2ade9469c0630975fed8b0acf8de9c Mon Sep 17 00:00:00 2001
From: "fles@qq.com" <snrc@8899>
Date: Wed, 23 Apr 2025 16:44:28 +0800
Subject: [PATCH 032/117] add mid video output

---
 generate_video_df.py                          |  9 ++++++--
 .../pipelines/diffusion_forcing_pipeline.py   | 23 +++++++++++++++----
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index b76d509..4b3e282 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -146,6 +146,10 @@
     print(f"prompt:{prompt_input}")
     print(f"guidance_scale:{guidance_scale}")
 
+    output_path = ""
+    current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
+    video_out_file = f"{args.prompt[0][:100].replace('/','')}_{args.seed}_{current_time}.mp4"
+
     with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
         video_frames = pipe(
             prompt=prompt_input,
@@ -165,10 +169,11 @@
             ar_step=args.ar_step,
             causal_block_size=args.causal_block_size,
             fps=fps,
+            local_rank=local_rank,
+            save_dir=save_dir,
+            video_out_file=video_out_file,
         )[0]
 
     if local_rank == 0:
-        current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
-        video_out_file = f"{args.prompt[0][:100].replace('/','')}_{args.seed}_{current_time}.mp4"
         output_path = os.path.join(save_dir, video_out_file)
         imageio.mimwrite(output_path, video_frames, fps=fps, quality=8, output_params=["-loglevel", "error"])
diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index 8cff0d5..34e6274 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -5,6 +5,7 @@
 from typing import Tuple
 from typing import Union
 
+import imageio
 import numpy as np
 import torch
 from diffusers.image_processor import PipelineImageInput
@@ -220,6 +221,9 @@ def __call__(
         ar_step: int = 5,
         causal_block_size: int = None,
         fps: int = 24,
+        local_rank: int = 0,
+        save_dir: str = "",
+        video_out_file: str = "",
     ):
         latent_height = height // 8
         latent_width = width // 8
@@ -346,10 +350,10 @@ def __call__(
             n_iter = 1 + (latent_length - base_num_frames - 1) // (base_num_frames - overlap_history_frames) + 1
             print(f"n_iter:{n_iter}")
             output_video = None
-            for i in range(n_iter):
+            for i_n_iter in range(n_iter):
                 if type(prompt) is list:
-                    if len(prompt) > i:
-                        prompt_embeds = prompt_embeds_list[i]
+                    if len(prompt) > i_n_iter:
+                        prompt_embeds = prompt_embeds_list[i_n_iter]
                 if output_video is not None:  # i !=0
                     prefix_video = output_video[:, -overlap_history:].to(prompt_embeds.device)
                     prefix_video = [self.vae.encode(prefix_video.unsqueeze(0))[0]]  # [(c, f, h, w)]
@@ -443,7 +447,18 @@ def __call__(
                     self.transformer.cpu()
                     torch.cuda.empty_cache()
                 x0 = latents[0].unsqueeze(0)
-                videos = [self.vae.decode(x0)[0]]
+                mid_output_video = self.vae.decode(x0)
+                videos = [mid_output_video[0]]
+                if local_rank == 0:
+                    mid_output_video = (mid_output_video / 2 + 0.5).clamp(0, 1)
+                    mid_output_video = [video for video in mid_output_video]
+                    mid_output_video = [video.permute(1, 2, 3, 0) * 255 for video in mid_output_video]
+                    mid_output_video = [video.cpu().numpy().astype(np.uint8) for video in mid_output_video]
+
+                    mid_video_out_file = f"mid_{i_n_iter}_{video_out_file}"
+                    mid_output_path = os.path.join(save_dir, mid_video_out_file)
+                    imageio.mimwrite(mid_output_path, mid_output_video[0], fps=fps, quality=8, output_params=["-loglevel", "error"])
+
                 if output_video is None:
                     output_video = videos[0].clamp(-1, 1).cpu()  # c, f, h, w
                 else:

From bebb91be6aaf70bd3deefd177a8be2889c814d4e Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 02:57:15 -0700
Subject: [PATCH 033/117] Update generate_video_df.py

---
 generate_video_df.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/generate_video_df.py b/generate_video_df.py
index e0fb91e..7780cad 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -150,6 +150,8 @@
         gc.collect()
         torch.cuda.empty_cache()
 
+    # 20250423 pftq: needs to be fixed, 20-min load times on multi-GPU caused by contention
+    print("Initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))
     pipe = DiffusionForcingPipeline(
         args.model_id,
         dit_path=args.model_id,
@@ -158,6 +160,7 @@
         use_usp=args.use_usp,
         offload=args.offload,
     )
+    print("Finished initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))
 
     if args.causal_attention:
         pipe.transformer.set_ar_attention(args.causal_block_size)

From ae4796e3c26b7e86594d3261717ba4dc38a6614d Mon Sep 17 00:00:00 2001
From: "fles@qq.com" <snrc@8899>
Date: Wed, 23 Apr 2025 19:11:05 +0800
Subject: [PATCH 034/117] fix bug

---
 skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index 34e6274..6fd429b 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -354,7 +354,7 @@ def __call__(
                 if type(prompt) is list:
                     if len(prompt) > i_n_iter:
                         prompt_embeds = prompt_embeds_list[i_n_iter]
-                if output_video is not None:  # i !=0
+                if output_video is not None:  # i_n_iter !=0
                     prefix_video = output_video[:, -overlap_history:].to(prompt_embeds.device)
                     prefix_video = [self.vae.encode(prefix_video.unsqueeze(0))[0]]  # [(c, f, h, w)]
                     if prefix_video[0].shape[1] % causal_block_size != 0:
@@ -362,10 +362,10 @@ def __call__(
                         print("the length of prefix video is truncated for the casual block size alignment.")
                         prefix_video[0] = prefix_video[0][:, : prefix_video[0].shape[1] - truncate_len]
                     predix_video_latent_length = prefix_video[0].shape[1]
-                    finished_frame_num = i * (base_num_frames - overlap_history_frames) + overlap_history_frames
+                    finished_frame_num = i_n_iter * (base_num_frames - overlap_history_frames) + overlap_history_frames
                     left_frame_num = latent_length - finished_frame_num
                     base_num_frames_iter = min(left_frame_num + overlap_history_frames, base_num_frames)
-                else:  # i == 0
+                else:  # i_n_iter == 0
                     base_num_frames_iter = base_num_frames
                 latent_shape = [16, base_num_frames_iter, latent_height, latent_width]
                 latents = self.prepare_latents(

From 3206feb0bab1a0084f2afdc914f044dad146bea9 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 04:56:10 -0700
Subject: [PATCH 035/117] Fixed edge case for tensor issue if dimensions not
 divisible by 8.

---
 generate_video_df.py | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 7780cad..4a45a7a 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -108,7 +108,19 @@
                 # 20250422 pftq: option to preserve image aspect ratio
                 if args.preserve_image_aspect_ratio:
                     img_width, img_height = image.size
-                    height = int(width / img_width * img_height)
+                    if img_height > img_width:
+                        height, width = width, height
+                        width = int(height / img_height * img_width)
+                    else:
+                        height = int(width / img_width * img_height)
+
+                    divisibility=8
+                    if width%divisibility!=0:
+                            width = width - (width%divisibility)
+                    if height%divisibility!=0:
+                            height = height - (height%divisibility)
+
+                    image = resizecrop(image, height, width)
                 else:
                     image_width, image_height = image.size
                     if image_height > image_width:
@@ -118,6 +130,14 @@
                 raise ValueError(f"Failed to load or process image: {e}")
                 
         if args.use_usp:
+            #20250422 pftq: Broadcast height and width to ensure consistency
+            height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
+            width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
+            dist.broadcast(height_tensor, src=0)
+            dist.broadcast(width_tensor, src=0)
+            height = height_tensor.item()
+            width = width_tensor.item()
+            
             # Broadcast image to other ranks
             image_data = torch.tensor(np.array(image), dtype=torch.uint8, device="cuda") if image is not None else None
             if local_rank == 0:
@@ -127,14 +147,8 @@
                 dist.broadcast(image_data, src=0)
                 image = Image.fromarray(image_data.cpu().numpy())
 
-            #20250422 pftq: Broadcast height and width to ensure consistency
-            height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
-            width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
-            dist.broadcast(height_tensor, src=0)
-            dist.broadcast(width_tensor, src=0)
-            height = height_tensor.item()
-            width = width_tensor.item()
-
+    print(f"Rank {local_rank}: {width}x{height}")
+    
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
 
     save_dir = os.path.join("result", args.outdir)

From d7a0878a2452c6a81845a1854817e9d514095e32 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 04:57:04 -0700
Subject: [PATCH 036/117] Fixed edge case for tensor issue if dimensions not
 divisible by 8.

---
 generate_video.py | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/generate_video.py b/generate_video.py
index 7eac127..3ece5b3 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -94,17 +94,31 @@
 
     #image = load_image(args.image).convert("RGB") if args.image else None
 
+        
     #20250422 pftq: Add error handling for image loading, aspect ratio preservation, and multi-GPU synchronization
     image = None
-    if args.image:
+    if args.image:  
         if local_rank == 0:
             try:
+                print("Loading image...")
                 image = load_image(args.image).convert("RGB")
 
                 # 20250422 pftq: option to preserve image aspect ratio
                 if args.preserve_image_aspect_ratio:
                     img_width, img_height = image.size
-                    height = int(width / img_width * img_height)
+                    if img_height > img_width:
+                        height, width = width, height
+                        width = int(height / img_height * img_width)
+                    else:
+                        height = int(width / img_width * img_height)
+
+                    divisibility=8
+                    if width%divisibility!=0:
+                            width = width - (width%divisibility)
+                    if height%divisibility!=0:
+                            height = height - (height%divisibility)
+
+                    image = resizecrop(image, height, width)
                 else:
                     image_width, image_height = image.size
                     if image_height > image_width:
@@ -114,22 +128,26 @@
                 raise ValueError(f"Failed to load or process image: {e}")
                 
         if args.use_usp:
+            #20250422 pftq: Broadcast height and width to ensure consistency
+            height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
+            width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
+            dist.broadcast(height_tensor, src=0)
+            dist.broadcast(width_tensor, src=0)
+            height = height_tensor.item()
+            width = width_tensor.item()
+            
             # Broadcast image to other ranks
             image_data = torch.tensor(np.array(image), dtype=torch.uint8, device="cuda") if image is not None else None
             if local_rank == 0:
+                print(f"Broadcasting image from rank {local_rank}...")
                 dist.broadcast(image_data, src=0)
             else:
+                print(f"Receiving image from rank {local_rank}...")
                 image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
                 dist.broadcast(image_data, src=0)
                 image = Image.fromarray(image_data.cpu().numpy())
 
-            #20250422 pftq: Broadcast height and width to ensure consistency
-            height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
-            width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
-            dist.broadcast(height_tensor, src=0)
-            dist.broadcast(width_tensor, src=0)
-            height = height_tensor.item()
-            width = width_tensor.item()
+    print(f"Rank {local_rank}: {width}x{height}")
 
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
 
@@ -195,7 +213,6 @@
             if args.seed == -1 or idx > 0:
                 args.seed = int(random.randrange(4294967294))
 
-        #20250422 pftq: Set seeds for reproducibility
         random.seed(args.seed)
         np.random.seed(args.seed)
         torch.manual_seed(args.seed)

From 02cad9970f928beb59708398650b1d3fa3649091 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 05:16:04 -0700
Subject: [PATCH 037/117] Update generate_video.py

---
 generate_video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generate_video.py b/generate_video.py
index 3ece5b3..3b6e945 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -112,7 +112,7 @@
                     else:
                         height = int(width / img_width * img_height)
 
-                    divisibility=8
+                    divisibility=16
                     if width%divisibility!=0:
                             width = width - (width%divisibility)
                     if height%divisibility!=0:

From dc694bb0a33b7fd6f793ef08267300a77b9822ed Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 05:16:18 -0700
Subject: [PATCH 038/117] Update generate_video_df.py

---
 generate_video_df.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 4a45a7a..4931b66 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -114,7 +114,7 @@
                     else:
                         height = int(width / img_width * img_height)
 
-                    divisibility=8
+                    divisibility=16
                     if width%divisibility!=0:
                             width = width - (width%divisibility)
                     if height%divisibility!=0:

From 55610c227128787c78b37173a6290431772af755 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 05:21:54 -0700
Subject: [PATCH 039/117] Update generate_video_df.py

---
 generate_video_df.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/generate_video_df.py b/generate_video_df.py
index 4931b66..251ebf6 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -100,6 +100,8 @@
 
     #20250422 pftq: Add error handling for image loading, aspect ratio preservation, and multi-GPU synchronization
     image = None
+    if args.use_usp:
+        dist.barrier()
     if args.image:
         if local_rank == 0:
             try:
@@ -130,6 +132,7 @@
                 raise ValueError(f"Failed to load or process image: {e}")
                 
         if args.use_usp:
+            dist.barrier()
             #20250422 pftq: Broadcast height and width to ensure consistency
             height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
             width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
@@ -146,6 +149,7 @@
                 image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
                 dist.broadcast(image_data, src=0)
                 image = Image.fromarray(image_data.cpu().numpy())
+            dist.barrier()
 
     print(f"Rank {local_rank}: {width}x{height}")
     

From 4368aae10f09236322c88041c79f65a2d9535f55 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 05:22:55 -0700
Subject: [PATCH 040/117] Update generate_video.py

---
 generate_video.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/generate_video.py b/generate_video.py
index 3b6e945..a9347df 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -97,6 +97,8 @@
         
     #20250422 pftq: Add error handling for image loading, aspect ratio preservation, and multi-GPU synchronization
     image = None
+    if args.use_usp:
+        dist.barrier()
     if args.image:  
         if local_rank == 0:
             try:
@@ -128,6 +130,7 @@
                 raise ValueError(f"Failed to load or process image: {e}")
                 
         if args.use_usp:
+            dist.barrier()
             #20250422 pftq: Broadcast height and width to ensure consistency
             height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
             width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
@@ -146,6 +149,7 @@
                 image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
                 dist.broadcast(image_data, src=0)
                 image = Image.fromarray(image_data.cpu().numpy())
+            dist.barrier()
 
     print(f"Rank {local_rank}: {width}x{height}")
 

From 4d521074547136c6c0bd1b90403cf1b766854677 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 06:37:25 -0700
Subject: [PATCH 041/117] Update generate_video_df.py

---
 generate_video_df.py | 118 ++++++++++++++++++-------------------------
 1 file changed, 48 insertions(+), 70 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 251ebf6..216a5d6 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -54,25 +54,6 @@
     #20250422 pftq: unneeded with seed synchronization code
     #assert (args.use_usp and args.seed != -1) or (not args.use_usp), "usp mode requires a valid seed"
 
-    local_rank = 0
-    if args.use_usp:
-        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
-        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
-        import torch.distributed as dist
-
-        dist.init_process_group("nccl")
-        local_rank = dist.get_rank()
-        torch.cuda.set_device(dist.get_rank())
-        device = "cuda"
-
-        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
-
-        initialize_model_parallel(
-            sequence_parallel_degree=dist.get_world_size(),
-            ring_degree=1,
-            ulysses_degree=dist.get_world_size(),
-        )
-
     if args.resolution == "540P":
         height = 544
         width = 960
@@ -98,66 +79,62 @@
     shift = args.shift
     #image = load_image(args.image).convert("RGB") if args.image else None
 
-    #20250422 pftq: Add error handling for image loading, aspect ratio preservation, and multi-GPU synchronization
+    #20250422 pftq: Add error handling for image loading, aspect ratio preservation
     image = None
-    if args.use_usp:
-        dist.barrier()
     if args.image:
-        if local_rank == 0:
-            try:
-                image = load_image(args.image).convert("RGB")
-
-                # 20250422 pftq: option to preserve image aspect ratio
-                if args.preserve_image_aspect_ratio:
-                    img_width, img_height = image.size
-                    if img_height > img_width:
-                        height, width = width, height
-                        width = int(height / img_height * img_width)
-                    else:
-                        height = int(width / img_width * img_height)
+        try:
+            image = load_image(args.image).convert("RGB")
+
+            # 20250422 pftq: option to preserve image aspect ratio
+            if args.preserve_image_aspect_ratio:
+                img_width, img_height = image.size
+                if img_height > img_width:
+                    height, width = width, height
+                    width = int(height / img_height * img_width)
+                else:
+                    height = int(width / img_width * img_height)
 
-                    divisibility=16
-                    if width%divisibility!=0:
-                            width = width - (width%divisibility)
-                    if height%divisibility!=0:
-                            height = height - (height%divisibility)
+                divisibility=16
+                if width%divisibility!=0:
+                        width = width - (width%divisibility)
+                if height%divisibility!=0:
+                        height = height - (height%divisibility)
 
-                    image = resizecrop(image, height, width)
-                else:
-                    image_width, image_height = image.size
-                    if image_height > image_width:
-                        height, width = width, height
-                    image = resizecrop(image, height, width)
-            except Exception as e:
-                raise ValueError(f"Failed to load or process image: {e}")
-                
-        if args.use_usp:
-            dist.barrier()
-            #20250422 pftq: Broadcast height and width to ensure consistency
-            height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
-            width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
-            dist.broadcast(height_tensor, src=0)
-            dist.broadcast(width_tensor, src=0)
-            height = height_tensor.item()
-            width = width_tensor.item()
-            
-            # Broadcast image to other ranks
-            image_data = torch.tensor(np.array(image), dtype=torch.uint8, device="cuda") if image is not None else None
-            if local_rank == 0:
-                dist.broadcast(image_data, src=0)
+                image = resizecrop(image, height, width)
             else:
-                image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
-                dist.broadcast(image_data, src=0)
-                image = Image.fromarray(image_data.cpu().numpy())
-            dist.barrier()
-
-    print(f"Rank {local_rank}: {width}x{height}")
+                image_width, image_height = image.size
+                if image_height > image_width:
+                    height, width = width, height
+                image = resizecrop(image, height, width)
+        except Exception as e:
+            raise ValueError(f"Failed to load or process image: {e}")
+
+    print(f"{width}x{height} | Image: "+str(image!=None))
     
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
 
     save_dir = os.path.join("result", args.outdir)
     os.makedirs(save_dir, exist_ok=True)
 
+    local_rank = 0
+    if args.use_usp:
+        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
+        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
+        import torch.distributed as dist
+
+        dist.init_process_group("nccl")
+        local_rank = dist.get_rank()
+        torch.cuda.set_device(dist.get_rank())
+        device = "cuda"
+
+        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+
+        initialize_model_parallel(
+            sequence_parallel_degree=dist.get_world_size(),
+            ring_degree=1,
+            ulysses_degree=dist.get_world_size(),
+        )
+
     prompt_input = args.prompt
     if args.prompt_enhancer and args.image is None:
         print(f"init prompt enhancer")
@@ -186,11 +163,12 @@
 
     #20250422 pftq: Set preferred linear algebra backend to avoid cuSOLVER issues
     torch.backends.cuda.preferred_linalg_library("default")  # or try "magma" if available
+
+    print(f"Rank {local_rank} prompt:{prompt_input}")
+    print(f"Rank {local_rank} guidance_scale:{guidance_scale}")
     
     for idx in range(args.batch_size): # 20250422 pftq: implemented --batch_size
         if local_rank == 0:
-            print(f"prompt:{prompt_input}")
-            print(f"guidance_scale:{guidance_scale}")
             print(f"Generating video {idx+1} of {args.batch_size}")
 
         #20250422 pftq: Synchronize seed across all ranks

From af8889b8ae004bc6885eb3fbf7aec8a669caa26b Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 06:37:38 -0700
Subject: [PATCH 042/117] Update generate_video.py

---
 generate_video.py | 115 +++++++++++++++++++---------------------------
 1 file changed, 46 insertions(+), 69 deletions(-)

diff --git a/generate_video.py b/generate_video.py
index a9347df..30ab503 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -64,25 +64,6 @@
     #20250422 pftq: unneeded with seed synchronization code
     #assert (args.use_usp and args.seed is not None) or (not args.use_usp), "usp mode need seed"
 
-    local_rank = 0
-    if args.use_usp:
-        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
-        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
-        import torch.distributed as dist
-
-        dist.init_process_group("nccl")
-        local_rank = dist.get_rank()
-        torch.cuda.set_device(dist.get_rank())
-        device = "cuda"
-
-        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
-
-        initialize_model_parallel(
-            sequence_parallel_degree=dist.get_world_size(),
-            ring_degree=1,
-            ulysses_degree=dist.get_world_size(),
-        )
-
     if args.resolution == "540P":
         height = 544
         width = 960
@@ -95,66 +76,62 @@
     #image = load_image(args.image).convert("RGB") if args.image else None
 
         
-    #20250422 pftq: Add error handling for image loading, aspect ratio preservation, and multi-GPU synchronization
+    #20250422 pftq: Add error handling for image loading, aspect ratio preservation
     image = None
     if args.use_usp:
         dist.barrier()
     if args.image:  
-        if local_rank == 0:
-            try:
-                print("Loading image...")
-                image = load_image(args.image).convert("RGB")
-
-                # 20250422 pftq: option to preserve image aspect ratio
-                if args.preserve_image_aspect_ratio:
-                    img_width, img_height = image.size
-                    if img_height > img_width:
-                        height, width = width, height
-                        width = int(height / img_height * img_width)
-                    else:
-                        height = int(width / img_width * img_height)
-
-                    divisibility=16
-                    if width%divisibility!=0:
-                            width = width - (width%divisibility)
-                    if height%divisibility!=0:
-                            height = height - (height%divisibility)
-
-                    image = resizecrop(image, height, width)
+        try:
+            print("Loading image...")
+            image = load_image(args.image).convert("RGB")
+
+            # 20250422 pftq: option to preserve image aspect ratio
+            if args.preserve_image_aspect_ratio:
+                img_width, img_height = image.size
+                if img_height > img_width:
+                    height, width = width, height
+                    width = int(height / img_height * img_width)
                 else:
-                    image_width, image_height = image.size
-                    if image_height > image_width:
-                        height, width = width, height
-                    image = resizecrop(image, height, width)
-            except Exception as e:
-                raise ValueError(f"Failed to load or process image: {e}")
-                
-        if args.use_usp:
-            dist.barrier()
-            #20250422 pftq: Broadcast height and width to ensure consistency
-            height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
-            width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
-            dist.broadcast(height_tensor, src=0)
-            dist.broadcast(width_tensor, src=0)
-            height = height_tensor.item()
-            width = width_tensor.item()
-            
-            # Broadcast image to other ranks
-            image_data = torch.tensor(np.array(image), dtype=torch.uint8, device="cuda") if image is not None else None
-            if local_rank == 0:
-                print(f"Broadcasting image from rank {local_rank}...")
-                dist.broadcast(image_data, src=0)
+                    height = int(width / img_width * img_height)
+
+                divisibility=16
+                if width%divisibility!=0:
+                        width = width - (width%divisibility)
+                if height%divisibility!=0:
+                        height = height - (height%divisibility)
+
+                image = resizecrop(image, height, width)
             else:
-                print(f"Receiving image from rank {local_rank}...")
-                image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
-                dist.broadcast(image_data, src=0)
-                image = Image.fromarray(image_data.cpu().numpy())
-            dist.barrier()
+                image_width, image_height = image.size
+                if image_height > image_width:
+                    height, width = width, height
+                image = resizecrop(image, height, width)
+        except Exception as e:
+            raise ValueError(f"Failed to load or process image: {e}")
 
-    print(f"Rank {local_rank}: {width}x{height}")
+    print(f"{width}x{height} | Image: "+str(image!=None))
 
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
 
+    local_rank = 0
+    if args.use_usp:
+        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
+        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
+        import torch.distributed as dist
+
+        dist.init_process_group("nccl")
+        local_rank = dist.get_rank()
+        torch.cuda.set_device(dist.get_rank())
+        device = "cuda"
+
+        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+
+        initialize_model_parallel(
+            sequence_parallel_degree=dist.get_world_size(),
+            ring_degree=1,
+            ulysses_degree=dist.get_world_size(),
+        )
+
     prompt_input = args.prompt
     if args.prompt_enhancer and args.image is None:
         print(f"init prompt enhancer")

From d1c2b761c0ca1eb944a08c44628807bc29330981 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 06:38:19 -0700
Subject: [PATCH 043/117] Update README.md

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index a3a5df9..22bfa52 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,5 @@
 ## Changes from pftq:
 - Added seed synchronization code to allow random seed with multi-GPU.
-- Added image broadcast/synchronization to avoid potential sync issues in multi-GPU.
 - Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
 - Fixed DF script not resize-cropping the image (I2V script does it but DF is missing the code).

From 6c4017768006d39273d2b07ad2a07d0e89c77aae Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 06:57:17 -0700
Subject: [PATCH 044/117] Match TeaCache update

---
 skyreels_v2_infer/modules/transformer.py | 106 ++++++++++++++++++++++-
 1 file changed, 103 insertions(+), 3 deletions(-)

diff --git a/skyreels_v2_infer/modules/transformer.py b/skyreels_v2_infer/modules/transformer.py
index 6f41558..3bef5ce 100644
--- a/skyreels_v2_infer/modules/transformer.py
+++ b/skyreels_v2_infer/modules/transformer.py
@@ -1,6 +1,6 @@
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import math
-
+import numpy as np
 import torch
 import torch.amp as amp
 import torch.nn as nn
@@ -484,6 +484,7 @@ def __init__(
         self.num_frame_per_block = 1
         self.flag_causal_attention = False
         self.block_mask = None
+        self.enable_teacache = False
 
         # embeddings
         self.patch_embedding = nn.Conv3d(in_dim, dim, kernel_size=patch_size, stride=patch_size)
@@ -574,6 +575,50 @@ def attention_mask(b, h, q_idx, kv_idx):
 
         return block_mask
 
+    def initialize_teacache(self, enable_teacache=True, num_steps=25, teacache_thresh=0.15, use_ret_steps=False, ckpt_dir=''):
+        self.enable_teacache = enable_teacache
+        print('using teacache')
+        self.cnt = 0
+        self.num_steps = num_steps
+        self.teacache_thresh = teacache_thresh
+        self.accumulated_rel_l1_distance_even = 0
+        self.accumulated_rel_l1_distance_odd = 0
+        self.previous_e0_even = None
+        self.previous_e0_odd = None
+        self.previous_residual_even = None
+        self.previous_residual_odd = None
+        self.use_ref_steps = use_ret_steps
+        if "I2V" in ckpt_dir:
+            if use_ret_steps:
+                if '540P' in ckpt_dir:
+                    self.coefficients = [ 2.57151496e+05, -3.54229917e+04,  1.40286849e+03, -1.35890334e+01, 1.32517977e-01]
+                if '720P' in ckpt_dir:
+                    self.coefficients = [ 8.10705460e+03,  2.13393892e+03, -3.72934672e+02,  1.66203073e+01, -4.17769401e-02]
+                self.ret_steps = 5*2
+                self.cutoff_steps = num_steps*2
+            else:
+                if '540P' in ckpt_dir:
+                    self.coefficients = [-3.02331670e+02,  2.23948934e+02, -5.25463970e+01,  5.87348440e+00, -2.01973289e-01]
+                if '720P' in ckpt_dir:
+                    self.coefficients = [-114.36346466,   65.26524496,  -18.82220707,    4.91518089,   -0.23412683]
+                self.ret_steps = 1*2
+                self.cutoff_steps = num_steps*2 - 2
+        else:
+            if use_ret_steps:
+                if '1.3B' in ckpt_dir:
+                    self.coefficients = [-5.21862437e+04, 9.23041404e+03, -5.28275948e+02, 1.36987616e+01, -4.99875664e-02]
+                if '14B' in ckpt_dir:
+                    self.coefficients = [-3.03318725e+05, 4.90537029e+04, -2.65530556e+03, 5.87365115e+01, -3.15583525e-01]
+                self.ret_steps = 5*2
+                self.cutoff_steps = num_steps*2
+            else:
+                if '1.3B' in ckpt_dir:
+                    self.coefficients = [2.39676752e+03, -1.31110545e+03,  2.01331979e+02, -8.29855975e+00, 1.37887774e-01]
+                if '14B' in ckpt_dir:
+                    self.coefficients = [-5784.54975374,  5449.50911966, -1811.16591783,   256.27178429, -13.02252404]
+                self.ret_steps = 1*2
+                self.cutoff_steps = num_steps*2 - 2
+
     def forward(self, x, t, context, clip_fea=None, y=None, fps=None):
         r"""
         Forward pass through the diffusion model
@@ -664,13 +709,68 @@ def forward(self, x, t, context, clip_fea=None, y=None, fps=None):
 
         # arguments
         kwargs = dict(e=e0, grid_sizes=grid_sizes, freqs=self.freqs, context=context, block_mask=self.block_mask)
-        for block in self.blocks:
-            x = block(x, **kwargs)
+        if self.enable_teacache:
+            modulated_inp = e0 if self.use_ref_steps else e
+            # teacache
+            if self.cnt%2==0: # even -> conditon
+                self.is_even = True
+                if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                    should_calc_even = True
+                    self.accumulated_rel_l1_distance_even = 0
+                else:
+                    rescale_func = np.poly1d(self.coefficients)
+                    self.accumulated_rel_l1_distance_even += rescale_func(((modulated_inp-self.previous_e0_even).abs().mean() / self.previous_e0_even.abs().mean()).cpu().item())
+                    if self.accumulated_rel_l1_distance_even < self.teacache_thresh:
+                        should_calc_even = False
+                    else:
+                        should_calc_even = True
+                        self.accumulated_rel_l1_distance_even = 0
+                self.previous_e0_even = modulated_inp.clone()
+
+            else: # odd -> unconditon
+                self.is_even = False
+                if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                    should_calc_odd = True
+                    self.accumulated_rel_l1_distance_odd = 0
+                else: 
+                    rescale_func = np.poly1d(self.coefficients)
+                    self.accumulated_rel_l1_distance_odd += rescale_func(((modulated_inp-self.previous_e0_odd).abs().mean() / self.previous_e0_odd.abs().mean()).cpu().item())
+                    if self.accumulated_rel_l1_distance_odd < self.teacache_thresh:
+                        should_calc_odd = False
+                    else:
+                        should_calc_odd = True
+                        self.accumulated_rel_l1_distance_odd = 0
+                self.previous_e0_odd = modulated_inp.clone()
+
+        if self.enable_teacache: 
+            if self.is_even:
+                if not should_calc_even:
+                    x += self.previous_residual_even
+                else:
+                    ori_x = x.clone()
+                    for block in self.blocks:
+                        x = block(x, **kwargs)
+                    self.previous_residual_even = x - ori_x
+            else:
+                if not should_calc_odd:
+                    x += self.previous_residual_odd
+                else:
+                    ori_x = x.clone()
+                    for block in self.blocks:
+                        x = block(x, **kwargs)
+                    self.previous_residual_odd = x - ori_x
+        
+        else:
+            for block in self.blocks:
+                x = block(x, **kwargs)
 
         x = self.head(x, e)
 
         # unpatchify
         x = self.unpatchify(x, grid_sizes)
+        self.cnt += 1
+        if self.cnt >= self.num_steps:
+            self.cnt = 0
         return x.float()
 
     def unpatchify(self, x, grid_sizes):

From f0dbdd2f30b7c2d95c0006303b1c46d867f6678e Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 06:57:45 -0700
Subject: [PATCH 045/117] Match TeaCache update

---
 skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index c6f39ec..b27f1d8 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -328,6 +328,9 @@ def __call__(
                     finished_frame_num = i * (base_num_frames - overlap_history_frames) + overlap_history_frames
                     left_frame_num = latent_length - finished_frame_num
                     base_num_frames_iter = min(left_frame_num + overlap_history_frames, base_num_frames)
+                    if ar_step > 0 and self.transformer.enable_teacache:
+                        num_steps = num_inference_steps + ((base_num_frames_iter - overlap_history_frames) // causal_block_size - 1) * ar_step
+                        self.transformer.num_steps = num_steps
                 else:  # i == 0
                     base_num_frames_iter = base_num_frames
                 latent_shape = [16, base_num_frames_iter, latent_height, latent_width]

From 70dfc8c23ea62bcf978e6f0c3b88b75c4776c73d Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 06:59:11 -0700
Subject: [PATCH 046/117] Match TeaCache update

---
 generate_video_df.py | 150 ++++++++++++++++++++++++++++---------------
 1 file changed, 100 insertions(+), 50 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 216a5d6..1e30e63 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -41,6 +41,16 @@
         default="A woman in a leather jacket and sunglasses riding a vintage motorcycle through a desert highway at sunset, her hair blowing wildly in the wind as the motorcycle kicks up dust, with the golden sun casting long shadows across the barren landscape.",
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
+    parser.add_argument("--teacache", action="store_true")
+    parser.add_argument(
+        "--teacache_thresh",
+        type=float,
+        default=0.2,
+        help="Higher speedup will cause to worse quality -- 0.1 for 2.0x speedup -- 0.2 for 3.0x speedup")
+    parser.add_argument(
+        "--use_ret_steps",
+        action="store_true",
+        help="Using Retention Steps will result in faster generation speed and better generation quality.")
     
     parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
     parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
@@ -54,6 +64,25 @@
     #20250422 pftq: unneeded with seed synchronization code
     #assert (args.use_usp and args.seed != -1) or (not args.use_usp), "usp mode requires a valid seed"
 
+    local_rank = 0
+    if args.use_usp:
+        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
+        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
+        import torch.distributed as dist
+
+        dist.init_process_group("nccl")
+        local_rank = dist.get_rank()
+        torch.cuda.set_device(dist.get_rank())
+        device = "cuda"
+
+        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+
+        initialize_model_parallel(
+            sequence_parallel_degree=dist.get_world_size(),
+            ring_degree=1,
+            ulysses_degree=dist.get_world_size(),
+        )
+
     if args.resolution == "540P":
         height = 544
         width = 960
@@ -79,62 +108,75 @@
     shift = args.shift
     #image = load_image(args.image).convert("RGB") if args.image else None
 
-    #20250422 pftq: Add error handling for image loading, aspect ratio preservation
+    #20250422 pftq: Add error handling for image loading, aspect ratio preservation, and multi-GPU synchronization
     image = None
+    if args.use_usp:
+        dist.barrier()
     if args.image:
-        try:
-            image = load_image(args.image).convert("RGB")
-
-            # 20250422 pftq: option to preserve image aspect ratio
-            if args.preserve_image_aspect_ratio:
-                img_width, img_height = image.size
-                if img_height > img_width:
-                    height, width = width, height
-                    width = int(height / img_height * img_width)
+        if local_rank == 0:
+            try:
+                image = load_image(args.image).convert("RGB")
+
+                # 20250422 pftq: option to preserve image aspect ratio
+                if args.preserve_image_aspect_ratio:
+                    img_width, img_height = image.size
+                    if img_height > img_width:
+                        height, width = width, height
+                        width = int(height / img_height * img_width)
+                    else:
+                        height = int(width / img_width * img_height)
+
+                    divisibility=16
+                    if width%divisibility!=0:
+                            width = width - (width%divisibility)
+                    if height%divisibility!=0:
+                            height = height - (height%divisibility)
+
+                    image = resizecrop(image, height, width)
+                else:
+                    image_width, image_height = image.size
+                    if image_height > image_width:
+                        height, width = width, height
+                    image = resizecrop(image, height, width)
+            except Exception as e:
+                raise ValueError(f"Failed to load or process image: {e}")
+                
+        if args.use_usp:
+            dist.barrier()
+            try:
+                #20250422 pftq: Broadcast height and width to ensure consistency
+                height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
+                width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
+                dist.broadcast(height_tensor, src=0)
+                dist.broadcast(width_tensor, src=0)
+                height = height_tensor.item()
+                width = width_tensor.item()
+                
+                # Broadcast image to other ranks
+                image_data = torch.tensor(np.array(image), dtype=torch.uint8, device="cuda") if image is not None else None
+                if local_rank == 0:
+                    print(f"Broadcasting image from rank {local_rank}...")
+                    dist.broadcast(image_data, src=0)
                 else:
-                    height = int(width / img_width * img_height)
-
-                divisibility=16
-                if width%divisibility!=0:
-                        width = width - (width%divisibility)
-                if height%divisibility!=0:
-                        height = height - (height%divisibility)
-
-                image = resizecrop(image, height, width)
-            else:
-                image_width, image_height = image.size
-                if image_height > image_width:
-                    height, width = width, height
-                image = resizecrop(image, height, width)
-        except Exception as e:
-            raise ValueError(f"Failed to load or process image: {e}")
-
-    print(f"{width}x{height} | Image: "+str(image!=None))
+                    print(f"Receiving image from rank {local_rank}...")
+                    image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
+                    dist.broadcast(image_data, src=0)
+                    image = Image.fromarray(image_data.cpu().numpy())
+                dist.barrier()
+
+            except Exception as e:
+                print(f"[Rank {local_rank}] Image broadcasting error: {e}")
+                if args.use_usp:
+                    dist.destroy_process_group()
+                raise
+
+    print(f"Rank {local_rank}: {width}x{height} | Image: "+str(image!=None))
     
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
 
     save_dir = os.path.join("result", args.outdir)
     os.makedirs(save_dir, exist_ok=True)
 
-    local_rank = 0
-    if args.use_usp:
-        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
-        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
-        import torch.distributed as dist
-
-        dist.init_process_group("nccl")
-        local_rank = dist.get_rank()
-        torch.cuda.set_device(dist.get_rank())
-        device = "cuda"
-
-        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
-
-        initialize_model_parallel(
-            sequence_parallel_degree=dist.get_world_size(),
-            ring_degree=1,
-            ulysses_degree=dist.get_world_size(),
-        )
-
     prompt_input = args.prompt
     if args.prompt_enhancer and args.image is None:
         print(f"init prompt enhancer")
@@ -160,15 +202,23 @@
     if args.causal_attention:
         pipe.transformer.set_ar_attention(args.causal_block_size)
 
+    if args.teacache:
+        if args.ar_step > 0:
+            num_steps = args.inference_steps + (((args.base_num_frames - 1)//4 + 1) // args.causal_block_size - 1) * args.ar_step
+            print('num_steps:', num_steps)
+        else:
+            num_steps = args.inference_steps
+        pipe.transformer.initialize_teacache(enable_teacache=True, num_steps=num_steps, 
+                                             teacache_thresh=args.teacache_thresh, use_ret_steps=args.use_ret_steps, 
+                                             ckpt_dir=args.model_id)
 
     #20250422 pftq: Set preferred linear algebra backend to avoid cuSOLVER issues
     torch.backends.cuda.preferred_linalg_library("default")  # or try "magma" if available
-
-    print(f"Rank {local_rank} prompt:{prompt_input}")
-    print(f"Rank {local_rank} guidance_scale:{guidance_scale}")
     
     for idx in range(args.batch_size): # 20250422 pftq: implemented --batch_size
         if local_rank == 0:
+            print(f"prompt:{prompt_input}")
+            print(f"guidance_scale:{guidance_scale}")
             print(f"Generating video {idx+1} of {args.batch_size}")
 
         #20250422 pftq: Synchronize seed across all ranks

From 19a2dc1d2050defb286bf9b0cb96fe8dc58acd5b Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 07:00:48 -0700
Subject: [PATCH 047/117] Match TeaCache update

---
 generate_video.py | 130 +++++++++++++++++++++++++++++-----------------
 1 file changed, 82 insertions(+), 48 deletions(-)

diff --git a/generate_video.py b/generate_video.py
index 30ab503..9bc9fa3 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -51,6 +51,16 @@
         default="A serene lake surrounded by towering mountains, with a few swans gracefully gliding across the water and sunlight dancing on the surface.",
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
+    parser.add_argument("--teacache", action="store_true")
+    parser.add_argument(
+        "--teacache_thresh",
+        type=float,
+        default=0.2,
+        help="Higher speedup will cause to worse quality -- 0.1 for 2.0x speedup -- 0.2 for 3.0x speedup")
+    parser.add_argument(
+        "--use_ret_steps",
+        action="store_true",
+        help="Using Retention Steps will result in faster generation speed and better generation quality.")
 
     parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
     parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
@@ -64,6 +74,25 @@
     #20250422 pftq: unneeded with seed synchronization code
     #assert (args.use_usp and args.seed is not None) or (not args.use_usp), "usp mode need seed"
 
+    local_rank = 0
+    if args.use_usp:
+        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
+        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
+        import torch.distributed as dist
+
+        dist.init_process_group("nccl")
+        local_rank = dist.get_rank()
+        torch.cuda.set_device(dist.get_rank())
+        device = "cuda"
+
+        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+
+        initialize_model_parallel(
+            sequence_parallel_degree=dist.get_world_size(),
+            ring_degree=1,
+            ulysses_degree=dist.get_world_size(),
+        )
+
     if args.resolution == "540P":
         height = 544
         width = 960
@@ -76,62 +105,67 @@
     #image = load_image(args.image).convert("RGB") if args.image else None
 
         
-    #20250422 pftq: Add error handling for image loading, aspect ratio preservation
+    #20250422 pftq: Add error handling for image loading, aspect ratio preservation, and multi-GPU synchronization
     image = None
-    if args.use_usp:
-        dist.barrier()
     if args.image:  
-        try:
-            print("Loading image...")
-            image = load_image(args.image).convert("RGB")
-
-            # 20250422 pftq: option to preserve image aspect ratio
-            if args.preserve_image_aspect_ratio:
-                img_width, img_height = image.size
-                if img_height > img_width:
-                    height, width = width, height
-                    width = int(height / img_height * img_width)
+        if local_rank == 0:
+            try:
+                print("Loading image...")
+                image = load_image(args.image).convert("RGB")
+
+                # 20250422 pftq: option to preserve image aspect ratio
+                if args.preserve_image_aspect_ratio:
+                    img_width, img_height = image.size
+                    if img_height > img_width:
+                        height, width = width, height
+                        width = int(height / img_height * img_width)
+                    else:
+                        height = int(width / img_width * img_height)
+
+                    divisibility=8
+                    if width%divisibility!=0:
+                            width = width + (width%divisibility)
+                    if height%divisibility!=0:
+                            height = height + (height%divisibility)
+
+                    image = resizecrop(image, height, width)
                 else:
-                    height = int(width / img_width * img_height)
-
-                divisibility=16
-                if width%divisibility!=0:
-                        width = width - (width%divisibility)
-                if height%divisibility!=0:
-                        height = height - (height%divisibility)
-
-                image = resizecrop(image, height, width)
+                    image_width, image_height = image.size
+                    if image_height > image_width:
+                        height, width = width, height
+                    image = resizecrop(image, height, width)
+            except Exception as e:
+                raise ValueError(f"Failed to load or process image: {e}")
+                
+        if args.use_usp:
+            # Broadcast image to other ranks
+            image_data = torch.tensor(np.array(image), dtype=torch.uint8, device="cuda") if image is not None else None
+            if local_rank == 0:
+                print(f"Broadcasting image from rank {local_rank}...")
+                dist.broadcast(image_data, src=0)
             else:
-                image_width, image_height = image.size
-                if image_height > image_width:
-                    height, width = width, height
-                image = resizecrop(image, height, width)
-        except Exception as e:
-            raise ValueError(f"Failed to load or process image: {e}")
-
-    print(f"{width}x{height} | Image: "+str(image!=None))
-
-    negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
-
-    local_rank = 0
-    if args.use_usp:
-        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
-        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
-        import torch.distributed as dist
+                print(f"Receiving image from rank {local_rank}...")
+                image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
+                dist.broadcast(image_data, src=0)
+                image = Image.fromarray(image_data.cpu().numpy())
 
-        dist.init_process_group("nccl")
-        local_rank = dist.get_rank()
-        torch.cuda.set_device(dist.get_rank())
-        device = "cuda"
+            #20250422 pftq: Broadcast height and width to ensure consistency
+            height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
+            width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
+            dist.broadcast(height_tensor, src=0)
+            dist.broadcast(width_tensor, src=0)
+            height = height_tensor.item()
+            width = width_tensor.item()
 
-        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+    print(f"Rank {local_rank}: {width}x{height}")
 
-        initialize_model_parallel(
-            sequence_parallel_degree=dist.get_world_size(),
-            ring_degree=1,
-            ulysses_degree=dist.get_world_size(),
-        )
+    negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
 
+    if args.teacache:
+        pipe.transformer.initialize_teacache(enable_teacache=True, num_steps=args.inference_steps, 
+                                             teacache_thresh=args.teacache_thresh, use_ret_steps=args.use_ret_steps, 
+                                             ckpt_dir=args.model_id)
+            
     prompt_input = args.prompt
     if args.prompt_enhancer and args.image is None:
         print(f"init prompt enhancer")

From 225fdf80d8ae0d919d3382794e98a7e2f64504d6 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 07:03:18 -0700
Subject: [PATCH 048/117] Match TeaCache update

---
 README.md | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 22bfa52..76e6a6a 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,6 @@ python3 generate_video_df.py \
 Change "DF" to "I2V' or "T2V" accordingly if you don't want to use the infinite length version of the model.
 
 <hr>
-
 <p align="center">
   <img src="assets/logo2.png" alt="SkyReels Logo" width="50%">
 </p>
@@ -88,7 +87,7 @@ Change "DF" to "I2V' or "T2V" accordingly if you don't want to use the infinite
 </p>
 
 ---
-Welcome to the SkyReels V2 repository! Here, you'll find the model weights and inference code for our infinite-lenght film genetative models
+Welcome to the SkyReels V2 repository! Here, you'll find the model weights and inference code for our infinite-length film generative models
 
 
 ## 🔥🔥🔥 News!!
@@ -270,13 +269,16 @@ python3 generate_video_df.py \
   --overlap_history 17 \
   --prompt "A graceful white swan with a curved neck and delicate feathers swimming in a serene lake at dawn, its reflection perfectly mirrored in the still water as mist rises from the surface, with the swan occasionally dipping its head into the water to feed." \
   --addnoise_condition 20 \
-  --offload
+  --offload \
+  --teacache \
+  --use_ret_steps \
+  --teacache_thresh 0.3
 ```
 
 asynchronous generation for 30s video
 ```shell
 model_id=Skywork/SkyReels-V2-DF-14B-540P
-# synchronous inference
+# asynchronous inference
 python3 generate_video_df.py \
   --model_id ${model_id} \
   --resolution 540P \
@@ -293,7 +295,7 @@ python3 generate_video_df.py \
 > **Note**: 
 > - If you want to run the **image-to-video (I2V)** task, add `--image ${image_path}` to your command and it is also better to use **text-to-video (T2V)**-like prompt which includes some descriptions of the first-frame image.
 > - For long video generation, you can just switch the `--num_frames`, e.g., `--num_frames 257` for 10s video, `--num_frames 377` for 15s video, `--num_frames 737` for 30s video, `--num_frames 1457` for 60s video. The number is not strictly aligned with the logical frame number for specified time duration, but it is aligned with some training parameters, which means it may perform better. When you use asynchronous inference with causal_block_size > 1, the `--num_frames` should be carefully set.
-> - You can use `--ar_step 5` to enable asynchronous inference. When asynchronous inference, `--causal_block_size 5` is recommanded while it is not supposed to be set for synchronous generation. REMEMBER that the frame latent number inputted into the model in every iteration, e.g., base frame latent number (e.g., (97-1)//4+1=25 for base_num_frames=97) and (e.g., (237-97-(97-17)x1+17-1)//4+1=20 for base_num_frames=97, num_frames=237, overlap_history=17) for the last iteration, MUST be divided by causal_block_size. If you find it too hard to calculate and set proper values, just use our recommanded setting above :). Asynchronous inference will take more steps to diffuse the whole sequence which means it will be SLOWER than synchronous mode. In our experiments, asynchronous inference may improve the instruction following and visual consistent performance.
+> - You can use `--ar_step 5` to enable asynchronous inference. When asynchronous inference, `--causal_block_size 5` is recommended while it is not supposed to be set for synchronous generation. REMEMBER that the frame latent number inputted into the model in every iteration, e.g., base frame latent number (e.g., (97-1)//4+1=25 for base_num_frames=97) and (e.g., (237-97-(97-17)x1+17-1)//4+1=20 for base_num_frames=97, num_frames=237, overlap_history=17) for the last iteration, MUST be divided by causal_block_size. If you find it too hard to calculate and set proper values, just use our recommended setting above :). Asynchronous inference will take more steps to diffuse the whole sequence which means it will be SLOWER than synchronous mode. In our experiments, asynchronous inference may improve the instruction following and visual consistent performance.
 > - To reduce peak VRAM, just lower the `--base_num_frames`, e.g., to 77 or 57, while keeping the same generative length `--num_frames` you want to generate. This may slightly reduce video quality, and it should not be set too small.
 > - `--addnoise_condition` is used to help smooth the long video generation by adding some noise to the clean condition. Too large noise can cause the inconsistency as well. 20 is a recommended value, and you may try larger ones, but it is recommended to not exceed 50.
 > - Generating a 540P video using the 1.3B model requires approximately 14.7GB peak VRAM, while the same resolution video using the 14B model demands around 51.2GB peak VRAM.
@@ -311,16 +313,19 @@ python3 generate_video.py \
   --shift 8.0 \
   --fps 24 \
   --prompt "A serene lake surrounded by towering mountains, with a few swans gracefully gliding across the water and sunlight dancing on the surface." \
-  --offload
+  --offload \
+  --teacache \
+  --use_ret_steps \
+  --teacache_thresh 0.3
 ```
 > **Note**: 
-> - When using an **image-to-video (I2V)** model, you must provide an input image using the `--image  ${image_path}` parameter. The `--guidance_scale 5.0` and `--shift 3.0` is recommanded for I2V model.
+> - When using an **image-to-video (I2V)** model, you must provide an input image using the `--image  ${image_path}` parameter. The `--guidance_scale 5.0` and `--shift 3.0` is recommended for I2V model.
 > - Generating a 540P video using the 1.3B model requires approximately 14.7GB peak VRAM, while the same resolution video using the 14B model demands around 43.4GB peak VRAM.
 
 
 - **Prompt Enhancer**
 
-The prompt enhancer is implemented based on <a href="https://huggingface.co/Qwen/Qwen2.5-32B-Instruct">Qwen2.5-32B-Instruct</a> and  is utilized via the `--prompt_enhancer` parameter. It works ideally for short prompts, while for long prompts, it might generate an excessively lengthy prompt that could lead to over-saturation in the generative video. Note the peak memory of GPU is 64G+ if use `--prompt_enhancer`. If you want obtain the enhanced prompt separately, you can also run the prompt_dehancer script separately for testing. The steps are as follows:
+The prompt enhancer is implemented based on <a href="https://huggingface.co/Qwen/Qwen2.5-32B-Instruct">Qwen2.5-32B-Instruct</a> and  is utilized via the `--prompt_enhancer` parameter. It works ideally for short prompts, while for long prompts, it might generate an excessively lengthy prompt that could lead to over-saturation in the generative video. Note the peak memory of GPU is 64G+ if you use `--prompt_enhancer`. If you want to obtain the enhanced prompt separately, you can also run the prompt_enhancer script separately for testing. The steps are as follows:
 
 ```shell
 cd skyreels_v2_infer/pipelines
@@ -348,7 +353,10 @@ Below are the key parameters you can customize for video generation:
 | --offload | True | Offloads model components to CPU to reduce VRAM usage (recommended) |
 | --use_usp | True | Enables multi-GPU acceleration with xDiT USP |
 | --outdir | ./video_out | Directory where generated videos will be saved |
-| --prompt_enhancer | True | expand the prompt into a more detailed description |
+| --prompt_enhancer | True | Expand the prompt into a more detailed description |
+| --teacache | False | Enables teacache for faster inference |
+| --teacache_thresh | 0.2 | Higher speedup will cause to worse quality |
+| --use_ret_steps | False | Retention Steps for teacache |
 
 **Diffusion Forcing Additional Parameters**
 | Parameter | Recommended Value | Description |
@@ -396,7 +404,7 @@ torchrun --nproc_per_node=2 generate_video.py \
   --seed 42
 ```
 > **Note**: 
-> - When using an **image-to-video (I2V)** model, you must provide an input image using the `--image  ${image_path}` parameter. The `--guidance_scale 5.0` and `--shift 3.0` is recommanded for I2V model.
+> - When using an **image-to-video (I2V)** model, you must provide an input image using the `--image  ${image_path}` parameter. The `--guidance_scale 5.0` and `--shift 3.0` is recommended for I2V model.
 
 
 ## Contents
@@ -406,7 +414,7 @@ torchrun --nproc_per_node=2 generate_video.py \
     - [Video Captioner](#video-captioner)
     - [Reinforcement Learning](#reinforcement-learning)
     - [Diffusion Forcing](#diffusion-forcing)
-    - [Hight-Quality Supervised Fine-Tuning(SFT)](#high-quality-supervised-fine-tuning-sft)
+    - [High-Quality Supervised Fine-Tuning(SFT)](#high-quality-supervised-fine-tuning-sft)
   - [Performance](#performance)
   - [Acknowledgements](#acknowledgements)
   - [Citation](#citation)
@@ -554,7 +562,7 @@ Inspired by the previous success in LLM, we propose to enhance the performance o
 - the generative model does not handle well with large, deformable motions. 
 - the generated videos may violate the physical law.
 
-To avoid the degradation in other metrics, such as text alignment and video quality, we ensure the preference data pairs have comparable text alignment and video quality, while only the motion quality varies. This requirement poses greater challenges in obtaining preference annotations due to the inherently higher costs of human annotation. To address this challenge, we propose a semi-automatic pipeline that strategically combines automatically generated motion pairsand human annotation results. This hybrid approach not only enhances the data scale but also improves alignment with human preferences through curated quality control. Leveraging this enhanced dataset, we first train a specialized reward model to capture the generic motion quality differences between paired samples. This learned reward function subsequently guides the sample selection process for Direct Preference Optimization (DPO), enhancing the motionquality of the generative model.
+To avoid the degradation in other metrics, such as text alignment and video quality, we ensure the preference data pairs have comparable text alignment and video quality, while only the motion quality varies. This requirement poses greater challenges in obtaining preference annotations due to the inherently higher costs of human annotation. To address this challenge, we propose a semi-automatic pipeline that strategically combines automatically generated motion pairs and human annotation results. This hybrid approach not only enhances the data scale but also improves alignment with human preferences through curated quality control. Leveraging this enhanced dataset, we first train a specialized reward model to capture the generic motion quality differences between paired samples. This learned reward function subsequently guides the sample selection process for Direct Preference Optimization (DPO), enhancing the motion quality of the generative model.
 
 #### Diffusion Forcing
 

From 008f1761bdb43449294ddb86aa5e3eb77f390352 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 08:30:52 -0700
Subject: [PATCH 049/117] Reverted Teacache Update

---
 generate_video_df.py | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 1e30e63..89e1116 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -41,16 +41,6 @@
         default="A woman in a leather jacket and sunglasses riding a vintage motorcycle through a desert highway at sunset, her hair blowing wildly in the wind as the motorcycle kicks up dust, with the golden sun casting long shadows across the barren landscape.",
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
-    parser.add_argument("--teacache", action="store_true")
-    parser.add_argument(
-        "--teacache_thresh",
-        type=float,
-        default=0.2,
-        help="Higher speedup will cause to worse quality -- 0.1 for 2.0x speedup -- 0.2 for 3.0x speedup")
-    parser.add_argument(
-        "--use_ret_steps",
-        action="store_true",
-        help="Using Retention Steps will result in faster generation speed and better generation quality.")
     
     parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
     parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
@@ -202,16 +192,6 @@
     if args.causal_attention:
         pipe.transformer.set_ar_attention(args.causal_block_size)
 
-    if args.teacache:
-        if args.ar_step > 0:
-            num_steps = args.inference_steps + (((args.base_num_frames - 1)//4 + 1) // args.causal_block_size - 1) * args.ar_step
-            print('num_steps:', num_steps)
-        else:
-            num_steps = args.inference_steps
-        pipe.transformer.initialize_teacache(enable_teacache=True, num_steps=num_steps, 
-                                             teacache_thresh=args.teacache_thresh, use_ret_steps=args.use_ret_steps, 
-                                             ckpt_dir=args.model_id)
-
     #20250422 pftq: Set preferred linear algebra backend to avoid cuSOLVER issues
     torch.backends.cuda.preferred_linalg_library("default")  # or try "magma" if available
     

From be6734085286751ed73ec6d36490371c4c6165d0 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 08:31:23 -0700
Subject: [PATCH 050/117] Reverted Teacache Update

Due to broken code here https://github.com/SkyworkAI/SkyReels-V2/issues/36
---
 generate_video.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/generate_video.py b/generate_video.py
index 9bc9fa3..6df92f9 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -51,16 +51,6 @@
         default="A serene lake surrounded by towering mountains, with a few swans gracefully gliding across the water and sunlight dancing on the surface.",
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
-    parser.add_argument("--teacache", action="store_true")
-    parser.add_argument(
-        "--teacache_thresh",
-        type=float,
-        default=0.2,
-        help="Higher speedup will cause to worse quality -- 0.1 for 2.0x speedup -- 0.2 for 3.0x speedup")
-    parser.add_argument(
-        "--use_ret_steps",
-        action="store_true",
-        help="Using Retention Steps will result in faster generation speed and better generation quality.")
 
     parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
     parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
@@ -160,11 +150,6 @@
     print(f"Rank {local_rank}: {width}x{height}")
 
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
-
-    if args.teacache:
-        pipe.transformer.initialize_teacache(enable_teacache=True, num_steps=args.inference_steps, 
-                                             teacache_thresh=args.teacache_thresh, use_ret_steps=args.use_ret_steps, 
-                                             ckpt_dir=args.model_id)
             
     prompt_input = args.prompt
     if args.prompt_enhancer and args.image is None:

From 52ffadbeccd89d289036f5301f7b5c3da1099f9c Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 08:31:53 -0700
Subject: [PATCH 051/117] Reverted Teacache Update

Due to broken code https://github.com/SkyworkAI/SkyReels-V2/issues/36
---
 skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index b27f1d8..c6f39ec 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -328,9 +328,6 @@ def __call__(
                     finished_frame_num = i * (base_num_frames - overlap_history_frames) + overlap_history_frames
                     left_frame_num = latent_length - finished_frame_num
                     base_num_frames_iter = min(left_frame_num + overlap_history_frames, base_num_frames)
-                    if ar_step > 0 and self.transformer.enable_teacache:
-                        num_steps = num_inference_steps + ((base_num_frames_iter - overlap_history_frames) // causal_block_size - 1) * ar_step
-                        self.transformer.num_steps = num_steps
                 else:  # i == 0
                     base_num_frames_iter = base_num_frames
                 latent_shape = [16, base_num_frames_iter, latent_height, latent_width]

From 34244cb06daeebf0f2118e27dc866fbbf15546df Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 08:32:21 -0700
Subject: [PATCH 052/117] Reverted Teacache Update

Due to broken code https://github.com/SkyworkAI/SkyReels-V2/issues/36
---
 skyreels_v2_infer/modules/transformer.py | 106 +----------------------
 1 file changed, 3 insertions(+), 103 deletions(-)

diff --git a/skyreels_v2_infer/modules/transformer.py b/skyreels_v2_infer/modules/transformer.py
index 3bef5ce..6f41558 100644
--- a/skyreels_v2_infer/modules/transformer.py
+++ b/skyreels_v2_infer/modules/transformer.py
@@ -1,6 +1,6 @@
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import math
-import numpy as np
+
 import torch
 import torch.amp as amp
 import torch.nn as nn
@@ -484,7 +484,6 @@ def __init__(
         self.num_frame_per_block = 1
         self.flag_causal_attention = False
         self.block_mask = None
-        self.enable_teacache = False
 
         # embeddings
         self.patch_embedding = nn.Conv3d(in_dim, dim, kernel_size=patch_size, stride=patch_size)
@@ -575,50 +574,6 @@ def attention_mask(b, h, q_idx, kv_idx):
 
         return block_mask
 
-    def initialize_teacache(self, enable_teacache=True, num_steps=25, teacache_thresh=0.15, use_ret_steps=False, ckpt_dir=''):
-        self.enable_teacache = enable_teacache
-        print('using teacache')
-        self.cnt = 0
-        self.num_steps = num_steps
-        self.teacache_thresh = teacache_thresh
-        self.accumulated_rel_l1_distance_even = 0
-        self.accumulated_rel_l1_distance_odd = 0
-        self.previous_e0_even = None
-        self.previous_e0_odd = None
-        self.previous_residual_even = None
-        self.previous_residual_odd = None
-        self.use_ref_steps = use_ret_steps
-        if "I2V" in ckpt_dir:
-            if use_ret_steps:
-                if '540P' in ckpt_dir:
-                    self.coefficients = [ 2.57151496e+05, -3.54229917e+04,  1.40286849e+03, -1.35890334e+01, 1.32517977e-01]
-                if '720P' in ckpt_dir:
-                    self.coefficients = [ 8.10705460e+03,  2.13393892e+03, -3.72934672e+02,  1.66203073e+01, -4.17769401e-02]
-                self.ret_steps = 5*2
-                self.cutoff_steps = num_steps*2
-            else:
-                if '540P' in ckpt_dir:
-                    self.coefficients = [-3.02331670e+02,  2.23948934e+02, -5.25463970e+01,  5.87348440e+00, -2.01973289e-01]
-                if '720P' in ckpt_dir:
-                    self.coefficients = [-114.36346466,   65.26524496,  -18.82220707,    4.91518089,   -0.23412683]
-                self.ret_steps = 1*2
-                self.cutoff_steps = num_steps*2 - 2
-        else:
-            if use_ret_steps:
-                if '1.3B' in ckpt_dir:
-                    self.coefficients = [-5.21862437e+04, 9.23041404e+03, -5.28275948e+02, 1.36987616e+01, -4.99875664e-02]
-                if '14B' in ckpt_dir:
-                    self.coefficients = [-3.03318725e+05, 4.90537029e+04, -2.65530556e+03, 5.87365115e+01, -3.15583525e-01]
-                self.ret_steps = 5*2
-                self.cutoff_steps = num_steps*2
-            else:
-                if '1.3B' in ckpt_dir:
-                    self.coefficients = [2.39676752e+03, -1.31110545e+03,  2.01331979e+02, -8.29855975e+00, 1.37887774e-01]
-                if '14B' in ckpt_dir:
-                    self.coefficients = [-5784.54975374,  5449.50911966, -1811.16591783,   256.27178429, -13.02252404]
-                self.ret_steps = 1*2
-                self.cutoff_steps = num_steps*2 - 2
-
     def forward(self, x, t, context, clip_fea=None, y=None, fps=None):
         r"""
         Forward pass through the diffusion model
@@ -709,68 +664,13 @@ def forward(self, x, t, context, clip_fea=None, y=None, fps=None):
 
         # arguments
         kwargs = dict(e=e0, grid_sizes=grid_sizes, freqs=self.freqs, context=context, block_mask=self.block_mask)
-        if self.enable_teacache:
-            modulated_inp = e0 if self.use_ref_steps else e
-            # teacache
-            if self.cnt%2==0: # even -> conditon
-                self.is_even = True
-                if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
-                    should_calc_even = True
-                    self.accumulated_rel_l1_distance_even = 0
-                else:
-                    rescale_func = np.poly1d(self.coefficients)
-                    self.accumulated_rel_l1_distance_even += rescale_func(((modulated_inp-self.previous_e0_even).abs().mean() / self.previous_e0_even.abs().mean()).cpu().item())
-                    if self.accumulated_rel_l1_distance_even < self.teacache_thresh:
-                        should_calc_even = False
-                    else:
-                        should_calc_even = True
-                        self.accumulated_rel_l1_distance_even = 0
-                self.previous_e0_even = modulated_inp.clone()
-
-            else: # odd -> unconditon
-                self.is_even = False
-                if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
-                    should_calc_odd = True
-                    self.accumulated_rel_l1_distance_odd = 0
-                else: 
-                    rescale_func = np.poly1d(self.coefficients)
-                    self.accumulated_rel_l1_distance_odd += rescale_func(((modulated_inp-self.previous_e0_odd).abs().mean() / self.previous_e0_odd.abs().mean()).cpu().item())
-                    if self.accumulated_rel_l1_distance_odd < self.teacache_thresh:
-                        should_calc_odd = False
-                    else:
-                        should_calc_odd = True
-                        self.accumulated_rel_l1_distance_odd = 0
-                self.previous_e0_odd = modulated_inp.clone()
-
-        if self.enable_teacache: 
-            if self.is_even:
-                if not should_calc_even:
-                    x += self.previous_residual_even
-                else:
-                    ori_x = x.clone()
-                    for block in self.blocks:
-                        x = block(x, **kwargs)
-                    self.previous_residual_even = x - ori_x
-            else:
-                if not should_calc_odd:
-                    x += self.previous_residual_odd
-                else:
-                    ori_x = x.clone()
-                    for block in self.blocks:
-                        x = block(x, **kwargs)
-                    self.previous_residual_odd = x - ori_x
-        
-        else:
-            for block in self.blocks:
-                x = block(x, **kwargs)
+        for block in self.blocks:
+            x = block(x, **kwargs)
 
         x = self.head(x, e)
 
         # unpatchify
         x = self.unpatchify(x, grid_sizes)
-        self.cnt += 1
-        if self.cnt >= self.num_steps:
-            self.cnt = 0
         return x.float()
 
     def unpatchify(self, x, grid_sizes):

From 0bc6fd30cea1eda791f3e70f84503b492c55fa1d Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 08:43:48 -0700
Subject: [PATCH 053/117] Reverted Teacache Update

Due to broken code from the update
---
 generate_video.py | 56 ++++++++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/generate_video.py b/generate_video.py
index 6df92f9..074e040 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -97,10 +97,11 @@
         
     #20250422 pftq: Add error handling for image loading, aspect ratio preservation, and multi-GPU synchronization
     image = None
+    if args.use_usp:
+        dist.barrier()
     if args.image:  
         if local_rank == 0:
             try:
-                print("Loading image...")
                 image = load_image(args.image).convert("RGB")
 
                 # 20250422 pftq: option to preserve image aspect ratio
@@ -112,11 +113,11 @@
                     else:
                         height = int(width / img_width * img_height)
 
-                    divisibility=8
+                    divisibility=16
                     if width%divisibility!=0:
-                            width = width + (width%divisibility)
+                            width = width - (width%divisibility)
                     if height%divisibility!=0:
-                            height = height + (height%divisibility)
+                            height = height - (height%divisibility)
 
                     image = resizecrop(image, height, width)
                 else:
@@ -128,26 +129,35 @@
                 raise ValueError(f"Failed to load or process image: {e}")
                 
         if args.use_usp:
-            # Broadcast image to other ranks
-            image_data = torch.tensor(np.array(image), dtype=torch.uint8, device="cuda") if image is not None else None
-            if local_rank == 0:
-                print(f"Broadcasting image from rank {local_rank}...")
-                dist.broadcast(image_data, src=0)
-            else:
-                print(f"Receiving image from rank {local_rank}...")
-                image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
-                dist.broadcast(image_data, src=0)
-                image = Image.fromarray(image_data.cpu().numpy())
-
-            #20250422 pftq: Broadcast height and width to ensure consistency
-            height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
-            width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
-            dist.broadcast(height_tensor, src=0)
-            dist.broadcast(width_tensor, src=0)
-            height = height_tensor.item()
-            width = width_tensor.item()
+            dist.barrier()
+            try:
+                #20250422 pftq: Broadcast height and width to ensure consistency
+                height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
+                width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
+                dist.broadcast(height_tensor, src=0)
+                dist.broadcast(width_tensor, src=0)
+                height = height_tensor.item()
+                width = width_tensor.item()
+                
+                # Broadcast image to other ranks
+                image_data = torch.tensor(np.array(image), dtype=torch.uint8, device="cuda") if image is not None else None
+                if local_rank == 0:
+                    print(f"Broadcasting image from rank {local_rank}...")
+                    dist.broadcast(image_data, src=0)
+                else:
+                    print(f"Receiving image from rank {local_rank}...")
+                    image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
+                    dist.broadcast(image_data, src=0)
+                    image = Image.fromarray(image_data.cpu().numpy())
+                dist.barrier()
+                    
+            except Exception as e:
+                print(f"[Rank {local_rank}] Image broadcasting error: {e}")
+                if args.use_usp:
+                    dist.destroy_process_group()
+                raise
 
-    print(f"Rank {local_rank}: {width}x{height}")
+    print(f"Rank {local_rank}: {width}x{height} | Image: "+str(image!=None))
 
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
             

From f4c6bc312b8b9362833a8f9de79ae6c221302e0c Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 09:05:49 -0700
Subject: [PATCH 054/117] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 76e6a6a..2161abd 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,8 @@
 - Exposed negative_prompt to allow that to be changed/overwritten.
 - Friendlier filenames with date, seed, cfg, steps, and other details in front.
 
+Note: The TeaCache update breaks the code (see https://github.com/SkyworkAI/SkyReels-V2/issues/36) so that is currently excluded from this fork.
+
 Easy install instructions for those using Runpod like me:
 ```
 #create once on new pod

From 9f53b3e6de73cd30af38d7242160e91008443c90 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 16:24:47 -0700
Subject: [PATCH 055/117] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 2161abd..71c47dc 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,6 @@
 ## Changes from pftq:
 - Added seed synchronization code to allow random seed with multi-GPU.
+- Fixed CuSolver error that occasionally comes up in multi-GPU by presetting linear algebra library.
 - Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
 - Fixed DF script not resize-cropping the image (I2V script does it but DF is missing the code).

From 48a5a28955707be68884d03bc9ea17b3ea3128a0 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 20:13:42 -0700
Subject: [PATCH 056/117] Fixed 20-min load time on multi-gpu due to
 contention.

---
 generate_video_df.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 89e1116..fc51439 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -177,8 +177,9 @@
         gc.collect()
         torch.cuda.empty_cache()
 
-    # 20250423 pftq: needs to be fixed, 20-min load times on multi-GPU caused by contention
+    # 20250423 pftq: fixed 20-min load times on multi-GPU caused by contention, reduced down to 12 min roughly the same as single GPU.
     print("Initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))
+    starttime = time.time()
     pipe = DiffusionForcingPipeline(
         args.model_id,
         dit_path=args.model_id,
@@ -187,7 +188,8 @@
         use_usp=args.use_usp,
         offload=args.offload,
     )
-    print("Finished initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))
+    totaltime = time.time()-starttime
+    print("Finished initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())+" ("+str(int(totaltime))+" seconds)")
 
     if args.causal_attention:
         pipe.transformer.set_ar_attention(args.causal_block_size)

From a284b9577f0bb88623a71439a597f13cf078472f Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 20:14:10 -0700
Subject: [PATCH 057/117] Fixed 20-min load time on multi-gpu due to
 contention.

---
 .../pipelines/diffusion_forcing_pipeline.py   | 74 +++++++++++++++++--
 1 file changed, 67 insertions(+), 7 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index c6f39ec..a03f23d 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -51,14 +51,74 @@ def __init__(
             device (str): Device to run on, defaults to 'cuda'
             weight_dtype: Weight data type, defaults to torch.bfloat16
         """
+
+        # 20250423 pftq: Fixed 20-min multi-gpu load time by loading on Rank 0 first and broadcasting
+        
+        import torch.distributed as dist  # 20250423 pftq: Added for rank checking and broadcasting
+        self.device = device
+        self.offload = offload
         load_device = "cpu" if offload else device
-        self.transformer = get_transformer(dit_path, load_device, weight_dtype)
+
+        # 20250423 pftq: Check rank and distributed mode
+        if use_usp:
+            if not dist.is_initialized():
+                raise RuntimeError("Distributed environment must be initialized with dist.init_process_group before using use_usp=True")
+            local_rank = dist.get_rank()
+        else:
+            local_rank = 0
+
+        print(f"[Rank {local_rank}] Initializing pipeline components...")
+
+        # 20250423 pftq: Load transformer only on rank 0 or single-GPU
+        if not use_usp or local_rank == 0:
+            print(f"[Rank {local_rank}] Loading transformer...")
+            self.transformer = get_transformer(dit_path, load_device, weight_dtype)
+            if use_usp:
+                # Prepare state dict for broadcasting
+                state_dict = {
+                    "transformer": self.transformer.state_dict(),
+                }
+        else:
+            # 20250423 pftq: Non-rank-0: Initialize empty models to avoid disk I/O
+            print(f"[Rank {local_rank}] Skipping weights for transformer...")
+            self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=True)  # 20250423 pftq: Requires skip_weights modification to modules.__init__.py
+            state_dict = None
+
+        if use_usp:
+            # 20250423 pftq: Broadcast transformer weights from rank 0
+            dist.barrier()  # Ensure rank 0 loads first
+            print(f"[Rank {local_rank}] Broadcasting weights for transformer...")
+            broadcast_list = [state_dict]
+            dist.broadcast_object_list(broadcast_list, src=0)
+            state_dict = broadcast_list[0]
+            # 20250423 pftq: Load broadcasted weights on all ranks
+            self.transformer.load_state_dict(state_dict["transformer"])
+            dist.barrier()  # 20250423 pftq: Synchronize ranks
+
+        # 20250423 pftq: Stagger text encoder loading across ranks
+        if use_usp:
+            for rank in range(dist.get_world_size()):
+                if local_rank == rank:
+                    print(f"[Rank {local_rank}] Loading text encoder...")
+                    self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
+                dist.barrier()
+        else:
+            print(f"[Rank {local_rank}] Loading text encoder...")
+            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
+
+        # 20250423 pftq: Load VAE on all ranks with optional staggering to reduce I/O contention
         vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
-        self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
-        self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
+        if use_usp:
+            # 20250423 pftq: Stagger VAE loading across ranks to avoid contention
+            for rank in range(dist.get_world_size()):
+                if local_rank == rank:
+                    print(f"[Rank {local_rank}] Loading VAE...")
+                    self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
+                dist.barrier()
+        else:
+            self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
+
         self.video_processor = VideoProcessor(vae_scale_factor=16)
-        self.device = device
-        self.offload = offload
 
         if use_usp:
             from xfuser.core.distributed import get_sequence_parallel_world_size
@@ -67,8 +127,8 @@ def __init__(
 
             for block in self.transformer.blocks:
                 block.self_attn.forward = types.MethodType(usp_attn_forward, block.self_attn)
-                self.transformer.forward = types.MethodType(usp_dit_forward, self.transformer)
-                self.sp_size = get_sequence_parallel_world_size()
+            self.transformer.forward = types.MethodType(usp_dit_forward, self.transformer)
+            self.sp_size = get_sequence_parallel_world_size()
 
         self.scheduler = FlowUniPCMultistepScheduler()
 

From 99cc289822f2c0dc9b21a60f7baeb3e932501ab3 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 20:14:28 -0700
Subject: [PATCH 058/117] Fixed 20-min load time on multi-gpu due to
 contention.

---
 skyreels_v2_infer/modules/__init__.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/skyreels_v2_infer/modules/__init__.py b/skyreels_v2_infer/modules/__init__.py
index 5bc6afe..863b345 100644
--- a/skyreels_v2_infer/modules/__init__.py
+++ b/skyreels_v2_infer/modules/__init__.py
@@ -27,18 +27,21 @@ def get_vae(model_path, device="cuda", weight_dtype=torch.float32) -> WanVAE:
     return vae
 
 
-def get_transformer(model_path, device="cuda", weight_dtype=torch.bfloat16) -> WanModel:
+def get_transformer(model_path, device="cuda", weight_dtype=torch.bfloat16, skip_weights=False) -> WanModel:
+    # 20250423 pftq: Added skip_weights parameter to initialize empty model
     config_path = os.path.join(model_path, "config.json")
     transformer = WanModel.from_config(config_path).to(weight_dtype).to(device)
 
-    for file in os.listdir(model_path):
-        if file.endswith(".safetensors"):
-            file_path = os.path.join(model_path, file)
-            state_dict = load_file(file_path)
-            transformer.load_state_dict(state_dict, strict=False)
-            del state_dict
-            gc.collect()
-            torch.cuda.empty_cache()
+    if not skip_weights:
+        # 20250423 pftq: Only load weights if skip_weights=False
+        for file in os.listdir(model_path):
+            if file.endswith(".safetensors"):
+                file_path = os.path.join(model_path, file)
+                state_dict = load_file(file_path)
+                transformer.load_state_dict(state_dict, strict=False)
+                del state_dict
+                gc.collect()
+                torch.cuda.empty_cache()
 
     transformer.requires_grad_(False)
     transformer.eval()

From 0a5bf6bea64c22017a0de99979cf1e99b9e11143 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 20:15:10 -0700
Subject: [PATCH 059/117] Fixed 20-min load time on multi-gpu due to
 contention.

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 71c47dc..3b6827b 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,6 @@
 ## Changes from pftq:
 - Added seed synchronization code to allow random seed with multi-GPU.
+- Fixed 20-min load time on multi-GPU due to contention (all GPUs loading models at once).
 - Fixed CuSolver error that occasionally comes up in multi-GPU by presetting linear algebra library.
 - Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.

From 8ede85e73996e8e619e857f6901647089fd1005e Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 21:39:25 -0700
Subject: [PATCH 060/117] Simplified image loading

---
 generate_video_df.py | 87 +++++++++++++++-----------------------------
 1 file changed, 29 insertions(+), 58 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index fc51439..52cf502 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -98,69 +98,40 @@
     shift = args.shift
     #image = load_image(args.image).convert("RGB") if args.image else None
 
-    #20250422 pftq: Add error handling for image loading, aspect ratio preservation, and multi-GPU synchronization
+    #20250422 pftq: Add error handling for image loading, aspect ratio preservation
     image = None
-    if args.use_usp:
-        dist.barrier()
     if args.image:
-        if local_rank == 0:
-            try:
-                image = load_image(args.image).convert("RGB")
-
-                # 20250422 pftq: option to preserve image aspect ratio
-                if args.preserve_image_aspect_ratio:
-                    img_width, img_height = image.size
-                    if img_height > img_width:
-                        height, width = width, height
-                        width = int(height / img_height * img_width)
-                    else:
-                        height = int(width / img_width * img_height)
-
-                    divisibility=16
-                    if width%divisibility!=0:
-                            width = width - (width%divisibility)
-                    if height%divisibility!=0:
-                            height = height - (height%divisibility)
-
-                    image = resizecrop(image, height, width)
+        try:
+            image = load_image(args.image).convert("RGB")
+
+            # 20250422 pftq: option to preserve image aspect ratio
+            if args.preserve_image_aspect_ratio:
+                img_width, img_height = image.size
+                if img_height > img_width:
+                    height, width = width, height
+                    width = int(height / img_height * img_width)
                 else:
-                    image_width, image_height = image.size
-                    if image_height > image_width:
-                        height, width = width, height
-                    image = resizecrop(image, height, width)
-            except Exception as e:
-                raise ValueError(f"Failed to load or process image: {e}")
-                
-        if args.use_usp:
-            dist.barrier()
-            try:
-                #20250422 pftq: Broadcast height and width to ensure consistency
-                height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
-                width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
-                dist.broadcast(height_tensor, src=0)
-                dist.broadcast(width_tensor, src=0)
-                height = height_tensor.item()
-                width = width_tensor.item()
-                
-                # Broadcast image to other ranks
-                image_data = torch.tensor(np.array(image), dtype=torch.uint8, device="cuda") if image is not None else None
-                if local_rank == 0:
-                    print(f"Broadcasting image from rank {local_rank}...")
-                    dist.broadcast(image_data, src=0)
-                else:
-                    print(f"Receiving image from rank {local_rank}...")
-                    image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
-                    dist.broadcast(image_data, src=0)
-                    image = Image.fromarray(image_data.cpu().numpy())
-                dist.barrier()
-
-            except Exception as e:
-                print(f"[Rank {local_rank}] Image broadcasting error: {e}")
-                if args.use_usp:
-                    dist.destroy_process_group()
-                raise
+                    height = int(width / img_width * img_height)
+
+                divisibility=16
+                if width%divisibility!=0:
+                        width = width - (width%divisibility)
+                if height%divisibility!=0:
+                        height = height - (height%divisibility)
+
+                image = resizecrop(image, height, width)
+            else:
+                image_width, image_height = image.size
+                if image_height > image_width:
+                    height, width = width, height
+                image = resizecrop(image, height, width)
+        except Exception as e:
+            raise ValueError(f"Failed to load or process image: {e}")
 
     print(f"Rank {local_rank}: {width}x{height} | Image: "+str(image!=None))
+
+    if args.use_usp:
+        dist.barrier()
     
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
 

From e6a2c9c34844b9a6a72cddea331f5df1c745a210 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 21:56:18 -0700
Subject: [PATCH 061/117] Simplified image loading

---
 generate_video.py | 87 ++++++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 58 deletions(-)

diff --git a/generate_video.py b/generate_video.py
index 074e040..5c4d31e 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -95,69 +95,40 @@
     #image = load_image(args.image).convert("RGB") if args.image else None
 
         
-    #20250422 pftq: Add error handling for image loading, aspect ratio preservation, and multi-GPU synchronization
+    #20250422 pftq: Add error handling for image loading, aspect ratio preservation
     image = None
-    if args.use_usp:
-        dist.barrier()
     if args.image:  
-        if local_rank == 0:
-            try:
-                image = load_image(args.image).convert("RGB")
-
-                # 20250422 pftq: option to preserve image aspect ratio
-                if args.preserve_image_aspect_ratio:
-                    img_width, img_height = image.size
-                    if img_height > img_width:
-                        height, width = width, height
-                        width = int(height / img_height * img_width)
-                    else:
-                        height = int(width / img_width * img_height)
-
-                    divisibility=16
-                    if width%divisibility!=0:
-                            width = width - (width%divisibility)
-                    if height%divisibility!=0:
-                            height = height - (height%divisibility)
-
-                    image = resizecrop(image, height, width)
-                else:
-                    image_width, image_height = image.size
-                    if image_height > image_width:
-                        height, width = width, height
-                    image = resizecrop(image, height, width)
-            except Exception as e:
-                raise ValueError(f"Failed to load or process image: {e}")
-                
-        if args.use_usp:
-            dist.barrier()
-            try:
-                #20250422 pftq: Broadcast height and width to ensure consistency
-                height_tensor = torch.tensor(height, dtype=torch.int64, device="cuda")
-                width_tensor = torch.tensor(width, dtype=torch.int64, device="cuda")
-                dist.broadcast(height_tensor, src=0)
-                dist.broadcast(width_tensor, src=0)
-                height = height_tensor.item()
-                width = width_tensor.item()
-                
-                # Broadcast image to other ranks
-                image_data = torch.tensor(np.array(image), dtype=torch.uint8, device="cuda") if image is not None else None
-                if local_rank == 0:
-                    print(f"Broadcasting image from rank {local_rank}...")
-                    dist.broadcast(image_data, src=0)
+        try:
+            image = load_image(args.image).convert("RGB")
+
+            # 20250422 pftq: option to preserve image aspect ratio
+            if args.preserve_image_aspect_ratio:
+                img_width, img_height = image.size
+                if img_height > img_width:
+                    height, width = width, height
+                    width = int(height / img_height * img_width)
                 else:
-                    print(f"Receiving image from rank {local_rank}...")
-                    image_data = torch.empty((height, width, 3), dtype=torch.uint8, device="cuda")
-                    dist.broadcast(image_data, src=0)
-                    image = Image.fromarray(image_data.cpu().numpy())
-                dist.barrier()
-                    
-            except Exception as e:
-                print(f"[Rank {local_rank}] Image broadcasting error: {e}")
-                if args.use_usp:
-                    dist.destroy_process_group()
-                raise
+                    height = int(width / img_width * img_height)
+
+                divisibility=16
+                if width%divisibility!=0:
+                        width = width - (width%divisibility)
+                if height%divisibility!=0:
+                        height = height - (height%divisibility)
+
+                image = resizecrop(image, height, width)
+            else:
+                image_width, image_height = image.size
+                if image_height > image_width:
+                    height, width = width, height
+                image = resizecrop(image, height, width)
+        except Exception as e:
+            raise ValueError(f"Failed to load or process image: {e}")
 
     print(f"Rank {local_rank}: {width}x{height} | Image: "+str(image!=None))
+    
+    if args.use_usp:
+        dist.barrier()
 
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
             

From 61b31fd4d651d142f256ff503b7579296f2546df Mon Sep 17 00:00:00 2001
From: "fles@qq.com" <snrc@8899>
Date: Thu, 24 Apr 2025 14:17:11 +0800
Subject: [PATCH 062/117] remove hard code reference video overlap_history

---
 generate_video_df.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 4b3e282..e04c9ee 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -84,6 +84,9 @@
     image = args.image.convert("RGB") if args.image else None
 
     video = []
+    pre_video_length = 17
+    if args.overlap_history is not None:
+        pre_video_length = args.overlap_history
     if args.video:
         args.video = load_video(args.video) 
         arg_width = width
@@ -94,7 +97,7 @@
                 height, width = arg_width, arg_height
             img = resizecrop(img, height, width)
             video.append(img.convert("RGB").resize((width, height)))
-            video = video[-17:]
+            video = video[-pre_video_length:]
     else:
         video = None
     

From 96ea378825deeabceacdae5daefb63fcc25df2d6 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 23:53:14 -0700
Subject: [PATCH 063/117] Fixed 20-min load time on multi-gpu due to
 contention.

---
 .../pipelines/image2video_pipeline.py         | 78 +++++++++++++++++--
 1 file changed, 71 insertions(+), 7 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/image2video_pipeline.py b/skyreels_v2_infer/pipelines/image2video_pipeline.py
index a260cf0..f7f8dca 100644
--- a/skyreels_v2_infer/pipelines/image2video_pipeline.py
+++ b/skyreels_v2_infer/pipelines/image2video_pipeline.py
@@ -39,15 +39,78 @@ class Image2VideoPipeline:
     def __init__(
         self, model_path, dit_path, device: str = "cuda", weight_dtype=torch.bfloat16, use_usp=False, offload=False
     ):
+        # 20250423 pftq: Fixed load time by broadcasting transformer and staggering text encoder, VAE, image encoder
+        import torch.distributed as dist  
         load_device = "cpu" if offload else device
-        self.transformer = get_transformer(dit_path, load_device, weight_dtype)
-        vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
-        self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
-        self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
-        self.clip = get_image_encoder(model_path, load_device, weight_dtype)
-        self.sp_size = 1
         self.device = device
         self.offload = offload
+
+        # 20250423 pftq: Check rank and distributed mode
+        if use_usp:
+            if not dist.is_initialized():
+                raise RuntimeError("Distributed environment must be initialized with dist.init_process_group before using use_usp=True")
+            local_rank = dist.get_rank()
+        else:
+            local_rank = 0
+
+        print(f"[Rank {local_rank}] Initializing pipeline components...")
+
+        # 20250423 pftq: Load transformer only on rank 0 or single-GPU
+        if not use_usp or local_rank == 0:
+            print(f"[Rank {local_rank}] Loading transformer...")
+            self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=False)
+            transformer_state_dict = self.transformer.state_dict() if use_usp else None
+        else:
+            print(f"[Rank {local_rank}] Skipping weights for transformer...")
+            self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=True)
+            transformer_state_dict = None
+
+        # 20250423 pftq: Broadcast transformer weights from rank 0
+        if use_usp:
+            dist.barrier()  # Ensure rank 0 loads transformer
+            broadcast_list = [transformer_state_dict]
+            print(f"[Rank {local_rank}] Broadcasting weights for transformer...")
+            dist.broadcast_object_list(broadcast_list, src=0)
+            transformer_state_dict = broadcast_list[0]
+            print(f"[Rank {local_rank}] Loading broadcasted transformer weights...")
+            self.transformer.load_state_dict(transformer_state_dict)
+            dist.barrier()  # Synchronize ranks
+
+        # 20250423 pftq: Stagger text encoder loading across ranks
+        if use_usp:
+            for rank in range(dist.get_world_size()):
+                if local_rank == rank:
+                    print(f"[Rank {local_rank}] Loading text encoder...")
+                    self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
+                dist.barrier()
+        else:
+            print(f"[Rank {local_rank}] Loading text encoder...")
+            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
+
+        # 20250423 pftq: Stagger VAE loading across ranks
+        vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
+        if use_usp:
+            for rank in range(dist.get_world_size()):
+                if local_rank == rank:
+                    print(f"[Rank {local_rank}] Loading VAE...")
+                    self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
+                dist.barrier()
+        else:
+            print(f"[Rank {local_rank}] Loading VAE...")
+            self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
+
+        # 20250423 pftq: Stagger image encoder loading across ranks
+        if use_usp:
+            for rank in range(dist.get_world_size()):
+                if local_rank == rank:
+                    print(f"[Rank {local_rank}] Loading image encoder...")
+                    self.clip = get_image_encoder(model_path, load_device, weight_dtype)
+                dist.barrier()
+        else:
+            print(f"[Rank {local_rank}] Loading image encoder...")
+            self.clip = get_image_encoder(model_path, load_device, weight_dtype)
+
+        self.sp_size = 1
         self.video_processor = VideoProcessor(vae_scale_factor=16)
         if use_usp:
             from xfuser.core.distributed import get_sequence_parallel_world_size
@@ -56,8 +119,9 @@ def __init__(
 
             for block in self.transformer.blocks:
                 block.self_attn.forward = types.MethodType(usp_attn_forward, block.self_attn)
+                # 20250423 pftq: Fixed indentation and removed duplicate forward assignment
                 self.transformer.forward = types.MethodType(usp_dit_forward, self.transformer)
-                self.sp_size = get_sequence_parallel_world_size()
+            self.sp_size = get_sequence_parallel_world_size()
 
         self.scheduler = FlowUniPCMultistepScheduler()
         self.vae_stride = (4, 8, 8)

From 99e5f5b8e0e8b1fc89e9b7f8a853bef335cd512b Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Wed, 23 Apr 2025 23:53:55 -0700
Subject: [PATCH 064/117] Fixed 20-min load time on multi-gpu due to
 contention.

---
 generate_video.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/generate_video.py b/generate_video.py
index 5c4d31e..58305ca 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -142,6 +142,9 @@
         gc.collect()
         torch.cuda.empty_cache()
 
+    # 20250423 pftq: needs fixing, 20-min load times on multi-GPU caused by contention, DF already reduced down to 12 min roughly the same as single GPU.
+    print("Initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))
+    starttime = time.time()
     if image is None:
         assert "T2V" in args.model_id, f"check model_id:{args.model_id}"
         print("init text2video pipeline")
@@ -154,6 +157,8 @@
         pipe = Image2VideoPipeline(
             model_path=args.model_id, dit_path=args.model_id, use_usp=args.use_usp, offload=args.offload
         )
+    totaltime = time.time()-starttime
+    print("Finished initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())+" ("+str(int(totaltime))+" seconds)")
 
     prompt_input = args.prompt
     if args.prompt_enhancer and image is not None:

From 098d027e5cf01fdc73b7db4b985af5198300d95c Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 00:55:45 -0700
Subject: [PATCH 065/117] Fixed 20-min load time on multi-gpu due to
 contention.

---
 .../pipelines/text2video_pipeline.py          | 66 +++++++++++++++++--
 1 file changed, 60 insertions(+), 6 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/text2video_pipeline.py b/skyreels_v2_infer/pipelines/text2video_pipeline.py
index 05a1dd3..c0d6d03 100644
--- a/skyreels_v2_infer/pipelines/text2video_pipeline.py
+++ b/skyreels_v2_infer/pipelines/text2video_pipeline.py
@@ -18,15 +18,68 @@ class Text2VideoPipeline:
     def __init__(
         self, model_path, dit_path, device: str = "cuda", weight_dtype=torch.bfloat16, use_usp=False, offload=False
     ):
+        # 20250423 pftq: Fixed load time by broadcasting transformer and staggering text encoder, VAE
+        import torch.distributed as dist
         load_device = "cpu" if offload else device
-        self.transformer = get_transformer(dit_path, load_device, weight_dtype)
+        self.device = device
+        self.offload = offload
+
+        # 20250423 pftq: Check rank and distributed mode
+        if use_usp:
+            if not dist.is_initialized():
+                raise RuntimeError("Distributed environment must be initialized with dist.init_process_group before using use_usp=True")
+            local_rank = dist.get_rank()
+        else:
+            local_rank = 0
+
+        print(f"[Rank {local_rank}] Initializing pipeline components...")
+
+        # 20250423 pftq: Load transformer only on rank 0 or single-GPU
+        if not use_usp or local_rank == 0:
+            print(f"[Rank {local_rank}] Loading transformer...")
+            self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=False)
+            transformer_state_dict = self.transformer.state_dict() if use_usp else None
+        else:
+            print(f"[Rank {local_rank}] Skipping weights for transformer...")
+            self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=True)
+            transformer_state_dict = None
+
+        # 20250423 pftq: Broadcast transformer weights from rank 0
+        if use_usp:
+            dist.barrier()  # Ensure rank 0 loads transformer
+            broadcast_list = [transformer_state_dict]
+            print(f"[Rank {local_rank}] Broadcasting weights for transformer...")
+            dist.broadcast_object_list(broadcast_list, src=0)
+            transformer_state_dict = broadcast_list[0]
+            print(f"[Rank {local_rank}] Loading broadcasted transformer weights...")
+            self.transformer.load_state_dict(transformer_state_dict)
+            dist.barrier()  # Synchronize ranks
+
+        # 20250423 pftq: Stagger text encoder loading across ranks
+        if use_usp:
+            for rank in range(dist.get_world_size()):
+                if local_rank == rank:
+                    print(f"[Rank {local_rank}] Loading text encoder...")
+                    self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
+                dist.barrier()
+        else:
+            print(f"[Rank {local_rank}] Loading text encoder...")
+            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
+
+        # 20250423 pftq: Stagger VAE loading across ranks
         vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
-        self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
-        self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
+        if use_usp:
+            for rank in range(dist.get_world_size()):
+                if local_rank == rank:
+                    print(f"[Rank {local_rank}] Loading VAE...")
+                    self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
+                dist.barrier()
+        else:
+            print(f"[Rank {local_rank}] Loading VAE...")
+            self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
+
         self.video_processor = VideoProcessor(vae_scale_factor=16)
         self.sp_size = 1
-        self.device = device
-        self.offload = offload
         if use_usp:
             from xfuser.core.distributed import get_sequence_parallel_world_size
             from ..distributed.xdit_context_parallel import usp_attn_forward, usp_dit_forward
@@ -34,8 +87,9 @@ def __init__(
 
             for block in self.transformer.blocks:
                 block.self_attn.forward = types.MethodType(usp_attn_forward, block.self_attn)
+                # 20250423 pftq: Fixed indentation and removed duplicate forward assignment
                 self.transformer.forward = types.MethodType(usp_dit_forward, self.transformer)
-                self.sp_size = get_sequence_parallel_world_size()
+            self.sp_size = get_sequence_parallel_world_size()
 
         self.scheduler = FlowUniPCMultistepScheduler()
         self.vae_stride = (4, 8, 8)

From d43520e12a8b5a4333207e19d0ed4a8f1d5b6db1 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 01:39:00 -0700
Subject: [PATCH 066/117] Fixed 20-min load time on multi-gpu due to
 contention.

---
 skyreels_v2_infer/modules/t5.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/skyreels_v2_infer/modules/t5.py b/skyreels_v2_infer/modules/t5.py
index b882fe7..d49cd02 100644
--- a/skyreels_v2_infer/modules/t5.py
+++ b/skyreels_v2_infer/modules/t5.py
@@ -425,6 +425,7 @@ def __init__(
         tokenizer_path=None,
         text_len=512,
         shard_fn=None,
+        weights_only=False,  # 20250423 pftq: Added for torch.load
     ):
         self.text_len = text_len
         self.checkpoint_path = checkpoint_path
@@ -433,8 +434,12 @@ def __init__(
         super().__init__()
         # init model
         model = umt5_xxl(encoder_only=True, return_tokenizer=False)
-        logging.info(f"loading {checkpoint_path}")
-        model.load_state_dict(torch.load(checkpoint_path, map_location="cpu"))
+        # 20250423 pftq: Load weights only if checkpoint_path is provided
+        if checkpoint_path:
+            logging.info(f"loading {checkpoint_path}")
+            model.load_state_dict(
+                torch.load(checkpoint_path, map_location="cpu", weights_only=weights_only)
+            )
         self.model = model
         if shard_fn is not None:
             self.model = shard_fn(self.model, sync_module_states=False)

From 7fc7a1fe63064501605d1ff5f76296e4c5f4e999 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 01:39:40 -0700
Subject: [PATCH 067/117] Fixed 20-min load time on multi-gpu due to
 contention. Reduced an additional 5-minutes by broadcasting text encoder as
 well.

---
 skyreels_v2_infer/modules/__init__.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/skyreels_v2_infer/modules/__init__.py b/skyreels_v2_infer/modules/__init__.py
index 863b345..6f1c9be 100644
--- a/skyreels_v2_infer/modules/__init__.py
+++ b/skyreels_v2_infer/modules/__init__.py
@@ -50,10 +50,15 @@ def get_transformer(model_path, device="cuda", weight_dtype=torch.bfloat16, skip
     return transformer
 
 
-def get_text_encoder(model_path, device="cuda", weight_dtype=torch.bfloat16) -> T5EncoderModel:
+def get_text_encoder(model_path, device="cuda", weight_dtype=torch.bfloat16, skip_weights=False) -> T5EncoderModel:
+    # 20250423 pftq: Added skip_weights and weights_only=True
     t5_model = os.path.join(model_path, "models_t5_umt5-xxl-enc-bf16.pth")
     tokenizer_path = os.path.join(model_path, "google", "umt5-xxl")
-    text_encoder = T5EncoderModel(checkpoint_path=t5_model, tokenizer_path=tokenizer_path).to(device).to(weight_dtype)
+    text_encoder = T5EncoderModel(
+        checkpoint_path=t5_model if not skip_weights else None,
+        tokenizer_path=tokenizer_path,
+        weights_only=True
+    ).to(device).to(weight_dtype)
     text_encoder.requires_grad_(False)
     text_encoder.eval()
     gc.collect()

From 1019f5391c5f2cb56a86e0ad10e4c447975ce694 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 01:46:30 -0700
Subject: [PATCH 068/117] Fixed 20-min load time on multi-gpu due to
 contention. Reduced an additional 5-minutes by broadcasting text encoder as
 well.

---
 .../pipelines/diffusion_forcing_pipeline.py   | 42 +++++++++----------
 1 file changed, 19 insertions(+), 23 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index a03f23d..d47c2a9 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -73,39 +73,35 @@ def __init__(
         if not use_usp or local_rank == 0:
             print(f"[Rank {local_rank}] Loading transformer...")
             self.transformer = get_transformer(dit_path, load_device, weight_dtype)
-            if use_usp:
-                # Prepare state dict for broadcasting
-                state_dict = {
-                    "transformer": self.transformer.state_dict(),
-                }
+            transformer_state_dict = self.transformer.state_dict() if use_usp else None
         else:
             # 20250423 pftq: Non-rank-0: Initialize empty models to avoid disk I/O
             print(f"[Rank {local_rank}] Skipping weights for transformer...")
             self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=True)  # 20250423 pftq: Requires skip_weights modification to modules.__init__.py
-            state_dict = None
+            transformer_state_dict = None
+
+        # 20250423 pftq: Load text encoder only on rank 0 or single-GPU
+        if not use_usp or local_rank == 0:
+            print(f"[Rank {local_rank}] Loading text encoder...")
+            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
+            text_encoder_state_dict = self.text_encoder.state_dict() if use_usp else None
+        else:
+            print(f"[Rank {local_rank}] Initializing empty text encoder...")
+            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=True)
+            text_encoder_state_dict = None
 
         if use_usp:
-            # 20250423 pftq: Broadcast transformer weights from rank 0
+            # 20250423 pftq: Broadcast transformer weights and text encoder from rank 0
             dist.barrier()  # Ensure rank 0 loads first
-            print(f"[Rank {local_rank}] Broadcasting weights for transformer...")
-            broadcast_list = [state_dict]
+            print(f"[Rank {local_rank}] Broadcasting weights for transformer and text encoder...")
+            broadcast_list = [transformer_state_dict, text_encoder_state_dict]
             dist.broadcast_object_list(broadcast_list, src=0)
-            state_dict = broadcast_list[0]
+            transformer_state_dict, text_encoder_state_dict = broadcast_list
             # 20250423 pftq: Load broadcasted weights on all ranks
-            self.transformer.load_state_dict(state_dict["transformer"])
+            self.transformer.load_state_dict(transformer_state_dict)
+            self.text_encoder.load_state_dict(text_encoder_state_dict)
             dist.barrier()  # 20250423 pftq: Synchronize ranks
-
-        # 20250423 pftq: Stagger text encoder loading across ranks
-        if use_usp:
-            for rank in range(dist.get_world_size()):
-                if local_rank == rank:
-                    print(f"[Rank {local_rank}] Loading text encoder...")
-                    self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
-                dist.barrier()
-        else:
-            print(f"[Rank {local_rank}] Loading text encoder...")
-            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
-
+        
         # 20250423 pftq: Load VAE on all ranks with optional staggering to reduce I/O contention
         vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
         if use_usp:

From 67cb94d365ad6989de579bdd2c7d15f6f67bba43 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 02:20:18 -0700
Subject: [PATCH 069/117] Fixed 20-min load time on multi-gpu due to
 contention. Reduced an additional 5-minutes by broadcasting text encoder as
 well.

---
 .../pipelines/text2video_pipeline.py          | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/text2video_pipeline.py b/skyreels_v2_infer/pipelines/text2video_pipeline.py
index c0d6d03..3025adc 100644
--- a/skyreels_v2_infer/pipelines/text2video_pipeline.py
+++ b/skyreels_v2_infer/pipelines/text2video_pipeline.py
@@ -44,28 +44,28 @@ def __init__(
             self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=True)
             transformer_state_dict = None
 
-        # 20250423 pftq: Broadcast transformer weights from rank 0
+        # 20250423 pftq: Load text encoder only on rank 0 or single-GPU
+        if not use_usp or local_rank == 0:
+            print(f"[Rank {local_rank}] Loading text encoder...")
+            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=False)
+            text_encoder_state_dict = self.text_encoder.state_dict() if use_usp else None
+        else:
+            print(f"[Rank {local_rank}] Initializing empty text encoder...")
+            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=True)
+            text_encoder_state_dict = None
+
+        # 20250423 pftq: Broadcast transformer and text encoder weights from rank 0
         if use_usp:
-            dist.barrier()  # Ensure rank 0 loads transformer
-            broadcast_list = [transformer_state_dict]
-            print(f"[Rank {local_rank}] Broadcasting weights for transformer...")
+            dist.barrier()  # Ensure rank 0 loads transformer and text encoder
+            broadcast_list = [transformer_state_dict, text_encoder_state_dict]
+            print(f"[Rank {local_rank}] Broadcasting weights for transformer and text encoder...")
             dist.broadcast_object_list(broadcast_list, src=0)
-            transformer_state_dict = broadcast_list[0]
-            print(f"[Rank {local_rank}] Loading broadcasted transformer weights...")
+            transformer_state_dict, text_encoder_state_dict = broadcast_list
+            print(f"[Rank {local_rank}] Loading broadcasted transformer and text encoder weights...")
             self.transformer.load_state_dict(transformer_state_dict)
+            self.text_encoder.load_state_dict(text_encoder_state_dict)
             dist.barrier()  # Synchronize ranks
 
-        # 20250423 pftq: Stagger text encoder loading across ranks
-        if use_usp:
-            for rank in range(dist.get_world_size()):
-                if local_rank == rank:
-                    print(f"[Rank {local_rank}] Loading text encoder...")
-                    self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
-                dist.barrier()
-        else:
-            print(f"[Rank {local_rank}] Loading text encoder...")
-            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
-
         # 20250423 pftq: Stagger VAE loading across ranks
         vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
         if use_usp:

From fce77ff855f836883ae407c777aa2fdc7c0d9f4e Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 03:26:26 -0700
Subject: [PATCH 070/117] Fixed 20-min load time on multi-gpu due to
 contention. Reduced an additional 5-minutes by broadcasting text encoder as
 well. Fixed VRAM spike from redundant GPU 0 load.

---
 .../pipelines/image2video_pipeline.py         | 41 +++++++++++--------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/image2video_pipeline.py b/skyreels_v2_infer/pipelines/image2video_pipeline.py
index f7f8dca..d58429f 100644
--- a/skyreels_v2_infer/pipelines/image2video_pipeline.py
+++ b/skyreels_v2_infer/pipelines/image2video_pipeline.py
@@ -65,28 +65,33 @@ def __init__(
             self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=True)
             transformer_state_dict = None
 
-        # 20250423 pftq: Broadcast transformer weights from rank 0
+        # 20250423 pftq: Load text encoder only on rank 0 or single-GPU
+        if not use_usp or local_rank == 0:
+            print(f"[Rank {local_rank}] Loading text encoder...")
+            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=False)
+            text_encoder_state_dict = self.text_encoder.state_dict() if use_usp else None
+        else:
+            print(f"[Rank {local_rank}] Initializing empty text encoder...")
+            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=True)
+            text_encoder_state_dict = None
+
+        # 20250423 pftq: Broadcast transformer and text encoder weights from rank 0
         if use_usp:
-            dist.barrier()  # Ensure rank 0 loads transformer
-            broadcast_list = [transformer_state_dict]
-            print(f"[Rank {local_rank}] Broadcasting weights for transformer...")
+            dist.barrier()  # Ensure rank 0 loads transformer and text encoder
+            broadcast_list = [transformer_state_dict, text_encoder_state_dict]
+            print(f"[Rank {local_rank}] Broadcasting weights for transformer and text encoder...")
             dist.broadcast_object_list(broadcast_list, src=0)
-            transformer_state_dict = broadcast_list[0]
-            print(f"[Rank {local_rank}] Loading broadcasted transformer weights...")
-            self.transformer.load_state_dict(transformer_state_dict)
+            transformer_state_dict, text_encoder_state_dict = broadcast_list
+            # 20250423 pftq: Load broadcasted weights on all ranks. Skip redundant load_state_dict on rank 0
+            if local_rank != 0:
+                print(f"[Rank {local_rank}] Loading broadcasted transformer and text encoder weights...")
+                self.transformer.load_state_dict(transformer_state_dict)
+                # 20250423 pftq: Ensure text encoder is on CPU before load_state_dict if offload=True
+                if offload:
+                    self.text_encoder = self.text_encoder.to("cpu")
+                self.text_encoder.load_state_dict(text_encoder_state_dict)
             dist.barrier()  # Synchronize ranks
 
-        # 20250423 pftq: Stagger text encoder loading across ranks
-        if use_usp:
-            for rank in range(dist.get_world_size()):
-                if local_rank == rank:
-                    print(f"[Rank {local_rank}] Loading text encoder...")
-                    self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
-                dist.barrier()
-        else:
-            print(f"[Rank {local_rank}] Loading text encoder...")
-            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
-
         # 20250423 pftq: Stagger VAE loading across ranks
         vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
         if use_usp:

From 5e9445c623385da9e1dfd873d8fa0b894b529447 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 03:27:30 -0700
Subject: [PATCH 071/117] Fixed 20-min load time on multi-gpu due to
 contention. Reduced an additional 5-minutes by broadcasting text encoder as
 well. Fixed VRAM spike from redundant GPU 0 load.

---
 .../pipelines/diffusion_forcing_pipeline.py           | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index d47c2a9..11f5475 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -97,9 +97,14 @@ def __init__(
             broadcast_list = [transformer_state_dict, text_encoder_state_dict]
             dist.broadcast_object_list(broadcast_list, src=0)
             transformer_state_dict, text_encoder_state_dict = broadcast_list
-            # 20250423 pftq: Load broadcasted weights on all ranks
-            self.transformer.load_state_dict(transformer_state_dict)
-            self.text_encoder.load_state_dict(text_encoder_state_dict)
+            # 20250423 pftq: Load broadcasted weights on all ranks. Skip redundant load_state_dict on rank 0
+            if local_rank != 0:
+                print(f"[Rank {local_rank}] Loading broadcasted transformer and text encoder weights...")
+                self.transformer.load_state_dict(transformer_state_dict)
+                # 20250423 pftq: Ensure text encoder is on CPU before load_state_dict if offload=True
+                if offload:
+                    self.text_encoder = self.text_encoder.to("cpu")
+                self.text_encoder.load_state_dict(text_encoder_state_dict)
             dist.barrier()  # 20250423 pftq: Synchronize ranks
         
         # 20250423 pftq: Load VAE on all ranks with optional staggering to reduce I/O contention

From b1a9278ec5a1f46c04b3c49bc4615f1fe39e7d34 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 03:27:46 -0700
Subject: [PATCH 072/117] Fixed 20-min load time on multi-gpu due to
 contention. Reduced an additional 5-minutes by broadcasting text encoder as
 well. Fixed VRAM spike from redundant GPU 0 load.

---
 skyreels_v2_infer/pipelines/text2video_pipeline.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/text2video_pipeline.py b/skyreels_v2_infer/pipelines/text2video_pipeline.py
index 3025adc..68dfb07 100644
--- a/skyreels_v2_infer/pipelines/text2video_pipeline.py
+++ b/skyreels_v2_infer/pipelines/text2video_pipeline.py
@@ -61,9 +61,14 @@ def __init__(
             print(f"[Rank {local_rank}] Broadcasting weights for transformer and text encoder...")
             dist.broadcast_object_list(broadcast_list, src=0)
             transformer_state_dict, text_encoder_state_dict = broadcast_list
-            print(f"[Rank {local_rank}] Loading broadcasted transformer and text encoder weights...")
-            self.transformer.load_state_dict(transformer_state_dict)
-            self.text_encoder.load_state_dict(text_encoder_state_dict)
+            # 20250423 pftq: Load broadcasted weights on all ranks. Skip redundant load_state_dict on rank 0
+            if local_rank != 0:
+                print(f"[Rank {local_rank}] Loading broadcasted transformer and text encoder weights...")
+                self.transformer.load_state_dict(transformer_state_dict)
+                # 20250423 pftq: Ensure text encoder is on CPU before load_state_dict if offload=True
+                if offload:
+                    self.text_encoder = self.text_encoder.to("cpu")
+                self.text_encoder.load_state_dict(text_encoder_state_dict)
             dist.barrier()  # Synchronize ranks
 
         # 20250423 pftq: Stagger VAE loading across ranks

From e358d55f214cb0f6957fe8e704a40227e9c48171 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 04:30:07 -0700
Subject: [PATCH 073/117] Smoothed pipeline broadcasting to avoid VRAM spikes.

---
 .../pipelines/image2video_pipeline.py         | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/image2video_pipeline.py b/skyreels_v2_infer/pipelines/image2video_pipeline.py
index d58429f..091afcf 100644
--- a/skyreels_v2_infer/pipelines/image2video_pipeline.py
+++ b/skyreels_v2_infer/pipelines/image2video_pipeline.py
@@ -75,20 +75,27 @@ def __init__(
             self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=True)
             text_encoder_state_dict = None
 
-        # 20250423 pftq: Broadcast transformer and text encoder weights from rank 0
+        # 20250423 pftq: Broadcast transformer from rank 0
         if use_usp:
             dist.barrier()  # Ensure rank 0 loads transformer and text encoder
-            broadcast_list = [transformer_state_dict, text_encoder_state_dict]
-            print(f"[Rank {local_rank}] Broadcasting weights for transformer and text encoder...")
-            dist.broadcast_object_list(broadcast_list, src=0)
-            transformer_state_dict, text_encoder_state_dict = broadcast_list
+            transformer_list = [transformer_state_dict]
+            print(f"[Rank {local_rank}] Broadcasting weights for transformer...")
+            dist.broadcast_object_list(transformer_list, src=0)
             # 20250423 pftq: Load broadcasted weights on all ranks. Skip redundant load_state_dict on rank 0
             if local_rank != 0:
-                print(f"[Rank {local_rank}] Loading broadcasted transformer and text encoder weights...")
+                print(f"[Rank {local_rank}] Loading broadcasted transformer...")
+                transformer_state_dict = transformer_list[0]
                 self.transformer.load_state_dict(transformer_state_dict)
-                # 20250423 pftq: Ensure text encoder is on CPU before load_state_dict if offload=True
-                if offload:
-                    self.text_encoder = self.text_encoder.to("cpu")
+            dist.barrier()  # 20250423 pftq: Synchronize ranks
+
+            # 20250423 pftq: Broadcast text encoder weights from rank 0
+            print(f"[Rank {local_rank}] Broadcasting weights for text encoder...")
+            text_encoder_list = [text_encoder_state_dict]
+            dist.broadcast_object_list(text_encoder_list, src=0)
+            # 20250423 pftq: Load broadcasted weights on all ranks. Skip redundant load_state_dict on rank 0
+            if local_rank != 0:
+                print(f"[Rank {local_rank}] Loading broadcasted text encoder...")
+                text_encoder_state_dict = text_encoder_list[0]
                 self.text_encoder.load_state_dict(text_encoder_state_dict)
             dist.barrier()  # Synchronize ranks
 

From e5c1273ff9eb7a17b5ffb0db2f79005363898ddb Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 04:50:31 -0700
Subject: [PATCH 074/117] Smoothed pipeline broadcasting to avoid VRAM spikes.

---
 .../pipelines/diffusion_forcing_pipeline.py   | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index 11f5475..73d21cb 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -90,20 +90,27 @@ def __init__(
             self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=True)
             text_encoder_state_dict = None
 
+        # 20250423 pftq: Broadcast transformer weights from rank 0
         if use_usp:
-            # 20250423 pftq: Broadcast transformer weights and text encoder from rank 0
             dist.barrier()  # Ensure rank 0 loads first
-            print(f"[Rank {local_rank}] Broadcasting weights for transformer and text encoder...")
-            broadcast_list = [transformer_state_dict, text_encoder_state_dict]
-            dist.broadcast_object_list(broadcast_list, src=0)
-            transformer_state_dict, text_encoder_state_dict = broadcast_list
+            transformer_list = [transformer_state_dict]
+            print(f"[Rank {local_rank}] Broadcasting weights for transformer...")
+            dist.broadcast_object_list(transformer_list, src=0)
             # 20250423 pftq: Load broadcasted weights on all ranks. Skip redundant load_state_dict on rank 0
             if local_rank != 0:
-                print(f"[Rank {local_rank}] Loading broadcasted transformer and text encoder weights...")
+                print(f"[Rank {local_rank}] Loading broadcasted transformer...")
+                transformer_state_dict = transformer_list[0]
                 self.transformer.load_state_dict(transformer_state_dict)
-                # 20250423 pftq: Ensure text encoder is on CPU before load_state_dict if offload=True
-                if offload:
-                    self.text_encoder = self.text_encoder.to("cpu")
+            dist.barrier()  # 20250423 pftq: Synchronize ranks
+
+            # 20250423 pftq: Broadcast text encoder weights from rank 0
+            print(f"[Rank {local_rank}] Broadcasting weights for text encoder...")
+            text_encoder_list = [text_encoder_state_dict]
+            dist.broadcast_object_list(text_encoder_list, src=0)
+            # 20250423 pftq: Load broadcasted weights on all ranks. Skip redundant load_state_dict on rank 0
+            if local_rank != 0:
+                print(f"[Rank {local_rank}] Loading broadcasted text encoder...")
+                text_encoder_state_dict = text_encoder_list[0]
                 self.text_encoder.load_state_dict(text_encoder_state_dict)
             dist.barrier()  # 20250423 pftq: Synchronize ranks
         

From f9c2fccb665d4ba03527025f16de72a5f649a586 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 06:01:13 -0700
Subject: [PATCH 075/117] Empty cache after loading to control VRAM use

---
 skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index 73d21cb..2de76dc 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -101,6 +101,7 @@ def __init__(
                 print(f"[Rank {local_rank}] Loading broadcasted transformer...")
                 transformer_state_dict = transformer_list[0]
                 self.transformer.load_state_dict(transformer_state_dict)
+            torch.cuda.empty_cache()
             dist.barrier()  # 20250423 pftq: Synchronize ranks
 
             # 20250423 pftq: Broadcast text encoder weights from rank 0
@@ -112,6 +113,7 @@ def __init__(
                 print(f"[Rank {local_rank}] Loading broadcasted text encoder...")
                 text_encoder_state_dict = text_encoder_list[0]
                 self.text_encoder.load_state_dict(text_encoder_state_dict)
+            torch.cuda.empty_cache()
             dist.barrier()  # 20250423 pftq: Synchronize ranks
         
         # 20250423 pftq: Load VAE on all ranks with optional staggering to reduce I/O contention

From b089e91484b44c015c4021e93facc8fb56607704 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 06:01:51 -0700
Subject: [PATCH 076/117] Empty cache after loading to control VRAM use

---
 skyreels_v2_infer/pipelines/image2video_pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/skyreels_v2_infer/pipelines/image2video_pipeline.py b/skyreels_v2_infer/pipelines/image2video_pipeline.py
index 091afcf..9d3d6f2 100644
--- a/skyreels_v2_infer/pipelines/image2video_pipeline.py
+++ b/skyreels_v2_infer/pipelines/image2video_pipeline.py
@@ -86,6 +86,7 @@ def __init__(
                 print(f"[Rank {local_rank}] Loading broadcasted transformer...")
                 transformer_state_dict = transformer_list[0]
                 self.transformer.load_state_dict(transformer_state_dict)
+            torch.cuda.empty_cache()
             dist.barrier()  # 20250423 pftq: Synchronize ranks
 
             # 20250423 pftq: Broadcast text encoder weights from rank 0
@@ -97,6 +98,7 @@ def __init__(
                 print(f"[Rank {local_rank}] Loading broadcasted text encoder...")
                 text_encoder_state_dict = text_encoder_list[0]
                 self.text_encoder.load_state_dict(text_encoder_state_dict)
+            torch.cuda.empty_cache()
             dist.barrier()  # Synchronize ranks
 
         # 20250423 pftq: Stagger VAE loading across ranks

From 82eb2729962f5be023417d91e7a57278433b6416 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 06:02:42 -0700
Subject: [PATCH 077/117] Empty cache after loading to control VRAM use

---
 .../pipelines/text2video_pipeline.py          | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/text2video_pipeline.py b/skyreels_v2_infer/pipelines/text2video_pipeline.py
index 68dfb07..64d25e1 100644
--- a/skyreels_v2_infer/pipelines/text2video_pipeline.py
+++ b/skyreels_v2_infer/pipelines/text2video_pipeline.py
@@ -54,21 +54,30 @@ def __init__(
             self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=True)
             text_encoder_state_dict = None
 
-        # 20250423 pftq: Broadcast transformer and text encoder weights from rank 0
+        # 20250423 pftq: Broadcast transformer from rank 0
         if use_usp:
             dist.barrier()  # Ensure rank 0 loads transformer and text encoder
-            broadcast_list = [transformer_state_dict, text_encoder_state_dict]
-            print(f"[Rank {local_rank}] Broadcasting weights for transformer and text encoder...")
-            dist.broadcast_object_list(broadcast_list, src=0)
-            transformer_state_dict, text_encoder_state_dict = broadcast_list
+            transformer_list = [transformer_state_dict]
+            print(f"[Rank {local_rank}] Broadcasting weights for transformer...")
+            dist.broadcast_object_list(transformer_list, src=0)
             # 20250423 pftq: Load broadcasted weights on all ranks. Skip redundant load_state_dict on rank 0
             if local_rank != 0:
-                print(f"[Rank {local_rank}] Loading broadcasted transformer and text encoder weights...")
+                print(f"[Rank {local_rank}] Loading broadcasted transformer...")
+                transformer_state_dict = transformer_list[0]
                 self.transformer.load_state_dict(transformer_state_dict)
-                # 20250423 pftq: Ensure text encoder is on CPU before load_state_dict if offload=True
-                if offload:
-                    self.text_encoder = self.text_encoder.to("cpu")
+            torch.cuda.empty_cache()
+            dist.barrier()  # 20250423 pftq: Synchronize ranks
+
+            # 20250423 pftq: Broadcast text encoder weights from rank 0
+            print(f"[Rank {local_rank}] Broadcasting weights for text encoder...")
+            text_encoder_list = [text_encoder_state_dict]
+            dist.broadcast_object_list(text_encoder_list, src=0)
+            # 20250423 pftq: Load broadcasted weights on all ranks. Skip redundant load_state_dict on rank 0
+            if local_rank != 0:
+                print(f"[Rank {local_rank}] Loading broadcasted text encoder...")
+                text_encoder_state_dict = text_encoder_list[0]
                 self.text_encoder.load_state_dict(text_encoder_state_dict)
+            torch.cuda.empty_cache()
             dist.barrier()  # Synchronize ranks
 
         # 20250423 pftq: Stagger VAE loading across ranks

From 33dd31249272ca1ebeef6fe7c898bd7c7ee8abc0 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 19:05:07 -0700
Subject: [PATCH 078/117] Removed unnecessary multi-gpu barrier call for slight
 performance gain.

---
 generate_video.py | 219 ++++++++++++++++------------------------------
 1 file changed, 77 insertions(+), 142 deletions(-)

diff --git a/generate_video.py b/generate_video.py
index 58305ca..ff9cc83 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -8,9 +8,6 @@
 import torch
 from diffusers.utils import load_image
 
-from PIL import Image  #20250422 pftq: Added for image resizing and cropping
-import numpy as np  #20250422 pftq: Added for seed synchronization
-
 from skyreels_v2_infer.modules import download_model
 from skyreels_v2_infer.pipelines import Image2VideoPipeline
 from skyreels_v2_infer.pipelines import PromptEnhancer
@@ -44,27 +41,52 @@
     parser.add_argument("--use_usp", action="store_true")
     parser.add_argument("--offload", action="store_true")
     parser.add_argument("--fps", type=int, default=24)
-    parser.add_argument("--seed", type=int, default=-1)
+    parser.add_argument("--seed", type=int, default=None)
     parser.add_argument(
         "--prompt",
         type=str,
         default="A serene lake surrounded by towering mountains, with a few swans gracefully gliding across the water and sunlight dancing on the surface.",
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
-
-    parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
-    parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
-    parser.add_argument("--negative_prompt", type=str, default="Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards") # 20250422 pftq: expose negative prompt
-    
+    parser.add_argument("--teacache", action="store_true")
+    parser.add_argument(
+        "--teacache_thresh",
+        type=float,
+        default=0.2,
+        help="Higher speedup will cause to worse quality -- 0.1 for 2.0x speedup -- 0.2 for 3.0x speedup")
+    parser.add_argument(
+        "--use_ret_steps",
+        action="store_true",
+        help="Using Retention Steps will result in faster generation speed and better generation quality.")
     args = parser.parse_args()
 
     args.model_id = download_model(args.model_id)
     print("model_id:", args.model_id)
 
-    #20250422 pftq: unneeded with seed synchronization code
-    #assert (args.use_usp and args.seed is not None) or (not args.use_usp), "usp mode need seed"
+    assert (args.use_usp and args.seed is not None) or (not args.use_usp), "usp mode need seed"
+    if args.seed is None:
+        random.seed(time.time())
+        args.seed = int(random.randrange(4294967294))
 
+    if args.resolution == "540P":
+        height = 544
+        width = 960
+    elif args.resolution == "720P":
+        height = 720
+        width = 1280
+    else:
+        raise ValueError(f"Invalid resolution: {args.resolution}")
+
+
+    import psutil
+    memory_usage = psutil.virtual_memory().percent
+    print(f"RAM Usage: {memory_usage}%")
+
+    image = load_image(args.image).convert("RGB") if args.image else None
+    negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
     local_rank = 0
+    memory_usage = psutil.virtual_memory().percent
+    print(f"RAM Usage: {memory_usage}%")
     if args.use_usp:
         assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
         from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
@@ -82,56 +104,9 @@
             ring_degree=1,
             ulysses_degree=dist.get_world_size(),
         )
+        memory_usage = psutil.virtual_memory().percent
+        print(f"RAM Usage: {memory_usage}%")
 
-    if args.resolution == "540P":
-        height = 544
-        width = 960
-    elif args.resolution == "720P":
-        height = 720
-        width = 1280
-    else:
-        raise ValueError(f"Invalid resolution: {args.resolution}")
-
-    #image = load_image(args.image).convert("RGB") if args.image else None
-
-        
-    #20250422 pftq: Add error handling for image loading, aspect ratio preservation
-    image = None
-    if args.image:  
-        try:
-            image = load_image(args.image).convert("RGB")
-
-            # 20250422 pftq: option to preserve image aspect ratio
-            if args.preserve_image_aspect_ratio:
-                img_width, img_height = image.size
-                if img_height > img_width:
-                    height, width = width, height
-                    width = int(height / img_height * img_width)
-                else:
-                    height = int(width / img_width * img_height)
-
-                divisibility=16
-                if width%divisibility!=0:
-                        width = width - (width%divisibility)
-                if height%divisibility!=0:
-                        height = height - (height%divisibility)
-
-                image = resizecrop(image, height, width)
-            else:
-                image_width, image_height = image.size
-                if image_height > image_width:
-                    height, width = width, height
-                image = resizecrop(image, height, width)
-        except Exception as e:
-            raise ValueError(f"Failed to load or process image: {e}")
-
-    print(f"Rank {local_rank}: {width}x{height} | Image: "+str(image!=None))
-    
-    if args.use_usp:
-        dist.barrier()
-
-    negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
-            
     prompt_input = args.prompt
     if args.prompt_enhancer and args.image is None:
         print(f"init prompt enhancer")
@@ -141,10 +116,8 @@
         del prompt_enhancer
         gc.collect()
         torch.cuda.empty_cache()
-
-    # 20250423 pftq: needs fixing, 20-min load times on multi-GPU caused by contention, DF already reduced down to 12 min roughly the same as single GPU.
-    print("Initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))
-    starttime = time.time()
+    memory_usage = psutil.virtual_memory().percent
+    print(f"RAM Usage: {memory_usage}%")
     if image is None:
         assert "T2V" in args.model_id, f"check model_id:{args.model_id}"
         print("init text2video pipeline")
@@ -157,86 +130,48 @@
         pipe = Image2VideoPipeline(
             model_path=args.model_id, dit_path=args.model_id, use_usp=args.use_usp, offload=args.offload
         )
-    totaltime = time.time()-starttime
-    print("Finished initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())+" ("+str(int(totaltime))+" seconds)")
-
+        args.image = load_image(args.image)
+        image_width, image_height = args.image.size
+        if image_height > image_width:
+            height, width = width, height
+        args.image = resizecrop(args.image, height, width)
+
+    memory_usage = psutil.virtual_memory().percent
+    print(f"RAM Usage: {memory_usage}%")
+    if args.teacache:
+        pipe.transformer.initialize_teacache(enable_teacache=True, num_steps=args.inference_steps, 
+                                             teacache_thresh=args.teacache_thresh, use_ret_steps=args.use_ret_steps, 
+                                             ckpt_dir=args.model_id)
+        
     prompt_input = args.prompt
     if args.prompt_enhancer and image is not None:
         prompt_input = prompt_enhancer(prompt_input)
         print(f"enhanced prompt: {prompt_input}")
 
-
-    #20250422 pftq: Set preferred linear algebra backend to avoid cuSOLVER issues
-    torch.backends.cuda.preferred_linalg_library("default")  # or try "magma" if available
-    
-    for idx in range(args.batch_size): # 20250422 pftq: implemented --batch_size
-        if local_rank == 0:
-            print(f"Generating video {idx+1} of {args.batch_size}")
-
-        #20250422 pftq: Synchronize seed across all ranks
-        if args.use_usp:
-            try:
-                #20250422 pftq: Synchronize ranks before seed broadcasting
-                dist.barrier()
-
-                #20250422 pftq: Always broadcast seed to ensure consistency
-                if local_rank == 0:
-                    if args.seed == -1 or args.seed is None or idx > 0:
-                        args.seed = int(random.randrange(4294967294))
-                seed_tensor = torch.tensor(args.seed, dtype=torch.int64, device="cuda")
-                dist.broadcast(seed_tensor, src=0)
-                args.seed = seed_tensor.item()
-
-                #20250422 pftq: Synchronize ranks after seed broadcasting
-                dist.barrier()
-            except Exception as e:
-                print(f"[Rank {local_rank}] Seed broadcasting error: {e}")
-                dist.destroy_process_group()
-                raise
-
-        else:
-            #20250422 pftq: Single GPU seed initialization
-            if args.seed == -1 or idx > 0:
-                args.seed = int(random.randrange(4294967294))
-
-        random.seed(args.seed)
-        np.random.seed(args.seed)
-        torch.manual_seed(args.seed)
-        torch.cuda.manual_seed_all(args.seed)
-        
-        kwargs = {
-            "prompt": prompt_input,
-            "negative_prompt": negative_prompt,
-            "num_frames": args.num_frames,
-            "num_inference_steps": args.inference_steps,
-            "guidance_scale": args.guidance_scale,
-            "shift": args.shift,
-            "generator": torch.Generator(device="cuda").manual_seed(args.seed),
-            "height": height,
-            "width": width,
-        }
-    
-        if image is not None:
-            #kwargs["image"] = load_image(args.image).convert("RGB") 
-            # 20250422 pftq: redundant reloading of the image
-            kwargs["image"] = image
-    
-        save_dir = os.path.join("result", args.outdir)
-        os.makedirs(save_dir, exist_ok=True)
-    
-        with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
-            print(f"infer kwargs:{kwargs}")
-            video_frames = pipe(**kwargs)[0]
-    
-        if local_rank == 0:
-            current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
-            #video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
-            
-            # 20250422 pftq: more useful filename
-            gpucount = ""
-            if args.use_usp and dist.get_world_size():
-                gpucount = "_"+str(dist.get_world_size())+"xGPU"
-            video_out_file = f"{current_time}_skyreels2_{args.resolution}-{args.num_frames}f_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
-            
-            output_path = os.path.join(save_dir, video_out_file)
-            imageio.mimwrite(output_path, video_frames, fps=args.fps, quality=8, output_params=["-loglevel", "error"])
+    kwargs = {
+        "prompt": prompt_input,
+        "negative_prompt": negative_prompt,
+        "num_frames": args.num_frames,
+        "num_inference_steps": args.inference_steps,
+        "guidance_scale": args.guidance_scale,
+        "shift": args.shift,
+        "generator": torch.Generator(device="cuda").manual_seed(args.seed),
+        "height": height,
+        "width": width,
+    }
+
+    if image is not None:
+        kwargs["image"] = args.image.convert("RGB")
+
+    save_dir = os.path.join("result", args.outdir)
+    os.makedirs(save_dir, exist_ok=True)
+
+    with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
+        print(f"infer kwargs:{kwargs}")
+        video_frames = pipe(**kwargs)[0]
+
+    if local_rank == 0:
+        current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
+        video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
+        output_path = os.path.join(save_dir, video_out_file)
+        imageio.mimwrite(output_path, video_frames, fps=args.fps, quality=8, output_params=["-loglevel", "error"])

From f3c317db33ac4029d9a004fffdf164940bbc2120 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 19:05:22 -0700
Subject: [PATCH 079/117] Removed unnecessary multi-gpu barrier call for slight
 performance gain.

---
 generate_video_df.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 52cf502..0e8e5ce 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -129,9 +129,6 @@
             raise ValueError(f"Failed to load or process image: {e}")
 
     print(f"Rank {local_rank}: {width}x{height} | Image: "+str(image!=None))
-
-    if args.use_usp:
-        dist.barrier()
     
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
 

From e4aab7743f6fe05d3b2ce2bf8e09fc27a31c103a Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 21:57:43 -0700
Subject: [PATCH 080/117] Correct version: Removed unnecessary multi-gpu
 barrier call for slight performance gain.

---
 generate_video.py | 216 +++++++++++++++++++++++++++++-----------------
 1 file changed, 139 insertions(+), 77 deletions(-)

diff --git a/generate_video.py b/generate_video.py
index ff9cc83..7b8e5fd 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -8,6 +8,9 @@
 import torch
 from diffusers.utils import load_image
 
+from PIL import Image  #20250422 pftq: Added for image resizing and cropping
+import numpy as np  #20250422 pftq: Added for seed synchronization
+
 from skyreels_v2_infer.modules import download_model
 from skyreels_v2_infer.pipelines import Image2VideoPipeline
 from skyreels_v2_infer.pipelines import PromptEnhancer
@@ -41,52 +44,27 @@
     parser.add_argument("--use_usp", action="store_true")
     parser.add_argument("--offload", action="store_true")
     parser.add_argument("--fps", type=int, default=24)
-    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--seed", type=int, default=-1)
     parser.add_argument(
         "--prompt",
         type=str,
         default="A serene lake surrounded by towering mountains, with a few swans gracefully gliding across the water and sunlight dancing on the surface.",
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
-    parser.add_argument("--teacache", action="store_true")
-    parser.add_argument(
-        "--teacache_thresh",
-        type=float,
-        default=0.2,
-        help="Higher speedup will cause to worse quality -- 0.1 for 2.0x speedup -- 0.2 for 3.0x speedup")
-    parser.add_argument(
-        "--use_ret_steps",
-        action="store_true",
-        help="Using Retention Steps will result in faster generation speed and better generation quality.")
+
+    parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
+    parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
+    parser.add_argument("--negative_prompt", type=str, default="Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards") # 20250422 pftq: expose negative prompt
+    
     args = parser.parse_args()
 
     args.model_id = download_model(args.model_id)
     print("model_id:", args.model_id)
 
-    assert (args.use_usp and args.seed is not None) or (not args.use_usp), "usp mode need seed"
-    if args.seed is None:
-        random.seed(time.time())
-        args.seed = int(random.randrange(4294967294))
-
-    if args.resolution == "540P":
-        height = 544
-        width = 960
-    elif args.resolution == "720P":
-        height = 720
-        width = 1280
-    else:
-        raise ValueError(f"Invalid resolution: {args.resolution}")
-
+    #20250422 pftq: unneeded with seed synchronization code
+    #assert (args.use_usp and args.seed is not None) or (not args.use_usp), "usp mode need seed"
 
-    import psutil
-    memory_usage = psutil.virtual_memory().percent
-    print(f"RAM Usage: {memory_usage}%")
-
-    image = load_image(args.image).convert("RGB") if args.image else None
-    negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
     local_rank = 0
-    memory_usage = psutil.virtual_memory().percent
-    print(f"RAM Usage: {memory_usage}%")
     if args.use_usp:
         assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
         from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
@@ -104,9 +82,53 @@
             ring_degree=1,
             ulysses_degree=dist.get_world_size(),
         )
-        memory_usage = psutil.virtual_memory().percent
-        print(f"RAM Usage: {memory_usage}%")
 
+    if args.resolution == "540P":
+        height = 544
+        width = 960
+    elif args.resolution == "720P":
+        height = 720
+        width = 1280
+    else:
+        raise ValueError(f"Invalid resolution: {args.resolution}")
+
+    #image = load_image(args.image).convert("RGB") if args.image else None
+
+        
+    #20250422 pftq: Add error handling for image loading, aspect ratio preservation
+    image = None
+    if args.image:  
+        try:
+            image = load_image(args.image).convert("RGB")
+
+            # 20250422 pftq: option to preserve image aspect ratio
+            if args.preserve_image_aspect_ratio:
+                img_width, img_height = image.size
+                if img_height > img_width:
+                    height, width = width, height
+                    width = int(height / img_height * img_width)
+                else:
+                    height = int(width / img_width * img_height)
+
+                divisibility=16
+                if width%divisibility!=0:
+                        width = width - (width%divisibility)
+                if height%divisibility!=0:
+                        height = height - (height%divisibility)
+
+                image = resizecrop(image, height, width)
+            else:
+                image_width, image_height = image.size
+                if image_height > image_width:
+                    height, width = width, height
+                image = resizecrop(image, height, width)
+        except Exception as e:
+            raise ValueError(f"Failed to load or process image: {e}")
+
+    print(f"Rank {local_rank}: {width}x{height} | Image: "+str(image!=None))
+    
+    negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
+            
     prompt_input = args.prompt
     if args.prompt_enhancer and args.image is None:
         print(f"init prompt enhancer")
@@ -116,8 +138,10 @@
         del prompt_enhancer
         gc.collect()
         torch.cuda.empty_cache()
-    memory_usage = psutil.virtual_memory().percent
-    print(f"RAM Usage: {memory_usage}%")
+
+    # 20250423 pftq: needs fixing, 20-min load times on multi-GPU caused by contention, DF already reduced down to 12 min roughly the same as single GPU.
+    print("Initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))
+    starttime = time.time()
     if image is None:
         assert "T2V" in args.model_id, f"check model_id:{args.model_id}"
         print("init text2video pipeline")
@@ -130,48 +154,86 @@
         pipe = Image2VideoPipeline(
             model_path=args.model_id, dit_path=args.model_id, use_usp=args.use_usp, offload=args.offload
         )
-        args.image = load_image(args.image)
-        image_width, image_height = args.image.size
-        if image_height > image_width:
-            height, width = width, height
-        args.image = resizecrop(args.image, height, width)
-
-    memory_usage = psutil.virtual_memory().percent
-    print(f"RAM Usage: {memory_usage}%")
-    if args.teacache:
-        pipe.transformer.initialize_teacache(enable_teacache=True, num_steps=args.inference_steps, 
-                                             teacache_thresh=args.teacache_thresh, use_ret_steps=args.use_ret_steps, 
-                                             ckpt_dir=args.model_id)
-        
+    totaltime = time.time()-starttime
+    print("Finished initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())+" ("+str(int(totaltime))+" seconds)")
+
     prompt_input = args.prompt
     if args.prompt_enhancer and image is not None:
         prompt_input = prompt_enhancer(prompt_input)
         print(f"enhanced prompt: {prompt_input}")
 
-    kwargs = {
-        "prompt": prompt_input,
-        "negative_prompt": negative_prompt,
-        "num_frames": args.num_frames,
-        "num_inference_steps": args.inference_steps,
-        "guidance_scale": args.guidance_scale,
-        "shift": args.shift,
-        "generator": torch.Generator(device="cuda").manual_seed(args.seed),
-        "height": height,
-        "width": width,
-    }
-
-    if image is not None:
-        kwargs["image"] = args.image.convert("RGB")
-
-    save_dir = os.path.join("result", args.outdir)
-    os.makedirs(save_dir, exist_ok=True)
-
-    with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
-        print(f"infer kwargs:{kwargs}")
-        video_frames = pipe(**kwargs)[0]
-
-    if local_rank == 0:
-        current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
-        video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
-        output_path = os.path.join(save_dir, video_out_file)
-        imageio.mimwrite(output_path, video_frames, fps=args.fps, quality=8, output_params=["-loglevel", "error"])
+
+    #20250422 pftq: Set preferred linear algebra backend to avoid cuSOLVER issues
+    torch.backends.cuda.preferred_linalg_library("default")  # or try "magma" if available
+    
+    for idx in range(args.batch_size): # 20250422 pftq: implemented --batch_size
+        if local_rank == 0:
+            print(f"Generating video {idx+1} of {args.batch_size}")
+
+        #20250422 pftq: Synchronize seed across all ranks
+        if args.use_usp:
+            try:
+                #20250422 pftq: Synchronize ranks before seed broadcasting
+                dist.barrier()
+
+                #20250422 pftq: Always broadcast seed to ensure consistency
+                if local_rank == 0:
+                    if args.seed == -1 or args.seed is None or idx > 0:
+                        args.seed = int(random.randrange(4294967294))
+                seed_tensor = torch.tensor(args.seed, dtype=torch.int64, device="cuda")
+                dist.broadcast(seed_tensor, src=0)
+                args.seed = seed_tensor.item()
+
+                #20250422 pftq: Synchronize ranks after seed broadcasting
+                dist.barrier()
+            except Exception as e:
+                print(f"[Rank {local_rank}] Seed broadcasting error: {e}")
+                dist.destroy_process_group()
+                raise
+
+        else:
+            #20250422 pftq: Single GPU seed initialization
+            if args.seed == -1 or idx > 0:
+                args.seed = int(random.randrange(4294967294))
+
+        random.seed(args.seed)
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed_all(args.seed)
+        
+        kwargs = {
+            "prompt": prompt_input,
+            "negative_prompt": negative_prompt,
+            "num_frames": args.num_frames,
+            "num_inference_steps": args.inference_steps,
+            "guidance_scale": args.guidance_scale,
+            "shift": args.shift,
+            "generator": torch.Generator(device="cuda").manual_seed(args.seed),
+            "height": height,
+            "width": width,
+        }
+    
+        if image is not None:
+            #kwargs["image"] = load_image(args.image).convert("RGB") 
+            # 20250422 pftq: redundant reloading of the image
+            kwargs["image"] = image
+    
+        save_dir = os.path.join("result", args.outdir)
+        os.makedirs(save_dir, exist_ok=True)
+    
+        with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
+            print(f"infer kwargs:{kwargs}")
+            video_frames = pipe(**kwargs)[0]
+    
+        if local_rank == 0:
+            current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
+            #video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
+            
+            # 20250422 pftq: more useful filename
+            gpucount = ""
+            if args.use_usp and dist.get_world_size():
+                gpucount = "_"+str(dist.get_world_size())+"xGPU"
+            video_out_file = f"{current_time}_skyreels2_{args.resolution}-{args.num_frames}f_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
+            
+            output_path = os.path.join(save_dir, video_out_file)
+            imageio.mimwrite(output_path, video_frames, fps=args.fps, quality=8, output_params=["-loglevel", "error"])

From e3d121a6b84af5150c3585d28c7eed215f9031ed Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 24 Apr 2025 23:09:13 -0700
Subject: [PATCH 081/117] Optimized multi-gpu loading further and use CPU RAM
 for broadcasting (tested faster and more stable than VRAM).

---
 .../pipelines/image2video_pipeline.py         | 71 ++++++++++++-------
 1 file changed, 44 insertions(+), 27 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/image2video_pipeline.py b/skyreels_v2_infer/pipelines/image2video_pipeline.py
index 9d3d6f2..3b4f840 100644
--- a/skyreels_v2_infer/pipelines/image2video_pipeline.py
+++ b/skyreels_v2_infer/pipelines/image2video_pipeline.py
@@ -55,28 +55,27 @@ def __init__(
 
         print(f"[Rank {local_rank}] Initializing pipeline components...")
 
-        # 20250423 pftq: Load transformer only on rank 0 or single-GPU
-        if not use_usp or local_rank == 0:
-            print(f"[Rank {local_rank}] Loading transformer...")
+        vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
+        # 20250423 pftq: Load normally on single gpu
+        if not use_usp:
+            print(f"[Rank {local_rank}] Loading transformer to {load_device}...")
             self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=False)
-            transformer_state_dict = self.transformer.state_dict() if use_usp else None
-        else:
-            print(f"[Rank {local_rank}] Skipping weights for transformer...")
-            self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=True)
-            transformer_state_dict = None
-
-        # 20250423 pftq: Load text encoder only on rank 0 or single-GPU
-        if not use_usp or local_rank == 0:
-            print(f"[Rank {local_rank}] Loading text encoder...")
+            print(f"[Rank {local_rank}] Loading text encoder to {load_device}...")
             self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=False)
-            text_encoder_state_dict = self.text_encoder.state_dict() if use_usp else None
-        else:
-            print(f"[Rank {local_rank}] Initializing empty text encoder...")
-            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=True)
-            text_encoder_state_dict = None
+            print(f"[Rank {local_rank}] Loading VAE...")
+            self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
 
         # 20250423 pftq: Broadcast transformer from rank 0
         if use_usp:
+            broadcast_device = "cpu" # tested to be more stable to start with cpu broadcast even if you have an H100
+            if local_rank == 0:
+                print(f"[Rank {local_rank}] Loading transformer to {broadcast_device}...")
+                self.transformer = get_transformer(dit_path, broadcast_device, weight_dtype, skip_weights=False)
+                transformer_state_dict = self.transformer.state_dict() 
+            else:
+                print(f"[Rank {local_rank}] Skipping transformer load...")
+                self.transformer = get_transformer(dit_path, broadcast_device, weight_dtype, skip_weights=True)
+                transformer_state_dict = None
             dist.barrier()  # Ensure rank 0 loads transformer and text encoder
             transformer_list = [transformer_state_dict]
             print(f"[Rank {local_rank}] Broadcasting weights for transformer...")
@@ -86,10 +85,26 @@ def __init__(
                 print(f"[Rank {local_rank}] Loading broadcasted transformer...")
                 transformer_state_dict = transformer_list[0]
                 self.transformer.load_state_dict(transformer_state_dict)
+            dist.barrier() 
+            if offload:
+                print(f"[Rank {local_rank}] Moving transformer to cpu...")
+                self.transformer.cpu()
+            else:
+                print(f"[Rank {local_rank}] Moving transformer to {device}...")
+                self.transformer.to(device)
+            dist.barrier() 
             torch.cuda.empty_cache()
-            dist.barrier()  # 20250423 pftq: Synchronize ranks
-
+            
             # 20250423 pftq: Broadcast text encoder weights from rank 0
+            if local_rank == 0:
+                print(f"[Rank {local_rank}] Loading text encoder to {broadcast_device}...")
+                self.text_encoder = get_text_encoder(model_path, broadcast_device, weight_dtype, skip_weights=False)
+                text_encoder_state_dict = self.text_encoder.state_dict() 
+            else:
+                print(f"[Rank {local_rank}] Skipping text encoder load...")
+                self.text_encoder = get_text_encoder(model_path, broadcast_device, weight_dtype, skip_weights=True)
+                text_encoder_state_dict = None
+            dist.barrier()  # Ensure rank 0 loads transformer and text encoder
             print(f"[Rank {local_rank}] Broadcasting weights for text encoder...")
             text_encoder_list = [text_encoder_state_dict]
             dist.broadcast_object_list(text_encoder_list, src=0)
@@ -98,20 +113,22 @@ def __init__(
                 print(f"[Rank {local_rank}] Loading broadcasted text encoder...")
                 text_encoder_state_dict = text_encoder_list[0]
                 self.text_encoder.load_state_dict(text_encoder_state_dict)
+            dist.barrier() 
+            if offload:
+                print(f"[Rank {local_rank}] Moving text encoder to cpu...")
+                self.text_encoder.cpu()
+            else:
+                print(f"[Rank {local_rank}] Moving text encoder to {device}...")
+                self.text_encoder.to(device)
+            dist.barrier() 
             torch.cuda.empty_cache()
-            dist.barrier()  # Synchronize ranks
 
-        # 20250423 pftq: Stagger VAE loading across ranks
-        vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
-        if use_usp:
+            # 20250423 pftq: Stagger VAE loading across ranks
             for rank in range(dist.get_world_size()):
                 if local_rank == rank:
                     print(f"[Rank {local_rank}] Loading VAE...")
                     self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
-                dist.barrier()
-        else:
-            print(f"[Rank {local_rank}] Loading VAE...")
-            self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
+                dist.barrier()  
 
         # 20250423 pftq: Stagger image encoder loading across ranks
         if use_usp:

From 96aac55ae271d59ff294fd7c8345bb75c8473cef Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 00:19:52 -0700
Subject: [PATCH 082/117] Optimized multi-gpu loading further and use CPU RAM
 for broadcasting (tested faster and more stable than VRAM).

---
 .../pipelines/diffusion_forcing_pipeline.py   | 82 +++++++++++--------
 1 file changed, 49 insertions(+), 33 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index 2de76dc..7a96ef5 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -69,30 +69,28 @@ def __init__(
 
         print(f"[Rank {local_rank}] Initializing pipeline components...")
 
-        # 20250423 pftq: Load transformer only on rank 0 or single-GPU
-        if not use_usp or local_rank == 0:
-            print(f"[Rank {local_rank}] Loading transformer...")
-            self.transformer = get_transformer(dit_path, load_device, weight_dtype)
-            transformer_state_dict = self.transformer.state_dict() if use_usp else None
-        else:
-            # 20250423 pftq: Non-rank-0: Initialize empty models to avoid disk I/O
-            print(f"[Rank {local_rank}] Skipping weights for transformer...")
-            self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=True)  # 20250423 pftq: Requires skip_weights modification to modules.__init__.py
-            transformer_state_dict = None
-
-        # 20250423 pftq: Load text encoder only on rank 0 or single-GPU
-        if not use_usp or local_rank == 0:
-            print(f"[Rank {local_rank}] Loading text encoder...")
-            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
-            text_encoder_state_dict = self.text_encoder.state_dict() if use_usp else None
-        else:
-            print(f"[Rank {local_rank}] Initializing empty text encoder...")
-            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=True)
-            text_encoder_state_dict = None
+        vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
+        # 20250423 pftq: Load normally on single gpu
+        if not use_usp:
+            print(f"[Rank {local_rank}] Loading transformer to {load_device}...")
+            self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=False)
+            print(f"[Rank {local_rank}] Loading text encoder to {load_device}...")
+            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=False)
+            print(f"[Rank {local_rank}] Loading VAE...")
+            self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
 
-        # 20250423 pftq: Broadcast transformer weights from rank 0
+        # 20250423 pftq: Broadcast transformer from rank 0
         if use_usp:
-            dist.barrier()  # Ensure rank 0 loads first
+            broadcast_device = "cpu" # tested to be more stable to start with cpu broadcast even if you have an H100
+            if local_rank == 0:
+                print(f"[Rank {local_rank}] Loading transformer to {broadcast_device}...")
+                self.transformer = get_transformer(dit_path, broadcast_device, weight_dtype, skip_weights=False)
+                transformer_state_dict = self.transformer.state_dict() 
+            else:
+                print(f"[Rank {local_rank}] Skipping transformer load...")
+                self.transformer = get_transformer(dit_path, broadcast_device, weight_dtype, skip_weights=True)
+                transformer_state_dict = None
+            dist.barrier()  # Ensure rank 0 loads transformer and text encoder
             transformer_list = [transformer_state_dict]
             print(f"[Rank {local_rank}] Broadcasting weights for transformer...")
             dist.broadcast_object_list(transformer_list, src=0)
@@ -101,10 +99,26 @@ def __init__(
                 print(f"[Rank {local_rank}] Loading broadcasted transformer...")
                 transformer_state_dict = transformer_list[0]
                 self.transformer.load_state_dict(transformer_state_dict)
+            dist.barrier() 
+            if offload:
+                print(f"[Rank {local_rank}] Moving transformer to cpu...")
+                self.transformer.cpu()
+            else:
+                print(f"[Rank {local_rank}] Moving transformer to {device}...")
+                self.transformer.to(device)
+            dist.barrier() 
             torch.cuda.empty_cache()
-            dist.barrier()  # 20250423 pftq: Synchronize ranks
-
+            
             # 20250423 pftq: Broadcast text encoder weights from rank 0
+            if local_rank == 0:
+                print(f"[Rank {local_rank}] Loading text encoder to {broadcast_device}...")
+                self.text_encoder = get_text_encoder(model_path, broadcast_device, weight_dtype, skip_weights=False)
+                text_encoder_state_dict = self.text_encoder.state_dict() 
+            else:
+                print(f"[Rank {local_rank}] Skipping text encoder load...")
+                self.text_encoder = get_text_encoder(model_path, broadcast_device, weight_dtype, skip_weights=True)
+                text_encoder_state_dict = None
+            dist.barrier()  # Ensure rank 0 loads transformer and text encoder
             print(f"[Rank {local_rank}] Broadcasting weights for text encoder...")
             text_encoder_list = [text_encoder_state_dict]
             dist.broadcast_object_list(text_encoder_list, src=0)
@@ -113,20 +127,22 @@ def __init__(
                 print(f"[Rank {local_rank}] Loading broadcasted text encoder...")
                 text_encoder_state_dict = text_encoder_list[0]
                 self.text_encoder.load_state_dict(text_encoder_state_dict)
+            dist.barrier() 
+            if offload:
+                print(f"[Rank {local_rank}] Moving text encoder to cpu...")
+                self.text_encoder.cpu()
+            else:
+                print(f"[Rank {local_rank}] Moving text encoder to {device}...")
+                self.text_encoder.to(device)
+            dist.barrier() 
             torch.cuda.empty_cache()
-            dist.barrier()  # 20250423 pftq: Synchronize ranks
-        
-        # 20250423 pftq: Load VAE on all ranks with optional staggering to reduce I/O contention
-        vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
-        if use_usp:
-            # 20250423 pftq: Stagger VAE loading across ranks to avoid contention
+
+            # 20250423 pftq: Stagger VAE loading across ranks
             for rank in range(dist.get_world_size()):
                 if local_rank == rank:
                     print(f"[Rank {local_rank}] Loading VAE...")
                     self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
-                dist.barrier()
-        else:
-            self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
+                dist.barrier()  
 
         self.video_processor = VideoProcessor(vae_scale_factor=16)
 

From 804b869bbb68fd46f91901ac2fb9ef9a603ab2d6 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 00:37:52 -0700
Subject: [PATCH 083/117] Optimized multi-gpu loading further and use CPU RAM
 for broadcasting (tested faster and more stable than VRAM).

---
 .../pipelines/text2video_pipeline.py          | 71 ++++++++++++-------
 1 file changed, 44 insertions(+), 27 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/text2video_pipeline.py b/skyreels_v2_infer/pipelines/text2video_pipeline.py
index 64d25e1..dbe3a80 100644
--- a/skyreels_v2_infer/pipelines/text2video_pipeline.py
+++ b/skyreels_v2_infer/pipelines/text2video_pipeline.py
@@ -34,28 +34,27 @@ def __init__(
 
         print(f"[Rank {local_rank}] Initializing pipeline components...")
 
-        # 20250423 pftq: Load transformer only on rank 0 or single-GPU
-        if not use_usp or local_rank == 0:
-            print(f"[Rank {local_rank}] Loading transformer...")
+        vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
+        # 20250423 pftq: Load normally on single gpu
+        if not use_usp:
+            print(f"[Rank {local_rank}] Loading transformer to {load_device}...")
             self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=False)
-            transformer_state_dict = self.transformer.state_dict() if use_usp else None
-        else:
-            print(f"[Rank {local_rank}] Skipping weights for transformer...")
-            self.transformer = get_transformer(dit_path, load_device, weight_dtype, skip_weights=True)
-            transformer_state_dict = None
-
-        # 20250423 pftq: Load text encoder only on rank 0 or single-GPU
-        if not use_usp or local_rank == 0:
-            print(f"[Rank {local_rank}] Loading text encoder...")
+            print(f"[Rank {local_rank}] Loading text encoder to {load_device}...")
             self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=False)
-            text_encoder_state_dict = self.text_encoder.state_dict() if use_usp else None
-        else:
-            print(f"[Rank {local_rank}] Initializing empty text encoder...")
-            self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype, skip_weights=True)
-            text_encoder_state_dict = None
+            print(f"[Rank {local_rank}] Loading VAE...")
+            self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
 
         # 20250423 pftq: Broadcast transformer from rank 0
         if use_usp:
+            broadcast_device = "cpu" # tested to be more stable to start with cpu broadcast even if you have an H100
+            if local_rank == 0:
+                print(f"[Rank {local_rank}] Loading transformer to {broadcast_device}...")
+                self.transformer = get_transformer(dit_path, broadcast_device, weight_dtype, skip_weights=False)
+                transformer_state_dict = self.transformer.state_dict() 
+            else:
+                print(f"[Rank {local_rank}] Skipping transformer load...")
+                self.transformer = get_transformer(dit_path, broadcast_device, weight_dtype, skip_weights=True)
+                transformer_state_dict = None
             dist.barrier()  # Ensure rank 0 loads transformer and text encoder
             transformer_list = [transformer_state_dict]
             print(f"[Rank {local_rank}] Broadcasting weights for transformer...")
@@ -65,10 +64,26 @@ def __init__(
                 print(f"[Rank {local_rank}] Loading broadcasted transformer...")
                 transformer_state_dict = transformer_list[0]
                 self.transformer.load_state_dict(transformer_state_dict)
+            dist.barrier() 
+            if offload:
+                print(f"[Rank {local_rank}] Moving transformer to cpu...")
+                self.transformer.cpu()
+            else:
+                print(f"[Rank {local_rank}] Moving transformer to {device}...")
+                self.transformer.to(device)
+            dist.barrier() 
             torch.cuda.empty_cache()
-            dist.barrier()  # 20250423 pftq: Synchronize ranks
-
+            
             # 20250423 pftq: Broadcast text encoder weights from rank 0
+            if local_rank == 0:
+                print(f"[Rank {local_rank}] Loading text encoder to {broadcast_device}...")
+                self.text_encoder = get_text_encoder(model_path, broadcast_device, weight_dtype, skip_weights=False)
+                text_encoder_state_dict = self.text_encoder.state_dict() 
+            else:
+                print(f"[Rank {local_rank}] Skipping text encoder load...")
+                self.text_encoder = get_text_encoder(model_path, broadcast_device, weight_dtype, skip_weights=True)
+                text_encoder_state_dict = None
+            dist.barrier()  # Ensure rank 0 loads transformer and text encoder
             print(f"[Rank {local_rank}] Broadcasting weights for text encoder...")
             text_encoder_list = [text_encoder_state_dict]
             dist.broadcast_object_list(text_encoder_list, src=0)
@@ -77,20 +92,22 @@ def __init__(
                 print(f"[Rank {local_rank}] Loading broadcasted text encoder...")
                 text_encoder_state_dict = text_encoder_list[0]
                 self.text_encoder.load_state_dict(text_encoder_state_dict)
+            dist.barrier() 
+            if offload:
+                print(f"[Rank {local_rank}] Moving text encoder to cpu...")
+                self.text_encoder.cpu()
+            else:
+                print(f"[Rank {local_rank}] Moving text encoder to {device}...")
+                self.text_encoder.to(device)
+            dist.barrier() 
             torch.cuda.empty_cache()
-            dist.barrier()  # Synchronize ranks
 
-        # 20250423 pftq: Stagger VAE loading across ranks
-        vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
-        if use_usp:
+            # 20250423 pftq: Stagger VAE loading across ranks
             for rank in range(dist.get_world_size()):
                 if local_rank == rank:
                     print(f"[Rank {local_rank}] Loading VAE...")
                     self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
-                dist.barrier()
-        else:
-            print(f"[Rank {local_rank}] Loading VAE...")
-            self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
+                dist.barrier()  
 
         self.video_processor = VideoProcessor(vae_scale_factor=16)
         self.sp_size = 1

From d9dc69deda994aa96e24768d5bca7245321e069a Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 00:39:16 -0700
Subject: [PATCH 084/117] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3b6827b..780a027 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 ## Changes from pftq:
 - Added seed synchronization code to allow random seed with multi-GPU.
-- Fixed 20-min load time on multi-GPU due to contention (all GPUs loading models at once).
+- Reduced 20-min+ load time on multi-GPU to ~8min by fixing contention (all GPUs loading models at once). Indirectly also solved CPU RAM spike during multi-GPU (>200GB on 4 GPUs).
 - Fixed CuSolver error that occasionally comes up in multi-GPU by presetting linear algebra library.
 - Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.

From 9a9f0ef0ec407171fa3e2cc13f85f05cd45d2fe4 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 00:50:49 -0700
Subject: [PATCH 085/117] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 780a027..4b0acbe 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 ## Changes from pftq:
-- Added seed synchronization code to allow random seed with multi-GPU.
-- Reduced 20-min+ load time on multi-GPU to ~8min by fixing contention (all GPUs loading models at once). Indirectly also solved CPU RAM spike during multi-GPU (>200GB on 4 GPUs).
-- Fixed CuSolver error that occasionally comes up in multi-GPU by presetting linear algebra library.
+- Added seed synchronization code to allow random seed with multi-GPU (https://github.com/SkyworkAI/SkyReels-V2/issues/24).
+- Reduced 20-min+ load time on multi-GPU to ~8min by fixing contention (all GPUs loading models at once). Indirectly also solved CPU RAM spike during multi-GPU (>200GB on 4 GPUs) (https://github.com/SkyworkAI/SkyReels-V2/issues/28).
+- Fixed CuSolver error that occasionally comes up in multi-GPU by presetting linear algebra library (https://github.com/SkyworkAI/SkyReels-V2/issues/37).
 - Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
 - Fixed DF script not resize-cropping the image (I2V script does it but DF is missing the code).

From 06ec81833216ef61211f898235b5d575ab720412 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 00:55:15 -0700
Subject: [PATCH 086/117] Reorder code to resolve merge conflict with main
 branch.

---
 generate_video.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/generate_video.py b/generate_video.py
index 7b8e5fd..5240167 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -45,6 +45,12 @@
     parser.add_argument("--offload", action="store_true")
     parser.add_argument("--fps", type=int, default=24)
     parser.add_argument("--seed", type=int, default=-1)
+
+    parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
+    parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
+    parser.add_argument("--negative_prompt", type=str, default="Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards") # 20250422 pftq: expose negative prompt
+    
+    
     parser.add_argument(
         "--prompt",
         type=str,
@@ -52,10 +58,6 @@
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
 
-    parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
-    parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
-    parser.add_argument("--negative_prompt", type=str, default="Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards") # 20250422 pftq: expose negative prompt
-    
     args = parser.parse_args()
 
     args.model_id = download_model(args.model_id)

From 8697eb75cda105b04fc312d23aac2ae67279c08f Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 00:56:58 -0700
Subject: [PATCH 087/117] Update generate_video.py

---
 generate_video.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/generate_video.py b/generate_video.py
index 5240167..7d071b0 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -44,20 +44,18 @@
     parser.add_argument("--use_usp", action="store_true")
     parser.add_argument("--offload", action="store_true")
     parser.add_argument("--fps", type=int, default=24)
-    parser.add_argument("--seed", type=int, default=-1)
+    parser.add_argument("--seed", type=int, default=None)
 
     parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
     parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
     parser.add_argument("--negative_prompt", type=str, default="Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards") # 20250422 pftq: expose negative prompt
     
-    
     parser.add_argument(
         "--prompt",
         type=str,
         default="A serene lake surrounded by towering mountains, with a few swans gracefully gliding across the water and sunlight dancing on the surface.",
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
-
     args = parser.parse_args()
 
     args.model_id = download_model(args.model_id)

From ba0106e448542a3277ef2af7650367cd722e9251 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 00:57:56 -0700
Subject: [PATCH 088/117] Reorder code to resolve merge conflict with main
 branch.

---
 generate_video_df.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 0e8e5ce..bc8b022 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -34,18 +34,18 @@
     parser.add_argument("--use_usp", action="store_true")
     parser.add_argument("--offload", action="store_true")
     parser.add_argument("--fps", type=int, default=24)
-    parser.add_argument("--seed", type=int, default=-1)
+    parser.add_argument("--seed", type=int, default=None)
+
+    parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
+    parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
+    parser.add_argument("--negative_prompt", type=str, default="Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards") # 20250422 pftq: expose negative prompt
+    
     parser.add_argument(
         "--prompt",
         type=str,
         default="A woman in a leather jacket and sunglasses riding a vintage motorcycle through a desert highway at sunset, her hair blowing wildly in the wind as the motorcycle kicks up dust, with the golden sun casting long shadows across the barren landscape.",
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
-    
-    parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
-    parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
-    parser.add_argument("--negative_prompt", type=str, default="Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards") # 20250422 pftq: expose negative prompt
-    
     args = parser.parse_args()
 
     args.model_id = download_model(args.model_id)

From a658cd8f89a977a986b3f3bcaa15711f8dd90ff7 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 00:59:31 -0700
Subject: [PATCH 089/117] Reorder code to resolve merge conflict with main
 branch.

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 4b0acbe..23cefe8 100644
--- a/README.md
+++ b/README.md
@@ -91,11 +91,11 @@ Change "DF" to "I2V' or "T2V" accordingly if you don't want to use the infinite
 </p>
 
 ---
-Welcome to the SkyReels V2 repository! Here, you'll find the model weights and inference code for our infinite-length film generative models
+Welcome to the **SkyReels V2** repository! Here, you'll find the model weights and inference code for our infinite-length film generative models. To the best of our knowledge, it represents the first open-source video generative model employing **AutoRegressive Diffusion-Forcing architecture** that achieves the **SOTA performance** among publicly available models.
 
 
 ## 🔥🔥🔥 News!!
-
+* Apr 24, 2025: 🔥 We release the 720P models, [SkyReels-V2-DF-14B-720P](https://huggingface.co/Skywork/SkyReels-V2-DF-14B-720P) and [SkyReels-V2-I2V-14B-720P](https://huggingface.co/Skywork/SkyReels-V2-I2V-14B-720P). The former facilitates infinite-length autoregressive video generation, and the latter focuses on Image2Video synthesis.
 * Apr 21, 2025: 👋 We release the inference code and model weights of [SkyReels-V2](https://huggingface.co/collections/Skywork/skyreels-v2-6801b1b93df627d441d0d0d9) Series Models and the video captioning model [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1) .
 * Apr 3, 2025: 🔥 We also release [SkyReels-A2](https://github.com/SkyworkAI/SkyReels-A2). This is an open-sourced controllable video generation framework capable of assembling arbitrary visual elements.
 * Feb 18, 2025: 🔥 we released [SkyReels-A1](https://github.com/SkyworkAI/SkyReels-A1). This is an open-sourced and effective framework for portrait image animation.
@@ -178,7 +178,7 @@ You can download our models from Hugging Face:
     <tr>
       <td>14B-720P</td>
       <td>720 * 1280 * 121f</td>
-      <td>Coming Soon</td>
+      <td>🤗 <a href="https://huggingface.co/Skywork/SkyReels-V2-DF-14B-720P">Huggingface</a> 🤖 <a href="https://www.modelscope.cn/models/Skywork/SkyReels-V2-DF-14B-720P">ModelScope</a></td>
     </tr>
     <tr>
       <td rowspan="5">Text-to-Video</td>
@@ -230,7 +230,7 @@ You can download our models from Hugging Face:
     <tr>
       <td>14B-720P</td>
       <td>720 * 1280 * 121f</td>
-      <td>Coming Soon</td>
+      <td>🤗 <a href="https://huggingface.co/Skywork/SkyReels-V2-I2V-14B-720P">Huggingface</a> 🤖 <a href="https://www.modelscope.cn/models/Skywork/SkyReels-V2-I2V-14B-720P">ModelScope</a></td>
     </tr>
     <tr>
       <td rowspan="3">Camera Director</td>

From fe8c7a44c68fe1f9e8f1977bd40cf805f2be27a2 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 02:14:04 -0700
Subject: [PATCH 090/117] Update generate_video_df.py

---
 generate_video_df.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index bc8b022..85e521c 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -34,7 +34,7 @@
     parser.add_argument("--use_usp", action="store_true")
     parser.add_argument("--offload", action="store_true")
     parser.add_argument("--fps", type=int, default=24)
-    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--seed", type=int, default=-1)
 
     parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
     parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
@@ -179,7 +179,7 @@
 
                 #20250422 pftq: Always broadcast seed to ensure consistency
                 if local_rank == 0:
-                    if args.seed == -1 or args.seed is None or idx > 0:
+                    if args.seed == -1 or idx > 0:
                         args.seed = int(random.randrange(4294967294))
                 seed_tensor = torch.tensor(args.seed, dtype=torch.int64, device="cuda")
                 dist.broadcast(seed_tensor, src=0)

From d4e78158b667ae5e08f421a0c8d7593b8a76b110 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 02:14:20 -0700
Subject: [PATCH 091/117] Update generate_video.py

---
 generate_video.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/generate_video.py b/generate_video.py
index 7d071b0..2ddd4ae 100644
--- a/generate_video.py
+++ b/generate_video.py
@@ -44,7 +44,7 @@
     parser.add_argument("--use_usp", action="store_true")
     parser.add_argument("--offload", action="store_true")
     parser.add_argument("--fps", type=int, default=24)
-    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--seed", type=int, default=-1)
 
     parser.add_argument("--batch_size", type=int, default=1) # 20250422 pftq: Batch functionality to avoid reloading the model each video
     parser.add_argument("--preserve_image_aspect_ratio", action="store_true")  # 20250422 pftq: Avoid resizing
@@ -178,7 +178,7 @@
 
                 #20250422 pftq: Always broadcast seed to ensure consistency
                 if local_rank == 0:
-                    if args.seed == -1 or args.seed is None or idx > 0:
+                    if args.seed == -1 or idx > 0:
                         args.seed = int(random.randrange(4294967294))
                 seed_tensor = torch.tensor(args.seed, dtype=torch.int64, device="cuda")
                 dist.broadcast(seed_tensor, src=0)

From 8dda10336eec114b44d9efe233acd882dd2c5910 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 02:21:16 -0700
Subject: [PATCH 092/117] Update README.md

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index 23cefe8..1522f2c 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,6 @@
 - Exposed negative_prompt to allow that to be changed/overwritten.
 - Friendlier filenames with date, seed, cfg, steps, and other details in front.
 
-Note: The TeaCache update breaks the code (see https://github.com/SkyworkAI/SkyReels-V2/issues/36) so that is currently excluded from this fork.
-
 Easy install instructions for those using Runpod like me:
 ```
 #create once on new pod

From e1f58289b3988181debf223b30e21d1113466bb5 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 03:09:29 -0700
Subject: [PATCH 093/117] Fixed OOM on lower VRAM cards from model loaded
 twice.

---
 skyreels_v2_infer/pipelines/image2video_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skyreels_v2_infer/pipelines/image2video_pipeline.py b/skyreels_v2_infer/pipelines/image2video_pipeline.py
index 3b4f840..3bef8cd 100644
--- a/skyreels_v2_infer/pipelines/image2video_pipeline.py
+++ b/skyreels_v2_infer/pipelines/image2video_pipeline.py
@@ -228,7 +228,7 @@ def __call__(
                 "y": y,
             }
 
-            self.transformer.to(self.device)
+            #self.transformer.to(self.device) # 20250425 pftq: loaded twice, causes OOM on 48GB VRAM cards
             for _, t in enumerate(tqdm(timesteps)):
                 latent_model_input = torch.stack([latent]).to(self.device)
                 timestep = torch.stack([t]).to(self.device)

From 6a1643615564a32024330561c927116474e98afa Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 03:10:44 -0700
Subject: [PATCH 094/117] Fixed OOM on lower VRAM cards from model loaded
 twice.

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 1522f2c..fb51c2a 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 - Added seed synchronization code to allow random seed with multi-GPU (https://github.com/SkyworkAI/SkyReels-V2/issues/24).
 - Reduced 20-min+ load time on multi-GPU to ~8min by fixing contention (all GPUs loading models at once). Indirectly also solved CPU RAM spike during multi-GPU (>200GB on 4 GPUs) (https://github.com/SkyworkAI/SkyReels-V2/issues/28).
 - Fixed CuSolver error that occasionally comes up in multi-GPU by presetting linear algebra library (https://github.com/SkyworkAI/SkyReels-V2/issues/37).
+- Fixed Out-of-Memory on I2V mode for <=50GB VRAM GPUs due to transformer model being loaded twice.
 - Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
 - Fixed DF script not resize-cropping the image (I2V script does it but DF is missing the code).

From c745c0a00b821bcb45f1880020c044bfae150bee Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 03:18:59 -0700
Subject: [PATCH 095/117] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fb51c2a..67ca3d5 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 - Added seed synchronization code to allow random seed with multi-GPU (https://github.com/SkyworkAI/SkyReels-V2/issues/24).
 - Reduced 20-min+ load time on multi-GPU to ~8min by fixing contention (all GPUs loading models at once). Indirectly also solved CPU RAM spike during multi-GPU (>200GB on 4 GPUs) (https://github.com/SkyworkAI/SkyReels-V2/issues/28).
 - Fixed CuSolver error that occasionally comes up in multi-GPU by presetting linear algebra library (https://github.com/SkyworkAI/SkyReels-V2/issues/37).
-- Fixed Out-of-Memory on I2V mode for <=50GB VRAM GPUs due to transformer model being loaded twice.
+- Removed duplicate model loading line on I2V pipe
 - Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
 - Fixed DF script not resize-cropping the image (I2V script does it but DF is missing the code).

From f20aeda1b131cea0f8a913b796a2ee1569d8f9d3 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 03:21:10 -0700
Subject: [PATCH 096/117] Removed duplicate transformer loading line

---
 skyreels_v2_infer/pipelines/image2video_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skyreels_v2_infer/pipelines/image2video_pipeline.py b/skyreels_v2_infer/pipelines/image2video_pipeline.py
index 3bef8cd..7bf05d6 100644
--- a/skyreels_v2_infer/pipelines/image2video_pipeline.py
+++ b/skyreels_v2_infer/pipelines/image2video_pipeline.py
@@ -228,7 +228,7 @@ def __call__(
                 "y": y,
             }
 
-            #self.transformer.to(self.device) # 20250425 pftq: loaded twice, causes OOM on 48GB VRAM cards
+            #self.transformer.to(self.device) # 20250425 pftq: loaded twice
             for _, t in enumerate(tqdm(timesteps)):
                 latent_model_input = torch.stack([latent]).to(self.device)
                 timestep = torch.stack([t]).to(self.device)

From ec64dd58e4d227cc4c0406a010f3bd840f600e07 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 03:35:26 -0700
Subject: [PATCH 097/117] Update README.md

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 67ca3d5..1522f2c 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,6 @@
 - Added seed synchronization code to allow random seed with multi-GPU (https://github.com/SkyworkAI/SkyReels-V2/issues/24).
 - Reduced 20-min+ load time on multi-GPU to ~8min by fixing contention (all GPUs loading models at once). Indirectly also solved CPU RAM spike during multi-GPU (>200GB on 4 GPUs) (https://github.com/SkyworkAI/SkyReels-V2/issues/28).
 - Fixed CuSolver error that occasionally comes up in multi-GPU by presetting linear algebra library (https://github.com/SkyworkAI/SkyReels-V2/issues/37).
-- Removed duplicate model loading line on I2V pipe
 - Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
 - Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
 - Fixed DF script not resize-cropping the image (I2V script does it but DF is missing the code).

From 16403e79ba0f4f8e1a8c656d2cb00ca1587296a9 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 04:34:09 -0700
Subject: [PATCH 098/117] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 1522f2c..f5b390e 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,8 @@ source /workspace/venv/bin/activate
 cd /workspace/SkyReels-V2
 ```
 
+Example prompts below.  If you run into memory/vram issues, you can reduce the base_num_frames while still having the same higher number on num_frames.  The point of the DF model is that now the whole video doesn't have to fit in VRAM and can be done in chunks.
+
 Example prompt (multi-GPU):
 ```
 model_id=Skywork/SkyReels-V2-DF-14B-540P

From 55d4d629d195f3b655375db42672b64f930ede8b Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 06:25:52 -0700
Subject: [PATCH 099/117] Fix merge conflict

---
 generate_video_df.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 5fa335f..501bd65 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -106,7 +106,8 @@
 
     guidance_scale = args.guidance_scale
     shift = args.shift
-    #image = load_image(args.image).convert("RGB") if args.image else None
+    image = load_image(args.image).convert("RGB") if args.image else None
+    negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
 
     #20250422 pftq: Add error handling for image loading, aspect ratio preservation
     image = None

From 0124fc51cb759dcdd42081b39fc8db5cc8ad6331 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 06:30:33 -0700
Subject: [PATCH 100/117] Fix merge conflict

---
 generate_video_df.py | 43 +++++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 501bd65..bd50b88 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -64,25 +64,6 @@
     #20250422 pftq: unneeded with seed synchronization code
     #assert (args.use_usp and args.seed != -1) or (not args.use_usp), "usp mode requires a valid seed"
 
-    local_rank = 0
-    if args.use_usp:
-        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
-        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
-        import torch.distributed as dist
-
-        dist.init_process_group("nccl")
-        local_rank = dist.get_rank()
-        torch.cuda.set_device(dist.get_rank())
-        device = "cuda"
-
-        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
-
-        initialize_model_parallel(
-            sequence_parallel_degree=dist.get_world_size(),
-            ring_degree=1,
-            ulysses_degree=dist.get_world_size(),
-        )
-
     if args.resolution == "540P":
         height = 544
         width = 960
@@ -109,6 +90,27 @@
     image = load_image(args.image).convert("RGB") if args.image else None
     negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
 
+    save_dir = os.path.join("result", args.outdir)
+    os.makedirs(save_dir, exist_ok=True)
+    local_rank = 0
+    if args.use_usp:
+        assert not args.prompt_enhancer, "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
+        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
+        import torch.distributed as dist
+
+        dist.init_process_group("nccl")
+        local_rank = dist.get_rank()
+        torch.cuda.set_device(dist.get_rank())
+        device = "cuda"
+
+        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+
+        initialize_model_parallel(
+            sequence_parallel_degree=dist.get_world_size(),
+            ring_degree=1,
+            ulysses_degree=dist.get_world_size(),
+        )
+
     #20250422 pftq: Add error handling for image loading, aspect ratio preservation
     image = None
     if args.image:
@@ -143,9 +145,6 @@
     
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
 
-    save_dir = os.path.join("result", args.outdir)
-    os.makedirs(save_dir, exist_ok=True)
-
     prompt_input = args.prompt
     if args.prompt_enhancer and args.image is None:
         print(f"init prompt enhancer")

From 9ce9a1c819b550a867a5fe221e46bd496784def3 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 06:33:35 -0700
Subject: [PATCH 101/117] Fix merge conflict

---
 generate_video_df.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index bd50b88..03bf790 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -87,8 +87,8 @@
 
     guidance_scale = args.guidance_scale
     shift = args.shift
-    image = load_image(args.image).convert("RGB") if args.image else None
-    negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+    #image = load_image(args.image).convert("RGB") if args.image else None # 20250422 pftq: redefined further below to fix resizing/crop and allow maintaining aspect ratio 
+    #negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" # 20250422 pftq: redefined further below to allow input from commandline
 
     save_dir = os.path.join("result", args.outdir)
     os.makedirs(save_dir, exist_ok=True)

From 8151808af65025dc7eed24ee55fa512c88d82f2b Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 06:34:25 -0700
Subject: [PATCH 102/117] Fix merge conflict

---
 generate_video_df.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 03bf790..fa947cb 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -87,8 +87,8 @@
 
     guidance_scale = args.guidance_scale
     shift = args.shift
-    #image = load_image(args.image).convert("RGB") if args.image else None # 20250422 pftq: redefined further below to fix resizing/crop and allow maintaining aspect ratio 
-    #negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" # 20250422 pftq: redefined further below to allow input from commandline
+    image = load_image(args.image).convert("RGB") if args.image else None
+    negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" 
 
     save_dir = os.path.join("result", args.outdir)
     os.makedirs(save_dir, exist_ok=True)

From e886e8e496eadf51de669223e4408f0cf6c58aa9 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 06:34:56 -0700
Subject: [PATCH 103/117] Fix merge conflict

---
 generate_video_df.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index fa947cb..bd50b88 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -88,7 +88,7 @@
     guidance_scale = args.guidance_scale
     shift = args.shift
     image = load_image(args.image).convert("RGB") if args.image else None
-    negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" 
+    negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
 
     save_dir = os.path.join("result", args.outdir)
     os.makedirs(save_dir, exist_ok=True)

From 5b6c1243184e426f9e400bad4849c1054ef4d68c Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Fri, 25 Apr 2025 06:50:21 -0700
Subject: [PATCH 104/117] Reapplied image aspect ratio code and negative prompt
 parameter after resolving merge conflicts.

---
 generate_video_df.py | 57 ++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 4a740c9..29beb86 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -11,7 +11,6 @@
 
 from skyreels_v2_infer import DiffusionForcingPipeline
 from skyreels_v2_infer.modules import download_model
-from skyreels_v2_infer.pipelines import resizecrop
 from skyreels_v2_infer.pipelines import PromptEnhancer
 from skyreels_v2_infer.pipelines import resizecrop
 
@@ -90,6 +89,7 @@
 
     guidance_scale = args.guidance_scale
     shift = args.shift
+    """
     if args.image:
         args.image = load_image(args.image)
         image_width, image_height = args.image.size
@@ -97,31 +97,8 @@
             height, width = width, height
         args.image = resizecrop(args.image, height, width)
     image = args.image.convert("RGB") if args.image else None
-    negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
-
-    save_dir = os.path.join("result", args.outdir)
-    os.makedirs(save_dir, exist_ok=True)
-    local_rank = 0
-    if args.use_usp:
-        assert (
-            not args.prompt_enhancer
-        ), "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
-        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
-        import torch.distributed as dist
-
-        dist.init_process_group("nccl")
-        local_rank = dist.get_rank()
-        torch.cuda.set_device(dist.get_rank())
-        device = "cuda"
-
-        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
-
-        initialize_model_parallel(
-            sequence_parallel_degree=dist.get_world_size(),
-            ring_degree=1,
-            ulysses_degree=dist.get_world_size(),
-        )
-
+    """
+    
     #20250422 pftq: Add error handling for image loading, aspect ratio preservation
     image = None
     if args.image:
@@ -151,10 +128,32 @@
                 image = resizecrop(image, height, width)
         except Exception as e:
             raise ValueError(f"Failed to load or process image: {e}")
-
-    print(f"Rank {local_rank}: {width}x{height} | Image: "+str(image!=None))
     
+    #negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
+    
+    save_dir = os.path.join("result", args.outdir)
+    os.makedirs(save_dir, exist_ok=True)
+    local_rank = 0
+    if args.use_usp:
+        assert (
+            not args.prompt_enhancer
+        ), "`--prompt_enhancer` is not allowed if using `--use_usp`. We recommend running the skyreels_v2_infer/pipelines/prompt_enhancer.py script first to generate enhanced prompt before enabling the `--use_usp` parameter."
+        from xfuser.core.distributed import initialize_model_parallel, init_distributed_environment
+        import torch.distributed as dist
+
+        dist.init_process_group("nccl")
+        local_rank = dist.get_rank()
+        torch.cuda.set_device(dist.get_rank())
+        device = "cuda"
+
+        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+
+        initialize_model_parallel(
+            sequence_parallel_degree=dist.get_world_size(),
+            ring_degree=1,
+            ulysses_degree=dist.get_world_size(),
+        )
 
     prompt_input = args.prompt
     if args.prompt_enhancer and args.image is None:
@@ -166,6 +165,8 @@
         gc.collect()
         torch.cuda.empty_cache()
 
+    print(f"Rank {local_rank}: {width}x{height} | Image: "+str(image!=None))
+    
     # 20250423 pftq: fixed 20-min load times on multi-GPU caused by contention, reduced down to 12 min roughly the same as single GPU.
     print("Initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))
     starttime = time.time()

From 8b88304e5354bebe167f7b0a709aa31880c2a252 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Mon, 28 Apr 2025 14:18:04 -0700
Subject: [PATCH 105/117] Integrated prompt travel and video input from chaojie
 and fixed merged conflicts.

---
 generate_video_df.py | 136 ++++++++++++++++++++-----------------------
 1 file changed, 63 insertions(+), 73 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 5f9ac45..85e5219 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -8,7 +8,7 @@
 import imageio
 from PIL import Image  #20250422 pftq: Added for image resizing and cropping
 import numpy as np  #20250422 pftq: Added for seed synchronization
-from diffusers.utils import load_image, load_video
+from diffusers.utils import load_video # 20250425 chaojie prompt travel & video input
 
 from skyreels_v2_infer import DiffusionForcingPipeline
 from skyreels_v2_infer.modules import download_model
@@ -23,7 +23,7 @@
     parser.add_argument("--resolution", type=str, choices=["540P", "720P"])
     parser.add_argument("--num_frames", type=int, default=97)
     parser.add_argument("--image", type=str, default=None)
-    parser.add_argument("--video", type=str, default=None)
+    parser.add_argument("--video", type=str, default=None) # 20250425 chaojie prompt travel & video input
     parser.add_argument("--ar_step", type=int, default=0)
     parser.add_argument("--causal_attention", action="store_true")
     parser.add_argument("--causal_block_size", type=int, default=1)
@@ -44,7 +44,7 @@
     
     parser.add_argument(
         "--prompt",
-        nargs="+",
+        nargs="+", # 20250425 chaojie prompt travel & video input
         type=str,
         default="A woman in a leather jacket and sunglasses riding a vintage motorcycle through a desert highway at sunset, her hair blowing wildly in the wind as the motorcycle kicks up dust, with the golden sun casting long shadows across the barren landscape.",
     )
@@ -101,32 +101,48 @@
         args.image = resizecrop(args.image, height, width)
     image = args.image.convert("RGB") if args.image else None
     """
-    
-    #20250422 pftq: Add error handling for image loading, aspect ratio preservation
-    image = None
-    if args.image:
-        try:
-            image = load_image(args.image).convert("RGB")
 
+    # 20250425 chaojie prompt travel & video input
     video = []
-    pre_video_length = 17
-    if args.overlap_history is not None:
-        pre_video_length = args.overlap_history
     if args.video:
+        pre_video_length = 17
+        if args.overlap_history is not None:
+            pre_video_length = args.overlap_history
         args.video = load_video(args.video) 
         arg_width = width
         arg_height = height
         for img in args.video:
-            image_width, image_height = img.size
-            if image_height > image_width:
-                height, width = arg_width, arg_height
-            img = resizecrop(img, height, width)
+            # 20250422 pftq: option to preserve image aspect ratio
+            if args.preserve_image_aspect_ratio:
+                img_width, img_height = img.size
+                if img_height > img_width:
+                    height, width = width, height
+                    width = int(height / img_height * img_width)
+                else:
+                    height = int(width / img_width * img_height)
+        
+                divisibility=16
+                if width%divisibility!=0:
+                        width = width - (width%divisibility)
+                if height%divisibility!=0:
+                        height = height - (height%divisibility)
+        
+                img = resizecrop(img, height, width)
+            else:
+                image_width, image_height = img.size
+                if image_height > image_width:
+                    height, width = width, height
+                img = resizecrop(img, height, width)
             video.append(img.convert("RGB").resize((width, height)))
             video = video[-pre_video_length:]
     else:
         video = None
-    
-    negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+
+        #20250422 pftq: Add error handling for image loading, aspect ratio preservation
+    image = None
+    if args.image and not args.video:
+        try:
+            image = load_image(args.image).convert("RGB")
 
             # 20250422 pftq: option to preserve image aspect ratio
             if args.preserve_image_aspect_ratio:
@@ -154,7 +170,7 @@
     
     #negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
     negative_prompt = args.negative_prompt # 20250422 pftq: allow editable negative prompt
-    
+  
     save_dir = os.path.join("result", args.outdir)
     os.makedirs(save_dir, exist_ok=True)
     local_rank = 0
@@ -188,7 +204,7 @@
         gc.collect()
         torch.cuda.empty_cache()
 
-    print(f"Rank {local_rank}: {width}x{height} | Image: "+str(image!=None))
+    print(f"Rank {local_rank}: {width}x{height} | Image Input: "+str(image!=None) + " | Video Input: "+str(video!=None))
     
     # 20250423 pftq: fixed 20-min load times on multi-GPU caused by contention, reduced down to 12 min roughly the same as single GPU.
     print("Initializing pipe at "+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))
@@ -206,23 +222,16 @@
 
     if args.causal_attention:
         pipe.transformer.set_ar_attention(args.causal_block_size)
-
+    
     if args.teacache:
         if args.ar_step > 0:
-            num_steps = (
-                args.inference_steps
-                + (((args.base_num_frames - 1) // 4 + 1) // args.causal_block_size - 1) * args.ar_step
-            )
-            print("num_steps:", num_steps)
+            num_steps = args.inference_steps + (((args.base_num_frames - 1)//4 + 1) // args.causal_block_size - 1) * args.ar_step
+            print('num_steps:', num_steps)
         else:
             num_steps = args.inference_steps
-        pipe.transformer.initialize_teacache(
-            enable_teacache=True,
-            num_steps=num_steps,
-            teacache_thresh=args.teacache_thresh,
-            use_ret_steps=args.use_ret_steps,
-            ckpt_dir=args.model_id,
-        )
+        pipe.transformer.initialize_teacache(enable_teacache=True, num_steps=num_steps, 
+                                             teacache_thresh=args.teacache_thresh, use_ret_steps=args.use_ret_steps, 
+                                             ckpt_dir=args.model_id)
 
     #20250422 pftq: Set preferred linear algebra backend to avoid cuSOLVER issues
     torch.backends.cuda.preferred_linalg_library("default")  # or try "magma" if available
@@ -264,6 +273,21 @@
         np.random.seed(args.seed)
         torch.manual_seed(args.seed)
         torch.cuda.manual_seed_all(args.seed)
+
+
+        current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
+        #video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
+        
+        # 20250422 pftq: more useful filename
+        gpucount = ""
+        if args.use_usp and dist.get_world_size():
+            gpucount = "_"+str(dist.get_world_size())+"xGPU"
+        prompt_summary = ""
+        if type(args.prompt) is list:
+            prompt_summary = args.prompt[0][:100].replace('/','')
+        else:
+            prompt_summary = args.prompt[:100].replace('/','')
+        video_out_file = f"{current_time}_skyreels2df_{args.resolution}-{args.num_frames}f_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{prompt_summary}_{idx}.mp4" 
         
         with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
             video_frames = pipe(
@@ -283,48 +307,14 @@
                 ar_step=args.ar_step,
                 causal_block_size=args.causal_block_size,
                 fps=fps,
+
+                # 20250425 chaojie prompt travel & video input
+                video=video,
+                local_rank=local_rank,
+                save_dir=save_dir,
+                video_out_file=video_out_file,
             )[0]
     
         if local_rank == 0:
-            current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
-            #video_out_file = f"{args.prompt[:100].replace('/','')}_{args.seed}_{current_time}.mp4"
-            
-            # 20250422 pftq: more useful filename
-            gpucount = ""
-            if args.use_usp and dist.get_world_size():
-                gpucount = "_"+str(dist.get_world_size())+"xGPU"
-            video_out_file = f"{current_time}_skyreels2df_{args.resolution}-{args.num_frames}f_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{args.prompt[:100].replace('/','')}_{idx}.mp4" 
-            
             output_path = os.path.join(save_dir, video_out_file)
             imageio.mimwrite(output_path, video_frames, fps=fps, quality=8, output_params=["-loglevel", "error"])
-    output_path = ""
-    current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
-    video_out_file = f"{args.prompt[0][:100].replace('/','')}_{args.seed}_{current_time}.mp4"
-
-    with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():
-        video_frames = pipe(
-            prompt=prompt_input,
-            negative_prompt=negative_prompt,
-            image=image,
-            video=video,
-            height=height,
-            width=width,
-            num_frames=num_frames,
-            num_inference_steps=args.inference_steps,
-            shift=shift,
-            guidance_scale=guidance_scale,
-            generator=torch.Generator(device="cuda").manual_seed(args.seed),
-            overlap_history=args.overlap_history,
-            addnoise_condition=args.addnoise_condition,
-            base_num_frames=args.base_num_frames,
-            ar_step=args.ar_step,
-            causal_block_size=args.causal_block_size,
-            fps=fps,
-            local_rank=local_rank,
-            save_dir=save_dir,
-            video_out_file=video_out_file,
-        )[0]
-
-    if local_rank == 0:
-        output_path = os.path.join(save_dir, video_out_file)
-        imageio.mimwrite(output_path, video_frames, fps=fps, quality=8, output_params=["-loglevel", "error"])

From e13fc6af0f0315bd8cf9d18e5429c7922d08370a Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Mon, 28 Apr 2025 14:18:58 -0700
Subject: [PATCH 106/117] Integrated prompt travel, video input, and partial
 video outputs from chaojie and fixed merged conflicts.

---
 .../pipelines/diffusion_forcing_pipeline.py   | 56 +++++++++++++------
 1 file changed, 38 insertions(+), 18 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index 66c8b48..31b5878 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -5,7 +5,7 @@
 from typing import Tuple
 from typing import Union
 
-import imageio
+import imageio # 20250425 chaojie prompt travel & video input
 import numpy as np
 import torch
 from diffusers.image_processor import PipelineImageInput
@@ -182,6 +182,7 @@ def encode_image(
         predix_video_latent_length = prefix_video[0].shape[1]
         return prefix_video, predix_video_latent_length
 
+    # 20250425 chaojie prompt travel & video input
     def encode_video(
         self, video: List[PipelineImageInput], height: int, width: int, num_frames: int
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -201,7 +202,7 @@ def encode_video(
             prefix_video[0] = prefix_video[0][:, : prefix_video[0].shape[1] - truncate_len]
         predix_video_latent_length = prefix_video[0].shape[1]
         return prefix_video, predix_video_latent_length
-
+    
     def prepare_latents(
         self,
         shape: Tuple[int],
@@ -256,7 +257,7 @@ def generate_timestep_matrix(
 
             update_mask.append(
                 (new_row != pre_row) & (new_row != num_iterations)
-            )  # False: no need to update， True: need to update
+            )  # False: no need to update, True: need to update
             step_index.append(new_row)
             step_matrix.append(step_template[new_row])
             pre_row = new_row
@@ -290,10 +291,11 @@ def generate_timestep_matrix(
     @torch.no_grad()
     def __call__(
         self,
-        prompt,
+        #prompt: Union[str, List[str]],
+        prompt, # 20250425 chaojie prompt travel & video input
         negative_prompt: Union[str, List[str]] = "",
         image: PipelineImageInput = None,
-        video: List[PipelineImageInput] = None,
+        video: List[PipelineImageInput] = None, # 20250425 chaojie prompt travel & video input
         height: int = 480,
         width: int = 832,
         num_frames: int = 97,
@@ -307,6 +309,8 @@ def __call__(
         ar_step: int = 5,
         causal_block_size: int = None,
         fps: int = 24,
+
+        # 20250425 chaojie prompt travel & video input
         local_rank: int = 0,
         save_dir: str = "",
         video_out_file: str = "",
@@ -322,10 +326,13 @@ def __call__(
         predix_video_latent_length = 0
         if image:
             prefix_video, predix_video_latent_length = self.encode_image(image, height, width, num_frames)
+        # 20250425 chaojie prompt travel & video input
         elif video:
             prefix_video, predix_video_latent_length = self.encode_video(video, height, width, num_frames)
 
         self.text_encoder.to(self.device)
+        #prompt_embeds = self.text_encoder.encode(prompt).to(self.transformer.dtype)
+        # 20250425 chaojie prompt travel & video input
         prompt_embeds_list = []
         if type(prompt) is list:
             for prompt_iter in prompt:
@@ -436,11 +443,18 @@ def __call__(
             n_iter = 1 + (latent_length - base_num_frames - 1) // (base_num_frames - overlap_history_frames) + 1
             print(f"n_iter:{n_iter}")
             output_video = None
+            #for i in range(n_iter):
+                #if output_video is not None:  # i !=0
+            # 20250425 chaojie prompt travel & video input 
             for i_n_iter in range(n_iter):
                 if type(prompt) is list:
                     if len(prompt) > i_n_iter:
                         prompt_embeds = prompt_embeds_list[i_n_iter]
+                if local_rank == 0:
+                    partnum = i_n_iter + 1
+                    print(f"Generating part {partnum} of {n_iter}: "+prompt[i_n_iter]) # 20250425 pftq
                 if output_video is not None:  # i_n_iter !=0
+            
                     prefix_video = output_video[:, -overlap_history:].to(prompt_embeds.device)
                     prefix_video = [self.vae.encode(prefix_video.unsqueeze(0))[0]]  # [(c, f, h, w)]
                     if prefix_video[0].shape[1] % causal_block_size != 0:
@@ -448,13 +462,15 @@ def __call__(
                         print("the length of prefix video is truncated for the casual block size alignment.")
                         prefix_video[0] = prefix_video[0][:, : prefix_video[0].shape[1] - truncate_len]
                     predix_video_latent_length = prefix_video[0].shape[1]
-                    finished_frame_num = i_n_iter * (base_num_frames - overlap_history_frames) + overlap_history_frames
+                    #finished_frame_num = i * (base_num_frames - overlap_history_frames) + overlap_history_frames
+                    finished_frame_num = i_n_iter * (base_num_frames - overlap_history_frames) + overlap_history_frames # 20250425 chaojie prompt travel & video input 
                     left_frame_num = latent_length - finished_frame_num
                     base_num_frames_iter = min(left_frame_num + overlap_history_frames, base_num_frames)
                     if ar_step > 0 and self.transformer.enable_teacache:
                         num_steps = num_inference_steps + ((base_num_frames_iter - overlap_history_frames) // causal_block_size - 1) * ar_step
                         self.transformer.num_steps = num_steps
-                else:  # i_n_iter == 0
+                #else:  # i == 0
+                else:  # i_n_iter == 0 # 20250425 chaojie prompt travel & video input 
                     base_num_frames_iter = base_num_frames
                 latent_shape = [16, base_num_frames_iter, latent_height, latent_width]
                 latents = self.prepare_latents(
@@ -536,24 +552,28 @@ def __call__(
                     self.transformer.cpu()
                     torch.cuda.empty_cache()
                 x0 = latents[0].unsqueeze(0)
-                mid_output_video = self.vae.decode(x0)
-                videos = [mid_output_video[0]]
+                videos = [self.vae.decode(x0)[0]]
+                if output_video is None:
+                    output_video = videos[0].clamp(-1, 1).cpu()  # c, f, h, w
+                else:
+                    output_video = torch.cat(
+                        [output_video, videos[0][:, overlap_history:].clamp(-1, 1).cpu()], 1
+                    )  # c, f, h, w
+                    
+                # 20250425 chaojie prompt travel & video input 
                 if local_rank == 0:
-                    mid_output_video = (mid_output_video / 2 + 0.5).clamp(0, 1)
+                    videonum = i_n_iter + 1
+                    print(f"Saving partial video {videonum} of {n_iter}...") # 20250425 pftq
+                    mid_output_video = output_video
+                    mid_output_video = [(mid_output_video / 2 + 0.5).clamp(0, 1)]
                     mid_output_video = [video for video in mid_output_video]
                     mid_output_video = [video.permute(1, 2, 3, 0) * 255 for video in mid_output_video]
                     mid_output_video = [video.cpu().numpy().astype(np.uint8) for video in mid_output_video]
 
-                    mid_video_out_file = f"mid_{i_n_iter}_{video_out_file}"
+                    mid_video_out_file = video_out_file.replace(".mp4", f"_partial{i_n_iter}.mp4")
                     mid_output_path = os.path.join(save_dir, mid_video_out_file)
                     imageio.mimwrite(mid_output_path, mid_output_video[0], fps=fps, quality=8, output_params=["-loglevel", "error"])
-
-                if output_video is None:
-                    output_video = videos[0].clamp(-1, 1).cpu()  # c, f, h, w
-                else:
-                    output_video = torch.cat(
-                        [output_video, videos[0][:, overlap_history:].clamp(-1, 1).cpu()], 1
-                    )  # c, f, h, w
+                    
             output_video = [(output_video / 2 + 0.5).clamp(0, 1)]
             output_video = [video for video in output_video]
             output_video = [video.permute(1, 2, 3, 0) * 255 for video in output_video]

From 84db4d335e1cb3832ea314055ce679c292aa41e9 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Mon, 28 Apr 2025 14:23:34 -0700
Subject: [PATCH 107/117] Integrated prompt travel, video input, and partial
 video outputs from chaojie and fixed merged conflicts.

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index f5b390e..6640c5c 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,11 @@
 - Exposed negative_prompt to allow that to be changed/overwritten.
 - Friendlier filenames with date, seed, cfg, steps, and other details in front.
 
+## Additional changes merged from chaojie's fork from https://github.com/SkyworkAI/SkyReels-V2/pull/12 (special thanks for permission to do so):
+- Prompt travel, allow multiple text strings in the --prompt parameter to guide the video differently each chunk of base_num_frames.
+- Video input via --video parameter, allow continuing/extending from a video.
+- Partially complete videos will be output as each chunk of base_num_frames completes.  In combination with the --video paramater, this lets you effectively resume from a previous render as well as abort mid-render if the videos take a turn you don't like.  Extremely useful for saving time and "watching" as the renders complete rather than committing the full time.
+
 Easy install instructions for those using Runpod like me:
 ```
 #create once on new pod

From 1011b49fae3d215077e4e96fe2d8cec6e3331e8a Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Mon, 28 Apr 2025 14:28:06 -0700
Subject: [PATCH 108/117] Integrated prompt travel, video input, and partial
 video outputs from chaojie and fixed merged conflicts.

---
 README.md | 64 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 6640c5c..d9762c3 100644
--- a/README.md
+++ b/README.md
@@ -13,32 +13,10 @@
 - Video input via --video parameter, allow continuing/extending from a video.
 - Partially complete videos will be output as each chunk of base_num_frames completes.  In combination with the --video paramater, this lets you effectively resume from a previous render as well as abort mid-render if the videos take a turn you don't like.  Extremely useful for saving time and "watching" as the renders complete rather than committing the full time.
 
-Easy install instructions for those using Runpod like me:
-```
-#create once on new pod
-export HF_HOME=/workspace/
-export TZ=America/Los_Angeles
-python -m venv venv
-git clone https://github.com/pftq/SkyReels-V2_Improvements
-mv SkyReels-V2_Improvements SkyReels-V2
-cd /workspace/SkyReels-V2
-source /workspace/venv/bin/activate
-pip install torch==2.5.1
-pip install --upgrade wheel setuptools
-pip install packaging
-pip install -r requirements.txt --no-build-isolation
-deactivate
-
-#always run at the start to use persisting drive
-export HF_HOME=/workspace/
-export TZ=America/Los_Angeles
-source /workspace/venv/bin/activate
-cd /workspace/SkyReels-V2
-```
-
 Example prompts below.  If you run into memory/vram issues, you can reduce the base_num_frames while still having the same higher number on num_frames.  The point of the DF model is that now the whole video doesn't have to fit in VRAM and can be done in chunks.
 
-Example prompt (multi-GPU):
+Multi-GPU with video input and prompt travel, batch of 10, preserving aspect ratio. 
+Change --video "video.mp4" to --image "image.jpg" if you want to load a starting image instead.
 ```
 model_id=Skywork/SkyReels-V2-DF-14B-540P
 gpu_count=2
@@ -53,15 +31,18 @@ torchrun --nproc_per_node=${gpu_count} generate_video_df.py \
   --guidance_scale 6 \
   --batch_size 10 \
   --preserve_image_aspect_ratio \
-  --image "image.jpg" \
-  --prompt "" \
+  --video "video.mp4" \
+  --prompt "The first thing he does" \
+  "The second thing he does." \
+  "The third thing he does." \
   --negative_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
   --addnoise_condition 20 \
   --use_usp \
   --offload
 ```
 
-Single GPU:
+Single GPU with video input and prompt travel, batch of 10, preserving aspect ratio. 
+Change --video "video.mp4" to --image "image.jpg" if you want to load a starting image instead.
 ```
 model_id=Skywork/SkyReels-V2-DF-14B-540P
 python3 generate_video_df.py \
@@ -75,14 +56,37 @@ python3 generate_video_df.py \
   --guidance_scale 6 \
   --batch_size 10 \
   --preserve_image_aspect_ratio \
-  --image "image.jpg" \
-  --prompt "" \
+  --video "video.mp4" \
+  --prompt "The first thing he does" \
+  "The second thing he does." \
+  "The third thing he does." \
   --negative_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
   --addnoise_condition 20 \
   --offload
 ```
 
-Change "DF" to "I2V' or "T2V" accordingly if you don't want to use the infinite length version of the model.
+Easy install instructions for those using Runpod like me:
+```
+#create once on new pod
+export HF_HOME=/workspace/
+export TZ=America/Los_Angeles
+python -m venv venv
+git clone https://github.com/pftq/SkyReels-V2_Improvements
+mv SkyReels-V2_Improvements SkyReels-V2
+cd /workspace/SkyReels-V2
+source /workspace/venv/bin/activate
+pip install torch==2.5.1
+pip install --upgrade wheel setuptools
+pip install packaging
+pip install -r requirements.txt --no-build-isolation
+deactivate
+
+#always run at the start to use persisting drive
+export HF_HOME=/workspace/
+export TZ=America/Los_Angeles
+source /workspace/venv/bin/activate
+cd /workspace/SkyReels-V2
+```
 
 <hr>
 <p align="center">

From 852c1045a3f5abd70f769b195dedd7c606f21504 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Mon, 28 Apr 2025 14:29:19 -0700
Subject: [PATCH 109/117] Integrated prompt travel, video input, and partial
 video outputs from chaojie and fixed merged conflicts.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d9762c3..a3c5fda 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
 - Exposed negative_prompt to allow that to be changed/overwritten.
 - Friendlier filenames with date, seed, cfg, steps, and other details in front.
 
-## Additional changes merged from chaojie's fork from https://github.com/SkyworkAI/SkyReels-V2/pull/12 (special thanks for permission to do so):
+## Additional changes from chaojie's fork (https://github.com/SkyworkAI/SkyReels-V2/pull/12):
 - Prompt travel, allow multiple text strings in the --prompt parameter to guide the video differently each chunk of base_num_frames.
 - Video input via --video parameter, allow continuing/extending from a video.
 - Partially complete videos will be output as each chunk of base_num_frames completes.  In combination with the --video paramater, this lets you effectively resume from a previous render as well as abort mid-render if the videos take a turn you don't like.  Extremely useful for saving time and "watching" as the renders complete rather than committing the full time.

From c8276f025e993f813672e94183b9b7b84f68af39 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Mon, 28 Apr 2025 14:30:07 -0700
Subject: [PATCH 110/117] Integrated prompt travel, video input, and partial
 video outputs from chaojie and fixed merged conflicts.


From d965fb471306b4b239d7a101e4ab11cbc92dc05d Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Mon, 28 Apr 2025 14:40:56 -0700
Subject: [PATCH 111/117] Integrated prompt travel, video input, and partial
 video outputs from chaojie and fixed merged conflicts.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a3c5fda..72b5be9 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
 - Friendlier filenames with date, seed, cfg, steps, and other details in front.
 
 ## Additional changes from chaojie's fork (https://github.com/SkyworkAI/SkyReels-V2/pull/12):
-- Prompt travel, allow multiple text strings in the --prompt parameter to guide the video differently each chunk of base_num_frames.
+- Prompt travel / multiple prompts, allow multiple text strings in the --prompt parameter to guide the video differently each chunk of base_num_frames.
 - Video input via --video parameter, allow continuing/extending from a video.
 - Partially complete videos will be output as each chunk of base_num_frames completes.  In combination with the --video paramater, this lets you effectively resume from a previous render as well as abort mid-render if the videos take a turn you don't like.  Extremely useful for saving time and "watching" as the renders complete rather than committing the full time.
 

From 8a35c19109edacbc81e461b20d50b5dcf5dfc94d Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Mon, 28 Apr 2025 14:45:38 -0700
Subject: [PATCH 112/117] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 72b5be9..22286f8 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ python3 generate_video_df.py \
   --offload
 ```
 
-Easy install instructions for those using Runpod like me:
+Easy install instructions for those like me using Runpod for H100 and multi-gpu:
 ```
 #create once on new pod
 export HF_HOME=/workspace/

From fac73231a564a27fbd37a96682fcafedc615b6a8 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Mon, 28 Apr 2025 14:54:03 -0700
Subject: [PATCH 113/117] Update README.md

---
 README.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 22286f8..d483a7f 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ torchrun --nproc_per_node=${gpu_count} generate_video_df.py \
   --resolution 540P \
   --ar_step 0 \
   --base_num_frames 97 \
-  --num_frames 257 \
+  --num_frames 289 \
   --overlap_history 17 \
   --inference_steps 50 \
   --guidance_scale 6 \
@@ -37,6 +37,8 @@ torchrun --nproc_per_node=${gpu_count} generate_video_df.py \
   "The third thing he does." \
   --negative_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
   --addnoise_condition 20 \
+  --use_ret_steps \
+  --teacache_thresh 0.0 \
   --use_usp \
   --offload
 ```
@@ -50,7 +52,7 @@ python3 generate_video_df.py \
   --resolution 540P \
   --ar_step 0 \
   --base_num_frames 97 \
-  --num_frames 257 \
+  --num_frames 289 \
   --overlap_history 17 \
   --inference_steps 50 \
   --guidance_scale 6 \
@@ -62,6 +64,8 @@ python3 generate_video_df.py \
   "The third thing he does." \
   --negative_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
   --addnoise_condition 20 \
+  --use_ret_steps \
+  --teacache_thresh 0.0 \
   --offload
 ```
 

From 524437efb82dc2b6b1bdf12d1db380572f43b736 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Mon, 28 Apr 2025 15:14:34 -0700
Subject: [PATCH 114/117] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d483a7f..7083be3 100644
--- a/README.md
+++ b/README.md
@@ -83,8 +83,8 @@ pip install torch==2.5.1
 pip install --upgrade wheel setuptools
 pip install packaging
 pip install -r requirements.txt --no-build-isolation
-deactivate
-
+```
+```
 #always run at the start to use persisting drive
 export HF_HOME=/workspace/
 export TZ=America/Los_Angeles

From c3feb43eaa67306614bd66553c5c7fc07f64022b Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 1 May 2025 07:07:39 -0700
Subject: [PATCH 115/117] Fixed error on number of prompts less than video
 chunks.

---
 skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
index 31b5878..6da8834 100644
--- a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
+++ b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -257,7 +257,7 @@ def generate_timestep_matrix(
 
             update_mask.append(
                 (new_row != pre_row) & (new_row != num_iterations)
-            )  # False: no need to update, True: need to update
+            )  # False: no need to update， True: need to update
             step_index.append(new_row)
             step_matrix.append(step_template[new_row])
             pre_row = new_row
@@ -340,6 +340,7 @@ def __call__(
         else:
             prompt_embeds_list.append(self.text_encoder.encode(prompt).to(self.transformer.dtype))
         prompt_embeds = prompt_embeds_list[0]
+        prompt_readable = ""
         
         if self.do_classifier_free_guidance:
             negative_prompt_embeds = self.text_encoder.encode(negative_prompt).to(self.transformer.dtype)
@@ -452,7 +453,9 @@ def __call__(
                         prompt_embeds = prompt_embeds_list[i_n_iter]
                 if local_rank == 0:
                     partnum = i_n_iter + 1
-                    print(f"Generating part {partnum} of {n_iter}: "+prompt[i_n_iter]) # 20250425 pftq
+                    if len(prompt) > i_n_iter:
+                        prompt_readable = prompt[i_n_iter]
+                    print(f"Generating part {partnum} of {n_iter}: "+prompt_readable) # 20250425 pftq
                 if output_video is not None:  # i_n_iter !=0
             
                     prefix_video = output_video[:, -overlap_history:].to(prompt_embeds.device)

From 8f995a15077f4255e00e5caeb560a4e734be3a63 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Thu, 1 May 2025 07:15:16 -0700
Subject: [PATCH 116/117] Update README.md

---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 7083be3..befa633 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,17 @@
 ## Changes from pftq:
-- Added seed synchronization code to allow random seed with multi-GPU (https://github.com/SkyworkAI/SkyReels-V2/issues/24).
-- Reduced 20-min+ load time on multi-GPU to ~8min by fixing contention (all GPUs loading models at once). Indirectly also solved CPU RAM spike during multi-GPU (>200GB on 4 GPUs) (https://github.com/SkyworkAI/SkyReels-V2/issues/28).
-- Fixed CuSolver error that occasionally comes up in multi-GPU by presetting linear algebra library (https://github.com/SkyworkAI/SkyReels-V2/issues/37).
-- Added batch_size parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
-- Added preserve_image_aspect_ratio parameter to allow preserving original image aspect ratio.
+- **Added seed synchronization code to allow random seed with multi-GPU** (https://github.com/SkyworkAI/SkyReels-V2/issues/24).
+- **Reduced 20-min+ load time on multi-GPU to ~8min** by fixing contention (all GPUs loading models at once). Indirectly also solved CPU RAM spike during multi-GPU (>200GB on 4 GPUs) (https://github.com/SkyworkAI/SkyReels-V2/issues/28).
+- **Fixed CuSolver error** that occasionally comes up in multi-GPU by presetting linear algebra library (https://github.com/SkyworkAI/SkyReels-V2/issues/37).
+- **Added batch_size** parameter to allow multiple videos to generate without reloading the model, which takes about 20 min on multi-gpu so this saves a lot of time.
+- **Added preserve_image_aspect_ratio** parameter to allow preserving original image aspect ratio.
 - Fixed DF script not resize-cropping the image (I2V script does it but DF is missing the code).
 - Exposed negative_prompt to allow that to be changed/overwritten.
 - Friendlier filenames with date, seed, cfg, steps, and other details in front.
 
 ## Additional changes from chaojie's fork (https://github.com/SkyworkAI/SkyReels-V2/pull/12):
-- Prompt travel / multiple prompts, allow multiple text strings in the --prompt parameter to guide the video differently each chunk of base_num_frames.
-- Video input via --video parameter, allow continuing/extending from a video.
-- Partially complete videos will be output as each chunk of base_num_frames completes.  In combination with the --video paramater, this lets you effectively resume from a previous render as well as abort mid-render if the videos take a turn you don't like.  Extremely useful for saving time and "watching" as the renders complete rather than committing the full time.
+- **Multiple prompts**, allow multiple text strings in the --prompt parameter to guide the video differently each chunk of base_num_frames.
+- **Video input** via --video parameter, allow continuing/extending from a video.
+- **Partially complete videos saved** as each chunk of base_num_frames completes.  In combination with the --video parameter, this lets you effectively resume from a previous render as well as abort mid-render if the videos take a turn you don't like.  Extremely useful for saving time and "watching" as the renders complete rather than committing the full time.
 
 Example prompts below.  If you run into memory/vram issues, you can reduce the base_num_frames while still having the same higher number on num_frames.  The point of the DF model is that now the whole video doesn't have to fit in VRAM and can be done in chunks.
 

From 96e6f94c9517e4ddb32b737928c27d7a278112f3 Mon Sep 17 00:00:00 2001
From: pftq <pftq@pftq.com>
Date: Mon, 5 May 2025 02:53:11 -0700
Subject: [PATCH 117/117] Shortened filename with less prompt text.

---
 generate_video_df.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/generate_video_df.py b/generate_video_df.py
index 85e5219..0ef7ba8 100644
--- a/generate_video_df.py
+++ b/generate_video_df.py
@@ -284,9 +284,9 @@
             gpucount = "_"+str(dist.get_world_size())+"xGPU"
         prompt_summary = ""
         if type(args.prompt) is list:
-            prompt_summary = args.prompt[0][:100].replace('/','')
+            prompt_summary = args.prompt[0][:10].replace('/','')
         else:
-            prompt_summary = args.prompt[:100].replace('/','')
+            prompt_summary = args.prompt[:10].replace('/','')
         video_out_file = f"{current_time}_skyreels2df_{args.resolution}-{args.num_frames}f_cfg{args.guidance_scale}_steps{args.inference_steps}_seed{args.seed}{gpucount}_{prompt_summary}_{idx}.mp4" 
         
         with torch.cuda.amp.autocast(dtype=pipe.transformer.dtype), torch.no_grad():