support teacache (#35)

linchunze · steven-kl · web-flow · commit 1028f5379c8a · 2025-04-23T20:17:20.000+08:00
Co-authored-by: Steven &lt;steven.n@kunlun-inc.com&gt;
diff --git a/README.md b/README.md
@@ -191,13 +191,16 @@ python3 generate_video_df.py \
   --overlap_history 17 \
   --prompt "A graceful white swan with a curved neck and delicate feathers swimming in a serene lake at dawn, its reflection perfectly mirrored in the still water as mist rises from the surface, with the swan occasionally dipping its head into the water to feed." \
   --addnoise_condition 20 \
-  --offload
+  --offload \
+  --teacache \
+  --use_ret_steps \
+  --teacache_thresh 0.3
 ```
 
 asynchronous generation for 30s video
 ```shell
 model_id=Skywork/SkyReels-V2-DF-14B-540P
-# synchronous inference
+# asynchronous inference
 python3 generate_video_df.py \
   --model_id ${model_id} \
   --resolution 540P \
@@ -232,7 +235,10 @@ python3 generate_video.py \
   --shift 8.0 \
   --fps 24 \
   --prompt "A serene lake surrounded by towering mountains, with a few swans gracefully gliding across the water and sunlight dancing on the surface." \
-  --offload
+  --offload \
+  --teacache \
+  --use_ret_steps \
+  --teacache_thresh 0.3
 ```
 > **Note**: 
 > - When using an **image-to-video (I2V)** model, you must provide an input image using the `--image  ${image_path}` parameter. The `--guidance_scale 5.0` and `--shift 3.0` is recommended for I2V model.
@@ -269,7 +275,10 @@ Below are the key parameters you can customize for video generation:
 | --offload | True | Offloads model components to CPU to reduce VRAM usage (recommended) |
 | --use_usp | True | Enables multi-GPU acceleration with xDiT USP |
 | --outdir | ./video_out | Directory where generated videos will be saved |
-| --prompt_enhancer | True | expand the prompt into a more detailed description |
+| --prompt_enhancer | True | Expand the prompt into a more detailed description |
+| --teacache | False | Enables teacache for faster inference |
+| --teacache_thresh | 0.2 | Higher speedup will cause to worse quality |
+| --use_ret_steps | False | Retention Steps for teacache |
 
 **Diffusion Forcing Additional Parameters**
 | Parameter | Recommended Value | Description |
diff --git a/generate_video.py b/generate_video.py
@@ -48,6 +48,16 @@
         default="A serene lake surrounded by towering mountains, with a few swans gracefully gliding across the water and sunlight dancing on the surface.",
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
+    parser.add_argument("--teacache", action="store_true")
+    parser.add_argument(
+        "--teacache_thresh",
+        type=float,
+        default=0.2,
+        help="Higher speedup will cause to worse quality -- 0.1 for 2.0x speedup -- 0.2 for 3.0x speedup")
+    parser.add_argument(
+        "--use_ret_steps",
+        action="store_true",
+        help="Using Retention Steps will result in faster generation speed and better generation quality.")
     args = parser.parse_args()
 
     args.model_id = download_model(args.model_id)
@@ -116,6 +126,11 @@
             height, width = width, height
         args.image = resizecrop(args.image, height, width)
 
+    if args.teacache:
+        pipe.transformer.initialize_teacache(enable_teacache=True, num_steps=args.inference_steps, 
+                                             teacache_thresh=args.teacache_thresh, use_ret_steps=args.use_ret_steps, 
+                                             ckpt_dir=args.model_id)
+        
     prompt_input = args.prompt
     if args.prompt_enhancer and image is not None:
         prompt_input = prompt_enhancer(prompt_input)
diff --git a/generate_video_df.py b/generate_video_df.py
@@ -39,6 +39,16 @@
         default="A woman in a leather jacket and sunglasses riding a vintage motorcycle through a desert highway at sunset, her hair blowing wildly in the wind as the motorcycle kicks up dust, with the golden sun casting long shadows across the barren landscape.",
     )
     parser.add_argument("--prompt_enhancer", action="store_true")
+    parser.add_argument("--teacache", action="store_true")
+    parser.add_argument(
+        "--teacache_thresh",
+        type=float,
+        default=0.2,
+        help="Higher speedup will cause to worse quality -- 0.1 for 2.0x speedup -- 0.2 for 3.0x speedup")
+    parser.add_argument(
+        "--use_ret_steps",
+        action="store_true",
+        help="Using Retention Steps will result in faster generation speed and better generation quality.")
     args = parser.parse_args()
 
     args.model_id = download_model(args.model_id)
@@ -117,6 +127,16 @@
 
     if args.causal_attention:
         pipe.transformer.set_ar_attention(args.causal_block_size)
+    
+    if args.teacache:
+        if args.ar_step > 0:
+            num_steps = args.inference_steps + (((args.base_num_frames - 1)//4 + 1) // args.causal_block_size - 1) * args.ar_step
+            print('num_steps:', num_steps)
+        else:
+            num_steps = args.inference_steps
+        pipe.transformer.initialize_teacache(enable_teacache=True, num_steps=num_steps, 
+                                             teacache_thresh=args.teacache_thresh, use_ret_steps=args.use_ret_steps, 
+                                             ckpt_dir=args.model_id)
 
     print(f"prompt:{prompt_input}")
     print(f"guidance_scale:{guidance_scale}")
diff --git a/skyreels_v2_infer/modules/transformer.py b/skyreels_v2_infer/modules/transformer.py
@@ -1,6 +1,6 @@
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import math
-
+import numpy as np
 import torch
 import torch.amp as amp
 import torch.nn as nn
@@ -484,6 +484,7 @@ def __init__(
         self.num_frame_per_block = 1
         self.flag_causal_attention = False
         self.block_mask = None
+        self.enable_teacache = False
 
         # embeddings
         self.patch_embedding = nn.Conv3d(in_dim, dim, kernel_size=patch_size, stride=patch_size)
@@ -574,6 +575,50 @@ def attention_mask(b, h, q_idx, kv_idx):
 
         return block_mask
 
+    def initialize_teacache(self, enable_teacache=True, num_steps=25, teacache_thresh=0.15, use_ret_steps=False, ckpt_dir=''):
+        self.enable_teacache = enable_teacache
+        print('using teacache')
+        self.cnt = 0
+        self.num_steps = num_steps
+        self.teacache_thresh = teacache_thresh
+        self.accumulated_rel_l1_distance_even = 0
+        self.accumulated_rel_l1_distance_odd = 0
+        self.previous_e0_even = None
+        self.previous_e0_odd = None
+        self.previous_residual_even = None
+        self.previous_residual_odd = None
+        self.use_ref_steps = use_ret_steps
+        if "I2V" in ckpt_dir:
+            if use_ret_steps:
+                if '540P' in ckpt_dir:
+                    self.coefficients = [ 2.57151496e+05, -3.54229917e+04,  1.40286849e+03, -1.35890334e+01, 1.32517977e-01]
+                if '720P' in ckpt_dir:
+                    self.coefficients = [ 8.10705460e+03,  2.13393892e+03, -3.72934672e+02,  1.66203073e+01, -4.17769401e-02]
+                self.ret_steps = 5*2
+                self.cutoff_steps = num_steps*2
+            else:
+                if '540P' in ckpt_dir:
+                    self.coefficients = [-3.02331670e+02,  2.23948934e+02, -5.25463970e+01,  5.87348440e+00, -2.01973289e-01]
+                if '720P' in ckpt_dir:
+                    self.coefficients = [-114.36346466,   65.26524496,  -18.82220707,    4.91518089,   -0.23412683]
+                self.ret_steps = 1*2
+                self.cutoff_steps = num_steps*2 - 2
+        else:
+            if use_ret_steps:
+                if '1.3B' in ckpt_dir:
+                    self.coefficients = [-5.21862437e+04, 9.23041404e+03, -5.28275948e+02, 1.36987616e+01, -4.99875664e-02]
+                if '14B' in ckpt_dir:
+                    self.coefficients = [-3.03318725e+05, 4.90537029e+04, -2.65530556e+03, 5.87365115e+01, -3.15583525e-01]
+                self.ret_steps = 5*2
+                self.cutoff_steps = num_steps*2
+            else:
+                if '1.3B' in ckpt_dir:
+                    self.coefficients = [2.39676752e+03, -1.31110545e+03,  2.01331979e+02, -8.29855975e+00, 1.37887774e-01]
+                if '14B' in ckpt_dir:
+                    self.coefficients = [-5784.54975374,  5449.50911966, -1811.16591783,   256.27178429, -13.02252404]
+                self.ret_steps = 1*2
+                self.cutoff_steps = num_steps*2 - 2
+
     def forward(self, x, t, context, clip_fea=None, y=None, fps=None):
         r"""
         Forward pass through the diffusion model
@@ -664,13 +709,68 @@ def forward(self, x, t, context, clip_fea=None, y=None, fps=None):
 
         # arguments
         kwargs = dict(e=e0, grid_sizes=grid_sizes, freqs=self.freqs, context=context, block_mask=self.block_mask)
-        for block in self.blocks:
-            x = block(x, **kwargs)
+        if self.enable_teacache:
+            modulated_inp = e0 if self.use_ref_steps else e
+            # teacache
+            if self.cnt%2==0: # even -> conditon
+                self.is_even = True
+                if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                    should_calc_even = True
+                    self.accumulated_rel_l1_distance_even = 0
+                else:
+                    rescale_func = np.poly1d(self.coefficients)
+                    self.accumulated_rel_l1_distance_even += rescale_func(((modulated_inp-self.previous_e0_even).abs().mean() / self.previous_e0_even.abs().mean()).cpu().item())
+                    if self.accumulated_rel_l1_distance_even < self.teacache_thresh:
+                        should_calc_even = False
+                    else:
+                        should_calc_even = True
+                        self.accumulated_rel_l1_distance_even = 0
+                self.previous_e0_even = modulated_inp.clone()
+
+            else: # odd -> unconditon
+                self.is_even = False
+                if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                    should_calc_odd = True
+                    self.accumulated_rel_l1_distance_odd = 0
+                else: 
+                    rescale_func = np.poly1d(self.coefficients)
+                    self.accumulated_rel_l1_distance_odd += rescale_func(((modulated_inp-self.previous_e0_odd).abs().mean() / self.previous_e0_odd.abs().mean()).cpu().item())
+                    if self.accumulated_rel_l1_distance_odd < self.teacache_thresh:
+                        should_calc_odd = False
+                    else:
+                        should_calc_odd = True
+                        self.accumulated_rel_l1_distance_odd = 0
+                self.previous_e0_odd = modulated_inp.clone()
+
+        if self.enable_teacache: 
+            if self.is_even:
+                if not should_calc_even:
+                    x += self.previous_residual_even
+                else:
+                    ori_x = x.clone()
+                    for block in self.blocks:
+                        x = block(x, **kwargs)
+                    self.previous_residual_even = x - ori_x
+            else:
+                if not should_calc_odd:
+                    x += self.previous_residual_odd
+                else:
+                    ori_x = x.clone()
+                    for block in self.blocks:
+                        x = block(x, **kwargs)
+                    self.previous_residual_odd = x - ori_x
+        
+        else:
+            for block in self.blocks:
+                x = block(x, **kwargs)
 
         x = self.head(x, e)
 
         # unpatchify
         x = self.unpatchify(x, grid_sizes)
+        self.cnt += 1
+        if self.cnt >= self.num_steps:
+            self.cnt = 0
         return x.float()
 
     def unpatchify(self, x, grid_sizes):
diff --git a/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py b/skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py
@@ -328,6 +328,9 @@ def __call__(
                     finished_frame_num = i * (base_num_frames - overlap_history_frames) + overlap_history_frames
                     left_frame_num = latent_length - finished_frame_num
                     base_num_frames_iter = min(left_frame_num + overlap_history_frames, base_num_frames)
+                    if ar_step > 0 and self.transformer.enable_teacache:
+                        num_steps = num_inference_steps + ((base_num_frames_iter - overlap_history_frames) // causal_block_size - 1) * ar_step
+                        self.transformer.num_steps = num_steps
                 else:  # i == 0
                     base_num_frames_iter = base_num_frames
                 latent_shape = [16, base_num_frames_iter, latent_height, latent_width]