Merge pull request #19 from StromNoNo/dev_acceleration

onpix · web-flow · commit 702ca3c50029 · 2026-01-01T19:00:38.000+08:00
Dev acceleration
diff --git a/README.md b/README.md
@@ -119,14 +119,41 @@ conda activate worldplay
 pip install -r requirements.txt
 ```
 
-### 2. Install Flash Attention (Optional but Recommended)
-Install Flash Attention for faster inference and reduced GPU memory consumption:
-```bash
-pip install flash-attn --no-build-isolation
-```
-Detailed instructions: [Flash Attention](https://github.com/Dao-AILab/flash-attention)
-
-### 3. Download All Required Models
+### 2. Install Attention Libraries (Optional but Recommended)
+* Flash Attention: 
+  Install Flash Attention for faster inference and reduced GPU memory consumption:
+  ```bash
+  pip install flash-attn --no-build-isolation
+  ```
+  Detailed instructions: [Flash Attention](https://github.com/Dao-AILab/flash-attention)
+
+
+* SageAttention: 
+  To enable SageAttention for faster inference, you need to install it by the following command:
+  ```bash
+  git clone https://github.com/cooper1637/SageAttention.git
+  cd SageAttention 
+  export EXT_PARALLEL=4 NVCC_APPEND_FLAGS="--threads 8" MAX_JOBS=32 # Optional
+  python3 setup.py install
+  ```
+
+### 3. Install AngelSlim and DeepGEMM
+* AngelSlim: 
+  Install AngelSlim to quantize transformer.
+  ```bash
+  pip install angelslim==0.2.2
+  ```
+
+* DeepGEMM: 
+  To enable fp8 gemm for transformer, you need to install it by the following command:
+  ```bash
+  git clone --recursive git@github.com:deepseek-ai/DeepGEMM.git
+  cd DeepGEMM
+  ./develop.sh
+  ./install.sh
+  ```
+
+### 4. Download All Required Models
 
 We provide a download script that automatically downloads all required models:
 
@@ -315,7 +342,6 @@ https://github.com/user-attachments/assets/531bf0ad-1fca-4d76-bb65-84701368926d
 https://github.com/user-attachments/assets/f165f409-5a74-4e19-a32c-fc98d92259e1
 
 ## 📝 TODO
-- [ ] Acceleration & Quantization
 - [ ] Open-source training code
 
 ## 📚 Citation
diff --git a/generate.py b/generate.py
@@ -264,7 +264,7 @@ def pose_to_input(pose_data, latent_num, tps=False):
     rotate_one_label = one_hot_to_one_dimension(rotate_one_hot)
     action_one_label = trans_one_label * 9 + rotate_one_label
 
-    return torch.tensor(w2c_list), torch.tensor(intrinsic_list), action_one_label
+    return torch.as_tensor(w2c_list), torch.as_tensor(intrinsic_list), action_one_label
 
 def save_video(video, path):
     if video.ndim == 5:
@@ -833,6 +833,38 @@ def main():
              'Use --with-ui or --with-ui true/1 to enable, --with-ui false/0 to disable'
     )
 
+    parser.add_argument(
+        '--use_sageattn', type=str_to_bool, nargs='?', const=True, default=False,
+        help='Enable sageattn (default: false). '
+             'Use --use_sageattn or --use_sageattn true/1 to enable, '
+             '--use_sageattn false/0 to disable'
+    )
+    parser.add_argument(
+        '--sage_blocks_range', type=str, default="0-53",
+        help='Sageattn blocks range (e.g., 0-5 or 0,1,2,3,4,5)'
+    )
+    parser.add_argument(
+        '--use_vae_parallel', type=str_to_bool, nargs='?', const=True, default=False,
+        help='Enable vae parallel (default: false). '
+             'Use --use_vae_parallel or --use_vae_parallel true/1 to enable, '
+             '--use_vae_parallel false/0 to disable'
+    )
+    # fp8 gemm related
+    parser.add_argument(
+        '--use_fp8_gemm', type=str_to_bool, nargs='?', const=True, default=False,
+        help='Enable fp8 gemm for transformer (default: false). '
+             'Use --use_fp8_gemm or --use_fp8_gemm true/1 to enable, '
+             '--use_fp8_gemm false/0 to disable'
+    )
+    parser.add_argument(
+        '--quant_type', type=str, default="fp8-per-block",
+        help='Quantization type for fp8 gemm (e.g., fp8-per-tensor-weight-only, fp8-per-tensor, fp8-per-block)'
+    )
+    parser.add_argument(
+        '--include_patterns', type=str, default="double_blocks",
+        help='Include patterns for fp8 gemm (default: double_blocks)'
+    )
+
     args = parser.parse_args()
     
     assert args.image_path is not None
diff --git a/hyvideo/commons/infer_state.py b/hyvideo/commons/infer_state.py
@@ -15,14 +15,22 @@
 # See the License for the specific language governing permissions and limitations under the License.
 
 from typing import Optional
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 @dataclass
 class InferState:
     enable_sageattn: bool = False  # whether to use SageAttention
     sage_blocks_range: Optional[range] = None  # block range to use SageAttention
     enable_torch_compile: bool = False  # whether to use torch compile
 
+    # fp8 gemm related
+    use_fp8_gemm: bool = False  # whether to use fp8 gemm
+    quant_type: str = "fp8-per-block"  # fp8 quantization type
+    include_patterns: list = field(default_factory=lambda: ["double_blocks"])  # include patterns for fp8 gemm
+
+    # vae related
+    use_vae_parallel: bool = False  # whether to use vae parallel
+
 __infer_state = None
 
 def parse_range(value):
@@ -34,13 +42,28 @@ def parse_range(value):
 
 def initialize_infer_state(args):
     global __infer_state
-    sage_blocks_range = None
+    sage_blocks_range = parse_range(args.sage_blocks_range)
     # Map CLI argument use_sageattn to internal enable_sageattn field
-    use_sageattn = False
+    use_sageattn = getattr(args, 'use_sageattn', False)
+
+    # Parse include_patterns from args
+    include_patterns = getattr(args, 'include_patterns', "double_blocks")
+    if isinstance(include_patterns, str):
+        # Split by comma and strip whitespace
+        include_patterns = [p.strip() for p in include_patterns.split(',') if p.strip()]
+
     __infer_state = InferState(
         enable_sageattn = use_sageattn,
         sage_blocks_range = sage_blocks_range,
         enable_torch_compile = args.enable_torch_compile,
+
+        # fp8 gemm related
+        use_fp8_gemm = args.use_fp8_gemm,
+        quant_type = args.quant_type,
+        include_patterns = include_patterns,
+
+        # vae related
+        use_vae_parallel = args.use_vae_parallel,
     )
     return __infer_state
 
diff --git a/hyvideo/models/autoencoders/hunyuanvideo_15_vae_w_cache.py b/hyvideo/models/autoencoders/hunyuanvideo_15_vae_w_cache.py
@@ -383,6 +383,28 @@ def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
                 # compute the shortcut part
                 shortcut = rearrange(x, "b (r2 r3 c) f h w -> b c f (h r2) (w r3)", r2=2, r3=2)
                 shortcut = shortcut.repeat_interleave(repeats=self.repeats // 2, dim=1)
+            elif feat_cache is None and x.shape[2] > 1:
+                # Multi-frame input without cache: first frame only spatial upsample, rest frames do spatio-temporal upsample
+                # Separate first frame and remaining frames
+                h_first = h[:, :, :1, :, :]  # first frame
+                h_rest = h[:, :, 1:, :, :]   # remaining frames
+                x_first = x[:, :, :1, :, :]
+                x_rest = x[:, :, 1:, :, :]
+                
+                # First frame: only spatial upsample
+                h_first = rearrange(h_first, "b (r2 r3 c) f h w -> b c f (h r2) (w r3)", r2=2, r3=2)
+                h_first = h_first[:, : h_first.shape[1] // 2]
+                shortcut_first = rearrange(x_first, "b (r2 r3 c) f h w -> b c f (h r2) (w r3)", r2=2, r3=2)
+                shortcut_first = shortcut_first.repeat_interleave(repeats=self.repeats // 2, dim=1)
+                out_first = h_first + shortcut_first
+                
+                # Remaining frames: spatio-temporal upsample
+                h_rest = rearrange(h_rest, "b (r1 r2 r3 c) f h w -> b c (f r1) (h r2) (w r3)", r1=r1, r2=2, r3=2)
+                shortcut_rest = rearrange(x_rest, "b (r1 r2 r3 c) f h w -> b c (f r1) (h r2) (w r3)", r1=r1, r2=2, r3=2)
+                shortcut_rest = shortcut_rest.repeat_interleave(repeats=self.repeats, dim=1)
+                out_rest = h_rest + shortcut_rest
+
+                return torch.cat([out_first, out_rest], dim=2)
             else:
                 h = rearrange(h, "b (r1 r2 r3 c) f h w -> b c (f r1) (h r2) (w r3)", r1=r1, r2=2, r3=2)
                 # compute the shortcut part
@@ -870,8 +892,9 @@ def tile_parallel_spatial_tiled_decode(self, z: torch.Tensor):
             decoded_metas.append(torch.tensor([ri, rj, pad_w, pad_h], device=z.device, dtype=torch.int64))
         
         while len(decoded_tiles) < tiles_per_rank:
+            T_out = decoded_tiles[0].shape[2] if len(decoded_tiles) > 0 else (T-1)*self.ffactor_temporal+1
             zero_tile = torch.zeros(
-                [1, 3, (T - 1) * self.ffactor_temporal + 1, self.tile_sample_min_size, self.tile_sample_min_size],
+                [1, 3, T_out, self.tile_sample_min_size, self.tile_sample_min_size],
                 device=dec.device,
                 dtype=dec.dtype
             )
@@ -891,6 +914,7 @@ def tile_parallel_spatial_tiled_decode(self, z: torch.Tensor):
 
         dist.all_gather(tiles_gather_list, decoded_tiles, group=get_parallel_state().sp_group)
         dist.all_gather(metas_gather_list, decoded_metas, group=get_parallel_state().sp_group)
+        dist.barrier()
 
         if rank != 0:
             return torch.empty(0, device=z.device)
diff --git a/hyvideo/models/transformers/modules/attention.py b/hyvideo/models/transformers/modules/attention.py
@@ -156,13 +156,20 @@ def shrink_head(encoder_state, dim):
         t_kv['k_txt'] = encoder_key
         t_kv['v_txt'] = encoder_value
 
-    encoder_hidden_states = F.scaled_dot_product_attention(
-                                                        encoder_query, 
-                                                        encoder_key, 
-                                                        encoder_value, 
-                                                        dropout_p=0.0, 
-                                                        is_causal=False
-                                                        )
+    infer_state = get_infer_state()
+    enable_sageattn = (infer_state.enable_sageattn and 
+                        block_idx in infer_state.sage_blocks_range)
+    if enable_sageattn:
+        from sageattention import sageattn
+        encoder_hidden_states = sageattn(encoder_query, encoder_key, encoder_value, tensor_layout="HND", is_causal=False)
+    else:
+        encoder_hidden_states = F.scaled_dot_product_attention(
+                                                            encoder_query, 
+                                                            encoder_key, 
+                                                            encoder_value, 
+                                                            dropout_p=0.0, 
+                                                            is_causal=False
+                                                            )
 
     # transpose back
     encoder_hidden_states = encoder_hidden_states.transpose(1, 2)  # [B, S, H, D]
@@ -227,7 +234,14 @@ def sequence_parallel_attention_vision(q, k, v,
     key = torch.cat([encoder_key, key], dim=2)
     value = torch.cat([encoder_value, value], dim=2)
 
-    hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+    infer_state = get_infer_state()
+    enable_sageattn = (infer_state.enable_sageattn and 
+                        block_idx in infer_state.sage_blocks_range)
+    if enable_sageattn:
+        from sageattention import sageattn
+        hidden_states = sageattn(query, key, value, tensor_layout="HND", is_causal=False)
+    else:
+        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
 
     # transpose back
     hidden_states = hidden_states.transpose(1, 2)  # [B, S, H, D]
diff --git a/hyvideo/pipelines/worldplay_video_pipeline.py b/hyvideo/pipelines/worldplay_video_pipeline.py
@@ -52,6 +52,8 @@
 )
 from hyvideo.commons.parallel_states import get_parallel_state
 
+from hyvideo.commons.infer_state import get_infer_state
+
 from hyvideo.models.autoencoders import hunyuanvideo_15_vae_w_cache
 from hyvideo.models.text_encoders import PROMPT_TEMPLATE, TextEncoder
 from hyvideo.models.text_encoders.byT5 import load_glyph_byT5_v2
@@ -1635,6 +1637,10 @@ def __call__(
             else:
                 latents = latents / self.vae.config.scaling_factor
 
+            if get_infer_state() and get_infer_state().use_vae_parallel:
+                self.vae.enable_spatial_tiling()
+                self.vae.enable_tile_parallelism()
+
 
             if return_pre_sr_video or not enable_sr:
                 with (torch.autocast(device_type="cuda", dtype=self.vae_dtype, enabled=self.vae_autocast_enabled),
@@ -1767,6 +1773,14 @@ def create_pipeline(cls, pretrained_model_name_or_path, transformer_version, cre
 
         transformer = transformer.to(transformer_dtype).to(transformer_init_device)
 
+        infer_state = get_infer_state()
+        if infer_state.use_fp8_gemm:
+            from angelslim.compressor.diffusion import DynamicDiTQuantizer
+            quant_type = infer_state.quant_type
+            include_patterns = infer_state.include_patterns
+            quantizer = DynamicDiTQuantizer(quant_type=quant_type, include_patterns=include_patterns)
+            quantizer.convert_linear(transformer)
+
         vae = hunyuanvideo_15_vae_w_cache.AutoencoderKLConv3D.from_pretrained(
             os.path.join(cached_folder, "vae"), 
             torch_dtype=vae_inference_config['dtype']
diff --git a/run.sh b/run.sh
@@ -46,22 +46,22 @@ ENABLE_SR=false # Enable super resolution. When the NUM_FRAMES == 125, you can s
 
 # inference with autoregressive model
 # torchrun --nproc_per_node=$N_INFERENCE_GPU generate.py  \
-#  --prompt "$PROMPT" \
-#  --image_path $IMAGE_PATH \
-#  --resolution $RESOLUTION \
-#  --aspect_ratio $ASPECT_RATIO \
-#  --video_length $NUM_FRAMES \
-#  --seed $SEED \
-#  --rewrite $REWRITE \
-#  --sr $ENABLE_SR --save_pre_sr_video \
-#  --pose "$POSE" \
-#  --output_path $OUTPUT_PATH \
-#  --model_path $MODEL_PATH \
-#  --action_ckpt $AR_ACTION_MODEL_PATH \
-#  --few_step false \
-#  --width $WIDTH \
-#  --height $HEIGHT \
-#  --model_type 'ar'
+#   --prompt "$PROMPT" \
+#   --image_path $IMAGE_PATH \
+#   --resolution $RESOLUTION \
+#   --aspect_ratio $ASPECT_RATIO \
+#   --video_length $NUM_FRAMES \
+#   --seed $SEED \
+#   --rewrite $REWRITE \
+#   --sr $ENABLE_SR --save_pre_sr_video \
+#   --pose "$POSE" \
+#   --output_path $OUTPUT_PATH \
+#   --model_path $MODEL_PATH \
+#   --action_ckpt $AR_ACTION_MODEL_PATH \
+#   --few_step false \
+#   --width $WIDTH \
+#   --height $HEIGHT \
+#   --model_type 'ar'
 
 # inference with autoregressive distilled model
 torchrun --nproc_per_node=$N_INFERENCE_GPU generate.py \
@@ -79,4 +79,7 @@ torchrun --nproc_per_node=$N_INFERENCE_GPU generate.py \
   --action_ckpt $AR_DISTILL_ACTION_MODEL_PATH \
   --few_step true \
   --num_inference_steps 4 \
-  --model_type 'ar'
+  --model_type 'ar' \
+  --use_vae_parallel false \
+  --use_sageattn false \
+  --use_fp8_gemm false \