Update hunyuan video t2v preprocess

JiayiZhangA · JiayiZhangA · commit 2ed4d01d3561 · 2025-08-27T19:55:25.000Z
diff --git a/examples/training/finetune/hunyuan_t2v/preprocess_hunyuan_data_t2v.sh b/examples/training/finetune/hunyuan_t2v/preprocess_hunyuan_data_t2v.sh
@@ -2,14 +2,15 @@
 
 GPU_NUM=1 # 2,4,8
 MODEL_PATH="hunyuanvideo-community/HunyuanVideo"
-DATASET_PATH="/FastVideo/data/mini_i2v_dataset/crush-smol_raw"
-OUTPUT_DIR="/FastVideo/data/mini_i2v_dataset/crush-smol_processed_t2v_hunyuan/"
+DATASET_PATH="data/crush-smol"
+OUTPUT_DIR="data/crush-smol_processed_t2v_hunyuan/"
 
 torchrun --nproc_per_node=$GPU_NUM \
     -m fastvideo.pipelines.preprocess.v1_preprocessing_new \
     --model_path $MODEL_PATH \
     --mode preprocess \
     --workload_type t2v \
+    --preprocess.dataset_type merged \
     --preprocess.dataset_path $DATASET_PATH \
     --preprocess.dataset_output_dir $OUTPUT_DIR \
     --preprocess.preprocess_video_batch_size 2 \
@@ -21,3 +22,4 @@ torchrun --nproc_per_node=$GPU_NUM \
     --preprocess.samples_per_file 8 \
     --preprocess.flush_frequency 8 \
     --preprocess.video_length_tolerance_range 5
+    
diff --git a/fastvideo/configs/models/encoders/clip.py b/fastvideo/configs/models/encoders/clip.py
@@ -74,7 +74,14 @@ class CLIPVisionArchConfig(ImageEncoderArchConfig):
 class CLIPTextConfig(TextEncoderConfig):
     arch_config: TextEncoderArchConfig = field(
         default_factory=CLIPTextArchConfig)
-
+    tokenizer_kwargs: dict = field(
+        default_factory=lambda: {
+            "padding": "max_length",
+            "truncation": True,
+            "max_length": 77,
+            "return_tensors": "pt"
+        }
+    )
     num_hidden_layers_override: int | None = None
     require_post_norm: bool | None = None
     prefix: str = "clip"
diff --git a/fastvideo/configs/models/encoders/llama.py b/fastvideo/configs/models/encoders/llama.py
@@ -60,5 +60,11 @@ class LlamaArchConfig(TextEncoderArchConfig):
 @dataclass
 class LlamaConfig(TextEncoderConfig):
     arch_config: TextEncoderArchConfig = field(default_factory=LlamaArchConfig)
-
+    tokenizer_kwargs: dict = field(
+        default_factory=lambda: {
+            "padding": "max_length",
+            "truncation": True,
+            "max_length": 256,
+            "return_tensors": "pt"
+        })
     prefix: str = "llama"
diff --git a/fastvideo/layers/rotary_embedding.py b/fastvideo/layers/rotary_embedding.py
@@ -138,14 +138,14 @@ def forward_native(
         cos, sin = cos_sin.chunk(2, dim=-1)
 
         query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
+        query = query.reshape(num_tokens, -1, self.head_size)
         query_rot = query[..., :self.rotary_dim]
         query_pass = query[..., self.rotary_dim:]
         query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key_shape = key.shape
-        key = key.view(num_tokens, -1, self.head_size)
+        key = key.reshape(num_tokens, -1, self.head_size)
         key_rot = key[..., :self.rotary_dim]
         key_pass = key[..., self.rotary_dim:]
         key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
diff --git a/fastvideo/models/vaes/hunyuanvae.py b/fastvideo/models/vaes/hunyuanvae.py
@@ -91,7 +91,7 @@ def forward(self,
         key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
         value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
 
-        # Perform scaled dot-product attention
+        # Perform scaled dot-product attentionz
         hidden_states = F.scaled_dot_product_attention(query,
                                                        key,
                                                        value,
@@ -361,7 +361,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                         hidden_states.device,
                         batch_size=batch_size)
                     hidden_states = attn(hidden_states,
-                                         attention_mask=attention_mask)
+                                         attention_mask=attention_mask.unsqueeze(1))
                     hidden_states = hidden_states.unflatten(
                         1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
 
@@ -385,7 +385,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                         hidden_states.device,
                         batch_size=batch_size)
                     hidden_states = attn(hidden_states,
-                                         attention_mask=attention_mask)
+                                         attention_mask=attention_mask.unsqueeze(1))
                     hidden_states = hidden_states.unflatten(
                         1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
 
diff --git a/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py b/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py
@@ -9,7 +9,13 @@
 
 class PreprocessPipelineI2V(ComposedPipelineBase):
     _required_config_modules = [
-        "image_encoder", "image_processor", "text_encoder", "tokenizer", "vae"
+        "image_encoder", 
+        "image_processor", 
+        "text_encoder", 
+        "tokenizer",
+        "text_encoder_2",
+        "tokenizer_2",
+        "vae"
     ]
 
     def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
@@ -51,8 +57,13 @@ def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
 
 
 class PreprocessPipelineT2V(ComposedPipelineBase):
-    _required_config_modules = ["text_encoder", "tokenizer", "vae"]
-
+    _required_config_modules = [
+        "text_encoder",
+        "tokenizer",
+        "text_encoder_2",
+        "tokenizer_2",
+        "vae"
+    ]
     def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
         assert fastvideo_args.preprocess_config is not None
         self.add_stage(stage_name="text_transform_stage",
@@ -61,10 +72,34 @@ def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
                            preprocess_config.training_cfg_rate,
                            seed=fastvideo_args.preprocess_config.seed,
                        ))
+        # llama_tokenizer_kwargs = {
+        #     "padding": "max_length", 
+        #     "truncation": True, 
+        #     "max_length": 256, 
+        #     "return_tensors": "pt"
+        # }
+        # clip_tokenizer_kwargs = {
+        #     "padding": "max_length", 
+        #     "truncation": True, 
+        #     "max_length": 77, 
+        #     "return_tensors": "pt"
+        # }
+        # if len(fastvideo_args.pipeline_config.text_encoder_configs) >= 2:
+        #     fastvideo_args.pipeline_config.text_encoder_configs[0].tokenizer_kwargs = llama_tokenizer_kwargs
+        #     fastvideo_args.pipeline_config.text_encoder_configs[1].tokenizer_kwargs = clip_tokenizer_kwargs
+        text_encoders = [
+            self.get_module("text_encoder"),  
+            self.get_module("text_encoder_2")  
+        ]
+        tokenizers = [
+            self.get_module("tokenizer"),      
+            self.get_module("tokenizer_2")   
+        ]
+
         self.add_stage(stage_name="prompt_encoding_stage",
                        stage=TextEncodingStage(
-                           text_encoders=[self.get_module("text_encoder")],
-                           tokenizers=[self.get_module("tokenizer")],
+                           text_encoders=text_encoders,
+                           tokenizers=tokenizers,
                        ))
         self.add_stage(
             stage_name="video_transform_stage",
diff --git a/scripts/dataset_preparation/prepare_json_file.py b/scripts/dataset_preparation/prepare_json_file.py
@@ -23,8 +23,10 @@ def get_video_info(video_path):
     fps = info.get("video_fps", 0)
     duration = num_frames / fps if fps > 0 else 0
 
-    # Extract name
-    _, _, videos_dir, video_name = str(video_path).split("/")
+    from pathlib import Path
+    video_path = Path(video_path)
+    videos_dir = video_path.parent.name
+    video_name = video_path.name
 
     return {
         "path": str(video_name),
@@ -100,6 +102,7 @@ def prepare_dataset_json(folder_path,
 
     # Save to JSON file
     output_file = folder_path / output_name
+    print(folder_path,output_file,output_name)
     with open(output_file, 'w') as f:
         json.dump(dataset_info, f, indent=2)