Update hunyuan video t2v preprocess

JiayiZhangA · JiayiZhangA · commit f877e5371a35 · 2025-08-31T00:00:30.000Z
diff --git a/HunyuanVideo b/HunyuanVideo
diff --git a/examples/training/finetune/hunyuan_t2v/preprocess_hunyuan_data_t2v.sh b/examples/training/finetune/hunyuan_t2v/preprocess_hunyuan_data_t2v.sh
@@ -2,14 +2,15 @@
 
 GPU_NUM=1 # 2,4,8
 MODEL_PATH="hunyuanvideo-community/HunyuanVideo"
-DATASET_PATH="/FastVideo/data/mini_i2v_dataset/crush-smol_raw"
-OUTPUT_DIR="/FastVideo/data/mini_i2v_dataset/crush-smol_processed_t2v_hunyuan/"
+DATASET_PATH="data/crush-smol"
+OUTPUT_DIR="data/crush-smol_processed_t2v_hunyuan/"
 
 torchrun --nproc_per_node=$GPU_NUM \
     -m fastvideo.pipelines.preprocess.v1_preprocessing_new \
     --model_path $MODEL_PATH \
     --mode preprocess \
     --workload_type t2v \
+    --preprocess.dataset_type merged \
     --preprocess.dataset_path $DATASET_PATH \
     --preprocess.dataset_output_dir $OUTPUT_DIR \
     --preprocess.preprocess_video_batch_size 2 \
@@ -21,3 +22,4 @@ torchrun --nproc_per_node=$GPU_NUM \
     --preprocess.samples_per_file 8 \
     --preprocess.flush_frequency 8 \
     --preprocess.video_length_tolerance_range 5
+    
diff --git a/fastvideo/configs/configs.py b/fastvideo/configs/configs.py
@@ -9,29 +9,6 @@
 logger = init_logger(__name__)
 
 
-class DatasetType(str, Enum):
-    """
-    Enumeration for different dataset types.
-    """
-    HF = "hf"
-    MERGED = "merged"
-
-    @classmethod
-    def from_string(cls, value: str) -> "DatasetType":
-        """Convert string to DatasetType enum."""
-        try:
-            return cls(value.lower())
-        except ValueError:
-            raise ValueError(
-                f"Invalid dataset type: {value}. Must be one of: {', '.join([m.value for m in cls])}"
-            ) from None
-
-    @classmethod
-    def choices(cls) -> list[str]:
-        """Get all available choices as strings for argparse."""
-        return [dataset_type.value for dataset_type in cls]
-
-
 class DatasetType(str, Enum):
     """
     Enumeration for different dataset types.
diff --git a/fastvideo/configs/models/encoders/clip.py b/fastvideo/configs/models/encoders/clip.py
@@ -74,7 +74,13 @@ class CLIPVisionArchConfig(ImageEncoderArchConfig):
 class CLIPTextConfig(TextEncoderConfig):
     arch_config: TextEncoderArchConfig = field(
         default_factory=CLIPTextArchConfig)
-
+    tokenizer_kwargs: dict = field(
+        default_factory=lambda: {
+            "padding": "max_length",
+            "truncation": True,
+            "max_length": 77,
+            "return_tensors": "pt"
+        })
     num_hidden_layers_override: int | None = None
     require_post_norm: bool | None = None
     prefix: str = "clip"
diff --git a/fastvideo/configs/models/encoders/llama.py b/fastvideo/configs/models/encoders/llama.py
@@ -60,5 +60,11 @@ class LlamaArchConfig(TextEncoderArchConfig):
 @dataclass
 class LlamaConfig(TextEncoderConfig):
     arch_config: TextEncoderArchConfig = field(default_factory=LlamaArchConfig)
-
+    tokenizer_kwargs: dict = field(
+        default_factory=lambda: {
+            "padding": "max_length",
+            "truncation": True,
+            "max_length": 256,
+            "return_tensors": "pt"
+        })
     prefix: str = "llama"
diff --git a/fastvideo/layers/rotary_embedding.py b/fastvideo/layers/rotary_embedding.py
@@ -138,14 +138,14 @@ def forward_native(
         cos, sin = cos_sin.chunk(2, dim=-1)
 
         query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
+        query = query.reshape(num_tokens, -1, self.head_size)
         query_rot = query[..., :self.rotary_dim]
         query_pass = query[..., self.rotary_dim:]
         query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key_shape = key.shape
-        key = key.view(num_tokens, -1, self.head_size)
+        key = key.reshape(num_tokens, -1, self.head_size)
         key_rot = key[..., :self.rotary_dim]
         key_pass = key[..., self.rotary_dim:]
         key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
diff --git a/fastvideo/models/vaes/hunyuanvae.py b/fastvideo/models/vaes/hunyuanvae.py
@@ -91,7 +91,7 @@ def forward(self,
         key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
         value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
 
-        # Perform scaled dot-product attention
+        # Perform scaled dot-product attentionz
         hidden_states = F.scaled_dot_product_attention(query,
                                                        key,
                                                        value,
@@ -361,7 +361,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                         hidden_states.device,
                         batch_size=batch_size)
                     hidden_states = attn(hidden_states,
-                                         attention_mask=attention_mask)
+                                         attention_mask=attention_mask.unsqueeze(1))
                     hidden_states = hidden_states.unflatten(
                         1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
 
@@ -385,7 +385,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                         hidden_states.device,
                         batch_size=batch_size)
                     hidden_states = attn(hidden_states,
-                                         attention_mask=attention_mask)
+                                         attention_mask=attention_mask.unsqueeze(1))
                     hidden_states = hidden_states.unflatten(
                         1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
 
diff --git a/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py b/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py
@@ -9,7 +9,8 @@
 
 class PreprocessPipelineI2V(ComposedPipelineBase):
     _required_config_modules = [
-        "image_encoder", "image_processor", "text_encoder", "tokenizer", "vae"
+        "image_encoder", "image_processor", "text_encoder", "tokenizer",
+        "text_encoder_2", "tokenizer_2", "vae"
     ]
 
     def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
@@ -51,7 +52,9 @@ def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
 
 
 class PreprocessPipelineT2V(ComposedPipelineBase):
-    _required_config_modules = ["text_encoder", "tokenizer", "vae"]
+    _required_config_modules = [
+        "text_encoder", "tokenizer", "text_encoder_2", "tokenizer_2", "vae"
+    ]
 
     def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
         assert fastvideo_args.preprocess_config is not None
@@ -61,10 +64,34 @@ def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
                            preprocess_config.training_cfg_rate,
                            seed=fastvideo_args.preprocess_config.seed,
                        ))
+        # llama_tokenizer_kwargs = {
+        #     "padding": "max_length",
+        #     "truncation": True,
+        #     "max_length": 256,
+        #     "return_tensors": "pt"
+        # }
+        # clip_tokenizer_kwargs = {
+        #     "padding": "max_length",
+        #     "truncation": True,
+        #     "max_length": 77,
+        #     "return_tensors": "pt"
+        # }
+        # if len(fastvideo_args.pipeline_config.text_encoder_configs) >= 2:
+        #     fastvideo_args.pipeline_config.text_encoder_configs[0].tokenizer_kwargs = llama_tokenizer_kwargs
+        #     fastvideo_args.pipeline_config.text_encoder_configs[1].tokenizer_kwargs = clip_tokenizer_kwargs
+        text_encoders = [
+            self.get_module("text_encoder"),
+            self.get_module("text_encoder_2")
+        ]
+        tokenizers = [
+            self.get_module("tokenizer"),
+            self.get_module("tokenizer_2")
+        ]
+
         self.add_stage(stage_name="prompt_encoding_stage",
                        stage=TextEncodingStage(
-                           text_encoders=[self.get_module("text_encoder")],
-                           tokenizers=[self.get_module("tokenizer")],
+                           text_encoders=text_encoders,
+                           tokenizers=tokenizers,
                        ))
         self.add_stage(
             stage_name="video_transform_stage",
diff --git a/fastvideo/workflow/preprocess/components.py b/fastvideo/workflow/preprocess/components.py
@@ -14,10 +14,6 @@
 from datasets import Dataset, Video, load_dataset
 
 from fastvideo.configs.configs import DatasetType, PreprocessConfig
-<<<<<<< HEAD
-from fastvideo.distributed.parallel_state import get_world_rank, get_world_size
-=======
->>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
 from fastvideo.logger import init_logger
 from fastvideo.pipelines.pipeline_batch_info import PreprocessBatch
 
@@ -82,10 +78,8 @@ def __call__(self, batch: dict[str, Any]) -> bool:
 
     def _validate_data_type(self, batch: dict[str, Any]) -> bool:
         """Validate basic validity of data items"""
-        print("-------------------------------")
-        print(batch)
-        return not (batch["caption"] is None or batch["caption"] == ""
-                    or "fps" not in batch or batch["fps"] is None or batch["fps"] <= 0
+        return not (batch["caption"] is None or batch["caption"] == "" or "fps"
+                    not in batch or batch["fps"] is None or batch["fps"] <= 0
                     or batch["num_frames"] is None or batch["num_frames"] <= 0)
 
     def _validate_resolution(self, batch: dict[str, Any]) -> bool:
@@ -405,19 +399,9 @@ def _default_file_writer_fn(self, args_tuple: tuple) -> int:
         return written_count
 
 
-<<<<<<< HEAD
-def build_dataset(preprocess_config: PreprocessConfig, split: str,
-                  validator: Callable[[dict[str, Any]], bool]) -> Dataset:
-    if preprocess_config.dataset_type == DatasetType.HF:
-        dataset = load_dataset(preprocess_config.dataset_path, split=split)
-        dataset = dataset.filter(validator)
-        dataset = dataset.shard(num_shards=get_world_size(),
-                                index=get_world_rank())
-=======
 def build_dataset(preprocess_config: PreprocessConfig, split: str) -> Dataset:
     if preprocess_config.dataset_type == DatasetType.HF:
         dataset = load_dataset(preprocess_config.dataset_path, split=split)
->>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
     elif preprocess_config.dataset_type == DatasetType.MERGED:
         metadata_json_path = os.path.join(preprocess_config.dataset_path,
                                           "videos2caption.json")
@@ -431,14 +415,6 @@ def build_dataset(preprocess_config: PreprocessConfig, split: str) -> Dataset:
             dataset = dataset.rename_column("cap", "caption")
         if "path" in column_names:
             dataset = dataset.rename_column("path", "name")
-<<<<<<< HEAD
-
-        dataset = dataset.filter(validator)
-        dataset = dataset.shard(num_shards=get_world_size(),
-                                index=get_world_rank())
-
-=======
->>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
         # add video column
         def add_video_column(item: dict[str, Any]) -> dict[str, Any]:
             item["video"] = os.path.join(video_folder, item["name"])
diff --git a/fastvideo/workflow/preprocess/preprocess_workflow.py b/fastvideo/workflow/preprocess/preprocess_workflow.py
@@ -44,13 +44,7 @@ def register_components(self) -> None:
         self.add_component("raw_data_validator", raw_data_validator)
 
         # training dataset
-<<<<<<< HEAD
-        training_dataset = build_dataset(preprocess_config,
-                                         split="train",
-                                         validator=raw_data_validator)
-=======
         training_dataset = build_dataset(preprocess_config, split="train")
->>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
         # set load_from_cache_file to False to check filter stats
         training_dataset = training_dataset.filter(raw_data_validator)
         # we do not use collate_fn here because we use iterable-style Dataset
@@ -66,13 +60,8 @@ def register_components(self) -> None:
         # try to load validation dataset if it exists
         try:
             validation_dataset = build_dataset(preprocess_config,
-<<<<<<< HEAD
-                                               split="validation",
-                                               validator=raw_data_validator)
-=======
                                                split="validation")
             validation_dataset = validation_dataset.filter(raw_data_validator)
->>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
             validation_dataloader = DataLoader(
                 validation_dataset,
                 batch_size=preprocess_config.preprocess_video_batch_size,
diff --git a/scripts/dataset_preparation/prepare_json_file.py b/scripts/dataset_preparation/prepare_json_file.py
@@ -23,8 +23,10 @@ def get_video_info(video_path):
     fps = info.get("video_fps", 0)
     duration = num_frames / fps if fps > 0 else 0
 
-    # Extract name
-    _, _, videos_dir, video_name = str(video_path).split("/")
+    from pathlib import Path
+    video_path = Path(video_path)
+    videos_dir = video_path.parent.name
+    video_name = video_path.name
 
     return {
         "path": str(video_name),
@@ -100,6 +102,7 @@ def prepare_dataset_json(folder_path,
 
     # Save to JSON file
     output_file = folder_path / output_name
+    print(folder_path,output_file,output_name)
     with open(output_file, 'w') as f:
         json.dump(dataset_info, f, indent=2)