hunyuan video data preprocess

JiayiZhangA · JiayiZhangA · commit 93e8e2bdbb44 · 2025-08-27T19:47:50.000Z
diff --git a/HunyuanVideo b/HunyuanVideo
@@ -0,0 +1 @@
+Subproject commit 4fdf87f3d8dbdca223d7f4511e7d86d3293ffccd
diff --git a/examples/training/finetune/wan_t2v_1.3B/crush_smol/preprocess_hunyuan_data_t2v.sh b/examples/training/finetune/wan_t2v_1.3B/crush_smol/preprocess_hunyuan_data_t2v.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+GPU_NUM=1 # 2,4,8
+MODEL_PATH="hunyuanvideo-community/HunyuanVideo"
+DATASET_PATH="/FastVideo/data/mini_i2v_dataset/crush-smol_raw"
+OUTPUT_DIR="/FastVideo/data/mini_i2v_dataset/crush-smol_processed_t2v_hunyuan/"
+
+torchrun --nproc_per_node=$GPU_NUM \
+    -m fastvideo.pipelines.preprocess.v1_preprocessing_new \
+    --model_path $MODEL_PATH \
+    --mode preprocess \
+    --workload_type t2v \
+    --preprocess.dataset_path $DATASET_PATH \
+    --preprocess.dataset_output_dir $OUTPUT_DIR \
+    --preprocess.preprocess_video_batch_size 2 \
+    --preprocess.dataloader_num_workers 0 \
+    --preprocess.max_height 480 \
+    --preprocess.max_width 832 \
+    --preprocess.num_frames 77 \
+    --preprocess.train_fps 16 \
+    --preprocess.samples_per_file 8 \
+    --preprocess.flush_frequency 8 \
+    --preprocess.video_length_tolerance_range 5
diff --git a/fastvideo/pipelines/preprocess/hunyuan/__init__.py b/fastvideo/pipelines/preprocess/hunyuan/__init__.py
diff --git a/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py b/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py
@@ -0,0 +1,83 @@
+from fastvideo.fastvideo_args import FastVideoArgs
+from fastvideo.pipelines.composed_pipeline_base import ComposedPipelineBase
+from fastvideo.pipelines.preprocess.preprocess_stages import (
+    TextTransformStage, VideoTransformStage)
+from fastvideo.pipelines.stages import (EncodingStage, ImageEncodingStage,
+                                        TextEncodingStage)
+from fastvideo.pipelines.stages.image_encoding import ImageVAEEncodingStage
+
+
+class PreprocessPipelineI2V(ComposedPipelineBase):
+    _required_config_modules = [
+        "image_encoder", "image_processor", "text_encoder", "tokenizer", "vae"
+    ]
+
+    def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
+        assert fastvideo_args.preprocess_config is not None
+        self.add_stage(stage_name="text_transform_stage",
+                       stage=TextTransformStage(
+                           cfg_uncondition_drop_rate=fastvideo_args.
+                           preprocess_config.training_cfg_rate,
+                           seed=fastvideo_args.preprocess_config.seed,
+                       ))
+        self.add_stage(stage_name="prompt_encoding_stage",
+                       stage=TextEncodingStage(
+                           text_encoders=[self.get_module("text_encoder")],
+                           tokenizers=[self.get_module("tokenizer")],
+                       ))
+        self.add_stage(
+            stage_name="video_transform_stage",
+            stage=VideoTransformStage(
+                train_fps=fastvideo_args.preprocess_config.train_fps,
+                num_frames=fastvideo_args.preprocess_config.num_frames,
+                max_height=fastvideo_args.preprocess_config.max_height,
+                max_width=fastvideo_args.preprocess_config.max_width,
+                do_temporal_sample=fastvideo_args.preprocess_config.
+                do_temporal_sample,
+            ))
+        if (self.get_module("image_encoder") is not None
+                and self.get_module("image_processor") is not None):
+            self.add_stage(
+                stage_name="image_encoding_stage",
+                stage=ImageEncodingStage(
+                    image_encoder=self.get_module("image_encoder"),
+                    image_processor=self.get_module("image_processor"),
+                ))
+        self.add_stage(stage_name="image_vae_encoding_stage",
+                       stage=ImageVAEEncodingStage(
+                           vae=self.get_module("vae"), ))
+        self.add_stage(stage_name="video_encoding_stage",
+                       stage=EncodingStage(vae=self.get_module("vae"), ))
+
+
+class PreprocessPipelineT2V(ComposedPipelineBase):
+    _required_config_modules = ["text_encoder", "tokenizer", "vae"]
+
+    def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
+        assert fastvideo_args.preprocess_config is not None
+        self.add_stage(stage_name="text_transform_stage",
+                       stage=TextTransformStage(
+                           cfg_uncondition_drop_rate=fastvideo_args.
+                           preprocess_config.training_cfg_rate,
+                           seed=fastvideo_args.preprocess_config.seed,
+                       ))
+        self.add_stage(stage_name="prompt_encoding_stage",
+                       stage=TextEncodingStage(
+                           text_encoders=[self.get_module("text_encoder")],
+                           tokenizers=[self.get_module("tokenizer")],
+                       ))
+        self.add_stage(
+            stage_name="video_transform_stage",
+            stage=VideoTransformStage(
+                train_fps=fastvideo_args.preprocess_config.train_fps,
+                num_frames=fastvideo_args.preprocess_config.num_frames,
+                max_height=fastvideo_args.preprocess_config.max_height,
+                max_width=fastvideo_args.preprocess_config.max_width,
+                do_temporal_sample=fastvideo_args.preprocess_config.
+                do_temporal_sample,
+            ))
+        self.add_stage(stage_name="video_encoding_stage",
+                       stage=EncodingStage(vae=self.get_module("vae"), ))
+
+
+EntryClass = [PreprocessPipelineI2V, PreprocessPipelineT2V]
diff --git a/fastvideo/workflow/preprocess/components.py b/fastvideo/workflow/preprocess/components.py
@@ -79,8 +79,10 @@ def __call__(self, batch: dict[str, Any]) -> bool:
 
     def _validate_data_type(self, batch: dict[str, Any]) -> bool:
         """Validate basic validity of data items"""
+        print("-------------------------------")
+        print(batch)
         return not (batch["caption"] is None or batch["caption"] == ""
-                    or batch["fps"] is None or batch["fps"] <= 0
+                    or "fps" not in batch or batch["fps"] is None or batch["fps"] <= 0
                     or batch["num_frames"] is None or batch["num_frames"] <= 0)
 
     def _validate_resolution(self, batch: dict[str, Any]) -> bool:
diff --git a/fastvideo/workflow/preprocess/preprocess_workflow.py b/fastvideo/workflow/preprocess/preprocess_workflow.py
@@ -47,6 +47,8 @@ def register_components(self) -> None:
         training_dataset = build_dataset(preprocess_config,
                                          split="train",
                                          validator=raw_data_validator)
+        # set load_from_cache_file to False to check filter stats
+        training_dataset = training_dataset.filter(raw_data_validator)
         # we do not use collate_fn here because we use iterable-style Dataset
         # and want to keep the original type of the dataset
         training_dataloader = DataLoader(

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Subproject commit 4fdf87f3d8dbdca223d7f4511e7d86d3293ffccd`