[Feat][Preprocess] support merged dataset (#752)

Eigensystem · web-flow · commit 3ab6470d1a71 · 2025-08-22T15:29:33.000-07:00
diff --git a/examples/training/finetune/wan_i2v_14B_480p/crush_smol/preprocess_wan_data_i2v_new.sh b/examples/training/finetune/wan_i2v_14B_480p/crush_smol/preprocess_wan_data_i2v_new.sh
@@ -2,15 +2,15 @@
 
 GPU_NUM=1 # 2,4,8
 MODEL_PATH="Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
-# MODEL_PATH="/home/eigensystem/.cache/huggingface/hub/models--Wan-AI--Wan2.1-I2V-14B-480P-Diffusers/snapshots/b184e23a8a16b20f108f727c902e769e873ffc73/"
-DATASET_PATH="data/crush-smol-test/"
+DATASET_PATH="data/crush-smol/"
 OUTPUT_DIR="data/crush-smol_processed_i2v/"
 
 torchrun --nproc_per_node=$GPU_NUM \
     -m fastvideo.pipelines.preprocess.v1_preprocessing_new \
     --model_path $MODEL_PATH \
     --mode preprocess \
     --workload_type i2v \
+    --preprocess.dataset_type merged \
     --preprocess.dataset_path $DATASET_PATH \
     --preprocess.dataset_output_dir $OUTPUT_DIR \
     --preprocess.preprocess_video_batch_size 2 \
diff --git a/examples/training/finetune/wan_t2v_1.3B/crush_smol/preprocess_wan_data_t2v_new.sh b/examples/training/finetune/wan_t2v_1.3B/crush_smol/preprocess_wan_data_t2v_new.sh
@@ -2,14 +2,15 @@
 
 GPU_NUM=1 # 2,4,8
 MODEL_PATH="Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
-DATASET_PATH="data/crush-smol-test/"
+DATASET_PATH="data/crush-smol/"
 OUTPUT_DIR="data/crush-smol_processed_t2v/"
 
 torchrun --nproc_per_node=$GPU_NUM \
     -m fastvideo.pipelines.preprocess.v1_preprocessing_new \
     --model_path $MODEL_PATH \
     --mode preprocess \
     --workload_type t2v \
+    --preprocess.dataset_type merged \
     --preprocess.dataset_path $DATASET_PATH \
     --preprocess.dataset_output_dir $OUTPUT_DIR \
     --preprocess.preprocess_video_batch_size 2 \
diff --git a/fastvideo/configs/configs.py b/fastvideo/configs/configs.py
@@ -1,17 +1,42 @@
 import dataclasses
+from enum import Enum
 from typing import Any, Optional
 
 from fastvideo.configs.utils import update_config_from_args
 from fastvideo.utils import FlexibleArgumentParser, StoreBoolean
 
 
+class DatasetType(str, Enum):
+    """
+    Enumeration for different dataset types.
+    """
+    HF = "hf"
+    MERGED = "merged"
+
+    @classmethod
+    def from_string(cls, value: str) -> "DatasetType":
+        """Convert string to DatasetType enum."""
+        try:
+            return cls(value.lower())
+        except ValueError:
+            raise ValueError(
+                f"Invalid dataset type: {value}. Must be one of: {', '.join([m.value for m in cls])}"
+            ) from None
+
+    @classmethod
+    def choices(cls) -> list[str]:
+        """Get all available choices as strings for argparse."""
+        return [dataset_type.value for dataset_type in cls]
+
+
 @dataclasses.dataclass
 class PreprocessConfig:
     """Configuration for preprocessing operations."""
 
     # Model and dataset configuration
     model_path: str = ""
     dataset_path: str = ""
+    dataset_type: DatasetType = DatasetType.HF
     dataset_output_dir: str = "./output"
 
     # Dataloader configuration
@@ -54,6 +79,12 @@ def add_cli_args(parser: FlexibleArgumentParser,
             type=str,
             default=PreprocessConfig.dataset_path,
             help="Path to the dataset directory for preprocessing")
+        preprocess_args.add_argument(
+            f"--{prefix_with_dot}dataset-type",
+            type=str,
+            choices=DatasetType.choices(),
+            default=PreprocessConfig.dataset_type.value,
+            help="Type of the dataset")
         preprocess_args.add_argument(
             f"--{prefix_with_dot}dataset-output-dir",
             type=str,
@@ -136,6 +167,10 @@ def add_cli_args(parser: FlexibleArgumentParser,
     def from_kwargs(cls, kwargs: dict[str,
                                       Any]) -> Optional["PreprocessConfig"]:
         """Create PreprocessConfig from keyword arguments."""
+        if 'dataset_type' in kwargs and isinstance(kwargs['dataset_type'], str):
+            kwargs['dataset_type'] = DatasetType.from_string(
+                kwargs['dataset_type'])
+
         preprocess_config = cls()
         if not update_config_from_args(
                 preprocess_config, kwargs, prefix="preprocess", pop_args=True):
diff --git a/fastvideo/workflow/preprocess/components.py b/fastvideo/workflow/preprocess/components.py
@@ -11,7 +11,9 @@
 import pyarrow as pa
 import pyarrow.parquet as pq
 import torch
+from datasets import Dataset, Video, load_dataset
 
+from fastvideo.configs.configs import DatasetType, PreprocessConfig
 from fastvideo.logger import init_logger
 from fastvideo.pipelines.pipeline_batch_info import PreprocessBatch
 
@@ -395,3 +397,33 @@ def _default_file_writer_fn(self, args_tuple: tuple) -> int:
             written_count += len(chunk_table)
 
         return written_count
+
+
+def build_dataset(preprocess_config: PreprocessConfig, split: str) -> Dataset:
+    if preprocess_config.dataset_type == DatasetType.HF:
+        dataset = load_dataset(preprocess_config.dataset_path, split=split)
+    elif preprocess_config.dataset_type == DatasetType.MERGED:
+        metadata_json_path = os.path.join(preprocess_config.dataset_path,
+                                          "videos2caption.json")
+        video_folder = os.path.join(preprocess_config.dataset_path, "videos")
+        dataset = load_dataset("json",
+                               data_files=metadata_json_path,
+                               split=split)
+        column_names = dataset.column_names
+        # rename columns to match the schema
+        if "cap" in column_names:
+            dataset = dataset.rename_column("cap", "caption")
+        if "path" in column_names:
+            dataset = dataset.rename_column("path", "name")
+        # add video column
+        def add_video_column(item: dict[str, Any]) -> dict[str, Any]:
+            item["video"] = os.path.join(video_folder, item["name"])
+            return item
+
+        dataset = dataset.map(add_video_column)
+        dataset = dataset.cast_column("video", Video())
+    else:
+        raise ValueError(
+            f"Invalid dataset type: {preprocess_config.dataset_type}")
+
+    return dataset
diff --git a/fastvideo/workflow/preprocess/preprocess_workflow.py b/fastvideo/workflow/preprocess/preprocess_workflow.py
@@ -1,7 +1,6 @@
 import os
 from typing import cast
 
-from datasets import load_dataset
 from torch.utils.data import DataLoader
 
 from fastvideo.configs.configs import PreprocessConfig
@@ -11,7 +10,8 @@
 from fastvideo.logger import init_logger
 from fastvideo.pipelines.pipeline_registry import PipelineType
 from fastvideo.workflow.preprocess.components import (
-    ParquetDatasetSaver, PreprocessingDataValidator, VideoForwardBatchBuilder)
+    ParquetDatasetSaver, PreprocessingDataValidator, VideoForwardBatchBuilder,
+    build_dataset)
 from fastvideo.workflow.preprocess.record_schema import (
     basic_t2v_record_creator, i2v_record_creator)
 from fastvideo.workflow.workflow_base import WorkflowBase
@@ -43,8 +43,7 @@ def register_components(self) -> None:
         self.add_component("raw_data_validator", raw_data_validator)
 
         # training dataset
-        training_dataset = load_dataset(preprocess_config.dataset_path,
-                                        split="train")
+        training_dataset = build_dataset(preprocess_config, split="train")
         # set load_from_cache_file to False to check filter stats
         training_dataset = training_dataset.filter(raw_data_validator)
         # we do not use collate_fn here because we use iterable-style Dataset
@@ -59,8 +58,8 @@ def register_components(self) -> None:
 
         # try to load validation dataset if it exists
         try:
-            validation_dataset = load_dataset(preprocess_config.dataset_path,
-                                              split="validation")
+            validation_dataset = build_dataset(preprocess_config,
+                                               split="validation")
             validation_dataset = validation_dataset.filter(raw_data_validator)
             validation_dataloader = DataLoader(
                 validation_dataset,