From 93e8e2bdbb44470c196516cb6b6765c30471c05f Mon Sep 17 00:00:00 2001
From: ainsley <jzhang2765@wisc.edu>
Date: Fri, 22 Aug 2025 10:37:56 +0000
Subject: [PATCH 1/5] hunyuan video data preprocess

---
 HunyuanVideo                                  |  1 +
 .../crush_smol/preprocess_hunyuan_data_t2v.sh | 23 +++++
 .../pipelines/preprocess/hunyuan/__init__.py  |  0
 .../hunyuan/hunyuan_preprocess_pipelines.py   | 83 +++++++++++++++++++
 fastvideo/workflow/preprocess/components.py   |  4 +-
 .../preprocess/preprocess_workflow.py         |  2 +
 6 files changed, 112 insertions(+), 1 deletion(-)
 create mode 160000 HunyuanVideo
 create mode 100644 examples/training/finetune/wan_t2v_1.3B/crush_smol/preprocess_hunyuan_data_t2v.sh
 create mode 100644 fastvideo/pipelines/preprocess/hunyuan/__init__.py
 create mode 100644 fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py

diff --git a/HunyuanVideo b/HunyuanVideo
new file mode 160000
index 000000000..4fdf87f3d
--- /dev/null
+++ b/HunyuanVideo
@@ -0,0 +1 @@
+Subproject commit 4fdf87f3d8dbdca223d7f4511e7d86d3293ffccd
diff --git a/examples/training/finetune/wan_t2v_1.3B/crush_smol/preprocess_hunyuan_data_t2v.sh b/examples/training/finetune/wan_t2v_1.3B/crush_smol/preprocess_hunyuan_data_t2v.sh
new file mode 100644
index 000000000..127859b68
--- /dev/null
+++ b/examples/training/finetune/wan_t2v_1.3B/crush_smol/preprocess_hunyuan_data_t2v.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+GPU_NUM=1 # 2,4,8
+MODEL_PATH="hunyuanvideo-community/HunyuanVideo"
+DATASET_PATH="/FastVideo/data/mini_i2v_dataset/crush-smol_raw"
+OUTPUT_DIR="/FastVideo/data/mini_i2v_dataset/crush-smol_processed_t2v_hunyuan/"
+
+torchrun --nproc_per_node=$GPU_NUM \
+    -m fastvideo.pipelines.preprocess.v1_preprocessing_new \
+    --model_path $MODEL_PATH \
+    --mode preprocess \
+    --workload_type t2v \
+    --preprocess.dataset_path $DATASET_PATH \
+    --preprocess.dataset_output_dir $OUTPUT_DIR \
+    --preprocess.preprocess_video_batch_size 2 \
+    --preprocess.dataloader_num_workers 0 \
+    --preprocess.max_height 480 \
+    --preprocess.max_width 832 \
+    --preprocess.num_frames 77 \
+    --preprocess.train_fps 16 \
+    --preprocess.samples_per_file 8 \
+    --preprocess.flush_frequency 8 \
+    --preprocess.video_length_tolerance_range 5
diff --git a/fastvideo/pipelines/preprocess/hunyuan/__init__.py b/fastvideo/pipelines/preprocess/hunyuan/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py b/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py
new file mode 100644
index 000000000..6b7215d56
--- /dev/null
+++ b/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py
@@ -0,0 +1,83 @@
+from fastvideo.fastvideo_args import FastVideoArgs
+from fastvideo.pipelines.composed_pipeline_base import ComposedPipelineBase
+from fastvideo.pipelines.preprocess.preprocess_stages import (
+    TextTransformStage, VideoTransformStage)
+from fastvideo.pipelines.stages import (EncodingStage, ImageEncodingStage,
+                                        TextEncodingStage)
+from fastvideo.pipelines.stages.image_encoding import ImageVAEEncodingStage
+
+
+class PreprocessPipelineI2V(ComposedPipelineBase):
+    _required_config_modules = [
+        "image_encoder", "image_processor", "text_encoder", "tokenizer", "vae"
+    ]
+
+    def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
+        assert fastvideo_args.preprocess_config is not None
+        self.add_stage(stage_name="text_transform_stage",
+                       stage=TextTransformStage(
+                           cfg_uncondition_drop_rate=fastvideo_args.
+                           preprocess_config.training_cfg_rate,
+                           seed=fastvideo_args.preprocess_config.seed,
+                       ))
+        self.add_stage(stage_name="prompt_encoding_stage",
+                       stage=TextEncodingStage(
+                           text_encoders=[self.get_module("text_encoder")],
+                           tokenizers=[self.get_module("tokenizer")],
+                       ))
+        self.add_stage(
+            stage_name="video_transform_stage",
+            stage=VideoTransformStage(
+                train_fps=fastvideo_args.preprocess_config.train_fps,
+                num_frames=fastvideo_args.preprocess_config.num_frames,
+                max_height=fastvideo_args.preprocess_config.max_height,
+                max_width=fastvideo_args.preprocess_config.max_width,
+                do_temporal_sample=fastvideo_args.preprocess_config.
+                do_temporal_sample,
+            ))
+        if (self.get_module("image_encoder") is not None
+                and self.get_module("image_processor") is not None):
+            self.add_stage(
+                stage_name="image_encoding_stage",
+                stage=ImageEncodingStage(
+                    image_encoder=self.get_module("image_encoder"),
+                    image_processor=self.get_module("image_processor"),
+                ))
+        self.add_stage(stage_name="image_vae_encoding_stage",
+                       stage=ImageVAEEncodingStage(
+                           vae=self.get_module("vae"), ))
+        self.add_stage(stage_name="video_encoding_stage",
+                       stage=EncodingStage(vae=self.get_module("vae"), ))
+
+
+class PreprocessPipelineT2V(ComposedPipelineBase):
+    _required_config_modules = ["text_encoder", "tokenizer", "vae"]
+
+    def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
+        assert fastvideo_args.preprocess_config is not None
+        self.add_stage(stage_name="text_transform_stage",
+                       stage=TextTransformStage(
+                           cfg_uncondition_drop_rate=fastvideo_args.
+                           preprocess_config.training_cfg_rate,
+                           seed=fastvideo_args.preprocess_config.seed,
+                       ))
+        self.add_stage(stage_name="prompt_encoding_stage",
+                       stage=TextEncodingStage(
+                           text_encoders=[self.get_module("text_encoder")],
+                           tokenizers=[self.get_module("tokenizer")],
+                       ))
+        self.add_stage(
+            stage_name="video_transform_stage",
+            stage=VideoTransformStage(
+                train_fps=fastvideo_args.preprocess_config.train_fps,
+                num_frames=fastvideo_args.preprocess_config.num_frames,
+                max_height=fastvideo_args.preprocess_config.max_height,
+                max_width=fastvideo_args.preprocess_config.max_width,
+                do_temporal_sample=fastvideo_args.preprocess_config.
+                do_temporal_sample,
+            ))
+        self.add_stage(stage_name="video_encoding_stage",
+                       stage=EncodingStage(vae=self.get_module("vae"), ))
+
+
+EntryClass = [PreprocessPipelineI2V, PreprocessPipelineT2V]
diff --git a/fastvideo/workflow/preprocess/components.py b/fastvideo/workflow/preprocess/components.py
index aa9bdbbcd..708020293 100644
--- a/fastvideo/workflow/preprocess/components.py
+++ b/fastvideo/workflow/preprocess/components.py
@@ -79,8 +79,10 @@ def __call__(self, batch: dict[str, Any]) -> bool:
 
     def _validate_data_type(self, batch: dict[str, Any]) -> bool:
         """Validate basic validity of data items"""
+        print("-------------------------------")
+        print(batch)
         return not (batch["caption"] is None or batch["caption"] == ""
-                    or batch["fps"] is None or batch["fps"] <= 0
+                    or "fps" not in batch or batch["fps"] is None or batch["fps"] <= 0
                     or batch["num_frames"] is None or batch["num_frames"] <= 0)
 
     def _validate_resolution(self, batch: dict[str, Any]) -> bool:
diff --git a/fastvideo/workflow/preprocess/preprocess_workflow.py b/fastvideo/workflow/preprocess/preprocess_workflow.py
index 8f83f07db..45a3c52a6 100644
--- a/fastvideo/workflow/preprocess/preprocess_workflow.py
+++ b/fastvideo/workflow/preprocess/preprocess_workflow.py
@@ -47,6 +47,8 @@ def register_components(self) -> None:
         training_dataset = build_dataset(preprocess_config,
                                          split="train",
                                          validator=raw_data_validator)
+        # set load_from_cache_file to False to check filter stats
+        training_dataset = training_dataset.filter(raw_data_validator)
         # we do not use collate_fn here because we use iterable-style Dataset
         # and want to keep the original type of the dataset
         training_dataloader = DataLoader(

From 766b93159a5daeefa8f09ff6e70a9898ca82c62e Mon Sep 17 00:00:00 2001
From: ainsley <jzhang2765@wisc.edu>
Date: Fri, 22 Aug 2025 10:40:45 +0000
Subject: [PATCH 2/5] clean up

---
 HunyuanVideo | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 HunyuanVideo

diff --git a/HunyuanVideo b/HunyuanVideo
deleted file mode 160000
index 4fdf87f3d..000000000
--- a/HunyuanVideo
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 4fdf87f3d8dbdca223d7f4511e7d86d3293ffccd

From d30879a075481d538f317f44ec02a3c2d5dd93ce Mon Sep 17 00:00:00 2001
From: Jinzhe Pan <48981407+Eigensystem@users.noreply.github.com>
Date: Sat, 23 Aug 2025 06:29:33 +0800
Subject: [PATCH 3/5] [Feat][Preprocess] support merged dataset (#752)

---
 fastvideo/configs/configs.py                  | 23 +++++++++++++++++++
 fastvideo/workflow/preprocess/components.py   | 12 ++++++++++
 .../preprocess/preprocess_workflow.py         |  9 ++++++++
 3 files changed, 44 insertions(+)

diff --git a/fastvideo/configs/configs.py b/fastvideo/configs/configs.py
index 07a7630b2..d2f2722ad 100644
--- a/fastvideo/configs/configs.py
+++ b/fastvideo/configs/configs.py
@@ -9,6 +9,29 @@
 logger = init_logger(__name__)
 
 
+class DatasetType(str, Enum):
+    """
+    Enumeration for different dataset types.
+    """
+    HF = "hf"
+    MERGED = "merged"
+
+    @classmethod
+    def from_string(cls, value: str) -> "DatasetType":
+        """Convert string to DatasetType enum."""
+        try:
+            return cls(value.lower())
+        except ValueError:
+            raise ValueError(
+                f"Invalid dataset type: {value}. Must be one of: {', '.join([m.value for m in cls])}"
+            ) from None
+
+    @classmethod
+    def choices(cls) -> list[str]:
+        """Get all available choices as strings for argparse."""
+        return [dataset_type.value for dataset_type in cls]
+
+
 class DatasetType(str, Enum):
     """
     Enumeration for different dataset types.
diff --git a/fastvideo/workflow/preprocess/components.py b/fastvideo/workflow/preprocess/components.py
index 708020293..c921c13de 100644
--- a/fastvideo/workflow/preprocess/components.py
+++ b/fastvideo/workflow/preprocess/components.py
@@ -14,7 +14,10 @@
 from datasets import Dataset, Video, load_dataset
 
 from fastvideo.configs.configs import DatasetType, PreprocessConfig
+<<<<<<< HEAD
 from fastvideo.distributed.parallel_state import get_world_rank, get_world_size
+=======
+>>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
 from fastvideo.logger import init_logger
 from fastvideo.pipelines.pipeline_batch_info import PreprocessBatch
 
@@ -402,6 +405,7 @@ def _default_file_writer_fn(self, args_tuple: tuple) -> int:
         return written_count
 
 
+<<<<<<< HEAD
 def build_dataset(preprocess_config: PreprocessConfig, split: str,
                   validator: Callable[[dict[str, Any]], bool]) -> Dataset:
     if preprocess_config.dataset_type == DatasetType.HF:
@@ -409,6 +413,11 @@ def build_dataset(preprocess_config: PreprocessConfig, split: str,
         dataset = dataset.filter(validator)
         dataset = dataset.shard(num_shards=get_world_size(),
                                 index=get_world_rank())
+=======
+def build_dataset(preprocess_config: PreprocessConfig, split: str) -> Dataset:
+    if preprocess_config.dataset_type == DatasetType.HF:
+        dataset = load_dataset(preprocess_config.dataset_path, split=split)
+>>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
     elif preprocess_config.dataset_type == DatasetType.MERGED:
         metadata_json_path = os.path.join(preprocess_config.dataset_path,
                                           "videos2caption.json")
@@ -422,11 +431,14 @@ def build_dataset(preprocess_config: PreprocessConfig, split: str,
             dataset = dataset.rename_column("cap", "caption")
         if "path" in column_names:
             dataset = dataset.rename_column("path", "name")
+<<<<<<< HEAD
 
         dataset = dataset.filter(validator)
         dataset = dataset.shard(num_shards=get_world_size(),
                                 index=get_world_rank())
 
+=======
+>>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
         # add video column
         def add_video_column(item: dict[str, Any]) -> dict[str, Any]:
             item["video"] = os.path.join(video_folder, item["name"])
diff --git a/fastvideo/workflow/preprocess/preprocess_workflow.py b/fastvideo/workflow/preprocess/preprocess_workflow.py
index 45a3c52a6..05625cc96 100644
--- a/fastvideo/workflow/preprocess/preprocess_workflow.py
+++ b/fastvideo/workflow/preprocess/preprocess_workflow.py
@@ -44,9 +44,13 @@ def register_components(self) -> None:
         self.add_component("raw_data_validator", raw_data_validator)
 
         # training dataset
+<<<<<<< HEAD
         training_dataset = build_dataset(preprocess_config,
                                          split="train",
                                          validator=raw_data_validator)
+=======
+        training_dataset = build_dataset(preprocess_config, split="train")
+>>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
         # set load_from_cache_file to False to check filter stats
         training_dataset = training_dataset.filter(raw_data_validator)
         # we do not use collate_fn here because we use iterable-style Dataset
@@ -62,8 +66,13 @@ def register_components(self) -> None:
         # try to load validation dataset if it exists
         try:
             validation_dataset = build_dataset(preprocess_config,
+<<<<<<< HEAD
                                                split="validation",
                                                validator=raw_data_validator)
+=======
+                                               split="validation")
+            validation_dataset = validation_dataset.filter(raw_data_validator)
+>>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
             validation_dataloader = DataLoader(
                 validation_dataset,
                 batch_size=preprocess_config.preprocess_video_batch_size,

From 3b39366b772eb0f9bcd6891d1091d1b8374d4221 Mon Sep 17 00:00:00 2001
From: ainsley <jzhang2765@wisc.edu>
Date: Fri, 22 Aug 2025 10:37:56 +0000
Subject: [PATCH 4/5] hunyuan video data preprocess

---
 HunyuanVideo | 1 +
 1 file changed, 1 insertion(+)
 create mode 160000 HunyuanVideo

diff --git a/HunyuanVideo b/HunyuanVideo
new file mode 160000
index 000000000..4fdf87f3d
--- /dev/null
+++ b/HunyuanVideo
@@ -0,0 +1 @@
+Subproject commit 4fdf87f3d8dbdca223d7f4511e7d86d3293ffccd

From 94b876719d1ba47b76cc58eefcfe3847e4416a81 Mon Sep 17 00:00:00 2001
From: ainsley <jzhang2765@wisc.edu>
Date: Mon, 25 Aug 2025 05:40:26 +0000
Subject: [PATCH 5/5] Update hunyuan video t2v preprocess

---
 HunyuanVideo                                  |  1 -
 .../preprocess_hunyuan_data_t2v.sh            |  6 ++--
 fastvideo/configs/configs.py                  | 23 ------------
 fastvideo/configs/models/encoders/clip.py     |  8 ++++-
 fastvideo/configs/models/encoders/llama.py    |  8 ++++-
 fastvideo/layers/rotary_embedding.py          |  4 +--
 fastvideo/models/vaes/hunyuanvae.py           |  4 +--
 .../hunyuan/hunyuan_preprocess_pipelines.py   | 35 ++++++++++++++++---
 fastvideo/workflow/preprocess/components.py   | 28 ++-------------
 .../preprocess/preprocess_workflow.py         | 11 ------
 .../dataset_preparation/prepare_json_file.py  |  7 ++--
 11 files changed, 60 insertions(+), 75 deletions(-)
 delete mode 160000 HunyuanVideo
 rename examples/training/finetune/{wan_t2v_1.3B/crush_smol => hunyuan_t2v}/preprocess_hunyuan_data_t2v.sh (83%)
 mode change 100644 => 100755

diff --git a/HunyuanVideo b/HunyuanVideo
deleted file mode 160000
index 4fdf87f3d..000000000
--- a/HunyuanVideo
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 4fdf87f3d8dbdca223d7f4511e7d86d3293ffccd
diff --git a/examples/training/finetune/wan_t2v_1.3B/crush_smol/preprocess_hunyuan_data_t2v.sh b/examples/training/finetune/hunyuan_t2v/preprocess_hunyuan_data_t2v.sh
old mode 100644
new mode 100755
similarity index 83%
rename from examples/training/finetune/wan_t2v_1.3B/crush_smol/preprocess_hunyuan_data_t2v.sh
rename to examples/training/finetune/hunyuan_t2v/preprocess_hunyuan_data_t2v.sh
index 127859b68..d0d9487ae
--- a/examples/training/finetune/wan_t2v_1.3B/crush_smol/preprocess_hunyuan_data_t2v.sh
+++ b/examples/training/finetune/hunyuan_t2v/preprocess_hunyuan_data_t2v.sh
@@ -2,14 +2,15 @@
 
 GPU_NUM=1 # 2,4,8
 MODEL_PATH="hunyuanvideo-community/HunyuanVideo"
-DATASET_PATH="/FastVideo/data/mini_i2v_dataset/crush-smol_raw"
-OUTPUT_DIR="/FastVideo/data/mini_i2v_dataset/crush-smol_processed_t2v_hunyuan/"
+DATASET_PATH="data/crush-smol"
+OUTPUT_DIR="data/crush-smol_processed_t2v_hunyuan/"
 
 torchrun --nproc_per_node=$GPU_NUM \
     -m fastvideo.pipelines.preprocess.v1_preprocessing_new \
     --model_path $MODEL_PATH \
     --mode preprocess \
     --workload_type t2v \
+    --preprocess.dataset_type merged \
     --preprocess.dataset_path $DATASET_PATH \
     --preprocess.dataset_output_dir $OUTPUT_DIR \
     --preprocess.preprocess_video_batch_size 2 \
@@ -21,3 +22,4 @@ torchrun --nproc_per_node=$GPU_NUM \
     --preprocess.samples_per_file 8 \
     --preprocess.flush_frequency 8 \
     --preprocess.video_length_tolerance_range 5
+    
diff --git a/fastvideo/configs/configs.py b/fastvideo/configs/configs.py
index d2f2722ad..07a7630b2 100644
--- a/fastvideo/configs/configs.py
+++ b/fastvideo/configs/configs.py
@@ -9,29 +9,6 @@
 logger = init_logger(__name__)
 
 
-class DatasetType(str, Enum):
-    """
-    Enumeration for different dataset types.
-    """
-    HF = "hf"
-    MERGED = "merged"
-
-    @classmethod
-    def from_string(cls, value: str) -> "DatasetType":
-        """Convert string to DatasetType enum."""
-        try:
-            return cls(value.lower())
-        except ValueError:
-            raise ValueError(
-                f"Invalid dataset type: {value}. Must be one of: {', '.join([m.value for m in cls])}"
-            ) from None
-
-    @classmethod
-    def choices(cls) -> list[str]:
-        """Get all available choices as strings for argparse."""
-        return [dataset_type.value for dataset_type in cls]
-
-
 class DatasetType(str, Enum):
     """
     Enumeration for different dataset types.
diff --git a/fastvideo/configs/models/encoders/clip.py b/fastvideo/configs/models/encoders/clip.py
index a7d313a86..b8b4942e1 100644
--- a/fastvideo/configs/models/encoders/clip.py
+++ b/fastvideo/configs/models/encoders/clip.py
@@ -74,7 +74,13 @@ class CLIPVisionArchConfig(ImageEncoderArchConfig):
 class CLIPTextConfig(TextEncoderConfig):
     arch_config: TextEncoderArchConfig = field(
         default_factory=CLIPTextArchConfig)
-
+    tokenizer_kwargs: dict = field(
+        default_factory=lambda: {
+            "padding": "max_length",
+            "truncation": True,
+            "max_length": 77,
+            "return_tensors": "pt"
+        })
     num_hidden_layers_override: int | None = None
     require_post_norm: bool | None = None
     prefix: str = "clip"
diff --git a/fastvideo/configs/models/encoders/llama.py b/fastvideo/configs/models/encoders/llama.py
index 53fc21e74..988a83f94 100644
--- a/fastvideo/configs/models/encoders/llama.py
+++ b/fastvideo/configs/models/encoders/llama.py
@@ -60,5 +60,11 @@ class LlamaArchConfig(TextEncoderArchConfig):
 @dataclass
 class LlamaConfig(TextEncoderConfig):
     arch_config: TextEncoderArchConfig = field(default_factory=LlamaArchConfig)
-
+    tokenizer_kwargs: dict = field(
+        default_factory=lambda: {
+            "padding": "max_length",
+            "truncation": True,
+            "max_length": 256,
+            "return_tensors": "pt"
+        })
     prefix: str = "llama"
diff --git a/fastvideo/layers/rotary_embedding.py b/fastvideo/layers/rotary_embedding.py
index 6abe90609..24c9d7f62 100644
--- a/fastvideo/layers/rotary_embedding.py
+++ b/fastvideo/layers/rotary_embedding.py
@@ -138,14 +138,14 @@ def forward_native(
         cos, sin = cos_sin.chunk(2, dim=-1)
 
         query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
+        query = query.reshape(num_tokens, -1, self.head_size)
         query_rot = query[..., :self.rotary_dim]
         query_pass = query[..., self.rotary_dim:]
         query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key_shape = key.shape
-        key = key.view(num_tokens, -1, self.head_size)
+        key = key.reshape(num_tokens, -1, self.head_size)
         key_rot = key[..., :self.rotary_dim]
         key_pass = key[..., self.rotary_dim:]
         key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
diff --git a/fastvideo/models/vaes/hunyuanvae.py b/fastvideo/models/vaes/hunyuanvae.py
index d0f614ea3..a2eb8d48b 100644
--- a/fastvideo/models/vaes/hunyuanvae.py
+++ b/fastvideo/models/vaes/hunyuanvae.py
@@ -361,7 +361,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                         hidden_states.device,
                         batch_size=batch_size)
                     hidden_states = attn(hidden_states,
-                                         attention_mask=attention_mask)
+                                         attention_mask=attention_mask.unsqueeze(1))
                     hidden_states = hidden_states.unflatten(
                         1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
 
@@ -385,7 +385,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                         hidden_states.device,
                         batch_size=batch_size)
                     hidden_states = attn(hidden_states,
-                                         attention_mask=attention_mask)
+                                         attention_mask=attention_mask.unsqueeze(1))
                     hidden_states = hidden_states.unflatten(
                         1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
 
diff --git a/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py b/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py
index 6b7215d56..7245be8c7 100644
--- a/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py
+++ b/fastvideo/pipelines/preprocess/hunyuan/hunyuan_preprocess_pipelines.py
@@ -9,7 +9,8 @@
 
 class PreprocessPipelineI2V(ComposedPipelineBase):
     _required_config_modules = [
-        "image_encoder", "image_processor", "text_encoder", "tokenizer", "vae"
+        "image_encoder", "image_processor", "text_encoder", "tokenizer",
+        "text_encoder_2", "tokenizer_2", "vae"
     ]
 
     def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
@@ -51,7 +52,9 @@ def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
 
 
 class PreprocessPipelineT2V(ComposedPipelineBase):
-    _required_config_modules = ["text_encoder", "tokenizer", "vae"]
+    _required_config_modules = [
+        "text_encoder", "tokenizer", "text_encoder_2", "tokenizer_2", "vae"
+    ]
 
     def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
         assert fastvideo_args.preprocess_config is not None
@@ -61,10 +64,34 @@ def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
                            preprocess_config.training_cfg_rate,
                            seed=fastvideo_args.preprocess_config.seed,
                        ))
+        # llama_tokenizer_kwargs = {
+        #     "padding": "max_length",
+        #     "truncation": True,
+        #     "max_length": 256,
+        #     "return_tensors": "pt"
+        # }
+        # clip_tokenizer_kwargs = {
+        #     "padding": "max_length",
+        #     "truncation": True,
+        #     "max_length": 77,
+        #     "return_tensors": "pt"
+        # }
+        # if len(fastvideo_args.pipeline_config.text_encoder_configs) >= 2:
+        #     fastvideo_args.pipeline_config.text_encoder_configs[0].tokenizer_kwargs = llama_tokenizer_kwargs
+        #     fastvideo_args.pipeline_config.text_encoder_configs[1].tokenizer_kwargs = clip_tokenizer_kwargs
+        text_encoders = [
+            self.get_module("text_encoder"),
+            self.get_module("text_encoder_2")
+        ]
+        tokenizers = [
+            self.get_module("tokenizer"),
+            self.get_module("tokenizer_2")
+        ]
+
         self.add_stage(stage_name="prompt_encoding_stage",
                        stage=TextEncodingStage(
-                           text_encoders=[self.get_module("text_encoder")],
-                           tokenizers=[self.get_module("tokenizer")],
+                           text_encoders=text_encoders,
+                           tokenizers=tokenizers,
                        ))
         self.add_stage(
             stage_name="video_transform_stage",
diff --git a/fastvideo/workflow/preprocess/components.py b/fastvideo/workflow/preprocess/components.py
index c921c13de..dd86ba7d3 100644
--- a/fastvideo/workflow/preprocess/components.py
+++ b/fastvideo/workflow/preprocess/components.py
@@ -14,10 +14,6 @@
 from datasets import Dataset, Video, load_dataset
 
 from fastvideo.configs.configs import DatasetType, PreprocessConfig
-<<<<<<< HEAD
-from fastvideo.distributed.parallel_state import get_world_rank, get_world_size
-=======
->>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
 from fastvideo.logger import init_logger
 from fastvideo.pipelines.pipeline_batch_info import PreprocessBatch
 
@@ -82,10 +78,8 @@ def __call__(self, batch: dict[str, Any]) -> bool:
 
     def _validate_data_type(self, batch: dict[str, Any]) -> bool:
         """Validate basic validity of data items"""
-        print("-------------------------------")
-        print(batch)
-        return not (batch["caption"] is None or batch["caption"] == ""
-                    or "fps" not in batch or batch["fps"] is None or batch["fps"] <= 0
+        return not (batch["caption"] is None or batch["caption"] == "" or "fps"
+                    not in batch or batch["fps"] is None or batch["fps"] <= 0
                     or batch["num_frames"] is None or batch["num_frames"] <= 0)
 
     def _validate_resolution(self, batch: dict[str, Any]) -> bool:
@@ -405,19 +399,9 @@ def _default_file_writer_fn(self, args_tuple: tuple) -> int:
         return written_count
 
 
-<<<<<<< HEAD
-def build_dataset(preprocess_config: PreprocessConfig, split: str,
-                  validator: Callable[[dict[str, Any]], bool]) -> Dataset:
-    if preprocess_config.dataset_type == DatasetType.HF:
-        dataset = load_dataset(preprocess_config.dataset_path, split=split)
-        dataset = dataset.filter(validator)
-        dataset = dataset.shard(num_shards=get_world_size(),
-                                index=get_world_rank())
-=======
 def build_dataset(preprocess_config: PreprocessConfig, split: str) -> Dataset:
     if preprocess_config.dataset_type == DatasetType.HF:
         dataset = load_dataset(preprocess_config.dataset_path, split=split)
->>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
     elif preprocess_config.dataset_type == DatasetType.MERGED:
         metadata_json_path = os.path.join(preprocess_config.dataset_path,
                                           "videos2caption.json")
@@ -431,14 +415,6 @@ def build_dataset(preprocess_config: PreprocessConfig, split: str) -> Dataset:
             dataset = dataset.rename_column("cap", "caption")
         if "path" in column_names:
             dataset = dataset.rename_column("path", "name")
-<<<<<<< HEAD
-
-        dataset = dataset.filter(validator)
-        dataset = dataset.shard(num_shards=get_world_size(),
-                                index=get_world_rank())
-
-=======
->>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
         # add video column
         def add_video_column(item: dict[str, Any]) -> dict[str, Any]:
             item["video"] = os.path.join(video_folder, item["name"])
diff --git a/fastvideo/workflow/preprocess/preprocess_workflow.py b/fastvideo/workflow/preprocess/preprocess_workflow.py
index 05625cc96..0146098ae 100644
--- a/fastvideo/workflow/preprocess/preprocess_workflow.py
+++ b/fastvideo/workflow/preprocess/preprocess_workflow.py
@@ -44,13 +44,7 @@ def register_components(self) -> None:
         self.add_component("raw_data_validator", raw_data_validator)
 
         # training dataset
-<<<<<<< HEAD
-        training_dataset = build_dataset(preprocess_config,
-                                         split="train",
-                                         validator=raw_data_validator)
-=======
         training_dataset = build_dataset(preprocess_config, split="train")
->>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
         # set load_from_cache_file to False to check filter stats
         training_dataset = training_dataset.filter(raw_data_validator)
         # we do not use collate_fn here because we use iterable-style Dataset
@@ -66,13 +60,8 @@ def register_components(self) -> None:
         # try to load validation dataset if it exists
         try:
             validation_dataset = build_dataset(preprocess_config,
-<<<<<<< HEAD
-                                               split="validation",
-                                               validator=raw_data_validator)
-=======
                                                split="validation")
             validation_dataset = validation_dataset.filter(raw_data_validator)
->>>>>>> 15df36ab ([Feat][Preprocess] support merged dataset (#752))
             validation_dataloader = DataLoader(
                 validation_dataset,
                 batch_size=preprocess_config.preprocess_video_batch_size,
diff --git a/scripts/dataset_preparation/prepare_json_file.py b/scripts/dataset_preparation/prepare_json_file.py
index b263b7053..7b67d7951 100644
--- a/scripts/dataset_preparation/prepare_json_file.py
+++ b/scripts/dataset_preparation/prepare_json_file.py
@@ -23,8 +23,10 @@ def get_video_info(video_path):
     fps = info.get("video_fps", 0)
     duration = num_frames / fps if fps > 0 else 0
 
-    # Extract name
-    _, _, videos_dir, video_name = str(video_path).split("/")
+    from pathlib import Path
+    video_path = Path(video_path)
+    videos_dir = video_path.parent.name
+    video_name = video_path.name
 
     return {
         "path": str(video_name),
@@ -100,6 +102,7 @@ def prepare_dataset_json(folder_path,
 
     # Save to JSON file
     output_file = folder_path / output_name
+    print(folder_path,output_file,output_name)
     with open(output_file, 'w') as f:
         json.dump(dataset_info, f, indent=2)