hao-ai-lab
diff --git a/‎fastvideo/data_preprocess/preprocess.py‎
Lines changed: 4 additions & 2 deletions b/‎fastvideo/data_preprocess/preprocess.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎fastvideo/v1/dataset/dataloader/schema.py‎
Lines changed: 39 additions & 1 deletion b/‎fastvideo/v1/dataset/dataloader/schema.py‎
Lines changed: 39 additions & 1 deletion
diff --git a/‎fastvideo/v1/dataset/json_prepare.py‎
Lines changed: 136 additions & 0 deletions b/‎fastvideo/v1/dataset/json_prepare.py‎
Lines changed: 136 additions & 0 deletions
diff --git a/‎fastvideo/v1/dataset/t2v_datasets.py‎
Lines changed: 3 additions & 1 deletion b/‎fastvideo/v1/dataset/t2v_datasets.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎fastvideo/v1/fastvideo_args.py‎
Lines changed: 1 addition & 1 deletion b/‎fastvideo/v1/fastvideo_args.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastvideo/v1/pipelines/preprocess/preprocess_pipeline_i2v.py‎
Lines changed: 92 additions & 0 deletions b/‎fastvideo/v1/pipelines/preprocess/preprocess_pipeline_i2v.py‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎fastvideo/v1/pipelines/preprocess/preprocess_pipeline_t2v.py‎
Lines changed: 23 additions & 0 deletions b/‎fastvideo/v1/pipelines/preprocess/preprocess_pipeline_t2v.py‎
Lines changed: 23 additions & 0 deletions
@@ -11,7 +11,8 @@
 from fastvideo.v1.fastvideo_args import FastVideoArgs
 from fastvideo.v1.configs.models.vaes import WanVAEConfig
 from fastvideo import PipelineConfig
-from fastvideo.v1.pipelines.preprocess_pipeline import PreprocessPipeline
+from fastvideo.v1.pipelines.preprocess.preprocess_pipeline_i2v import PreprocessPipeline_I2V
+from fastvideo.v1.pipelines.preprocess.preprocess_pipeline_t2v import PreprocessPipeline_T2V
 
 logger = init_logger(__name__)
 
@@ -42,7 +43,7 @@ def main(args):
                                    )
     fastvideo_args.check_fastvideo_args()
     fastvideo_args.device = torch.device(f"cuda:{local_rank}")
-
+    PreprocessPipeline = PreprocessPipeline_I2V if args.preprocess_task == "i2v" else PreprocessPipeline_T2V
     pipeline = PreprocessPipeline(args.model_path, fastvideo_args)
     pipeline.forward(batch=None, fastvideo_args=fastvideo_args, args=args)
 
@@ -91,6 +92,7 @@ def main(args):
     parser.add_argument("--group_frame", action="store_true")  # TODO
     parser.add_argument("--group_resolution", action="store_true")  # TODO
     parser.add_argument("--dataset", default="t2v")
+    parser.add_argument("--preprocess_task", type=str, default="t2v")
     parser.add_argument("--train_fps", type=int, default=30)
     parser.add_argument("--use_image_num", type=int, default=0)
     parser.add_argument("--text_max_length", type=int, default=256)
 
@@ -9,7 +9,7 @@
 
 import pyarrow as pa
 
-pyarrow_schema = pa.schema([
+pyarrow_schema_i2v = pa.schema([
     pa.field("id", pa.string()),
     # --- Image/Video VAE latents ---
     # Tensors are stored as raw bytes with shape and dtype info for loading
@@ -30,6 +30,10 @@
     pa.field("text_attention_mask_shape", pa.list_(pa.int64())),
     # e.g., 'bool' or 'int8'
     pa.field("text_attention_mask_dtype", pa.string()),
+    #I2V
+    pa.field("clip_feature_bytes", pa.binary()),
+    pa.field("clip_feature_shape", pa.list_(pa.int64())),
+    pa.field("clip_feature_dtype", pa.string()),
     # --- Metadata ---
     pa.field("file_name", pa.string()),
     pa.field("caption", pa.string()),
@@ -42,3 +46,37 @@
     pa.field("duration_sec", pa.float64()),
     pa.field("fps", pa.float64()),
 ])
+
+pyarrow_schema_t2v = pa.schema([
+    pa.field("id", pa.string()),
+    # --- Image/Video VAE latents ---
+    # Tensors are stored as raw bytes with shape and dtype info for loading
+    pa.field("vae_latent_bytes", pa.binary()),
+    # e.g., [C, T, H, W] or [C, H, W]
+    pa.field("vae_latent_shape", pa.list_(pa.int64())),
+    # e.g., 'float32'
+    pa.field("vae_latent_dtype", pa.string()),
+    # --- Text encoder output tensor ---
+    # Tensors are stored as raw bytes with shape and dtype info for loading
+    pa.field("text_embedding_bytes", pa.binary()),
+    # e.g., [SeqLen, Dim]
+    pa.field("text_embedding_shape", pa.list_(pa.int64())),
+    # e.g., 'bfloat16' or 'float32'
+    pa.field("text_embedding_dtype", pa.string()),
+    pa.field("text_attention_mask_bytes", pa.binary()),
+    # e.g., [SeqLen]
+    pa.field("text_attention_mask_shape", pa.list_(pa.int64())),
+    # e.g., 'bool' or 'int8'
+    pa.field("text_attention_mask_dtype", pa.string()),
+    # --- Metadata ---
+    pa.field("file_name", pa.string()),
+    pa.field("caption", pa.string()),
+    pa.field("media_type", pa.string()),  # 'image' or 'video'
+    pa.field("width", pa.int64()),
+    pa.field("height", pa.int64()),
+    # -- Video-specific (can be null/default for images) ---
+    # Number of frames processed (e.g., 1 for image, N for video)
+    pa.field("num_frames", pa.int64()),
+    pa.field("duration_sec", pa.float64()),
+    pa.field("fps", pa.float64()),
+])
@@ -0,0 +1,136 @@
+import argparse
+import json
+import os
+import time
+from multiprocessing import Pool, cpu_count
+from pathlib import Path
+
+import torchvision
+from tqdm import tqdm
+
+
+def get_video_info(video_path):
+    """Get video information using torchvision."""
+    # Read video tensor (T, C, H, W)
+    video_tensor, _, info = torchvision.io.read_video(str(video_path),
+                                                      output_format="TCHW",
+                                                      pts_unit="sec")
+
+    num_frames = video_tensor.shape[0]
+    height = video_tensor.shape[2]
+    width = video_tensor.shape[3]
+    fps = info.get("video_fps", 0)
+    duration = num_frames / fps if fps > 0 else 0
+
+    # Extract name
+    _, _, videos_dir, video_name = str(video_path).split("/")
+
+    return {
+        "path": str(video_name),
+        "resolution": {
+            "width": width,
+            "height": height
+        },
+        "size": os.path.getsize(video_path),
+        "fps": fps,
+        "duration": duration,
+        "num_frames": num_frames
+    }
+
+
+def prepare_dataset_json(folder_path,
+                         output_name="videos2caption.json",
+                         num_workers=None) -> None:
+    """Prepare dataset information from a folder containing videos and prompt.txt."""
+    folder_path = Path(folder_path)
+
+    # Read prompt file
+    prompt_file = folder_path / "prompt.txt"
+    if not prompt_file.exists():
+        raise FileNotFoundError(f"prompt.txt not found in {folder_path}")
+
+    with open(prompt_file) as f:
+        prompts = [line.strip() for line in f.readlines() if line.strip()]
+
+    # Read videos file
+    videos_file = folder_path / "videos.txt"
+    if not videos_file.exists():
+        raise FileNotFoundError(f"videos.txt not found in {folder_path}")
+
+    with open(videos_file) as f:
+        video_paths = [line.strip() for line in f.readlines() if line.strip()]
+
+    if len(prompts) != len(video_paths):
+        raise ValueError(
+            f"Number of prompts ({len(prompts)}) does not match number of videos ({len(video_paths)})"
+        )
+
+    # Prepare arguments for multiprocessing
+    process_args = [folder_path / video_path for video_path in video_paths]
+
+    # Determine number of workers
+    if num_workers is None:
+        num_workers = max(1, cpu_count() - 1)  # Leave one CPU free
+
+    # Process videos in parallel
+    start_time = time.time()
+    with Pool(num_workers) as pool:
+        results = list(
+            tqdm(pool.imap(get_video_info, process_args),
+                 total=len(process_args),
+                 desc="Processing videos",
+                 unit="video"))
+
+    # Combine results with prompts
+    dataset_info = []
+    for result, prompt in zip(results, prompts):
+        result["cap"] = [prompt]
+        dataset_info.append(result)
+
+    # Calculate total processing time
+    total_time = time.time() - start_time
+    total_videos = len(dataset_info)
+    avg_time_per_video = total_time / total_videos if total_videos > 0 else 0
+
+    print("\nProcessing completed:")
+    print(f"Total videos processed: {total_videos}")
+    print(f"Total time: {total_time:.2f} seconds")
+    print(f"Average time per video: {avg_time_per_video:.2f} seconds")
+
+    # Save to JSON file
+    output_file = folder_path / output_name
+    with open(output_file, 'w') as f:
+        json.dump(dataset_info, f, indent=2)
+
+    # Create merge.txt
+    merge_file = folder_path / "merge.txt"
+    with open(merge_file, 'w') as f:
+        f.write(f"{folder_path}/videos,{output_file}\n")
+
+    print(f"Dataset information saved to {output_file}")
+    print(f"Merge file created at {merge_file}")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description='Prepare video dataset information in JSON format')
+    parser.add_argument(
+        '--folder',
+        type=str,
+        required=True,
+        help='Path to the folder containing videos and prompt.txt')
+    parser.add_argument(
+        '--output',
+        type=str,
+        default='videos2caption.json',
+        help='Name of the output JSON file (default: videos2caption.json)')
+    parser.add_argument('--workers',
+                        type=int,
+                        default=32,
+                        help='Number of worker processes (default: 16)')
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    prepare_dataset_json(args.folder, args.output, args.workers)
@@ -138,6 +138,7 @@ def get_video(self, idx) -> dict:
         video_path = dataset_prog.cap_list[idx]["path"]
         assert os.path.exists(video_path), f"file {video_path} do not exist!"
         frame_indices = dataset_prog.cap_list[idx]["sample_frame_index"]
+
         torchvision_video, _, metadata = torchvision.io.read_video(
             video_path, output_format="TCHW")
         video = torchvision_video[frame_indices]
@@ -270,7 +271,8 @@ def define_frame_index(self, cap_list) -> tuple[list[dict], list[int]]:
                         cnt_resolution_mismatch += 1
                         continue
 
-                # import ipdb;ipdb.set_trace()
+                # if path == 'finetrainers/3dgs-dissolve/videos/1.mp4':
+                #     from IPython import embed; embed()
                 i["num_frames"] = math.ceil(fps * duration)
                 # max 5.0 and min 1.0 are just thresholds to filter some videos which have suitable duration.
                 if i["num_frames"] / fps > self.video_length_tolerance_range * (
 
@@ -92,7 +92,7 @@ class FastVideoArgs:
     lora_target_names: Optional[List[
         str]] = None  # can restrict list of layers to adapt, e.g. ["q_proj"]
 
-    # STA (Spatial-Temporal Attention) parameters
+    # STA parameters
     mask_strategy_file_path: Optional[str] = None
     enable_torch_compile: bool = False
 
 
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+I2V Data Preprocessing pipeline implementation.
+
+This module contains an implementation of the I2V Data Preprocessing pipeline
+using the modular pipeline architecture.
+"""
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import torch
+from PIL import Image
+
+from fastvideo.v1.dataset.dataloader.schema import pyarrow_schema_i2v
+from fastvideo.v1.fastvideo_args import FastVideoArgs
+from fastvideo.v1.forward_context import set_forward_context
+from fastvideo.v1.pipelines.preprocess_pipeline_base import (
+    BasePreprocessPipeline)
+
+
+class PreprocessPipeline_I2V(BasePreprocessPipeline):
+    """I2V preprocessing pipeline implementation."""
+
+    _required_config_modules = [
+        "text_encoder", "tokenizer", "vae", "image_encoder", "image_processor"
+    ]
+
+    def get_schema_fields(self) -> List[str]:
+        """Get the schema fields for I2V pipeline."""
+        return [f.name for f in pyarrow_schema_i2v]
+
+    def get_extra_features(self, valid_data: Dict[str, Any],
+                           fastvideo_args: FastVideoArgs) -> Dict[str, Any]:
+        """Get CLIP features from the first frame of each video."""
+        first_frame = valid_data["pixel_values"][:, :, 0, :, :].permute(
+            0, 2, 3, 1)  # (B, C, T, H, W) -> (B, H, W, C)
+
+        processed_images = []
+        for frame in first_frame:
+            frame_pil = Image.fromarray(frame.cpu().numpy().astype(np.uint8))
+            processed_img = self.get_module("image_processor")(
+                images=frame_pil, return_tensors="pt")
+            processed_images.append(processed_img)
+
+        # Get CLIP features
+        pixel_values = torch.cat(
+            [img['pixel_values'] for img in processed_images],
+            dim=0).to(fastvideo_args.device)
+        with torch.no_grad():
+            image_inputs = {'pixel_values': pixel_values}
+            with set_forward_context(current_timestep=0, attn_metadata=None):
+                clip_features = self.get_module("image_encoder")(**image_inputs)
+            clip_features = clip_features.last_hidden_state
+
+        return {"clip_feature": clip_features}
+
+    def create_record(
+            self,
+            video_name: str,
+            vae_latent: np.ndarray,
+            text_embedding: np.ndarray,
+            text_attention_mask: np.ndarray,
+            valid_data: Optional[Dict[str, Any]],
+            idx: int,
+            extra_features: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        """Create a record for the Parquet dataset with CLIP features."""
+        record = super().create_record(video_name=video_name,
+                                       vae_latent=vae_latent,
+                                       text_embedding=text_embedding,
+                                       text_attention_mask=text_attention_mask,
+                                       valid_data=valid_data,
+                                       idx=idx,
+                                       extra_features=extra_features)
+
+        if extra_features and "clip_feature" in extra_features:
+            clip_feature = extra_features["clip_feature"]
+            record.update({
+                "clip_feature_bytes": clip_feature.tobytes(),
+                "clip_feature_shape": list(clip_feature.shape),
+                "clip_feature_dtype": str(clip_feature.dtype),
+            })
+        else:
+            record.update({
+                "clip_feature_bytes": b"",
+                "clip_feature_shape": [],
+                "clip_feature_dtype": "",
+            })
+
+        return record
+
+
+EntryClass = PreprocessPipeline_I2V
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+T2V Data Preprocessing pipeline implementation.
+
+This module contains an implementation of the T2V Data Preprocessing pipeline
+using the modular pipeline architecture.
+"""
+from fastvideo.v1.dataset.dataloader.schema import pyarrow_schema_t2v
+from fastvideo.v1.pipelines.preprocess_pipeline_base import (
+    BasePreprocessPipeline)
+
+
+class PreprocessPipeline_T2V(BasePreprocessPipeline):
+    """T2V preprocessing pipeline implementation."""
+
+    _required_config_modules = ["text_encoder", "tokenizer", "vae"]
+
+    def get_schema_fields(self):
+        """Get the schema fields for T2V pipeline."""
+        return [f.name for f in pyarrow_schema_t2v]
+
+
+EntryClass = PreprocessPipeline_T2V