hao-ai-lab
diff --git a/‎fastvideo/v1/dataset/__init__.py‎
Lines changed: 39 additions & 0 deletions b/‎fastvideo/v1/dataset/__init__.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎fastvideo/v1/dataset/dataloader/schema.py‎
Lines changed: 44 additions & 0 deletions b/‎fastvideo/v1/dataset/dataloader/schema.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎fastvideo/v1/dataset/latent_datasets.py‎
Lines changed: 129 additions & 0 deletions b/‎fastvideo/v1/dataset/latent_datasets.py‎
Lines changed: 129 additions & 0 deletions
@@ -0,0 +1,39 @@
+from torchvision import transforms
+from torchvision.transforms import Lambda
+from transformers import AutoTokenizer
+
+from fastvideo.v1.dataset.t2v_datasets import T2V_dataset
+from fastvideo.v1.dataset.transform import (CenterCropResizeVideo, Normalize255,
+                                            TemporalRandomCrop)
+
+
+def getdataset(args, start_idx=0) -> T2V_dataset:
+    temporal_sample = TemporalRandomCrop(args.num_frames)  # 16 x
+    norm_fun = Lambda(lambda x: 2.0 * x - 1.0)
+    resize_topcrop = [
+        CenterCropResizeVideo((args.max_height, args.max_width), top_crop=True),
+    ]
+    resize = [
+        CenterCropResizeVideo((args.max_height, args.max_width)),
+    ]
+    transform = transforms.Compose([
+        # Normalize255(),
+        *resize,
+    ])
+    transform_topcrop = transforms.Compose([
+        Normalize255(),
+        *resize_topcrop,
+        norm_fun,
+    ])
+    # tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl", cache_dir=args.cache_dir)
+    tokenizer = AutoTokenizer.from_pretrained(args.text_encoder_name,
+                                              cache_dir=args.cache_dir)
+    if args.dataset == "t2v":
+        return T2V_dataset(args,
+                           transform=transform,
+                           temporal_sample=temporal_sample,
+                           tokenizer=tokenizer,
+                           transform_topcrop=transform_topcrop,
+                           start_idx=start_idx)
+
+    raise NotImplementedError(args.dataset)
@@ -0,0 +1,44 @@
+# schema.py
+"""
+Unified data schema and format for saving and loading image/video data after
+preprocessing.
+
+It uses apache arrow in-memory format that can be consumed by modern data
+frameworks that can handle parquet or lance file.
+"""
+
+import pyarrow as pa
+
+pyarrow_schema = pa.schema([
+    pa.field("id", pa.string()),
+    # --- Image/Video VAE latents ---
+    # Tensors are stored as raw bytes with shape and dtype info for loading
+    pa.field("vae_latent_bytes", pa.binary()),
+    # e.g., [C, T, H, W] or [C, H, W]
+    pa.field("vae_latent_shape", pa.list_(pa.int64())),
+    # e.g., 'float32'
+    pa.field("vae_latent_dtype", pa.string()),
+    # --- Text encoder output tensor ---
+    # Tensors are stored as raw bytes with shape and dtype info for loading
+    pa.field("text_embedding_bytes", pa.binary()),
+    # e.g., [SeqLen, Dim]
+    pa.field("text_embedding_shape", pa.list_(pa.int64())),
+    # e.g., 'bfloat16' or 'float32'
+    pa.field("text_embedding_dtype", pa.string()),
+    pa.field("text_attention_mask_bytes", pa.binary()),
+    # e.g., [SeqLen]
+    pa.field("text_attention_mask_shape", pa.list_(pa.int64())),
+    # e.g., 'bool' or 'int8'
+    pa.field("text_attention_mask_dtype", pa.string()),
+    # --- Metadata ---
+    pa.field("file_name", pa.string()),
+    pa.field("caption", pa.string()),
+    pa.field("media_type", pa.string()),  # 'image' or 'video'
+    pa.field("width", pa.int64()),
+    pa.field("height", pa.int64()),
+    # -- Video-specific (can be null/default for images) ---
+    # Number of frames processed (e.g., 1 for image, N for video)
+    pa.field("num_frames", pa.int64()),
+    pa.field("duration_sec", pa.float64()),
+    pa.field("fps", pa.float64()),
+])
@@ -0,0 +1,129 @@
+import json
+import os
+import random
+
+import torch
+from torch.utils.data import Dataset
+
+
+class LatentDataset(Dataset):
+
+    def __init__(
+        self,
+        json_path,
+        num_latent_t,
+        cfg_rate,
+    ) -> None:
+        # data_merge_path: video_dir, latent_dir, prompt_embed_dir, json_path
+        self.json_path = json_path
+        self.cfg_rate = cfg_rate
+        self.datase_dir_path = os.path.dirname(json_path)
+        self.video_dir = os.path.join(self.datase_dir_path, "video")
+        self.latent_dir = os.path.join(self.datase_dir_path, "latent")
+        self.prompt_embed_dir = os.path.join(self.datase_dir_path,
+                                             "prompt_embed")
+        self.prompt_attention_mask_dir = os.path.join(self.datase_dir_path,
+                                                      "prompt_attention_mask")
+        with open(self.json_path) as f:
+            self.data_anno = json.load(f)
+        # json.load(f) already keeps the order
+        # self.data_anno = sorted(self.data_anno, key=lambda x: x['latent_path'])
+        self.num_latent_t = num_latent_t
+
+        self.uncond_prompt_embed = torch.zeros(256, 4096).to(torch.float32)
+
+        self.uncond_prompt_mask = torch.zeros(256).bool()
+        self.lengths = [
+            data_item.get("length", 1) for data_item in self.data_anno
+        ]
+
+    def __getitem__(self, idx):
+        latent_file = self.data_anno[idx]["latent_path"]
+        prompt_embed_file = self.data_anno[idx]["prompt_embed_path"]
+        prompt_attention_mask_file = self.data_anno[idx][
+            "prompt_attention_mask"]
+        # load
+        latent = torch.load(
+            os.path.join(self.latent_dir, latent_file),
+            map_location="cpu",
+            weights_only=True,
+        )
+        latent = latent.squeeze(0)[:, -self.num_latent_t:]
+        if random.random() < self.cfg_rate:
+            prompt_embed = self.uncond_prompt_embed
+            prompt_attention_mask = self.uncond_prompt_mask
+        else:
+            prompt_embed = torch.load(
+                os.path.join(self.prompt_embed_dir, prompt_embed_file),
+                map_location="cpu",
+                weights_only=True,
+            )
+            prompt_attention_mask = torch.load(
+                os.path.join(self.prompt_attention_mask_dir,
+                             prompt_attention_mask_file),
+                map_location="cpu",
+                weights_only=True,
+            )
+        return latent, prompt_embed, prompt_attention_mask
+
+    def __len__(self):
+        return len(self.data_anno)
+
+
+def latent_collate_function(batch):
+    # return latent, prompt, latent_attn_mask, text_attn_mask
+    # latent_attn_mask: # b t h w
+    # text_attn_mask: b 1 l
+    # needs to check if the latent/prompt' size and apply padding & attn mask
+    latents, prompt_embeds, prompt_attention_masks = zip(*batch)
+    # calculate max shape
+    max_t = max([latent.shape[1] for latent in latents])
+    max_h = max([latent.shape[2] for latent in latents])
+    max_w = max([latent.shape[3] for latent in latents])
+
+    # padding
+    latent_list: list[torch.Tensor] = [
+        torch.nn.functional.pad(
+            latent,
+            (
+                0,
+                max_t - latent.shape[1],
+                0,
+                max_h - latent.shape[2],
+                0,
+                max_w - latent.shape[3],
+            ),
+        ) for latent in latents
+    ]
+    # attn mask
+    latent_attn_mask = torch.ones(len(latent_list), max_t, max_h, max_w)
+    # set to 0 if padding
+    for i, latent in enumerate(latent_list):
+        latent_attn_mask[i, latent.shape[1]:, :, :] = 0
+        latent_attn_mask[i, :, latent.shape[2]:, :] = 0
+        latent_attn_mask[i, :, :, latent.shape[3]:] = 0
+
+    prompt_embeds = torch.stack(prompt_embeds, dim=0)
+    prompt_attention_masks = torch.stack(prompt_attention_masks, dim=0)
+    latents = torch.stack(latent_list, dim=0)
+    return latents, prompt_embeds, latent_attn_mask, prompt_attention_masks
+
+
+if __name__ == "__main__":
+    dataset = LatentDataset("data/Mochi-Synthetic-Data/merge.txt",
+                            num_latent_t=28,
+                            cfg_rate=0.0)
+    dataloader = torch.utils.data.DataLoader(dataset,
+                                             batch_size=2,
+                                             shuffle=False,
+                                             collate_fn=latent_collate_function)
+    for latent, prompt_embed, latent_attn_mask, prompt_attention_mask in dataloader:
+        print(
+            latent.shape,
+            prompt_embed.shape,
+            latent_attn_mask.shape,
+            prompt_attention_mask.shape,
+        )
+        import pdb
+
+        pdb.set_trace()