Improve dataset preparation support + multiresolution prep (#39)

a-r-r-o-w · web-flow · commit feb2e2654720 · 2024-10-17T16:48:52.000+05:30
* update

* make style

* renormalize correctly

* apply suggestions from review

* apply suggestions from review

* update
diff --git a/README.md b/README.md
@@ -46,7 +46,7 @@ from diffusers import export_to_video
 pipe = CogVideoXPipeline.from_pretrained(
     "THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16
 ).to("cuda")
-+ pipe.load_lora_weights("my-awesome-name/my-awesome-lora", adapter_name=["cogvideox-lora"])
++ pipe.load_lora_weights("my-awesome-name/my-awesome-lora", adapter_name="cogvideox-lora")
 + pipe.set_adapters(["cogvideox-lora"], [1.0])
 
 video = pipe("<my-awesome-prompt>").frames[0]
@@ -429,7 +429,7 @@ With `train_batch_size = 4`:
 - [ ] Make scripts compatible with FSDP
 - [x] Make scripts compatible with DeepSpeed
 - [ ] vLLM-powered captioning script
-- [ ] Multi-resolution/frame support in `prepare_dataset.py`
+- [x] Multi-resolution/frame support in `prepare_dataset.py`
 - [ ] Analyzing traces for potential speedups and removing as many syncs as possible
 - [ ] Support for QLoRA (priority), and other types of high usage LoRAs methods
 - [x] Test scripts with memory-efficient optimizer from bitsandbytes
diff --git a/README_zh.md b/README_zh.md
@@ -440,7 +440,7 @@ diffusers（该分支为 CogVideoX 的图像到视频添加了 LoRA 加载支持
 - [ ] 使脚本兼容 FSDP
 - [x] 使脚本兼容 DeepSpeed
 - [ ] 基于 vLLM 的字幕脚本
-- [ ] 在 `prepare_dataset.py` 中支持多分辨率/帧数
+- [x] 在 `prepare_dataset.py` 中支持多分辨率/帧数
 - [ ] 分析性能瓶颈并尽可能减少同步操作
 - [ ] 支持 QLoRA（优先），以及其他高使用率的 LoRA 方法
 - [x] 使用 bitsandbytes 的节省内存优化器测试脚本
diff --git a/training/cogvideox_image_to_video_lora.py b/training/cogvideox_image_to_video_lora.py
@@ -200,6 +200,7 @@ def log_validation(
 
     return videos
 
+
 class CollateFunction:
     def __init__(self, weight_dtype, load_tensors):
         self.weight_dtype = weight_dtype
@@ -223,6 +224,7 @@ def __call__(self, data):
             "prompts": prompts,
         }
 
+
 def main(args):
     if args.report_to == "wandb" and args.hub_token is not None:
         raise ValueError(
@@ -647,7 +649,7 @@ def load_model_hook(models, input_dir):
 
                 # Encode videos
                 if not args.load_tensors:
-                    images = images.permute(0, 2, 1, 3, 4) # [B, C, F, H, W]
+                    images = images.permute(0, 2, 1, 3, 4)  # [B, C, F, H, W]
                     image_noise_sigma = torch.normal(
                         mean=-3.0, std=0.5, size=(images.size(0),), device=accelerator.device, dtype=weight_dtype
                     )
diff --git a/training/dataset.py b/training/dataset.py
@@ -78,15 +78,16 @@ def __init__(
                 self.video_paths,
             ) = self._load_dataset_from_csv()
 
-        self.num_videos = len(self.video_paths)
-        if self.num_videos != len(self.prompts):
+        if len(self.video_paths) != len(self.prompts):
             raise ValueError(
                 f"Expected length of prompts and videos to be the same but found {len(self.prompts)=} and {len(self.video_paths)=}. Please ensure that the number of caption prompts and videos match in your dataset."
             )
 
         self.video_transforms = transforms.Compose(
             [
-                transforms.RandomHorizontalFlip(random_flip) if random_flip else transforms.Lambda(self.identity_transform),
+                transforms.RandomHorizontalFlip(random_flip)
+                if random_flip
+                else transforms.Lambda(self.identity_transform),
                 transforms.Lambda(self.scale_transform),
                 transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
             ]
@@ -101,7 +102,7 @@ def scale_transform(x):
         return x / 255.0
 
     def __len__(self) -> int:
-        return self.num_videos
+        return len(self.video_paths)
 
     def __getitem__(self, index: int) -> Dict[str, Any]:
         if isinstance(index, list):
@@ -358,10 +359,30 @@ def _find_nearest_resolution(self, height, width):
 
 
 class BucketSampler(Sampler):
-    def __init__(self, data_source: VideoDataset, batch_size: int = 8, shuffle: bool = True) -> None:
+    r"""
+    PyTorch Sampler that groups 3D data by height, width and frames.
+
+    Args:
+        data_source (`VideoDataset`):
+            A PyTorch dataset object that is an instance of `VideoDataset`.
+        batch_size (`int`, defaults to `8`):
+            The batch size to use for training.
+        shuffle (`bool`, defaults to `True`):
+            Whether or not to shuffle the data in each batch before dispatching to dataloader.
+        drop_last (`bool`, defaults to `False`):
+            Whether or not to drop incomplete buckets of data after completely iterating over all data
+            in the dataset. If set to True, only batches that have `batch_size` number of entries will
+            be yielded. If set to False, it is guaranteed that all data in the dataset will be processed
+            and batches that do not have `batch_size` number of entries will also be yielded.
+    """
+    
+    def __init__(
+        self, data_source: VideoDataset, batch_size: int = 8, shuffle: bool = True, drop_last: bool = False
+    ) -> None:
         self.data_source = data_source
         self.batch_size = batch_size
         self.shuffle = shuffle
+        self.drop_last = drop_last
 
         self.buckets = {resolution: [] for resolution in data_source.resolutions}
 
@@ -377,3 +398,15 @@ def __iter__(self):
                 yield self.buckets[(f, h, w)]
                 del self.buckets[(f, h, w)]
                 self.buckets[(f, h, w)] = []
+
+        if self.drop_last:
+            return
+
+        for fhw, bucket in list(self.buckets.items()):
+            if len(bucket) == 0:
+                continue
+            if self.shuffle:
+                random.shuffle(bucket)
+                yield bucket
+                del self.buckets[fhw]
+                self.buckets[fhw] = []
diff --git a/training/prepare_dataset.py b/training/prepare_dataset.py