improve dataset preparation (#43)

a-r-r-o-w · web-flow · commit f2a1626de2d6 · 2024-10-18T02:43:22.000+05:30
* update

* update

* update

* update
diff --git a/assets/dataset.md b/assets/dataset.md
@@ -18,7 +18,7 @@ The framework supports resolutions and frame counts that meet the following cond
     - Any resolution as long as it is divisible by 32. For example, `720 * 480`, `1920 * 1020`, etc.
 
 - **Supported Frame Counts (Frames)**:
-    - Must satisfy (4K + 1), i.e., multiples of 4 such as 16, 24, 32, 48, 64, 80.
+    - Must be `4 * k` or `4 * k + 1` (example: 16, 32, 49, 81)
 
 It is recommended to place all videos in a single folder.
 
@@ -58,4 +58,4 @@ huggingface-cli download --repo-type dataset Wild-Heart/Disney-VideoGeneration-D
 
 This dataset has been prepared in the expected format and can be used directly. However, directly using the video dataset may cause Out of Memory (OOM) issues on GPUs with smaller VRAM because it requires loading the [VAE](https://huggingface.co/THUDM/CogVideoX-5b/tree/main/vae) (which encodes videos into latent space) and the large [T5-XXL](https://huggingface.co/google/t5-v1_1-xxl/) text encoder. To reduce memory usage, you can use the `training/prepare_dataset.py` script to precompute latents and embeddings.
 
-Fill or modify the parameters in `prepare_dataset.sh` and execute it to get precomputed latents and embeddings (make sure to specify `--save_tensors` to save the precomputed artifacts). When using these artifacts during training, ensure that you specify the `--load_tensors` flag, or else the videos will be used directly, requiring the text encoder and VAE to be loaded. The script also supports PyTorch DDP so that large datasets can be encoded in parallel across multiple GPUs (modify the `NUM_GPUS` parameter).
+Fill or modify the parameters in `prepare_dataset.sh` and execute it to get precomputed latents and embeddings (make sure to specify `--save_latents_and_embeddings` to save the precomputed artifacts). If preparing for image-to-video training, make sure to pass `--save_image_latents`, which encodes and saves image latents along with videos. When using these artifacts during training, ensure that you specify the `--load_tensors` flag, or else the videos will be used directly, requiring the text encoder and VAE to be loaded. The script also supports PyTorch DDP so that large datasets can be encoded in parallel across multiple GPUs (modify the `NUM_GPUS` parameter).
diff --git a/assets/dataset_zh.md b/assets/dataset_zh.md
@@ -18,7 +18,7 @@ A black and white animated sequence on a ship’s deck features a bulldog charac
     - 任意分辨率且必须能被32整除。例如，`720 * 480`, `1920 * 1020` 等分辨率。
 
 - **支持的帧数（Frames）**：
-    - 满足 (4K +1)，即4的倍数，例如，16, 24, 32, 48, 64, 80。
+    - 必须是 `4 * k` 或 `4 * k + 1`（例如：16, 32, 49, 81）
 
 所有的视频建议放在一个文件夹中。
 
@@ -66,6 +66,7 @@ OOM（内存不足），因为它需要加载 [VAE](https://huggingface.co/THUDM
 
 文本编码器。为了降低内存需求，您可以使用 `training/prepare_dataset.py` 脚本预先计算潜在变量和嵌入。
 
-填写或修改 `prepare_dataset.sh` 中的参数并执行它以获得预先计算的潜在变量和嵌入（请确保指定 `--save_tensors`
-以保存预计算的工件）。在训练期间使用这些工件时，确保指定 `--load_tensors` 标志，否则将直接使用视频并需要加载文本编码器和
+填写或修改 `prepare_dataset.sh` 中的参数并执行它以获得预先计算的潜在变量和嵌入（请确保指定 `--save_latents_and_embeddings`
+以保存预计算的工件）。如果准备图像到视频的训练，请确保传递 `--save_image_latents`，它对沙子进行编码，将图像潜在值与视频一起保存。
+在训练期间使用这些工件时，确保指定 `--load_tensors` 标志，否则将直接使用视频并需要加载文本编码器和
 VAE。该脚本还支持 PyTorch DDP，以便可以使用多个 GPU 并行编码大型数据集（修改 `NUM_GPUS` 参数）。
diff --git a/prepare_dataset.sh b/prepare_dataset.sh
@@ -38,7 +38,7 @@ CMD_WITHOUT_PRE_ENCODING="\
       --dtype $DTYPE
 "
 
-CMD_WITH_PRE_ENCODING="$CMD_WITHOUT_PRE_ENCODING --save_tensors"
+CMD_WITH_PRE_ENCODING="$CMD_WITHOUT_PRE_ENCODING --save_latents_and_embeddings"
 
 # Select which you'd like to run
 CMD=$CMD_WITH_PRE_ENCODING
diff --git a/training/cogvideox_image_to_video_lora.py b/training/cogvideox_image_to_video_lora.py
@@ -26,7 +26,6 @@
 import diffusers
 import torch
 import transformers
-import wandb
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
 from accelerate.utils import (
@@ -53,6 +52,8 @@
 from tqdm.auto import tqdm
 from transformers import AutoTokenizer, T5EncoderModel
 
+import wandb
+
 
 from args import get_args  # isort:skip
 from dataset import BucketSampler, VideoDatasetWithResizing, VideoDatasetWithResizeAndRectangleCrop  # isort:skip
@@ -523,7 +524,7 @@ def load_model_hook(models, input_dir):
 
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataset) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
         overrode_max_train_steps = True
@@ -560,7 +561,7 @@ def load_model_hook(models, input_dir):
     )
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataset) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if overrode_max_train_steps:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
     # Afterwards we recalculate our number of training epochs
@@ -582,6 +583,7 @@ def load_model_hook(models, input_dir):
     accelerator.print("***** Running training *****")
     accelerator.print(f"  Num trainable parameters = {num_trainable_parameters}")
     accelerator.print(f"  Num examples = {len(train_dataset)}")
+    accelerator.print(f"  Num batches each epoch = {len(train_dataloader)}")
     accelerator.print(f"  Num epochs = {args.num_train_epochs}")
     accelerator.print(f"  Instantaneous batch size per device = {args.train_batch_size}")
     accelerator.print(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
diff --git a/training/cogvideox_text_to_video_lora.py b/training/cogvideox_text_to_video_lora.py
@@ -25,7 +25,6 @@
 import diffusers
 import torch
 import transformers
-import wandb
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
 from accelerate.utils import (
@@ -52,6 +51,8 @@
 from tqdm.auto import tqdm
 from transformers import AutoTokenizer, T5EncoderModel
 
+import wandb
+
 
 from args import get_args  # isort:skip
 from dataset import BucketSampler, VideoDatasetWithResizing, VideoDatasetWithResizeAndRectangleCrop  # isort:skip
@@ -507,7 +508,7 @@ def collate_fn(data):
 
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataset) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
         overrode_max_train_steps = True
@@ -544,7 +545,7 @@ def collate_fn(data):
     )
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataset) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if overrode_max_train_steps:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
     # Afterwards we recalculate our number of training epochs
@@ -566,6 +567,7 @@ def collate_fn(data):
     accelerator.print("***** Running training *****")
     accelerator.print(f"  Num trainable parameters = {num_trainable_parameters}")
     accelerator.print(f"  Num examples = {len(train_dataset)}")
+    accelerator.print(f"  Num batches each epoch = {len(train_dataloader)}")
     accelerator.print(f"  Num epochs = {args.num_train_epochs}")
     accelerator.print(f"  Instantaneous batch size per device = {args.train_batch_size}")
     accelerator.print(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
diff --git a/training/cogvideox_text_to_video_sft.py b/training/cogvideox_text_to_video_sft.py
@@ -25,7 +25,6 @@
 import diffusers
 import torch
 import transformers
-import wandb
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
 from accelerate.utils import (
@@ -51,6 +50,8 @@
 from tqdm.auto import tqdm
 from transformers import AutoTokenizer, T5EncoderModel
 
+import wandb
+
 
 from args import get_args  # isort:skip
 from dataset import BucketSampler, VideoDatasetWithResizing, VideoDatasetWithResizeAndRectangleCrop  # isort:skip
@@ -471,7 +472,7 @@ def collate_fn(data):
 
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataset) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
         overrode_max_train_steps = True
@@ -508,7 +509,7 @@ def collate_fn(data):
     )
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataset) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if overrode_max_train_steps:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
     # Afterwards we recalculate our number of training epochs
@@ -530,6 +531,7 @@ def collate_fn(data):
     accelerator.print("***** Running training *****")
     accelerator.print(f"  Num trainable parameters = {num_trainable_parameters}")
     accelerator.print(f"  Num examples = {len(train_dataset)}")
+    accelerator.print(f"  Num batches each epoch = {len(train_dataloader)}")
     accelerator.print(f"  Num epochs = {args.num_train_epochs}")
     accelerator.print(f"  Instantaneous batch size per device = {args.train_batch_size}")
     accelerator.print(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
diff --git a/training/dataset.py b/training/dataset.py
@@ -375,7 +375,7 @@ class BucketSampler(Sampler):
             be yielded. If set to False, it is guaranteed that all data in the dataset will be processed
             and batches that do not have `batch_size` number of entries will also be yielded.
     """
-    
+
     def __init__(
         self, data_source: VideoDataset, batch_size: int = 8, shuffle: bool = True, drop_last: bool = False
     ) -> None:
@@ -386,6 +386,16 @@ def __init__(
 
         self.buckets = {resolution: [] for resolution in data_source.resolutions}
 
+        self._raised_warning_for_drop_last = False
+
+    def __len__(self):
+        if self.drop_last and not self._raised_warning_for_drop_last:
+            self._raised_warning_for_drop_last = True
+            logger.warning(
+                "Calculating the length for bucket sampler is not possible when `drop_last` is set to True. This may cause problems when setting the number of epochs used for training."
+            )
+        return (len(self.data_source) + self.batch_size - 1) // self.batch_size
+
     def __iter__(self):
         for index, data in enumerate(self.data_source):
             video_metadata = data["video_metadata"]
diff --git a/training/prepare_dataset.py b/training/prepare_dataset.py

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ CMD_WITHOUT_PRE_ENCODING="\`
`38`	`38`	`--dtype $DTYPE`
`39`	`39`	`"`
`40`	`40`
`41`		`-CMD_WITH_PRE_ENCODING="$CMD_WITHOUT_PRE_ENCODING --save_tensors"`
	`41`	`+CMD_WITH_PRE_ENCODING="$CMD_WITHOUT_PRE_ENCODING --save_latents_and_embeddings"`
`42`	`42`
`43`	`43`	`# Select which you'd like to run`
`44`	`44`	`CMD=$CMD_WITH_PRE_ENCODING`