DeepSpeed and DDP Configs (#10)

a-r-r-o-w · sayakpaul · web-flow · commit be9d99a1b074 · 2024-10-09T02:54:18.000+05:30
* add configs

* remove compiled ddp config

* add coauthor

Co-Authored-By: Sayak Paul &lt;spsayakpaul@gmail.com&gt;

* update

* deepspeed numbers  nd fixes

---------

Co-authored-by: Sayak Paul &lt;spsayakpaul@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -85,7 +85,7 @@ TODO: Add a section on creating and using precomputed embeddings.
 
 We provide training script for both text-to-video and image-to-video generation which are compatible with the [Cog family of models](https://huggingface.co/collections/THUDM/cogvideo-66c08e62f1685a3ade464cce).
 
-Take a look at `training/*.sh`
+Take a look at `*.sh`
 
 Note: Untested on MPS
 
@@ -282,11 +282,55 @@ ValueError: Expected a cuda device, but got: cpu
 
 </details>
 
+<details>
+<summary> DeepSpeed (AdamW + CPU/Parameter offloading) </summary>
+
+> [!NOTE]
+> Results are for `lora_rank=256` with `gradient_checkpointing` enabled, 2x RTX 4090.
+
+With `train_batch_size = 1`:
+
+|       model        | memory_before_training | memory_before_validation | memory_after_validation | memory_after_testing |
+|:------------------:|:----------------------:|:------------------------:|:-----------------------:|:--------------------:|
+| THUDM/CogVideoX-2b |         13.141         |          13.141          |         21.070          |       24.602         |
+| THUDM/CogVideoX-5b |         20.170         |          20.170          |         28.662          |       38.957         |
+
+With `train_batch_size = 4`:
+
+|       model        | memory_before_training | memory_before_validation | memory_after_validation | memory_after_testing |
+|:------------------:|:----------------------:|:------------------------:|:-----------------------:|:--------------------:|
+| THUDM/CogVideoX-2b |         13.141         |          19.854          |         20.836          |       24.709         |
+| THUDM/CogVideoX-5b |         20.170         |          40.635          |         40.699          |       39.027         |
+
+</details>
+
 ### Full finetuning
 
 > [!NOTE]
 > `memory_after_validation` is indicative of the peak memory required for training. This is because apart from the activations, parameters and gradients stored for training, you also need to load the vae and text encoder in memory and spend some memory to perform inference. In order to reduce total memory required to perform training, one can choose to not perform validation/testing as part of the training script.
 
+<details>
+<summary> DeepSpeed (AdamW + CPU/Parameter offloading) </summary>
+
+> [!NOTE]
+> Results with `gradient_checkpointing` enabled, 2x RTX 4090.
+
+With `train_batch_size = 1`:
+
+|       model        | memory_before_training | memory_before_validation | memory_after_validation | memory_after_testing |
+|:------------------:|:----------------------:|:------------------------:|:-----------------------:|:--------------------:|
+| THUDM/CogVideoX-2b |         13.111         |          13.111          |         20.328          |       23.867         |
+| THUDM/CogVideoX-5b |         19.762         |          19.998          |         27.697          |       38.018         |
+
+With `train_batch_size = 4`:
+
+|       model        | memory_before_training | memory_before_validation | memory_after_validation | memory_after_testing |
+|:------------------:|:----------------------:|:------------------------:|:-----------------------:|:--------------------:|
+| THUDM/CogVideoX-2b |         13.111         |          21.188          |         21.254          |       23.869         |
+| THUDM/CogVideoX-5b |         19.762         |          43.465          |         43.531          |       38.082         |
+
+</details>
+
 - [ ] Make scripts compatible with DDP
 - [ ] Make scripts compatible with FSDP
 - [x] Make scripts compatible with DeepSpeed
diff --git a/accelerate_configs/deepspeed.yaml b/accelerate_configs/deepspeed.yaml
@@ -0,0 +1,23 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/accelerate_configs/uncompiled_2.yaml b/accelerate_configs/uncompiled_2.yaml
@@ -0,0 +1,17 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+gpu_ids: 0,1
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/train_text_to_video_sft.sh b/train_text_to_video_sft.sh
@@ -55,7 +55,7 @@ for learning_rate in "${LEARNING_RATES[@]}"; do
           --gradient_checkpointing \
           --learning_rate $learning_rate \
           --lr_scheduler $lr_schedule \
-          --lr_warmup_steps 200 \
+          --lr_warmup_steps 800 \
           --lr_num_cycles 1 \
           --enable_slicing \
           --enable_tiling \
@@ -65,7 +65,7 @@ for learning_rate in "${LEARNING_RATES[@]}"; do
           --weight_decay 0.001 \
           --max_grad_norm 1.0 \
           --allow_tf32 \
-          --report_to wandb
+          --report_to wandb \
           --nccl_timeout 1800"
         
         echo "Running command: $cmd"
diff --git a/training/cogvideox_text_to_video_lora.py b/training/cogvideox_text_to_video_lora.py
@@ -26,7 +26,7 @@
 import torch
 import transformers
 import wandb
-from accelerate import Accelerator
+from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
 from accelerate.utils import (
     DistributedDataParallelKwargs,
@@ -315,7 +315,7 @@ def main(args):
             "bf16" in accelerator.state.deepspeed_plugin.deepspeed_config
             and accelerator.state.deepspeed_plugin.deepspeed_config["bf16"]["enabled"]
         ):
-            weight_dtype = torch.float16
+            weight_dtype = torch.bfloat16
     else:
         if accelerator.mixed_precision == "fp16":
             weight_dtype = torch.float16
@@ -631,7 +631,7 @@ def collate_fn(data):
 
                 videos = latent_dist.sample() * VAE_SCALING_FACTOR
                 videos = videos.permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
-                videos = videos.to(memory_format=torch.contiguous_format).float()
+                videos = videos.to(memory_format=torch.contiguous_format, dtype=weight_dtype)
                 model_input = videos
 
                 # Encode prompts
@@ -646,7 +646,7 @@ def collate_fn(data):
                         requires_grad=False,
                     )
                 else:
-                    prompt_embeds = prompts
+                    prompt_embeds = prompts.to(dtype=weight_dtype)
 
                 # Sample noise that will be added to the latents
                 noise = torch.randn_like(model_input)
@@ -721,7 +721,7 @@ def collate_fn(data):
                 progress_bar.update(1)
                 global_step += 1
 
-                if accelerator.is_main_process:
+                if accelerator.is_main_process or accelerator.distributed_type == DistributedType.DEEPSPEED:
                     if global_step % args.checkpointing_steps == 0:
                         # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
                         if args.checkpoints_total_limit is not None:
diff --git a/training/cogvideox_text_to_video_sft.py b/training/cogvideox_text_to_video_sft.py
@@ -26,7 +26,7 @@
 import torch
 import transformers
 import wandb
-from accelerate import Accelerator
+from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
 from accelerate.utils import (
     DistributedDataParallelKwargs,
@@ -271,7 +271,7 @@ def main(args):
             "bf16" in accelerator.state.deepspeed_plugin.deepspeed_config
             and accelerator.state.deepspeed_plugin.deepspeed_config["bf16"]["enabled"]
         ):
-            weight_dtype = torch.float16
+            weight_dtype = torch.bfloat16
     else:
         if accelerator.mixed_precision == "fp16":
             weight_dtype = torch.float16
@@ -562,7 +562,7 @@ def collate_fn(data):
 
                 videos = latent_dist.sample() * VAE_SCALING_FACTOR
                 videos = videos.permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
-                videos = videos.to(memory_format=torch.contiguous_format).float()
+                videos = videos.to(memory_format=torch.contiguous_format, dtype=weight_dtype)
                 model_input = videos
 
                 # Encode prompts
@@ -577,7 +577,7 @@ def collate_fn(data):
                         requires_grad=False,
                     )
                 else:
-                    prompt_embeds = prompts
+                    prompt_embeds = prompts.to(dtype=weight_dtype)
 
                 # Sample noise that will be added to the latents
                 noise = torch.randn_like(model_input)
@@ -652,7 +652,7 @@ def collate_fn(data):
                 progress_bar.update(1)
                 global_step += 1
 
-                if accelerator.is_main_process:
+                if accelerator.is_main_process or accelerator.distributed_type == DistributedType.DEEPSPEED:
                     if global_step % args.checkpointing_steps == 0:
                         # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
                         if args.checkpoints_total_limit is not None: