NUS-HPC-AI-Lab
diff --git a/‎README.md‎
Lines changed: 18 additions & 0 deletions b/‎README.md‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎opendit/vqvae/attention.py‎ ‎opendit/vae/attention.py‎opendit/vqvae/attention.py renamed to opendit/vae/attention.py b/‎opendit/vqvae/attention.py‎ ‎opendit/vae/attention.py‎opendit/vqvae/attention.py renamed to opendit/vae/attention.py
diff --git a/‎opendit/vqvae/data.py‎ ‎opendit/vae/data.py‎opendit/vqvae/data.py renamed to opendit/vae/data.py b/‎opendit/vqvae/data.py‎ ‎opendit/vae/data.py‎opendit/vqvae/data.py renamed to opendit/vae/data.py
diff --git a/‎opendit/vqvae/download.py‎ ‎opendit/vae/download.py‎opendit/vqvae/download.py renamed to opendit/vae/download.py b/‎opendit/vqvae/download.py‎ ‎opendit/vae/download.py‎opendit/vqvae/download.py renamed to opendit/vae/download.py
diff --git a/‎opendit/vqvae/reconstruct.py‎ ‎opendit/vae/reconstruct.py‎opendit/vqvae/reconstruct.py renamed to opendit/vae/reconstruct.py
Lines changed: 1 addition & 1 deletion b/‎opendit/vqvae/reconstruct.py‎ ‎opendit/vae/reconstruct.py‎opendit/vqvae/reconstruct.py renamed to opendit/vae/reconstruct.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎opendit/vqvae/utils.py‎ ‎opendit/vae/utils.py‎opendit/vqvae/utils.py renamed to opendit/vae/utils.py b/‎opendit/vqvae/utils.py‎ ‎opendit/vae/utils.py‎opendit/vqvae/utils.py renamed to opendit/vae/utils.py
diff --git a/‎opendit/vqvae/vqvae.py‎ ‎opendit/vae/vqvae.py‎opendit/vqvae/vqvae.py renamed to opendit/vae/vqvae.py b/‎opendit/vqvae/vqvae.py‎ ‎opendit/vae/vqvae.py‎opendit/vqvae/vqvae.py renamed to opendit/vae/vqvae.py
diff --git a/‎opendit/vqvae/wrapper.py‎ ‎opendit/vae/wrapper.py‎opendit/vqvae/wrapper.py renamed to opendit/vae/wrapper.py b/‎opendit/vqvae/wrapper.py‎ ‎opendit/vae/wrapper.py‎opendit/vqvae/wrapper.py renamed to opendit/vae/wrapper.py
diff --git a/‎sample.py‎
Lines changed: 2 additions & 2 deletions b/‎sample.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎train.py‎
Lines changed: 3 additions & 2 deletions b/‎train.py‎
Lines changed: 3 additions & 2 deletions
@@ -97,6 +97,7 @@ torchrun --standalone --nproc_per_node=2 train.py \
 ```
 
 We disable all speedup methods by default. Here are details of some key arguments for training:
+- `--nproc_per_node`: The GPU number you want to use for the current node.
 - `--plugin`: The booster plugin used by ColossalAI, `zero2` and `ddp` are supported. The default value is `zero2`. Recommend to enable `zero2`.
 - `--mixed_precision`: The data type for mixed precision training. The default value is `fp16`.
 - `--grad_checkpoint`: Whether enable the gradient checkpointing. This saves the memory cost during training process. The default value is `False`. Recommend to disable it when memory is enough.
@@ -107,6 +108,23 @@ We disable all speedup methods by default. Here are details of some key argument
 
 For more details on the configuration of the training process, please visit our code.
 
+<b>Multi-Node Training.</b>
+
+To train OpenDiT on mutiple nodes, you can use the following command:
+
+```
+colossalai run --nproc_per_node 8 --hostfile hostfile train.py \
+    --model DiT-XL/2 \
+    --batch_size 2
+```
+
+And you need to create `hostfile` under the current dir. It should contain all IP address of your nodes and you need to make sure all nodes can be connected without password by ssh. An example of hostfile:
+
+```
+111.111.111.111 # ip of node1
+222.222.222.222 # ip of node2
+```
+
 <b>Inference.</b> You can perform inference using DiT model as follows. You need to replace the checkpoint path to your own trained model. Or you can download [official](https://github.com/facebookresearch/DiT?tab=readme-ov-file#sampling--) or [our](https://drive.google.com/file/d/1P4t2V3RDNcoCiEkbVWAjNetm3KC_4ueI/view?usp=drive_link) checkpoint for inference.
 
 ```shell
 
@@ -5,7 +5,7 @@
 from torchvision.io import write_video
 from torchvision.utils import save_image
 
-from opendit.vqvae.wrapper import AutoencoderKLWrapper
+from opendit.vae.wrapper import AutoencoderKLWrapper
 
 
 def t2v(x):
 
@@ -18,8 +18,8 @@
 from opendit.models.diffusion import create_diffusion
 from opendit.models.dit import DiT_models
 from opendit.utils.download import find_model
-from opendit.vqvae.reconstruct import save_sample
-from opendit.vqvae.wrapper import AutoencoderKLWrapper
+from opendit.vae.reconstruct import save_sample
+from opendit.vae.wrapper import AutoencoderKLWrapper
 
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 
@@ -32,7 +32,7 @@
 from opendit.utils.pg_utils import ProcessGroupManager
 from opendit.utils.train_utils import all_reduce_mean, format_numel_str, get_model_numel, requires_grad, update_ema
 from opendit.utils.video_utils import DatasetFromCSV, get_transforms_image, get_transforms_video
-from opendit.vqvae.wrapper import AutoencoderKLWrapper
+from opendit.vae.wrapper import AutoencoderKLWrapper
 
 # the first flag below was False when we tested this script but True makes A100 training a lot faster:
 torch.backends.cuda.matmul.allow_tf32 = True
@@ -113,8 +113,9 @@ def main(args):
     assert args.image_size % 8 == 0, "Image size must be divisible by 8 (for the VAE encoder)."
     if args.use_video:
         # Wrap the VAE in a wrapper that handles video data
-        # Use 3d patch size that is divisible by the input size
+        # We use 2d vae from stableai instead of 3d vqvae from videogpt because it has better results
         vae = AutoencoderKLWrapper(vae)
+        # Use 3d patch size that is divisible by the input size
         input_size = (args.num_frames, args.image_size, args.image_size)
         for i in range(3):
             assert input_size[i] % vae.patch_size[i] == 0, "Input size must be divisible by patch size"