NUS-HPC-AI-Lab
diff --git a/‎README.md‎
Lines changed: 24 additions & 8 deletions b/‎README.md‎
Lines changed: 24 additions & 8 deletions
diff --git a/‎opendit/models/diffusion/__init__.py‎ ‎opendit/diffusion/__init__.py‎opendit/models/diffusion/__init__.py renamed to opendit/diffusion/__init__.py b/‎opendit/models/diffusion/__init__.py‎ ‎opendit/diffusion/__init__.py‎opendit/models/diffusion/__init__.py renamed to opendit/diffusion/__init__.py
diff --git a/‎…ndit/models/diffusion/diffusion_utils.py‎ ‎opendit/diffusion/diffusion_utils.py‎opendit/models/diffusion/diffusion_utils.py renamed to opendit/diffusion/diffusion_utils.py b/‎…ndit/models/diffusion/diffusion_utils.py‎ ‎opendit/diffusion/diffusion_utils.py‎opendit/models/diffusion/diffusion_utils.py renamed to opendit/diffusion/diffusion_utils.py
diff --git a/‎…t/models/diffusion/gaussian_diffusion.py‎ ‎opendit/diffusion/gaussian_diffusion.py‎opendit/models/diffusion/gaussian_diffusion.py renamed to opendit/diffusion/gaussian_diffusion.py b/‎…t/models/diffusion/gaussian_diffusion.py‎ ‎opendit/diffusion/gaussian_diffusion.py‎opendit/models/diffusion/gaussian_diffusion.py renamed to opendit/diffusion/gaussian_diffusion.py
diff --git a/‎opendit/models/diffusion/respace.py‎ ‎opendit/diffusion/respace.py‎opendit/models/diffusion/respace.py renamed to opendit/diffusion/respace.py b/‎opendit/models/diffusion/respace.py‎ ‎opendit/diffusion/respace.py‎opendit/models/diffusion/respace.py renamed to opendit/diffusion/respace.py
diff --git a/‎…dit/models/diffusion/timestep_sampler.py‎ ‎opendit/diffusion/timestep_sampler.py‎opendit/models/diffusion/timestep_sampler.py renamed to opendit/diffusion/timestep_sampler.py b/‎…dit/models/diffusion/timestep_sampler.py‎ ‎opendit/diffusion/timestep_sampler.py‎opendit/models/diffusion/timestep_sampler.py renamed to opendit/diffusion/timestep_sampler.py
diff --git a/‎opendit/models/clip.py‎ ‎opendit/embed/clip_text_emb.py‎opendit/models/clip.py renamed to opendit/embed/clip_text_emb.py b/‎opendit/models/clip.py‎ ‎opendit/embed/clip_text_emb.py‎opendit/models/clip.py renamed to opendit/embed/clip_text_emb.py
diff --git a/‎opendit/embed/label_emb.py‎
Lines changed: 45 additions & 0 deletions b/‎opendit/embed/label_emb.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎opendit/embed/patch_emb.py‎
Lines changed: 61 additions & 0 deletions b/‎opendit/embed/patch_emb.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎opendit/embed/pos_emb.py‎
Lines changed: 100 additions & 0 deletions b/‎opendit/embed/pos_emb.py‎
Lines changed: 100 additions & 0 deletions
@@ -6,9 +6,14 @@
 <p align="center"><a href="https://github.com/NUS-HPC-AI-Lab/OpenDiT">[Homepage]</a> | <a href="https://discord.gg/yXF4n8Et">[Discord]</a> | <a href="./figure/wechat.jpg">[WeChat]</a> | <a href="https://twitter.com/YangYou1991/status/1762447718105170185">[Twitter]</a> | <a href="https://zhuanlan.zhihu.com/p/684457582">[Zhihu]</a> | <a href="https://mp.weixin.qq.com/s/IBb9vlo8hfYKrj9ztxkhjg">[Media]</a></p>
 </p>
 
+###  Latest News 🔥
+
+* [2024/03/01] Support DiT-based Latte for text-to-video generation.
+* [2024/02/27] Officially release OpenDiT: An Easy, Fast and Memory-Efficent System for DiT Training and Inference.
+
 # About
 
-OpenDiT is an open-source project that provides a high-performance implementation of Diffusion Transformer(DiT) powered by Colossal-AI, specifically designed to enhance the efficiency of training and inference for DiT applications, including text-to-video generation and text-to-image generation.
+OpenDiT is an open-source project that provides a high-performance implementation of Diffusion Transformer (DiT) powered by Colossal-AI, specifically designed to enhance the efficiency of training and inference for DiT applications, including text-to-video generation and text-to-image generation.
 
 OpenDiT boasts the performance by the following techniques:
 
@@ -87,26 +92,30 @@ pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation -
 
 ### Image
 
-<b>Training.</b> You can train the DiT model by executing the following command:
+<b>Training.</b> You can train the DiT model on CIFAR10 by executing the following command:
 
 ```shell
 # Use script
 bash train_img.sh
 # Use command line
 torchrun --standalone --nproc_per_node=2 train.py \
     --model DiT-XL/2 \
-    --batch_size 2
+    --batch_size 2 \
+    --num_classes 10
 ```
 
 We disable all speedup methods by default. Here are details of some key arguments for training:
 - `--nproc_per_node`: The GPU number you want to use for the current node.
 - `--plugin`: The booster plugin used by ColossalAI, `zero2` and `ddp` are supported. The default value is `zero2`. Recommend to enable `zero2`.
 - `--mixed_precision`: The data type for mixed precision training. The default value is `fp16`.
 - `--grad_checkpoint`: Whether enable the gradient checkpointing. This saves the memory cost during training process. The default value is `False`. Recommend to disable it when memory is enough.
-- `--enable_modulate_kernel`: Whether enable the modulate kernel optimization. This speeds up the training process. The default value is `False`. Recommend to enable it for GPU < H100.
 - `--enable_layernorm_kernel`: Whether enable the layernorm kernel optimization. This speeds up the training process. The default value is `False`. Recommend to enable it.
 - `--enable_flashattn`: Whether enable the FlashAttention. This speeds up the training process. The default value is `False`. Recommend to enable.
+- `--enable_modulate_kernel`: Whether enable the modulate kernel optimization. This speeds up the training process. The default value is `False`. This kernel will cause NaN under some circumstances. So we recommend to disable it for now.
 - `--sequence_parallel_size`: The sequence parallelism size. Will enable sequence parallelism when setting a value > 1. The default value is 1. Recommend to disable it if memory is enough.
+- `--load`: Load previous saved checkpoint dir and continue training.
+- `--num_classes`: Label class number. Only used for label-to-image generation.
+
 
 For more details on the configuration of the training process, please visit our code.
 
@@ -137,14 +146,17 @@ python sample.py --model DiT-XL/2 --image_size 256 --ckpt ./model.pt
 ```
 
 ### Video
-<b>Training.</b> Our video training pipeline is a faithful implementation, and we encourage you to explore your own strategies using OpenDiT. You can train the video DiT model by executing the following command:
+<b>Training.</b> We current support `VDiT` and `Latte` for video generation. VDiT adopts DiT structure and use video as inputs data. Latte further use more efficient spatial & temporal blocks based on VDiT (not exactly align with origin [Latte](https://github.com/Vchitect/Latte)).
+
+Our video training pipeline is a faithful implementation, and we encourage you to explore your own strategies using OpenDiT. You can train the video DiT model by executing the following command:
 
 ```shell
 # train with scipt
 bash train_video.sh
 # train with command line
+# model can also be Latte-XL/1x2x2
 torchrun --standalone --nproc_per_node=2 train.py \
-    --model vDiT-XL/222 \
+    --model VDiT-XL/1x2x2 \
     --use_video \
     --data_path ./videos/demo.csv \
     --batch_size 1 \
@@ -166,15 +178,18 @@ This script shares the same speedup methods as we have shown in the image traini
 # Use script
 bash sample_video.sh
 # Use command line
+# model can also be Latte-XL/1x2x2
 python sample.py \
-    --model vDiT-XL/222 \
+    --model VDiT-XL/1x2x2 \
     --use_video \
     --ckpt ckpt_path \
     --num_frames 16 \
     --image_size 256 \
     --frame_interval 3
 ```
 
+Inference tips: 1) EMA model requires quite long time to converge and produce meaningful results. So you can sample base model (`--ckpt /epochXX-global_stepXX/model`) instead of ema model (`--ckpt /epochXX-global_stepXX/ema.pt`) to check your training process. 2) Modify the text condition in `sample.py` which aligns with your datasets helps to produce better results in the early stage of training.
+
 ## FastSeq
 
 ![fastseq_overview](./figure/fastseq_overview.png)
@@ -210,7 +225,8 @@ torchrun --standalone --nproc_per_node=8 train.py \
     --batch_size 180 \
     --enable_layernorm_kernel \
     --enable_flashattn \
-    --mixed_precision fp16
+    --mixed_precision fp16 \
+    --num_classes 1000
 ```
 
 
 
@@ -0,0 +1,45 @@
+# Modified from Meta DiT
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DiT:   https://github.com/facebookresearch/DiT/tree/main
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE:   https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+
+
+import torch
+from torch import nn
+
+
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
@@ -0,0 +1,61 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn.functional as F
+from torch import nn
+
+
+class PatchEmbed3D(nn.Module):
+    """Video to Patch Embedding.
+
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        in_chans (int): Number of input video channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(
+        self,
+        patch_size=(2, 4, 4),
+        in_chans=3,
+        embed_dim=96,
+        norm_layer=None,
+        flatten=True,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.flatten = flatten
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, D, H, W = x.size()
+        if W % self.patch_size[2] != 0:
+            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
+        if H % self.patch_size[1] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
+        if D % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))
+
+        x = self.proj(x)  # (B C T H W)
+        if self.norm is not None:
+            D, Wh, Ww = x.size(2), x.size(3), x.size(4)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCTHW -> BNC
+        return x
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+
+def get_3d_sincos_pos_embed(embed_dim, grid_size, t_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    t_size: int of the temporal size
+    return:
+    pos_embed: [t_size*grid_size*grid_size, embed_dim] or [1+t_size*grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    assert embed_dim % 4 == 0
+    embed_dim_spatial = embed_dim // 4 * 3
+    embed_dim_temporal = embed_dim // 4
+
+    # spatial
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed_spatial = get_2d_sincos_pos_embed_from_grid(embed_dim_spatial, grid)
+
+    # temporal
+    grid_t = np.arange(t_size, dtype=np.float32)
+    pos_embed_temporal = get_1d_sincos_pos_embed_from_grid(embed_dim_temporal, grid_t)
+
+    # concate: [T, H, W] order
+    pos_embed_temporal = pos_embed_temporal[:, np.newaxis, :]
+    pos_embed_temporal = np.repeat(pos_embed_temporal, grid_size**2, axis=1)  # [T, H*W, D // 4]
+    pos_embed_spatial = pos_embed_spatial[np.newaxis, :, :]
+    pos_embed_spatial = np.repeat(pos_embed_spatial, t_size, axis=0)  # [T, H*W, D // 4 * 3]
+
+    pos_embed = np.concatenate([pos_embed_temporal, pos_embed_spatial], axis=-1)
+    pos_embed = pos_embed.reshape([-1, embed_dim])  # [T*H*W, D]
+
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed(embed_dim, length):
+    pos = np.arange(0, length)[..., None]
+    return get_1d_sincos_pos_embed_from_grid(embed_dim, pos)
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb