support openai/adm with minimal code change

tongdaxu · tongdaxu · commit 816daadc585c · 2024-01-27T20:46:26.000+08:00
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 from importlib import import_module
 from typing import Callable, Optional, Union
 
@@ -707,6 +708,61 @@ def fuse_projections(self, fuse=True):
         self.fused_projections = fuse
 
 
+class QKVAttentionADM(nn.Module):
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum("bct,bcs->bts", q * scale, k * scale)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+
+
+class AttentionADM(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        # print(channels, self.num_heads, num_head_channels)
+        self.norm = nn.GroupNorm(32, channels)
+        self.qkv = nn.Conv1d(channels, channels * 3, 1, 1)
+        self.attention = QKVAttentionADM(self.num_heads)
+        self.proj_out = nn.Conv1d(channels, channels, 1, 1)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        **cross_attention_kwargs,
+    ):
+        # ignore temb and kwargs for now
+        b, c, *spatial = hidden_states.shape
+        hidden_states = hidden_states.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(hidden_states))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (hidden_states + h).reshape(b, c, *spatial)
+
+
 class AttnProcessor:
     r"""
     Default processor for performing attention-related computations.
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -254,6 +254,38 @@ def forward(self, timesteps):
         return t_emb
 
 
+def timestep_embedding_adm(timesteps, dim, max_period=10000):
+    """
+    ADM order embedding from https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/nn.py#L103
+    """
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+        device=timesteps.device
+    )
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+
+
+class TimestepsADM(nn.Module):
+    """
+    ADM order embedding from https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/nn.py#L103
+    """
+
+    def __init__(self, num_channels: int):
+        super().__init__()
+        self.num_channels = num_channels
+
+    def forward(self, timesteps):
+        t_emb = timestep_embedding_adm(
+            timesteps,
+            self.num_channels,
+        )
+        return t_emb
+
+
 class GaussianFourierProjection(nn.Module):
     """Gaussian Fourier embeddings for noise levels."""
 
diff --git a/src/diffusers/models/unets/unet_2d.py b/src/diffusers/models/unets/unet_2d.py
@@ -19,9 +19,9 @@
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import BaseOutput
-from ..embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from ..embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps, TimestepsADM
 from ..modeling_utils import ModelMixin
-from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
+from .unet_2d_blocks import UNetMidBlock2D, UNetMidBlock2DADM, get_down_block, get_up_block
 
 
 @dataclass
@@ -58,7 +58,7 @@ class UNet2DModel(ModelMixin, ConfigMixin):
         down_block_types (`Tuple[str]`, *optional*, defaults to `("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`):
             Tuple of downsample block types.
         mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2D"`):
-            Block type for middle of UNet, it can be either `UNetMidBlock2D` or `UnCLIPUNetMidBlock2D`.
+            Block type for middle of UNet, it can be either `UNetMidBlock2D` or `UNetMidBlock2DADM`.
         up_block_types (`Tuple[str]`, *optional*, defaults to `("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`):
             Tuple of upsample block types.
         block_out_channels (`Tuple[int]`, *optional*, defaults to `(224, 448, 672, 896)`):
@@ -72,6 +72,7 @@ class UNet2DModel(ModelMixin, ConfigMixin):
             The upsample type for upsampling layers. Choose between "conv" and "resnet"
         dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
         act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        attention_type (`str`, *optional*, defaults to `default`): The attention type, Choose between "default", "adm"
         attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
         norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for normalization.
         attn_norm_num_groups (`int`, *optional*, defaults to `None`):
@@ -100,6 +101,7 @@ def __init__(
         freq_shift: int = 0,
         flip_sin_to_cos: bool = True,
         down_block_types: Tuple[str] = ("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
+        mid_block_type: str = "UNetMidBlock2D",
         up_block_types: Tuple[str] = ("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
         block_out_channels: Tuple[int] = (224, 448, 672, 896),
         layers_per_block: int = 2,
@@ -109,6 +111,7 @@ def __init__(
         upsample_type: str = "conv",
         dropout: float = 0.0,
         act_fn: str = "silu",
+        attention_type: str = "default",
         attention_head_dim: Optional[int] = 8,
         norm_num_groups: int = 32,
         attn_norm_num_groups: Optional[int] = None,
@@ -148,7 +151,9 @@ def __init__(
         elif time_embedding_type == "learned":
             self.time_proj = nn.Embedding(num_train_timesteps, block_out_channels[0])
             timestep_input_dim = block_out_channels[0]
-
+        elif time_embedding_type == "adm":
+            self.time_proj = TimestepsADM(block_out_channels[0])
+            timestep_input_dim = block_out_channels[0]
         self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
 
         # class embedding
@@ -182,6 +187,7 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
+                attention_type=attention_type,
                 attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
                 downsample_padding=downsample_padding,
                 resnet_time_scale_shift=resnet_time_scale_shift,
@@ -191,20 +197,34 @@ def __init__(
             self.down_blocks.append(down_block)
 
         # mid
-        self.mid_block = UNetMidBlock2D(
-            in_channels=block_out_channels[-1],
-            temb_channels=time_embed_dim,
-            dropout=dropout,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            attention_head_dim=attention_head_dim if attention_head_dim is not None else block_out_channels[-1],
-            resnet_groups=norm_num_groups,
-            attn_groups=attn_norm_num_groups,
-            add_attention=add_attention,
-        )
-
+        if mid_block_type == "UNetMidBlock2D":
+            self.mid_block = UNetMidBlock2D(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_head_dim=attention_head_dim if attention_head_dim is not None else block_out_channels[-1],
+                resnet_groups=norm_num_groups,
+                attn_groups=attn_norm_num_groups,
+                add_attention=add_attention,
+            )
+        elif mid_block_type == "UNetMidBlock2DADM":
+            self.mid_block = UNetMidBlock2DADM(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_head_dim=attention_head_dim if attention_head_dim is not None else block_out_channels[-1],
+                resnet_groups=norm_num_groups,
+            )
+        else:
+            raise ValueError
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         output_channel = reversed_block_out_channels[0]
@@ -214,7 +234,6 @@ def __init__(
             input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
 
             is_final_block = i == len(block_out_channels) - 1
-
             up_block = get_up_block(
                 up_block_type,
                 num_layers=layers_per_block + 1,
@@ -226,6 +245,7 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
+                attention_type=attention_type,
                 attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
                 resnet_time_scale_shift=resnet_time_scale_shift,
                 upsample_type=upsample_type,
diff --git a/src/diffusers/models/unets/unet_2d_blocks.py b/src/diffusers/models/unets/unet_2d_blocks.py