refactor part 8

a-r-r-o-w · a-r-r-o-w · commit 174621f34dbb · 2024-10-23T04:26:20.000+02:00
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -1598,58 +1598,6 @@ def forward(
         return objs
 
 
-class AllegroCombinedTimestepSizeEmbeddings(nn.Module):
-    """
-    For Allegro. TODO(aryan)
-    """
-
-    def __init__(self, embedding_dim: int, size_emb_dim: int, use_additional_conditions: bool = False):
-        super().__init__()
-
-        self.outdim = size_emb_dim
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-
-        self.use_additional_conditions = use_additional_conditions
-        if use_additional_conditions:
-            self.use_additional_conditions = True
-            self.additional_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-            self.resolution_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
-            self.aspect_ratio_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
-
-    def apply_condition(self, size: torch.Tensor, batch_size: int, embedder: nn.Module):
-        if size.ndim == 1:
-            size = size[:, None]
-
-        if size.shape[0] != batch_size:
-            size = size.repeat(batch_size // size.shape[0], 1)
-            if size.shape[0] != batch_size:
-                raise ValueError(f"`batch_size` should be {size.shape[0]} but found {batch_size}.")
-
-        current_batch_size, dims = size.shape[0], size.shape[1]
-        size = size.reshape(-1)
-        size_freq = self.additional_condition_proj(size).to(size.dtype)
-
-        size_emb = embedder(size_freq)
-        size_emb = size_emb.reshape(current_batch_size, dims * self.outdim)
-        return size_emb
-
-    def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
-        timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
-
-        if self.use_additional_conditions:
-            resolution = self.apply_condition(resolution, batch_size=batch_size, embedder=self.resolution_embedder)
-            aspect_ratio = self.apply_condition(
-                aspect_ratio, batch_size=batch_size, embedder=self.aspect_ratio_embedder
-            )
-            conditioning = timesteps_emb + torch.cat([resolution, aspect_ratio], dim=1)
-        else:
-            conditioning = timesteps_emb
-
-        return conditioning
-
-
 class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
     """
     For PixArt-Alpha.
diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
@@ -23,9 +23,8 @@
 from ..utils import is_torch_version
 from .activations import get_activation
 from .embeddings import (
-    AllegroCombinedTimestepSizeEmbeddings,
     CombinedTimestepLabelEmbeddings,
-    PixArtAlphaCombinedTimestepSizeEmbeddings,
+    PixArtAlphaCombinedTimestepSizeEmbeddings
 )
 
 
@@ -267,6 +266,7 @@ def forward(
         hidden_dtype: Optional[torch.dtype] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         # No modulation happening here.
+        added_cond_kwargs = added_cond_kwargs or {"resolution": None, "aspect_ratio": None}
         embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
         return self.linear(self.silu(embedded_timestep)), embedded_timestep
 
@@ -390,41 +390,6 @@ def forward(
         return x
 
 
-class AllegroAdaLayerNormSingle(nn.Module):
-    r"""
-    Norm layer adaptive layer norm single (adaLN-single).
-
-    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
-
-    Parameters:
-        embedding_dim (`int`): The size of each embedding vector.
-        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
-    """
-
-    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
-        super().__init__()
-
-        self.emb = AllegroCombinedTimestepSizeEmbeddings(
-            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
-        )
-
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
-
-    def forward(
-        self,
-        timestep: torch.Tensor,
-        added_cond_kwargs: Dict[str, torch.Tensor] = None,
-        batch_size: int = None,
-        hidden_dtype: Optional[torch.dtype] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        # No modulation happening here.
-        embedded_timestep = self.emb(
-            timestep, batch_size=batch_size, hidden_dtype=hidden_dtype, resolution=None, aspect_ratio=None
-        )
-        return self.linear(self.silu(embedded_timestep)), embedded_timestep
-
-
 class CogView3PlusAdaLayerNormZeroTextImage(nn.Module):
     r"""
     Norm layer adaptive layer norm zero (adaLN-Zero).
diff --git a/src/diffusers/models/transformers/transformer_allegro.py b/src/diffusers/models/transformers/transformer_allegro.py
@@ -27,7 +27,7 @@
 from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
-from ..normalization import AllegroAdaLayerNormSingle
+from ..normalization import AdaLayerNormSingle
 
 
 logger = logging.get_logger(__name__)
@@ -36,7 +36,29 @@
 @maybe_allow_in_graph
 class AllegroTransformerBlock(nn.Module):
     r"""
-    TODO(aryan): docs
+    Transformer block used in [Allegro](https://github.com/rhymes-ai/Allegro) model.
+    Args:
+        dim (`int`):
+            The number of channels in the input and output.
+        num_attention_heads (`int`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`):
+            The number of channels in each head.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        cross_attention_dim (`int`, defaults to `2304`):
+            The dimension of the cross attention features.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to be used in feed-forward.
+        attention_bias (`bool`, defaults to `False`):
+            Whether or not to use bias in attention projection layers.
+        only_cross_attention (`bool`, defaults to `False`):
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_eps (`float`, defaults to `1e-5`):
+            Epsilon value for normalization layers.
+        final_dropout (`bool` defaults to `False`):
+            Whether to apply a final dropout after the last feed-forward layer.
     """
 
     def __init__(
@@ -48,11 +70,8 @@ def __init__(
         cross_attention_dim: Optional[int] = None,
         activation_fn: str = "geglu",
         attention_bias: bool = False,
-        only_cross_attention: bool = False,
-        upcast_attention: bool = False,
         norm_elementwise_affine: bool = True,
         norm_eps: float = 1e-5,
-        final_dropout: bool = False,
     ):
         super().__init__()
 
@@ -65,8 +84,7 @@ def __init__(
             dim_head=attention_head_dim,
             dropout=dropout,
             bias=attention_bias,
-            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
-            upcast_attention=upcast_attention,
+            cross_attention_dim=None,
             processor=AllegroAttnProcessor2_0(),
         )
 
@@ -79,9 +97,8 @@ def __init__(
             dim_head=attention_head_dim,
             dropout=dropout,
             bias=attention_bias,
-            upcast_attention=upcast_attention,
             processor=AllegroAttnProcessor2_0(),
-        )  # is self-attn if encoder_hidden_states is none
+        )
 
         # 3. Feed Forward
         self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
@@ -90,7 +107,6 @@ def __init__(
             dim,
             dropout=dropout,
             activation_fn=activation_fn,
-            final_dropout=final_dropout,
         )
 
         # 4. Scale-shift
@@ -159,37 +175,55 @@ class AllegroTransformer3DModel(ModelMixin, ConfigMixin):
     _supports_gradient_checkpointing = True
 
     """
-    A 2D Transformer model for image-like data.
-
-    Parameters:
-        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
-        in_channels (`int`, *optional*):
-            The number of channels in the input and output (specify if the input is **continuous**).
-        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
-        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
-            This is fixed during training since it is used to learn a number of position embeddings.
-        num_vector_embeds (`int`, *optional*):
-            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
-            Includes the class for the masked latent pixel.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
-        num_embeds_ada_norm ( `int`, *optional*):
-            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
-            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
-            added to the hidden states.
-
-            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
-        attention_bias (`bool`, *optional*):
-            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    A 3D Transformer model for video-like data.
+    Args:
+        patch_size (`int`, defaults to `2`):
+            The size of spatial patches to use in the patch embedding layer.
+        patch_size_t (`int`, defaults to `1`):
+            The size of temporal patches to use in the patch embedding layer.
+        num_attention_heads (`int`, defaults to `24`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, defaults to `96`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `4`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `4`):
+            The number of channels in the output.
+        num_layers (`int`, defaults to `32`):
+            The number of layers of Transformer blocks to use.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        cross_attention_dim (`int`, defaults to `2304`):
+            The dimension of the cross attention features.
+        attention_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in the attention projection layers.
+        sample_height (`int`, defaults to `90`):
+            The height of the input latents.
+        sample_width (`int`, defaults to `160`):
+            The width of the input latents.
+        sample_frames (`int`, defaults to `22`):
+            The number of frames in the input latents.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to use in feed-forward.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether or not to use elementwise affine in normalization layers.
+        norm_eps (`float`, defaults to `1e-5`):
+            The epsilon value to use in normalization layers.
+        caption_channels (`int`, defaults to `4096`):
+            Number of channels to use for projecting the caption embeddings.
+        interpolation_scale_h (`float`, defaults to `2.0`):
+            Scaling factor to apply in 3D positional embeddings across height dimension.
+        interpolation_scale_w (`float`, defaults to `2.0`):
+            Scaling factor to apply in 3D positional embeddings across width dimension.
+        interpolation_scale_t (`float`, defaults to `2.2`):
+            Scaling factor to apply in 3D positional embeddings across time dimension.
     """
 
     @register_to_config
     def __init__(
         self,
         patch_size: int = 2,
-        patch_size_temporal: int = 1,
+        patch_size_t: int = 1,
         num_attention_heads: int = 24,
         attention_head_dim: int = 96,
         in_channels: int = 4,
@@ -202,7 +236,6 @@ def __init__(
         sample_width: int = 160,
         sample_frames: int = 22,
         activation_fn: str = "gelu-approximate",
-        upcast_attention: bool = False,
         norm_elementwise_affine: bool = False,
         norm_eps: float = 1e-6,
         caption_channels: int = 4096,
@@ -245,7 +278,6 @@ def __init__(
                     cross_attention_dim=cross_attention_dim,
                     activation_fn=activation_fn,
                     attention_bias=attention_bias,
-                    upcast_attention=upcast_attention,
                     norm_elementwise_affine=norm_elementwise_affine,
                     norm_eps=norm_eps,
                 )
@@ -259,7 +291,7 @@ def __init__(
         self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * out_channels)
 
         # 4. Timestep embeddings
-        self.adaln_single = AllegroAdaLayerNormSingle(self.inner_dim, use_additional_conditions=False)
+        self.adaln_single = AdaLayerNormSingle(self.inner_dim, use_additional_conditions=False)
 
         # 5. Caption projection
         self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=self.inner_dim)
@@ -280,9 +312,13 @@ def forward(
         return_dict: bool = True,
     ):
         batch_size, num_channels, num_frames, height, width = hidden_states.shape
-        p_t = self.config.patch_size_temporal
+        p_t = self.config.patch_size_t
         p = self.config.patch_size
 
+        post_patch_num_frames = num_frames // self.config.patch_size_temporal
+        post_patch_height = height // self.config.patch_size
+        post_patch_width = width // self.config.patch_size
+
         # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
         #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
         #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
@@ -317,22 +353,20 @@ def forward(
             encoder_attention_mask = (1 - encoder_attention_mask.to(self.dtype)) * -10000.0
             encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
 
-        # 1. Input
-        post_patch_num_frames = num_frames // self.config.patch_size_temporal
-        post_patch_height = height // self.config.patch_size
-        post_patch_width = width // self.config.patch_size
-
+        # 1. Timestep embeddings
         timestep, embedded_timestep = self.adaln_single(
             timestep, batch_size=batch_size, hidden_dtype=hidden_states.dtype
         )
 
+        # 2. Patch embeddings
         hidden_states = hidden_states.permute(0, 2, 1, 3, 4).flatten(0, 1)
         hidden_states = self.pos_embed(hidden_states)
         hidden_states = hidden_states.unflatten(0, (batch_size, -1)).flatten(1, 2)
 
         encoder_hidden_states = self.caption_projection(encoder_hidden_states)
         encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, encoder_hidden_states.shape[-1])
 
+        # 3. Transformer blocks
         for i, block in enumerate(self.transformer_blocks):
             # TODO(aryan): Implement gradient checkpointing
             if self.gradient_checkpointing:
@@ -364,7 +398,7 @@ def custom_forward(*inputs):
                     image_rotary_emb=image_rotary_emb,
                 )
 
-        # 3. Output
+        # 4. Output normalization & projection
         shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
         hidden_states = self.norm_out(hidden_states)
 
@@ -373,7 +407,7 @@ def custom_forward(*inputs):
         hidden_states = self.proj_out(hidden_states)
         hidden_states = hidden_states.squeeze(1)
 
-        # unpatchify
+        # 5. Unpatchify
         hidden_states = hidden_states.reshape(
             batch_size, post_patch_num_frames, post_patch_height, post_patch_width, p_t, p, p, -1
         )