refactor part 5

a-r-r-o-w · a-r-r-o-w · commit 8f9ffa8f615c · 2024-10-22T02:59:12.000+02:00
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_allegro.py b/src/diffusers/models/autoencoders/autoencoder_kl_allegro.py
@@ -18,7 +18,6 @@
 
 import torch
 import torch.nn as nn
-from einops import rearrange
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ..attention_processor import Attention, SpatialNorm
@@ -114,7 +113,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             hidden_states = self.conv1(hidden_states)
 
         if self.up_sample:
-            hidden_states = rearrange(hidden_states, "b (d c) f h w -> b c (f d) h w", d=2)
+            hidden_states = hidden_states.unflatten(1, (2, -1)).permute(0, 2, 3, 1, 4, 5).flatten(2, 3)
 
         hidden_states = torch.cat((hidden_states[:, :, 0:1], hidden_states), dim=2)
         hidden_states = torch.cat((hidden_states, hidden_states[:, :, -1:]), dim=2)
@@ -858,10 +857,10 @@ def encode(
                     )
                     out_video_cube[:, :, n_start:n_end, h_start:h_end, w_start:w_end] += latent_mean_blend
 
-        ## final conv
-        out_video_cube = rearrange(out_video_cube, "b c n h w -> (b n) c h w")
+        # final conv
+        out_video_cube = out_video_cube.permute(0, 2, 1, 3, 4).flatten(0, 1)
         out_video_cube = self.quant_conv(out_video_cube)
-        out_video_cube = rearrange(out_video_cube, "(b n) c h w -> b c n h w", b=B)
+        out_video_cube = out_video_cube.unflatten(0, (B, -1)).permute(0, 2, 1, 3, 4)
 
         posterior = DiagonalGaussianDistribution(out_video_cube)
 
@@ -885,9 +884,9 @@ def decode(
         B, C, N, H, W = input_latents.shape
 
         ## post quant conv (a mapping)
-        input_latents = rearrange(input_latents, "b c n h w -> (b n) c h w")
+        input_latents = input_latents.permute(0, 2, 1, 3, 4).flatten(0, 1)
         input_latents = self.post_quant_conv(input_latents)
-        input_latents = rearrange(input_latents, "(b n) c h w -> b c n h w", b=B)
+        input_latents = input_latents.unflatten(0, (B, -1)).permute(0, 2, 1, 3, 4)
 
         ## out tensor shape
         out_n = math.floor((N - IN_KERNEL[0]) / IN_STRIDE[0]) + 1
@@ -947,7 +946,7 @@ def decode(
                     )
                     out_video[:, :, n_start:n_end, h_start:h_end, w_start:w_end] += out_video_blend
 
-        out_video = rearrange(out_video, "b c t h w -> b t c h w").contiguous()
+        out_video = out_video.permute(0, 2, 1, 3, 4).contiguous()
 
         decoded = out_video
         if not return_dict:
diff --git a/src/diffusers/models/transformers/transformer_allegro.py b/src/diffusers/models/transformers/transformer_allegro.py
@@ -18,17 +18,13 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import logging
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import FeedForward
-from ..attention_processor import (
-    AllegroAttnProcessor2_0,
-    Attention,
-)
-from ..embeddings import PixArtAlphaTextProjection
+from ..attention_processor import AllegroAttnProcessor2_0, Attention
+from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
 from ..normalization import AllegroAdaLayerNormSingle
@@ -37,57 +33,6 @@
 logger = logging.get_logger(__name__)
 
 
-class PatchEmbed2D(nn.Module):
-    """2D Image to Patch Embedding"""
-
-    def __init__(
-        self,
-        num_frames=1,
-        height=224,
-        width=224,
-        patch_size_t=1,
-        patch_size=16,
-        in_channels=3,
-        embed_dim=768,
-        layer_norm=False,
-        flatten=True,
-        bias=True,
-        use_abs_pos=False,
-    ):
-        super().__init__()
-        self.use_abs_pos = use_abs_pos
-        self.flatten = flatten
-        self.layer_norm = layer_norm
-
-        self.proj = nn.Conv2d(
-            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), bias=bias
-        )
-        if layer_norm:
-            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
-        else:
-            self.norm = None
-
-        self.patch_size_t = patch_size_t
-        self.patch_size = patch_size
-
-    def forward(self, latent):
-        b, _, _, _, _ = latent.shape
-        video_latent = None
-
-        latent = rearrange(latent, "b c t h w -> (b t) c h w")
-
-        latent = self.proj(latent)
-        if self.flatten:
-            latent = latent.flatten(2).transpose(1, 2)  # BT C H W -> BT N C
-        if self.layer_norm:
-            latent = self.norm(latent)
-
-        latent = rearrange(latent, "(b t) n c -> b (t n) c", b=b)
-        video_latent = latent
-
-        return video_latent
-
-
 @maybe_allow_in_graph
 class AllegroTransformerBlock(nn.Module):
     r"""
@@ -280,13 +225,13 @@ def __init__(
         interpolation_scale_w = interpolation_scale_w if interpolation_scale_w is not None else sample_width / 40
 
         # 1. Patch embedding
-        self.pos_embed = PatchEmbed2D(
+        self.pos_embed = PatchEmbed(
             height=sample_height,
             width=sample_width,
             patch_size=patch_size,
             in_channels=in_channels,
             embed_dim=self.inner_dim,
-            # pos_embed_type=None,
+            pos_embed_type=None,
         )
 
         # 2. Transformer blocks
@@ -327,8 +272,8 @@ def _set_gradient_checkpointing(self, module, value=False):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        timestep: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
@@ -368,13 +313,9 @@ def forward(
             )
 
         # convert encoder_attention_mask to a bias the same way we do for attention_mask
-        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 3:
-            # b, 1+use_image_num, l -> a video with images
-            # b, 1, l -> only images
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
             encoder_attention_mask = (1 - encoder_attention_mask.to(self.dtype)) * -10000.0
-            encoder_attention_mask = (
-                rearrange(encoder_attention_mask, "b 1 l -> (b 1) 1 l") if encoder_attention_mask.numel() > 0 else None
-            )
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
 
         # 1. Input
         post_patch_num_frames = num_frames // self.config.patch_size_temporal
@@ -385,9 +326,9 @@ def forward(
             timestep, batch_size=batch_size, hidden_dtype=hidden_states.dtype
         )
 
-        hidden_states = self.pos_embed(
-            hidden_states
-        )  # TODO(aryan): remove dtype conversion here and move to pipeline if needed
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4).flatten(0, 1)
+        hidden_states = self.pos_embed(hidden_states)
+        hidden_states = hidden_states.unflatten(0, (batch_size, -1)).flatten(1, 2)
 
         encoder_hidden_states = self.caption_projection(encoder_hidden_states)
         encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, encoder_hidden_states.shape[-1])
diff --git a/src/diffusers/pipelines/allegro/pipeline_allegro.py b/src/diffusers/pipelines/allegro/pipeline_allegro.py
@@ -836,25 +836,11 @@ def __call__(
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-            current_timestep = t
-            if not torch.is_tensor(current_timestep):
-                # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-                # This would be a good case for the `match` statement (Python 3.10+)
-                is_mps = latent_model_input.device.type == "mps"
-                if isinstance(current_timestep, float):
-                    dtype = torch.float32 if is_mps else torch.float64
-                else:
-                    dtype = torch.int32 if is_mps else torch.int64
-                current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
-            elif len(current_timestep.shape) == 0:
-                current_timestep = current_timestep[None].to(latent_model_input.device)
             # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-            current_timestep = current_timestep.expand(latent_model_input.shape[0])
+            timestep = t.expand(latent_model_input.shape[0])
 
             if prompt_embeds.ndim == 3:
                 prompt_embeds = prompt_embeds.unsqueeze(1)  # b l d -> b 1 l d
-            if prompt_attention_mask.ndim == 2:
-                prompt_attention_mask = prompt_attention_mask.unsqueeze(1)  # b l -> b 1 l
 
             # prepare attention_mask.
             # b c t h w -> b t h w
@@ -866,7 +852,7 @@ def __call__(
                 attention_mask=attention_mask,
                 encoder_hidden_states=prompt_embeds,
                 encoder_attention_mask=prompt_attention_mask,
-                timestep=current_timestep,
+                timestep=timestep,
                 image_rotary_emb=image_rotary_emb,
                 return_dict=False,
             )[0]
@@ -876,12 +862,6 @@ def __call__(
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
-            # learned sigma
-            if latent_channels == self.transformer.config.out_channels // 2:
-                noise_pred = noise_pred.chunk(2, dim=1)[0]
-            else:
-                noise_pred = noise_pred
-
             # compute previous image: x_t -> x_t-1
             latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]