refactor

a-r-r-o-w · a-r-r-o-w · commit 2c828c227dd0 · 2024-12-12T09:57:12.000+01:00
diff --git a/scripts/convert_sana_to_diffusers.py b/scripts/convert_sana_to_diffusers.py
@@ -59,8 +59,8 @@ def main(args):
     converted_state_dict = {}
 
     # Patch embeddings.
-    converted_state_dict["pos_embed.proj.weight"] = state_dict.pop("x_embedder.proj.weight")
-    converted_state_dict["pos_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias")
+    converted_state_dict["patch_embed.proj.weight"] = state_dict.pop("x_embedder.proj.weight")
+    converted_state_dict["patch_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias")
 
     # Caption projection.
     converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight")
@@ -69,18 +69,18 @@ def main(args):
     converted_state_dict["caption_projection.linear_2.bias"] = state_dict.pop("y_embedder.y_proj.fc2.bias")
 
     # AdaLN-single LN
-    converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.weight"] = state_dict.pop(
+    converted_state_dict["time_embed.emb.timestep_embedder.linear_1.weight"] = state_dict.pop(
         "t_embedder.mlp.0.weight"
     )
-    converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.bias"] = state_dict.pop("t_embedder.mlp.0.bias")
-    converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.weight"] = state_dict.pop(
+    converted_state_dict["time_embed.emb.timestep_embedder.linear_1.bias"] = state_dict.pop("t_embedder.mlp.0.bias")
+    converted_state_dict["time_embed.emb.timestep_embedder.linear_2.weight"] = state_dict.pop(
         "t_embedder.mlp.2.weight"
     )
-    converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.bias"] = state_dict.pop("t_embedder.mlp.2.bias")
+    converted_state_dict["time_embed.emb.timestep_embedder.linear_2.bias"] = state_dict.pop("t_embedder.mlp.2.bias")
 
     # Shared norm.
-    converted_state_dict["adaln_single.linear.weight"] = state_dict.pop("t_block.1.weight")
-    converted_state_dict["adaln_single.linear.bias"] = state_dict.pop("t_block.1.bias")
+    converted_state_dict["time_embed.linear.weight"] = state_dict.pop("t_block.1.weight")
+    converted_state_dict["time_embed.linear.bias"] = state_dict.pop("t_block.1.bias")
 
     # y norm
     converted_state_dict["caption_norm.weight"] = state_dict.pop("attention_y_norm.weight")
@@ -166,18 +166,19 @@ def main(args):
             num_cross_attention_heads=model_kwargs[args.model_type]["num_cross_attention_heads"],
             cross_attention_head_dim=model_kwargs[args.model_type]["cross_attention_head_dim"],
             cross_attention_dim=model_kwargs[args.model_type]["cross_attention_dim"],
+            caption_channels=2304,
+            mlp_ratio=2.5,
             attention_bias=False,
             sample_size=32,
             patch_size=1,
             norm_elementwise_affine=False,
             norm_eps=1e-6,
-            caption_channels=2304,
-            expand_ratio=2.5,
         )
+    
     if is_accelerate_available():
         load_model_dict_into_meta(transformer, converted_state_dict)
     else:
-        transformer.load_state_dict(converted_state_dict, strict=True)
+        transformer.load_state_dict(converted_state_dict, strict=True, assign=True)
 
     try:
         state_dict.pop("y_embedder.y_embedding")
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -5361,21 +5361,16 @@ def __call__(
     ) -> torch.Tensor:
         original_dtype = hidden_states.dtype
 
-        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
 
         query = attn.to_q(hidden_states)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
 
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
-        key = key.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1).transpose(-1, -2)
-        value = value.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
+        query = query.transpose(1, 2).unflatten(1, (attn.heads, -1))
+        key = key.transpose(1, 2).unflatten(1, (attn.heads, -1)).transpose(2, 3)
+        value = value.transpose(1, 2).unflatten(1, (attn.heads, -1))
 
         query = self.kernel_func(query)
         key = self.kernel_func(key)
@@ -5386,17 +5381,14 @@ def __call__(
         scores = torch.matmul(value, key)
         hidden_states = torch.matmul(scores, query)
 
-        if hidden_states.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.float()
-
         hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
-        hidden_states = hidden_states.view(batch_size, attn.heads * head_dim, -1).permute(0, 2, 1)
+        hidden_states = hidden_states.flatten(1, 2).transpose(1, 2)
         hidden_states = hidden_states.to(original_dtype)
 
         hidden_states = attn.to_out[0](hidden_states)
         hidden_states = attn.to_out[1](hidden_states)
 
-        if hidden_states.dtype == torch.float16:
+        if original_dtype == torch.float16:
             hidden_states = hidden_states.clip(-65504, 65504)
 
         return hidden_states
diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py
@@ -26,39 +26,10 @@
 from ..attention_processor import SanaMultiscaleLinearAttention
 from ..modeling_utils import ModelMixin
 from ..normalization import RMSNorm, get_normalization
+from ..transformers.sana_transformer import GLUMBConv
 from .vae import DecoderOutput, EncoderOutput
 
 
-class GLUMBConv(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int) -> None:
-        super().__init__()
-
-        hidden_channels = 4 * in_channels
-
-        self.nonlinearity = nn.SiLU()
-
-        self.conv_inverted = nn.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
-        self.conv_depth = nn.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
-        self.conv_point = nn.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
-        self.norm = RMSNorm(out_channels, eps=1e-5, elementwise_affine=True, bias=True)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        residual = hidden_states
-
-        hidden_states = self.conv_inverted(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-
-        hidden_states = self.conv_depth(hidden_states)
-        hidden_states, gate = torch.chunk(hidden_states, 2, dim=1)
-        hidden_states = hidden_states * self.nonlinearity(gate)
-
-        hidden_states = self.conv_point(hidden_states)
-        # move channel to the last dimension so we apply RMSnorm across channel dimension
-        hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
-
-        return hidden_states + residual
-
-
 class ResBlock(nn.Module):
     def __init__(
         self,
@@ -115,6 +86,7 @@ def __init__(
         self.conv_out = GLUMBConv(
             in_channels=in_channels,
             out_channels=in_channels,
+            norm_type="rms_norm",
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/src/diffusers/models/transformers/sana_transformer.py b/src/diffusers/models/transformers/sana_transformer.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, Optional, Union
+from typing import Dict, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -35,28 +35,27 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-# Modified from diffusers.models.autoencoders.autoencoder_dc.GLUMBConv
-@maybe_allow_in_graph
-class SanaGLUMBConv(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int, mlp_ratio: float = 2.5) -> None:
+class GLUMBConv(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, expand_ratio: float = 4, norm_type: Optional[str] = None, residual_connection: bool = True) -> None:
         super().__init__()
 
-        hidden_channels = int(mlp_ratio * in_channels)
+        hidden_channels = int(expand_ratio * in_channels)
+        self.norm_type = norm_type
+        self.residual_connection = residual_connection
 
         self.nonlinearity = nn.SiLU()
 
         self.conv_inverted = nn.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
         self.conv_depth = nn.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
         self.conv_point = nn.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
 
-    def forward(self, hidden_states: torch.Tensor, HW: Optional[tuple[int]] = None) -> torch.Tensor:
-        B, N, C = hidden_states.shape
-        if HW is None:
-            H = W = int(N**0.5)
-        else:
-            H, W = HW
+        self.norm = None
+        if norm_type == "rms_norm":
+            self.norm = RMSNorm(out_channels, eps=1e-5, elementwise_affine=True, bias=True)
 
-        hidden_states = hidden_states.reshape(B, H, W, C).permute(0, 3, 1, 2)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.residual_connection:
+            residual = hidden_states    
 
         hidden_states = self.conv_inverted(hidden_states)
         hidden_states = self.nonlinearity(hidden_states)
@@ -66,23 +65,22 @@ def forward(self, hidden_states: torch.Tensor, HW: Optional[tuple[int]] = None)
         hidden_states = hidden_states * self.nonlinearity(gate)
 
         hidden_states = self.conv_point(hidden_states)
-        hidden_states = hidden_states.reshape(B, C, N).permute(0, 2, 1)
-
+        
+        if self.norm_type == "rms_norm":
+            # move channel to the last dimension so we apply RMSnorm across channel dimension
+            hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
+
+        if self.residual_connection:
+            hidden_states = hidden_states + residual
+        
         return hidden_states
 
 
 class SanaTransformerBlock(nn.Module):
     r"""
-    A Transformer block following the Linear Transformer architecture, introduced in Sana
-
-    Reference: https://arxiv.org/abs/2410.10629
-
-    Parameters:
-        dim (`int`): The number of channels in the input and output.
-        num_attention_heads (`int`): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`): The number of channels in each head.
+    Transformer block introduced in [Sana](https://huggingface.co/papers/2410.10629).
     """
-
+    
     def __init__(
         self,
         dim: int = 2240,
@@ -127,11 +125,7 @@ def __init__(
             )
 
         # 3. Feed-forward
-        self.ff = SanaGLUMBConv(
-            in_channels=dim,
-            out_channels=dim,
-            mlp_ratio=mlp_ratio,
-        )
+        self.ff = GLUMBConv(dim, dim, mlp_ratio, norm_type=None, residual_connection=False)
 
         self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
 
@@ -142,7 +136,8 @@ def forward(
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         timestep: Optional[torch.LongTensor] = None,
-        HW: Optional[tuple[int]] = None,
+        height: int = None,
+        width: int = None,
     ) -> torch.Tensor:
         batch_size = hidden_states.shape[0]
 
@@ -171,15 +166,17 @@ def forward(
         norm_hidden_states = self.norm2(hidden_states)
         norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
 
-        ff_output = self.ff(norm_hidden_states, HW=HW)
+        norm_hidden_states = norm_hidden_states.unflatten(1, (height, width)).permute(0, 3, 1, 2)
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = ff_output.flatten(2, 3).permute(0, 2, 1)
         hidden_states = hidden_states + gate_mlp * ff_output
 
         return hidden_states
 
 
 class SanaTransformer2DModel(ModelMixin, ConfigMixin):
     r"""
-    A 2D Transformer model as introduced in [Sana](https://arxiv.org/abs/2410.10629) family of models.
+    A 2D Transformer model introduced in [Sana](https://huggingface.co/papers/2410.10629) family of models.
 
     Args:
         in_channels (`int`, defaults to `32`):
@@ -204,7 +201,7 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin):
             The expansion ratio to use in the GLUMBConv layer.
         dropout (`float`, defaults to `0.0`):
             The dropout probability.
-        attention_bias (`bool`, defaults to `True`):
+        attention_bias (`bool`, defaults to `False`):
             Whether to use bias in the attention layer.
         sample_size (`int`, defaults to `32`):
             The base size of the input latent.
@@ -233,7 +230,7 @@ def __init__(
         caption_channels: int = 2304,
         mlp_ratio: float = 2.5,
         dropout: float = 0.0,
-        attention_bias: bool = True,
+        attention_bias: bool = False,
         sample_size: int = 32,
         patch_size: int = 1,
         norm_elementwise_affine: bool = False,
@@ -245,7 +242,7 @@ def __init__(
         inner_dim = num_attention_heads * attention_head_dim
 
         # 1. Patch Embedding
-        self.pos_embed = PatchEmbed(
+        self.patch_embed = PatchEmbed(
             height=sample_size,
             width=sample_size,
             patch_size=patch_size,
@@ -255,7 +252,9 @@ def __init__(
             pos_embed_type=None,
         )
 
-        # 2. Caption Embedding
+        # 2. Additional condition embeddings
+        self.time_embed = AdaLayerNormSingle(inner_dim)
+
         self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
         self.caption_norm = RMSNorm(inner_dim, eps=1e-5)
 
@@ -285,8 +284,6 @@ def __init__(
         self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
         self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels)
 
-        self.adaln_single = AdaLayerNormSingle(inner_dim)
-
         self.gradient_checkpointing = False
 
     def _set_gradient_checkpointing(self, module, value=False):
@@ -361,7 +358,7 @@ def forward(
         encoder_attention_mask: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         return_dict: bool = True,
-    ):
+    ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
         # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
         #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
         #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
@@ -387,11 +384,12 @@ def forward(
 
         # 1. Input
         batch_size, num_channels, height, width = hidden_states.shape
-        post_patch_height = height // self.config.patch_size
-        post_patch_width = width // self.config.patch_size
-        hidden_states = self.pos_embed(hidden_states)
+        p = self.config.patch_size
+        post_patch_height, post_patch_width = height // p, width // p
+
+        hidden_states = self.patch_embed(hidden_states)
 
-        timestep, embedded_timestep = self.adaln_single(
+        timestep, embedded_timestep = self.time_embed(
             timestep, batch_size=batch_size, hidden_dtype=hidden_states.dtype
         )
 
@@ -418,7 +416,8 @@ def create_block_forward(block):
                 encoder_hidden_states,
                 encoder_attention_mask,
                 timestep,
-                (post_patch_height, post_patch_width),
+                post_patch_height,
+                post_patch_width,
             )
 
         # 3. Normalization
@@ -436,14 +435,7 @@ def create_block_forward(block):
             batch_size, post_patch_height, post_patch_width, self.config.patch_size, self.config.patch_size, -1
         )
         hidden_states = hidden_states.permute(0, 5, 1, 3, 2, 4)
-        output = hidden_states.reshape(
-            shape=(
-                batch_size,
-                -1,
-                post_patch_height * self.config.patch_size,
-                post_patch_width * self.config.patch_size,
-            )
-        )
+        output = hidden_states.reshape(batch_size, -1, post_patch_height * p, post_patch_width * p)
 
         if not return_dict:
             return (output,)
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sana.py b/src/diffusers/pipelines/pag/pipeline_pag_sana.py
@@ -27,7 +27,6 @@
 from ...schedulers import FlowDPMSolverMultistepScheduler
 from ...utils import (
     BACKENDS_MAPPING,
-    deprecate,
     is_bs4_available,
     is_ftfy_available,
     logging,
diff --git a/src/diffusers/pipelines/sana/pipeline_sana.py b/src/diffusers/pipelines/sana/pipeline_sana.py