Refactor of image_proj (testing)

guiyrt · guiyrt · commit 4ba374a68821 · 2024-12-10T17:59:05.000Z
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -3927,9 +3927,8 @@ def __call__(
             key = attn.norm_k(key)
 
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
         hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False, scale=attn.scale
         )
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -21,7 +21,7 @@
 
 from ..utils import deprecate
 from .activations import FP32SiLU, get_activation
-from .attention_processor import Attention
+from .attention_processor import Attention, FusedAttnProcessor2_0
 
 
 def get_timestep_embedding(
@@ -2104,76 +2104,55 @@ def forward(self, id_embeds: torch.Tensor) -> torch.Tensor:
         return out
 
 
-# Modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
-class TimePerceiverAttention(nn.Module):
+class IPAdapterTimeImageProjectionBlock(nn.Module):
     def __init__(
         self,
-        *,
-        dim: int,
+        hidden_dim: int = 768,
         dim_head: int = 64,
-        heads: int = 8,
+        heads: int = 16,
+        ffn_ratio: float = 4,
     ) -> None:
         super().__init__()
+        from .attention import FeedForward
 
-        self.scale = dim_head**-0.5
-        self.dim_head = dim_head
-        self.heads = heads
-        inner_dim = dim_head * heads
-
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
-        self.to_out = nn.Linear(inner_dim, dim, bias=False)
-
-    def forward(self, x, latents, shift=None, scale=None):
-        """
-        Args:
-            x (torch.Tensor): image features
-                shape (b, n1, D)
-            latent (torch.Tensor): latent features
-                shape (b, n2, D)
-        """
-
-        def reshape_tensor(x, heads):
-            bs, length, _ = x.shape
-            # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
-            x = x.view(bs, length, heads, -1)
-            # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
-            x = x.transpose(1, 2)
-            # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
-            return x.reshape(bs, heads, length, -1)
-
-        x = self.norm1(x)
-        latents = self.norm2(latents)
-
-        if shift is not None and scale is not None:
-            latents = latents * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-
-        b, l, _ = latents.shape
+        self.ln0 = nn.LayerNorm(hidden_dim)
+        self.ln1 = nn.LayerNorm(hidden_dim)
+        self.attn = Attention(
+            query_dim=hidden_dim,
+            cross_attention_dim=hidden_dim,
+            dim_head=dim_head,
+            heads=heads,
+            bias=False,
+            out_bias=False,
+            processor=FusedAttnProcessor2_0(),
+        )
+        self.ff = FeedForward(hidden_dim, hidden_dim, activation_fn="gelu", mult=ffn_ratio, bias=False)
 
-        q = self.to_q(latents)
-        kv_input = torch.cat((x, latents), dim=-2)
-        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        # AdaLayerNorm
+        self.adaln_silu = nn.SiLU()
+        self.adaln_proj = nn.Linear(hidden_dim, 4 * hidden_dim)
+        self.adaln_norm = nn.LayerNorm(hidden_dim)
 
-        q = reshape_tensor(q, self.heads)
-        k = reshape_tensor(k, self.heads)
-        v = reshape_tensor(v, self.heads)
+        # Custom scale cannot be passed in constructor
+        self.attn.scale = 1 / math.sqrt(math.sqrt(dim_head))
+        self.attn.fuse_projections()
+        self.attn.to_k = None
+        self.attn.to_v = None
 
-        # attention
-        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
-        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
-        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
-        out = weight @ v
+    def forward(self, x, latents, timestep_emb):
+        shift_msa, scale_msa, shift_mlp, scale_mlp = self.adaln_proj(self.adaln_silu(timestep_emb))
 
-        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        x = self.ln0(x)
+        latents = self.ln1(latents) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        latents = self.attn(x, latents) + latents
 
-        return self.to_out(out)
+        residual = latents
+        latents = self.adaln_norm(latents) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        return self.ff(latents) + residual
 
 
 # Modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
-class TimePerceiverResampler(nn.Module):
+class IPAdapterTimeImageProjection(nn.Module):
     def __init__(
         self,
         embed_dim: int = 1152,
@@ -2189,65 +2168,32 @@ def __init__(
         timestep_freq_shift: int = 0,
     ) -> None:
         super().__init__()
-
         self.latents = nn.Parameter(torch.randn(1, num_queries, hidden_dim) / hidden_dim**0.5)
         self.proj_in = nn.Linear(embed_dim, hidden_dim)
         self.proj_out = nn.Linear(hidden_dim, output_dim)
         self.norm_out = nn.LayerNorm(output_dim)
-
-        ff_inner_dim = int(hidden_dim * ffn_ratio)
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(
-                nn.ModuleList(
-                    [
-                        # msa
-                        TimePerceiverAttention(dim=hidden_dim, dim_head=dim_head, heads=heads),
-                        # ff
-                        nn.Sequential(
-                            nn.LayerNorm(hidden_dim),
-                            nn.Linear(hidden_dim, ff_inner_dim, bias=False),
-                            nn.GELU(),
-                            nn.Linear(ff_inner_dim, hidden_dim, bias=False),
-                        ),
-                        # adaLN
-                        nn.Sequential(nn.SiLU(), nn.Linear(hidden_dim, ff_inner_dim, bias=True)),
-                    ]
-                )
-            )
-
-        # Time
+        self.layers = nn.ModuleList(
+            [IPAdapterTimeImageProjectionBlock(hidden_dim, dim_head, heads, ffn_ratio) for _ in range(depth)]
+        )
         self.time_proj = Timesteps(timestep_in_dim, timestep_flip_sin_to_cos, timestep_freq_shift)
         self.time_embedding = TimestepEmbedding(timestep_in_dim, hidden_dim, act_fn="silu")
 
-    def forward(self, x, timestep, need_temb=False):
+    def forward(self, x, timestep):
         timestep_emb = self.time_proj(timestep).to(dtype=x.dtype)
-        timestep_emb = self.time_embedding(timestep_emb, None)
+        timestep_emb = self.time_embedding(timestep_emb)
 
         latents = self.latents.repeat(x.size(0), 1, 1)
 
         x = self.proj_in(x)
         x = x + timestep_emb[:, None]
 
-        for attn, ff, adaLN_modulation in self.layers:
-            shift_msa, scale_msa, shift_mlp, scale_mlp = adaLN_modulation(timestep_emb).chunk(4, dim=1)
-            latents = attn(x, latents, shift_msa, scale_msa) + latents
-
-            res = latents
-            for idx_ff in range(len(ff)):
-                layer_ff = ff[idx_ff]
-                latents = layer_ff(latents)
-                if idx_ff == 0 and isinstance(layer_ff, nn.LayerNorm):  # adaLN
-                    latents = latents * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
-            latents = latents + res
+        for block in self.layers:
+            latents = block(x, latents, timestep_emb)
 
         latents = self.proj_out(latents)
         latents = self.norm_out(latents)
 
-        if need_temb:
-            return latents, timestep_emb
-        else:
-            return latents
+        return latents, timestep_emb
 
 
 class MultiIPAdapterImageProjection(nn.Module):
diff --git a/src/diffusers/models/transformers/transformer_sd3.py b/src/diffusers/models/transformers/transformer_sd3.py
@@ -31,7 +31,7 @@
 from ...models.normalization import AdaLayerNormContinuous, AdaLayerNormZero
 from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
-from ..embeddings import CombinedTimestepTextProjEmbeddings, PatchEmbed, TimePerceiverResampler
+from ..embeddings import CombinedTimestepTextProjEmbeddings, IPAdapterTimeImageProjection, PatchEmbed
 from ..modeling_outputs import Transformer2DModelOutput
 
 
@@ -363,16 +363,31 @@ def _load_ip_adapter_weights(self, state_dict: Dict, low_cpu_mem_usage: bool):
 
         self.set_attn_processor(attn_procs)
 
+        # Convert image_proj state dict to diffusers
+        image_proj_state_dict = {}
+        for key, value in state_dict["image_proj"].items():
+            for idx in range(4):
+                key = key.replace(f"layers.{idx}.0.norm1", f"layers.{idx}.ln0")
+                key = key.replace(f"layers.{idx}.0.norm2", f"layers.{idx}.ln1")
+                key = key.replace(f"layers.{idx}.0.to_q", f"layers.{idx}.attn.to_q")
+                key = key.replace(f"layers.{idx}.0.to_kv", f"layers.{idx}.attn.to_kv")
+                key = key.replace(f"layers.{idx}.0.to_out", f"layers.{idx}.attn.to_out.0")
+                key = key.replace(f"layers.{idx}.1.0", f"layers.{idx}.adaln_norm")
+                key = key.replace(f"layers.{idx}.1.1", f"layers.{idx}.ff.net.0.proj")
+                key = key.replace(f"layers.{idx}.1.3", f"layers.{idx}.ff.net.2")
+                key = key.replace(f"layers.{idx}.2.1", f"layers.{idx}.adaln_proj")
+            image_proj_state_dict[key] = value
+
         # Image projetion parameters
-        embed_dim = state_dict["image_proj"]["proj_in.weight"].shape[1]
-        output_dim = state_dict["image_proj"]["proj_out.weight"].shape[0]
-        hidden_dim = state_dict["image_proj"]["latents"].shape[2]
-        heads = state_dict["image_proj"]["layers.0.0.to_q.weight"].shape[0] // 64
-        num_queries = state_dict["image_proj"]["latents"].shape[1]
-        timestep_in_dim = state_dict["image_proj"]["time_embedding.linear_1.weight"].shape[1]
+        embed_dim = image_proj_state_dict["proj_in.weight"].shape[1]
+        output_dim = image_proj_state_dict["proj_out.weight"].shape[0]
+        hidden_dim = image_proj_state_dict["proj_in.weight"].shape[0]
+        heads = image_proj_state_dict["layers.0.attn.to_q.weight"].shape[0] // 64
+        num_queries = image_proj_state_dict["latents"].shape[1]
+        timestep_in_dim = image_proj_state_dict["time_embedding.linear_1.weight"].shape[1]
 
         # Image projection
-        self.image_proj = TimePerceiverResampler(
+        self.image_proj = IPAdapterTimeImageProjection(
             embed_dim=embed_dim,
             output_dim=output_dim,
             hidden_dim=hidden_dim,
@@ -382,9 +397,9 @@ def _load_ip_adapter_weights(self, state_dict: Dict, low_cpu_mem_usage: bool):
         ).to(device=self.device, dtype=self.dtype)
 
         if not low_cpu_mem_usage:
-            self.image_proj.load_state_dict(state_dict["image_proj"], strict=True)
+            self.image_proj.load_state_dict(image_proj_state_dict, strict=True)
         else:
-            load_model_dict_into_meta(self.image_proj, state_dict["image_proj"], device=self.device, dtype=self.dtype)
+            load_model_dict_into_meta(self.image_proj, image_proj_state_dict, device=self.device, dtype=self.dtype)
 
     def forward(
         self,

Original file line number	Diff line number	Diff line change
`@@ -3927,9 +3927,8 @@ def __call__(`
`3927`	`3927`	`key = attn.norm_k(key)`
`3928`	`3928`
`3929`	`3929`	`# the output of sdp = (batch, num_heads, seq_len, head_dim)`
`3930`		`- # TODO: add support for attn.scale when we move to Torch 2.1`
`3931`	`3930`	`hidden_states = F.scaled_dot_product_attention(`
`3932`		`- query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False`
	`3931`	`+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False, scale=attn.scale`
`3933`	`3932`	`)`
`3934`	`3933`
`3935`	`3934`	`hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)`