Added support for single IPAdapter on SD3.5 pipeline

guiyrt · guiyrt · commit 55674387f29f · 2024-12-06T15:09:06.000Z
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
@@ -189,7 +189,7 @@ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
 
     def forward(
         self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor,
-        joint_attention_kwargs=None,
+        joint_attention_kwargs: Dict[str, Any] = {}
     ):
         if self.use_dual_attention:
             norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1(
@@ -208,15 +208,15 @@ def forward(
         # Attention.
         attn_output, context_attn_output = self.attn(
             hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states,
-            **({} if joint_attention_kwargs is None else joint_attention_kwargs),
+            **joint_attention_kwargs
         )
 
         # Process attention outputs for the `hidden_states`.
         attn_output = gate_msa.unsqueeze(1) * attn_output
         hidden_states = hidden_states + attn_output
 
         if self.use_dual_attention:
-            attn_output2 = self.attn2(hidden_states=norm_hidden_states2, **({} if joint_attention_kwargs is None else joint_attention_kwargs),)
+            attn_output2 = self.attn2(hidden_states=norm_hidden_states2, **joint_attention_kwargs)
             attn_output2 = gate_msa2.unsqueeze(1) * attn_output2
             hidden_states = hidden_states + attn_output2
 
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -18,6 +18,7 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
+from einops import rearrange
 
 from ..image_processor import IPAdapterMaskProcessor
 from ..utils import deprecate, logging
@@ -4800,6 +4801,144 @@ def __call__(
         hidden_states = hidden_states / attn.rescale_output_factor
 
         return hidden_states
+    
+
+class IPAdapterJointAttnProcessor2_0(torch.nn.Module):
+    """Attention processor for IP-Adapter used typically in processing the SD3-like self-attention projections."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        ip_hidden_states_dim: int,
+        head_dim: int,
+        timesteps_emb_dim: int = 1280,
+        scale: float = 0.5
+    ):
+        super().__init__()
+
+        # To prevent circular import
+        from .normalization import RMSNorm, AdaLayerNorm
+
+        self.norm_ip = AdaLayerNorm(timesteps_emb_dim, output_dim=ip_hidden_states_dim * 2,
+                                    norm_eps=1e-6, chunk_dim=1)
+        self.to_k_ip = nn.Linear(ip_hidden_states_dim, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(ip_hidden_states_dim, hidden_size, bias=False)
+        self.norm_q = RMSNorm(head_dim, 1e-6)
+        self.norm_k = RMSNorm(head_dim, 1e-6)
+        self.norm_ip_k = RMSNorm(head_dim, 1e-6)
+        self.scale = scale
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        ip_hidden_states: torch.FloatTensor = None,
+        temb: torch.FloatTensor = None
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        batch_size = hidden_states.shape[0]
+
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        img_query = query
+        img_key = key
+        img_value = value
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # `context` projections.
+        if encoder_hidden_states is not None:
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+
+            query = torch.cat([query, encoder_hidden_states_query_proj], dim=2)
+            key = torch.cat([key, encoder_hidden_states_key_proj], dim=2)
+            value = torch.cat([value, encoder_hidden_states_value_proj], dim=2)
+
+        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        if encoder_hidden_states is not None:
+            # Split the attention outputs.
+            hidden_states, encoder_hidden_states = (
+                hidden_states[:, : residual.shape[1]],
+                hidden_states[:, residual.shape[1] :],
+            )
+            if not attn.context_pre_only:
+                encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        # IP Adapter
+        if self.scale != 0 and ip_hidden_states is not None:
+            # Norm image features
+            norm_ip_hidden_states = self.norm_ip(ip_hidden_states, temb=temb)
+
+            # To k and v
+            ip_key = self.to_k_ip(norm_ip_hidden_states)
+            ip_value = self.to_v_ip(norm_ip_hidden_states)
+
+            # Reshape
+            img_query = rearrange(img_query, 'b l (h d) -> b h l d', h=attn.heads)
+            img_key = rearrange(img_key, 'b l (h d) -> b h l d', h=attn.heads)
+            img_value = rearrange(img_value, 'b l (h d) -> b h l d', h=attn.heads)
+            ip_key = rearrange(ip_key, 'b l (h d) -> b h l d', h=attn.heads)
+            ip_value = rearrange(ip_value, 'b l (h d) -> b h l d', h=attn.heads)
+
+            # Norm
+            img_query = self.norm_q(img_query)
+            img_key = self.norm_k(img_key)
+            ip_key = self.norm_ip_k(ip_key)
+
+            # cat img
+            img_key = torch.cat([img_key, ip_key], dim=2)
+            img_value = torch.cat([img_value, ip_value], dim=2)
+
+            ip_hidden_states = F.scaled_dot_product_attention(img_query, img_key, img_value, dropout_p=0.0, is_causal=False)
+            ip_hidden_states = rearrange(ip_hidden_states, 'b h l d -> b l (h d)')
+            ip_hidden_states = ip_hidden_states.to(img_query.dtype)
+
+            hidden_states = hidden_states + ip_hidden_states * self.scale
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if encoder_hidden_states is not None:
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
 
 
 class PAGIdentitySelfAttnProcessor2_0:
@@ -5089,6 +5228,7 @@ def __init__(self):
     IPAdapterAttnProcessor,
     IPAdapterAttnProcessor2_0,
     IPAdapterXFormersAttnProcessor,
+    IPAdapterJointAttnProcessor2_0,
     PAGIdentitySelfAttnProcessor2_0,
     PAGCFGIdentitySelfAttnProcessor2_0,
     LoRAAttnProcessor,
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -1999,6 +1999,154 @@ def forward(self, id_embeds: torch.Tensor) -> torch.Tensor:
         return out
 
 
+# Modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+class TimePerceiverAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim: int,
+        dim_head: int = 64,
+        heads: int = 8,
+    ) -> None:
+        super().__init__()
+
+        self.scale = dim_head ** -0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+    def forward(self, x, latents, shift=None, scale=None):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        def reshape_tensor(x, heads):
+            bs, length, _ = x.shape
+            # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+            x = x.view(bs, length, heads, -1)
+            # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+            x = x.transpose(1, 2)
+            # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+            return x.reshape(bs, heads, length, -1)
+
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+
+        if shift is not None and scale is not None:
+            latents = latents * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+        b, l, _ = latents.shape
+
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+
+        return self.to_out(out)
+    
+
+# Modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+class TimePerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int = 1152,
+        output_dim: int = 2432,
+        hidden_dim: int = 1280,
+        depth: int = 4,
+        dim_head: int = 64,
+        heads: int = 20,
+        num_queries: int = 64,
+        ffn_ratio: int = 4,
+        timestep_in_dim: int = 320,
+        timestep_flip_sin_to_cos: bool = True,
+        timestep_freq_shift: int = 0,
+    ) -> None:
+        super().__init__()
+        
+        self.latents = nn.Parameter(torch.randn(1, num_queries, hidden_dim) / hidden_dim ** 0.5)        
+        self.proj_in = nn.Linear(embed_dim, hidden_dim)
+        self.proj_out = nn.Linear(hidden_dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+        
+        ff_inner_dim = int(hidden_dim * ffn_ratio)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        # msa
+                        TimePerceiverAttention(dim=hidden_dim, dim_head=dim_head, heads=heads),
+                        # ff
+                        nn.Sequential(
+                            nn.LayerNorm(hidden_dim),
+                            nn.Linear(hidden_dim, ff_inner_dim, bias=False),
+                            nn.GELU(),
+                            nn.Linear(ff_inner_dim, hidden_dim, bias=False),
+                        ),
+                        # adaLN
+                        nn.Sequential(
+                            nn.SiLU(),
+                            nn.Linear(hidden_dim, ff_inner_dim, bias=True)
+                        )
+                    ]
+                )
+            )
+
+        # Time
+        self.time_proj = Timesteps(timestep_in_dim, timestep_flip_sin_to_cos, timestep_freq_shift)
+        self.time_embedding = TimestepEmbedding(timestep_in_dim, hidden_dim, act_fn="silu")
+
+    def forward(self, x, timestep, need_temb=False):
+        timestep_emb = self.time_proj(timestep).to(dtype=x.dtype)
+        timestep_emb = self.time_embedding(timestep_emb, None)
+
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        
+        x = self.proj_in(x)
+        x = x + timestep_emb[:, None]
+
+        for attn, ff, adaLN_modulation in self.layers:
+            shift_msa, scale_msa, shift_mlp, scale_mlp = adaLN_modulation(timestep_emb).chunk(4, dim=1)
+            latents = attn(x, latents, shift_msa, scale_msa) + latents
+
+            res = latents
+            for idx_ff in range(len(ff)):
+                layer_ff = ff[idx_ff]
+                latents = layer_ff(latents)
+                if idx_ff == 0 and isinstance(layer_ff, nn.LayerNorm):  # adaLN
+                    latents = latents * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
+            latents = latents + res
+            
+        latents = self.proj_out(latents)
+        latents = self.norm_out(latents)
+
+        if need_temb:
+            return latents, timestep_emb
+        else:
+            return latents
+
+
 class MultiIPAdapterImageProjection(nn.Module):
     def __init__(self, IPAdapterImageProjectionLayers: Union[List[nn.Module], Tuple[nn.Module]]):
         super().__init__()
diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py