[WIP][cogview4]: implement CogView4 attention processor

OleehyO · OleehyO · commit 3bd6d30a0f01 · 2025-01-28T16:22:01.000Z
Add CogView4AttnProcessor class for implementing scaled dot-product attention
with rotary embeddings for the CogVideoX model. This processor concatenates
encoder and hidden states, applies QKV projections and RoPE, but does not
include spatial normalization.

TODO:
- Fix incorrect QKV projection weights
- Resolve ~25% error in RoPE implementation compared to Megatron
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -2802,6 +2802,105 @@ def __call__(
             return hidden_states
 
 
+class CogView4AttnProcessor:
+    """
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CogView4AttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        ###############################################3
+        # TODO: 直接用qkv_weight算出qkv（注意要先分出num_heads, head_dim），再在head_dims上拆出qkv
+        linear_qkv_weight = torch.load("/home/lhy/code/cogview/linear_qkv_weight.pt")
+        linear_qkv_bias = torch.load("/home/lhy/code/cogview/linear_qkv_bias.pt")
+
+        qkv = torch.matmul(hidden_states, linear_qkv_weight.T) + linear_qkv_bias
+        qkv = qkv.view(batch_size, -1, attn.heads, head_dim * 3)
+        query, key, value = qkv.chunk(3, dim=-1)
+
+
+        # TODO: 校验rope是否apply正确(目前有25%的误差)
+        ###############################################3
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            from .embeddings import apply_rotary_emb_megatron
+
+            query[:, :, text_seq_length:, :] = apply_rotary_emb_megatron(
+                query[:, :, text_seq_length:, :], image_rotary_emb
+            )
+            key[:, :, text_seq_length:, :] = apply_rotary_emb_megatron(
+                key[:, :, text_seq_length:, :], image_rotary_emb
+            )
+
+            ##########################################
+            query = torch.load("/home/lhy/code/cogview/query_after_rope.pt")
+            key = torch.load("/home/lhy/code/cogview/key_after_rope.pt")
+            value = torch.load("/home/lhy/code/cogview/value_after_rope.pt")
+            query = query[None, :16+4096, ...]
+            key = key[None, :16+4096, ...]
+            value = value[None, :16+4096, ...]
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            ##########################################
+
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+
+        encoder_hidden_states, hidden_states = hidden_states.split(
+            [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+        )
+        return hidden_states, encoder_hidden_states
+
+
 class CogVideoXAttnProcessor2_0:
     r"""
     Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
@@ -2824,9 +2923,7 @@ def __call__(
 
         hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
 
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
+        batch_size, sequence_length, _ = hidden_states.shape
 
         if attention_mask is not None:
             attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
@@ -6174,6 +6271,7 @@ def __call__(
     FusedFluxAttnProcessor2_0,
     FusedFluxAttnProcessor2_0_NPU,
     CogVideoXAttnProcessor2_0,
+    CogView4AttnProcessor,
     FusedCogVideoXAttnProcessor2_0,
     XFormersAttnAddedKVProcessor,
     XFormersAttnProcessor,
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -1283,6 +1283,40 @@ def apply_1d_rope(tokens, pos, cos, sin):
     x = torch.cat([t, h, w], dim=-1)
     return x
 
+def apply_rotary_emb_megatron(x: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    """Apply rotary position embeddings to input tensor.
+
+    Args:
+        x: Input tensor of shape [seq_len, batch_size, n_heads, head_dim]
+        freqs: Frequency tensor of shape [seq_len, 1, 1, head_dim//2]
+
+    Returns:
+        Tensor with rotary position embeddings applied
+    """
+    batch_size, n_heads, seq_len, rot_dim = x.shape
+
+    # Reshape x to have rot_dim as the last dimension
+    x_rot, x_pass = x.chunk(2, dim=-1)
+
+    # Apply rotary embeddings
+    # First calculate cos and sin
+    cos, sin = freqs.chunk(2, dim=-1)
+    cos, sin = torch.cos(cos), torch.sin(sin)
+
+    # Rotate x_rot
+    x_rot_cos = x_rot * cos
+    # Create rotated version of x_rot by shifting rot_dim/2 positions
+    x_rot_shifted = torch.cat([-x_rot[..., rot_dim//2:], x_rot[..., :rot_dim//2]], dim=-1)
+    x_rot_sin = x_rot_shifted * sin
+
+    # Combine
+    x_rot = x_rot_cos + x_rot_sin
+
+    # Concatenate back with x_pass
+    x_out = torch.cat([x_rot, x_pass], dim=-1)
+
+    return x_out
+
 
 class FluxPosEmbed(nn.Module):
     # modified from https://github.com/black-forest-labs/flux/blob/c00d7c60b085fce8058b9df845e036090873f2ce/src/flux/modules/layers.py#L11