update

yiyixuxu · yiyixuxu · commit 80af1f0ebac3 · 2024-11-05T02:55:16.000+01:00
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -124,6 +124,7 @@ def __init__(
         context_pre_only=None,
         pre_only=False,
         elementwise_affine: bool = True,
+        is_causal: bool = False,
     ):
         super().__init__()
 
@@ -146,6 +147,7 @@ def __init__(
         self.out_context_dim = out_context_dim if out_context_dim is not None else query_dim
         self.context_pre_only = context_pre_only
         self.pre_only = pre_only
+        self.is_causal = is_causal
 
         # we make use of this private variable to know whether this class is loaded
         # with an deprecated state dict so that we can convert it on the fly
@@ -195,8 +197,8 @@ def __init__(
             self.norm_q = RMSNorm(dim_head, eps=eps)
             self.norm_k = RMSNorm(dim_head, eps=eps)
         elif qk_norm == "l2":
-            self.norm_q = LpNorm(p=2, eps=eps)
-            self.norm_k = LpNorm(p=2, eps=eps)
+            self.norm_q = LpNorm(p=2, dim=-1)
+            self.norm_k = LpNorm(p=2, dim=-1)
         else:
             raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None,'layer_norm','fp32_layer_norm','rms_norm'")
 
@@ -2720,6 +2722,91 @@ def __call__(
         return hidden_states
 
 
+class MochiVaeAttnProcessor2_0:
+    r"""
+    Attention processor used in Mochi VAE.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        is_single_frame = hidden_states.shape[1] == 1
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if is_single_frame:
+            hidden_states = attn.to_v(hidden_states)
+
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+
+            if attn.residual_connection:
+                hidden_states = hidden_states + residual
+
+            hidden_states = hidden_states / attn.rescale_output_factor
+            return hidden_states
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=attn.is_causal
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
 class StableAudioAttnProcessor2_0:
     r"""
     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py
@@ -23,7 +23,7 @@
 from ...utils import logging
 from ...utils.accelerate_utils import apply_forward_hook
 from ..activations import get_activation
-from ..attention_processor import Attention
+from ..attention_processor import Attention, MochiVaeAttnProcessor2_0
 from ..modeling_outputs import AutoencoderKLOutput
 from ..modeling_utils import ModelMixin
 from .autoencoder_kl_cogvideox import CogVideoXCausalConv3d
@@ -174,6 +174,8 @@ def __init__(
                         heads=out_channels // 32,
                         dim_head=32,
                         qk_norm="l2",
+                        is_causal=True,
+                        processor=MochiVaeAttnProcessor2_0(),
                     )
                 )
             else:
@@ -280,6 +282,8 @@ def __init__(
                         heads=in_channels // 32,
                         dim_head=32,
                         qk_norm="l2",
+                        is_causal=True,
+                        processor=MochiVaeAttnProcessor2_0(),
                     )
                 )
             else:
@@ -484,7 +488,7 @@ def __init__(
 
         self.nonlinearity = get_activation(act_fn)
 
-        self.fourier_features = FourierFeatures()
+        # self.fourier_features = FourierFeatures()
         self.proj_in = nn.Linear(in_channels, block_out_channels[0])
         self.block_in = MochiMidBlock3D(
             in_channels=block_out_channels[0], num_layers=layers_per_block[0], add_attention=add_attention_block[0]
@@ -517,7 +521,7 @@ def forward(
         new_conv_cache = {}
         conv_cache = conv_cache or {}
 
-        hidden_states = self.fourier_features(hidden_states)
+        # hidden_states = self.fourier_features(hidden_states)
 
         hidden_states = hidden_states.permute(0, 2, 3, 4, 1)
         hidden_states = self.proj_in(hidden_states)
diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
@@ -557,12 +557,11 @@ def forward(self, x):
 
 
 class LpNorm(nn.Module):
-    def __init__(self, p: int = 2, dim: int = -1, eps: float = 1e-12):
+    def __init__(self, p: int = 2, dim: int = -1):
         super().__init__()
 
         self.p = p
         self.dim = dim
-        self.eps = eps
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        return F.normalize(hidden_states, p=self.p, dim=self.dim, eps=self.eps)
+        return F.normalize(hidden_states, p=self.p, dim=self.dim)