huggingface
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 10 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/diffusers/models/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎src/diffusers/models/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/diffusers/models/attention_processor.py‎
Lines changed: 103 additions & 0 deletions b/‎src/diffusers/models/attention_processor.py‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎src/diffusers/models/autoencoders/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/diffusers/models/autoencoders/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -1,4 +1,5 @@
 # Initially taken from GitHub's Python gitignore file
+__*/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
 
@@ -86,6 +86,7 @@
             "AutoencoderKLCogVideoX",
             "AutoencoderKLHunyuanVideo",
             "AutoencoderKLLTXVideo",
+            "AutoencoderKLMagvit",
             "AutoencoderKLMochi",
             "AutoencoderKLTemporalDecoder",
             "AutoencoderOobleck",
@@ -97,6 +98,7 @@
             "ControlNetUnionModel",
             "ControlNetXSAdapter",
             "DiTTransformer2DModel",
+            "EasyAnimateTransformer3DModel",
             "FluxControlNetModel",
             "FluxMultiControlNetModel",
             "FluxTransformer2DModel",
@@ -276,6 +278,9 @@
             "CogVideoXVideoToVideoPipeline",
             "CogView3PlusPipeline",
             "CycleDiffusionPipeline",
+            "EasyAnimatePipeline",
+            "EasyAnimateInpaintPipeline",
+            "EasyAnimateControlPipeline",
             "FluxControlImg2ImgPipeline",
             "FluxControlInpaintPipeline",
             "FluxControlNetImg2ImgPipeline",
@@ -596,6 +601,7 @@
             AutoencoderKLCogVideoX,
             AutoencoderKLHunyuanVideo,
             AutoencoderKLLTXVideo,
+            AutoencoderKLMagvit,
             AutoencoderKLMochi,
             AutoencoderKLTemporalDecoder,
             AutoencoderOobleck,
@@ -607,6 +613,7 @@
             ControlNetUnionModel,
             ControlNetXSAdapter,
             DiTTransformer2DModel,
+            EasyAnimateTransformer3DModel,
             FluxControlNetModel,
             FluxMultiControlNetModel,
             FluxTransformer2DModel,
@@ -765,6 +772,9 @@
             CogVideoXVideoToVideoPipeline,
             CogView3PlusPipeline,
             CycleDiffusionPipeline,
+            EasyAnimatePipeline,
+            EasyAnimateInpaintPipeline,
+            EasyAnimateControlPipeline,
             FluxControlImg2ImgPipeline,
             FluxControlInpaintPipeline,
             FluxControlNetImg2ImgPipeline,
 
@@ -31,6 +31,7 @@
     _import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
     _import_structure["autoencoders.autoencoder_kl_allegro"] = ["AutoencoderKLAllegro"]
     _import_structure["autoencoders.autoencoder_kl_cogvideox"] = ["AutoencoderKLCogVideoX"]
+    _import_structure["autoencoders.autoencoder_kl_magvit"] = ["AutoencoderKLMagvit"]
     _import_structure["autoencoders.autoencoder_kl_hunyuan_video"] = ["AutoencoderKLHunyuanVideo"]
     _import_structure["autoencoders.autoencoder_kl_ltx"] = ["AutoencoderKLLTXVideo"]
     _import_structure["autoencoders.autoencoder_kl_mochi"] = ["AutoencoderKLMochi"]
@@ -54,6 +55,7 @@
     _import_structure["modeling_utils"] = ["ModelMixin"]
     _import_structure["transformers.auraflow_transformer_2d"] = ["AuraFlowTransformer2DModel"]
     _import_structure["transformers.cogvideox_transformer_3d"] = ["CogVideoXTransformer3DModel"]
+    _import_structure["transformers.easyanimate_transformer_3d"] = ["EasyAnimateTransformer3DModel"]
     _import_structure["transformers.dit_transformer_2d"] = ["DiTTransformer2DModel"]
     _import_structure["transformers.dual_transformer_2d"] = ["DualTransformer2DModel"]
     _import_structure["transformers.hunyuan_transformer_2d"] = ["HunyuanDiT2DModel"]
@@ -101,6 +103,7 @@
             AutoencoderKLCogVideoX,
             AutoencoderKLHunyuanVideo,
             AutoencoderKLLTXVideo,
+            AutoencoderKLMagvit,
             AutoencoderKLMochi,
             AutoencoderKLTemporalDecoder,
             AutoencoderOobleck,
@@ -131,6 +134,7 @@
             CogView3PlusTransformer2DModel,
             DiTTransformer2DModel,
             DualTransformer2DModel,
+            EasyAnimateTransformer3DModel,
             FluxTransformer2DModel,
             HunyuanDiT2DModel,
             HunyuanVideoTransformer3DModel,
 
@@ -3507,6 +3507,109 @@ def __call__(
         return hidden_states
 
 
+class EasyAnimateAttnProcessor2_0:
+    r"""
+    Attention processor used in EasyAnimate.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        attn2: Attention = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn2 is None:
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        
+        if attn2 is not None:
+            query_txt = attn2.to_q(encoder_hidden_states)
+            key_txt = attn2.to_k(encoder_hidden_states)
+            value_txt = attn2.to_v(encoder_hidden_states)
+            
+            inner_dim = key_txt.shape[-1]
+            head_dim = inner_dim // attn.heads
+
+            query_txt = query_txt.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key_txt = key_txt.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value_txt = value_txt.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+            if attn2.norm_q is not None:
+                query_txt = attn2.norm_q(query_txt)
+            if attn2.norm_k is not None:
+                key_txt = attn2.norm_k(key_txt)
+
+            query = torch.cat([query_txt, query], dim=2)
+            key = torch.cat([key_txt, key], dim=2)
+            value = torch.cat([value_txt, value], dim=2)
+
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            from .embeddings import apply_rotary_emb
+            query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
+            if not attn.is_cross_attention:
+                key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
+
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+
+        if attn2 is None:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+
+            encoder_hidden_states, hidden_states = hidden_states.split(
+                [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+            )
+        else:
+            encoder_hidden_states, hidden_states = hidden_states.split(
+                [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+            )
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            encoder_hidden_states = attn2.to_out[0](encoder_hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn2.to_out[1](encoder_hidden_states)
+        return hidden_states, encoder_hidden_states
+
+
 class StableAudioAttnProcessor2_0:
     r"""
     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
 
@@ -5,6 +5,7 @@
 from .autoencoder_kl_cogvideox import AutoencoderKLCogVideoX
 from .autoencoder_kl_hunyuan_video import AutoencoderKLHunyuanVideo
 from .autoencoder_kl_ltx import AutoencoderKLLTXVideo
+from .autoencoder_kl_magvit import AutoencoderKLMagvit
 from .autoencoder_kl_mochi import AutoencoderKLMochi
 from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
 from .autoencoder_oobleck import AutoencoderOobleck
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`# Initially taken from GitHub's Python gitignore file`
	`2`	`+__*/`
`2`	`3`
`3`	`4`	`# Byte-compiled / optimized / DLL files`
`4`	`5`	`__pycache__/`