huggingface
diff --git a/‎src/diffusers/models/transformers/transformer_allegro.py‎
Lines changed: 2 additions & 44 deletions b/‎src/diffusers/models/transformers/transformer_allegro.py‎
Lines changed: 2 additions & 44 deletions
diff --git a/‎src/diffusers/pipelines/allegro/pipeline_allegro.py‎
Lines changed: 0 additions & 2 deletions b/‎src/diffusers/pipelines/allegro/pipeline_allegro.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/models/transformers/test_models_transformer_allegro.py‎
Lines changed: 79 additions & 0 deletions b/‎tests/models/transformers/test_models_transformer_allegro.py‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎tests/pipelines/allegro/__init__.py‎ b/‎tests/pipelines/allegro/__init__.py‎
diff --git a/‎tests/pipelines/allegro/test_allegro.py‎ b/‎tests/pipelines/allegro/test_allegro.py‎
@@ -239,47 +239,7 @@ class AllegroTransformer3DModel(ModelMixin, ConfigMixin):
         attention_bias (`bool`, *optional*):
             Configure if the `TransformerBlocks` attention should contain a bias parameter.
     """
-
-    #     {
-    #   "_class_name": "AllegroTransformer3DModel",
-    #   "_diffusers_version": "0.30.3",
-    #   "_name_or_path": "/cpfs/data/user/larrytsai/Projects/Yi-VG/allegro/transformer",
-    #   "activation_fn": "gelu-approximate",
-    #   "attention_bias": true,
-    #   "attention_head_dim": 96,
-    #   "ca_attention_mode": "xformers",
-    #   "caption_channels": 4096,
-    #   "cross_attention_dim": 2304,
-    #   "double_self_attention": false,
-    #   "downsampler": null,
-    #   "dropout": 0.0,
-    #   "in_channels": 4,
-    #   "interpolation_scale_h": 2.0,
-    #   "interpolation_scale_t": 2.2,
-    #   "interpolation_scale_w": 2.0,
-    #   "model_max_length": 300,
-    #   "norm_elementwise_affine": false,
-    #   "norm_eps": 1e-06,
-    #   "norm_type": "ada_norm_single",
-    #   "num_attention_heads": 24,
-    #   "num_embeds_ada_norm": 1000,
-    #   "num_layers": 32,
-    #   "only_cross_attention": false,
-    #   "out_channels": 4,
-    #   "patch_size": 2,
-    #   "patch_size_t": 1,
-    #   "sa_attention_mode": "flash",
-    #   "sample_size": [
-    #     90,
-    #     160
-    #   ],
-    #   "sample_size_t": 22,
-    #   "upcast_attention": false,
-    #   "use_additional_conditions": null,
-    #   "use_linear_projection": false,
-    #   "use_rope": true
-    # }
-
+    
     @register_to_config
     def __init__(
         self,
@@ -304,8 +264,6 @@ def __init__(
         interpolation_scale_h: float = 2.0,
         interpolation_scale_w: float = 2.0,
         interpolation_scale_t: float = 2.2,
-        use_rotary_positional_embeddings: bool = True,
-        model_max_length: int = 300,
     ):
         super().__init__()
 
@@ -369,8 +327,8 @@ def _set_gradient_checkpointing(self, module, value=False):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        timestep: Optional[torch.LongTensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
 
@@ -193,7 +193,6 @@ def __init__(
 
         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
 
-    # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
@@ -207,7 +206,6 @@ def encode_prompt(
         negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
         clean_caption: bool = False,
         max_sequence_length: int = 300,
-        **kwargs,
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -0,0 +1,79 @@
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import AllegroTransformer3DModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class AllegroTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = AllegroTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        num_frames = 8
+        height = 8
+        width = 8
+        embedding_dim = 16
+        sequence_length = 16
+
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim // 2)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 8, 8, 8)
+
+    @property
+    def output_shape(self):
+        return (4, 8, 8, 8)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings.
+            "num_attention_heads": 2,
+            "attention_head_dim": 8,
+            "in_channels": 4,
+            "out_channels": 4,
+            "num_layers": 1,
+            "cross_attention_dim": 16,
+            "sample_width": 8,
+            "sample_height": 8,
+            "sample_frames": 8,
+            "caption_channels": 8,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict