add modeling tests for cogvideox 1.5

a-r-r-o-w · a-r-r-o-w · commit 9edddc1da823 · 2024-11-08T23:43:54.000+01:00
diff --git a/tests/models/transformers/test_models_transformer_cogvideox.py b/tests/models/transformers/test_models_transformer_cogvideox.py
@@ -76,6 +76,7 @@ def prepare_init_args_and_inputs_for_common(self):
             "sample_height": 8,
             "sample_frames": 8,
             "patch_size": 2,
+            "patch_size_t": None,
             "temporal_compression_ratio": 4,
             "max_text_seq_length": 8,
         }
@@ -85,3 +86,63 @@ def prepare_init_args_and_inputs_for_common(self):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"CogVideoXTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+
+class CogVideoX1_5TransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = CogVideoXTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        num_frames = 1
+        height = 8
+        width = 8
+        embedding_dim = 8
+        sequence_length = 8
+
+        hidden_states = torch.randn((batch_size, num_frames, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+        }
+
+    @property
+    def input_shape(self):
+        return (1, 4, 8, 8)
+
+    @property
+    def output_shape(self):
+        return (1, 4, 8, 8)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings.
+            "num_attention_heads": 2,
+            "attention_head_dim": 8,
+            "in_channels": 4,
+            "out_channels": 4,
+            "time_embed_dim": 2,
+            "text_embed_dim": 8,
+            "num_layers": 1,
+            "sample_width": 8,
+            "sample_height": 8,
+            "sample_frames": 8,
+            "patch_size": 2,
+            "patch_size_t": 2,
+            "temporal_compression_ratio": 4,
+            "max_text_seq_length": 8,
+            "use_rotary_positional_embeddings": True,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"CogVideoXTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)