revert changes to tests

a-r-r-o-w · a-r-r-o-w · commit 7debcec22808 · 2024-12-30T20:49:08.000+01:00
diff --git a/tests/pipelines/cogvideo/test_cogvideox.py b/tests/pipelines/cogvideo/test_cogvideox.py
@@ -21,10 +21,6 @@
 from transformers import AutoTokenizer, T5EncoderModel
 
 from diffusers import AutoencoderKLCogVideoX, CogVideoXPipeline, CogVideoXTransformer3DModel, DDIMScheduler
-from diffusers.pipelines.pyramid_attention_broadcast_utils import (
-    PyramidAttentionBroadcastConfig,
-    apply_pyramid_attention_broadcast,
-)
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
     numpy_cosine_similarity_distance,
@@ -63,7 +59,7 @@ class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     )
     test_xformers_attention = False
 
-    def get_dummy_components(self, num_layers: int = 1):
+    def get_dummy_components(self):
         torch.manual_seed(0)
         transformer = CogVideoXTransformer3DModel(
             # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
@@ -75,7 +71,7 @@ def get_dummy_components(self, num_layers: int = 1):
             out_channels=4,
             time_embed_dim=2,
             text_embed_dim=32,  # Must match with tiny-random-t5
-            num_layers=num_layers,
+            num_layers=1,
             sample_width=2,  # latent width: 2 -> final width: 16
             sample_height=2,  # latent height: 2 -> final height: 16
             sample_frames=9,  # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
@@ -323,35 +319,6 @@ def test_fused_qkv_projections(self):
             original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
         ), "Original outputs should match when fused QKV projections are disabled."
 
-    def test_pyramid_attention_broadcast(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        num_layers = 4
-        components = self.get_dummy_components(num_layers=num_layers)
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["num_inference_steps"] = 4
-        frames = pipe(**inputs).frames  # [B, F, C, H, W]
-        original_image_slice = frames[0, -2:, -1, -3:, -3:]
-
-        config = PyramidAttentionBroadcastConfig(
-            spatial_attention_block_skip_range=2, spatial_attention_timestep_skip_range=(100, 800)
-        )
-        apply_pyramid_attention_broadcast(pipe, config)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["num_inference_steps"] = 4
-        frames = pipe(**inputs).frames
-        image_slice_pab_enabled = frames[0, -2:, -1, -3:, -3:]
-
-        # We need to use higher tolerance because we are using a random model. With a converged/trained
-        # model, the tolerance can be lower.
-        assert np.allclose(
-            original_image_slice, image_slice_pab_enabled, atol=0.2
-        ), "PAB outputs should not differ much in specified timestep range."
-
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/cogvideo/test_cogvideox_image2video.py b/tests/pipelines/cogvideo/test_cogvideox_image2video.py
@@ -22,10 +22,6 @@
 from transformers import AutoTokenizer, T5EncoderModel
 
 from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel, DDIMScheduler
-from diffusers.pipelines.pyramid_attention_broadcast_utils import (
-    PyramidAttentionBroadcastConfig,
-    apply_pyramid_attention_broadcast,
-)
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
@@ -65,7 +61,7 @@ class CogVideoXImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
     )
     test_xformers_attention = False
 
-    def get_dummy_components(self, num_layers: int = 1):
+    def get_dummy_components(self):
         torch.manual_seed(0)
         transformer = CogVideoXTransformer3DModel(
             # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
@@ -80,7 +76,7 @@ def get_dummy_components(self, num_layers: int = 1):
             out_channels=4,
             time_embed_dim=2,
             text_embed_dim=32,  # Must match with tiny-random-t5
-            num_layers=num_layers,
+            num_layers=1,
             sample_width=2,  # latent width: 2 -> final width: 16
             sample_height=2,  # latent height: 2 -> final height: 16
             sample_frames=9,  # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
@@ -346,35 +342,6 @@ def test_fused_qkv_projections(self):
             original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
         ), "Original outputs should match when fused QKV projections are disabled."
 
-    def test_pyramid_attention_broadcast(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        num_layers = 4
-        components = self.get_dummy_components(num_layers=num_layers)
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["num_inference_steps"] = 4
-        frames = pipe(**inputs).frames  # [B, F, C, H, W]
-        original_image_slice = frames[0, -2:, -1, -3:, -3:]
-
-        config = PyramidAttentionBroadcastConfig(
-            spatial_attention_block_skip_range=2, spatial_attention_timestep_skip_range=(100, 800)
-        )
-        apply_pyramid_attention_broadcast(pipe, config)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["num_inference_steps"] = 4
-        frames = pipe(**inputs).frames
-        image_slice_pab_enabled = frames[0, -2:, -1, -3:, -3:]
-
-        # We need to use higher tolerance because we are using a random model. With a converged/trained
-        # model, the tolerance can be lower.
-        assert np.allclose(
-            original_image_slice, image_slice_pab_enabled, atol=0.2
-        ), "PAB outputs should not differ much in specified timestep range."
-
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/cogvideo/test_cogvideox_video2video.py b/tests/pipelines/cogvideo/test_cogvideox_video2video.py
@@ -21,10 +21,6 @@
 from transformers import AutoTokenizer, T5EncoderModel
 
 from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXVideoToVideoPipeline, DDIMScheduler
-from diffusers.pipelines.pyramid_attention_broadcast_utils import (
-    PyramidAttentionBroadcastConfig,
-    apply_pyramid_attention_broadcast,
-)
 from diffusers.utils.testing_utils import enable_full_determinism, torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -57,7 +53,7 @@ class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
     )
     test_xformers_attention = False
 
-    def get_dummy_components(self, num_layers: int = 1):
+    def get_dummy_components(self):
         torch.manual_seed(0)
         transformer = CogVideoXTransformer3DModel(
             # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
@@ -69,7 +65,7 @@ def get_dummy_components(self, num_layers: int = 1):
             out_channels=4,
             time_embed_dim=2,
             text_embed_dim=32,  # Must match with tiny-random-t5
-            num_layers=num_layers,
+            num_layers=1,
             sample_width=2,  # latent width: 2 -> final width: 16
             sample_height=2,  # latent height: 2 -> final height: 16
             sample_frames=9,  # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
@@ -327,32 +323,3 @@ def test_fused_qkv_projections(self):
         assert np.allclose(
             original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
         ), "Original outputs should match when fused QKV projections are disabled."
-
-    def test_pyramid_attention_broadcast(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        num_layers = 4
-        components = self.get_dummy_components(num_layers=num_layers)
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["num_inference_steps"] = 4
-        frames = pipe(**inputs).frames  # [B, F, C, H, W]
-        original_image_slice = frames[0, -2:, -1, -3:, -3:]
-
-        config = PyramidAttentionBroadcastConfig(
-            spatial_attention_block_skip_range=2, spatial_attention_timestep_skip_range=(100, 800)
-        )
-        apply_pyramid_attention_broadcast(pipe, config)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["num_inference_steps"] = 4
-        frames = pipe(**inputs).frames
-        image_slice_pab_enabled = frames[0, -2:, -1, -3:, -3:]
-
-        # We need to use higher tolerance because we are using a random model. With a converged/trained
-        # model, the tolerance can be lower.
-        assert np.allclose(
-            original_image_slice, image_slice_pab_enabled, atol=0.2
-        ), "PAB outputs should not differ much in specified timestep range."
diff --git a/tests/pipelines/latte/test_latte.py b/tests/pipelines/latte/test_latte.py
@@ -22,10 +22,11 @@
 import torch
 from transformers import AutoTokenizer, T5EncoderModel
 
-from diffusers import AutoencoderKL, DDIMScheduler, LattePipeline, LatteTransformer3DModel
-from diffusers.pipelines.pyramid_attention_broadcast_utils import (
-    PyramidAttentionBroadcastConfig,
-    apply_pyramid_attention_broadcast,
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LattePipeline,
+    LatteTransformer3DModel,
 )
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
@@ -52,11 +53,11 @@ class LattePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 
     required_optional_params = PipelineTesterMixin.required_optional_params
 
-    def get_dummy_components(self, num_layers: int = 1):
+    def get_dummy_components(self):
         torch.manual_seed(0)
         transformer = LatteTransformer3DModel(
             sample_size=8,
-            num_layers=num_layers,
+            num_layers=1,
             patch_size=2,
             attention_head_dim=8,
             num_attention_heads=3,
@@ -263,38 +264,6 @@ def test_save_load_optional_components(self):
     def test_xformers_attention_forwardGenerator_pass(self):
         super()._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
 
-    def test_pyramid_attention_broadcast(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        num_layers = 4
-        components = self.get_dummy_components(num_layers=num_layers)
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["num_inference_steps"] = 4
-        frames = pipe(**inputs).frames  # [B, F, C, H, W]
-        original_image_slice = frames[0, -2:, -1, -3:, -3:]
-
-        config = PyramidAttentionBroadcastConfig(
-            spatial_attention_block_skip_range=2,
-            temporal_attention_block_skip_range=3,
-            spatial_attention_timestep_skip_range=(100, 800),
-            temporal_attention_timestep_skip_range=(100, 800),
-        )
-        apply_pyramid_attention_broadcast(pipe, config)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["num_inference_steps"] = 4
-        frames = pipe(**inputs).frames
-        image_slice_pab_enabled = frames[0, -2:, -1, -3:, -3:]
-
-        # We need to use higher tolerance because we are using a random model. With a converged/trained
-        # model, the tolerance can be lower.
-        assert np.allclose(
-            original_image_slice, image_slice_pab_enabled, atol=0.2
-        ), "PAB outputs should not differ much in specified timestep range."
-
 
 @slow
 @require_torch_gpu