update tests

a-r-r-o-w · a-r-r-o-w · commit 9d452dc40197 · 2024-12-09T14:14:48.000+01:00
diff --git a/tests/pipelines/cogvideo/test_cogvideox.py b/tests/pipelines/cogvideo/test_cogvideox.py
@@ -21,7 +21,10 @@
 from transformers import AutoTokenizer, T5EncoderModel
 
 from diffusers import AutoencoderKLCogVideoX, CogVideoXPipeline, CogVideoXTransformer3DModel, DDIMScheduler
-from diffusers.pipelines.pyramid_attention_broadcast_utils import PyramidAttentionBroadcastAttentionProcessorWrapper
+from diffusers.pipelines.pyramid_attention_broadcast_utils import (
+    PyramidAttentionBroadcastConfig,
+    apply_pyramid_attention_broadcast,
+)
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
     numpy_cosine_similarity_distance,
@@ -333,40 +336,21 @@ def test_pyramid_attention_broadcast(self):
         frames = pipe(**inputs).frames  # [B, F, C, H, W]
         original_image_slice = frames[0, -2:, -1, -3:, -3:]
 
-        pipe.enable_pyramid_attention_broadcast(spatial_attn_skip_range=2, spatial_attn_timestep_range=(100, 800))
-        assert pipe.pyramid_attention_broadcast_enabled
-
-        num_pab_processors = sum(
-            [
-                isinstance(processor, PyramidAttentionBroadcastAttentionProcessorWrapper)
-                for processor in pipe.transformer.attn_processors.values()
-            ]
+        config = PyramidAttentionBroadcastConfig(
+            spatial_attention_block_skip_range=2, spatial_attention_timestep_skip_range=(100, 800)
         )
-        assert num_pab_processors == num_layers
+        apply_pyramid_attention_broadcast(pipe, config)
 
         inputs = self.get_dummy_inputs(device)
         inputs["num_inference_steps"] = 4
         frames = pipe(**inputs).frames
         image_slice_pab_enabled = frames[0, -2:, -1, -3:, -3:]
 
-        pipe.disable_pyramid_attention_broadcast()
-        assert not pipe.pyramid_attention_broadcast_enabled
-
-        inputs = self.get_dummy_inputs(device)
-        frames = pipe(**inputs).frames
-        image_slice_pab_disabled = frames[0, -2:, -1, -3:, -3:]
-
         # We need to use higher tolerance because we are using a random model. With a converged/trained
         # model, the tolerance can be lower.
         assert np.allclose(
             original_image_slice, image_slice_pab_enabled, atol=0.2
         ), "PAB outputs should not differ much in specified timestep range."
-        assert np.allclose(
-            image_slice_pab_enabled, image_slice_pab_disabled, atol=0.2
-        ), "Outputs, with PAB enabled, shouldn't differ much when PAB is disabled in specified timestep range."
-        assert np.allclose(
-            original_image_slice, image_slice_pab_disabled, atol=0.2
-        ), "Original outputs should match when PAB is disabled."
 
 
 @slow
diff --git a/tests/pipelines/cogvideo/test_cogvideox_image2video.py b/tests/pipelines/cogvideo/test_cogvideox_image2video.py
@@ -22,7 +22,10 @@
 from transformers import AutoTokenizer, T5EncoderModel
 
 from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel, DDIMScheduler
-from diffusers.pipelines.pyramid_attention_broadcast_utils import PyramidAttentionBroadcastAttentionProcessorWrapper
+from diffusers.pipelines.pyramid_attention_broadcast_utils import (
+    PyramidAttentionBroadcastConfig,
+    apply_pyramid_attention_broadcast,
+)
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
@@ -356,40 +359,21 @@ def test_pyramid_attention_broadcast(self):
         frames = pipe(**inputs).frames  # [B, F, C, H, W]
         original_image_slice = frames[0, -2:, -1, -3:, -3:]
 
-        pipe.enable_pyramid_attention_broadcast(spatial_attn_skip_range=2, spatial_attn_timestep_range=(100, 800))
-        assert pipe.pyramid_attention_broadcast_enabled
-
-        num_pab_processors = sum(
-            [
-                isinstance(processor, PyramidAttentionBroadcastAttentionProcessorWrapper)
-                for processor in pipe.transformer.attn_processors.values()
-            ]
+        config = PyramidAttentionBroadcastConfig(
+            spatial_attention_block_skip_range=2, spatial_attention_timestep_skip_range=(100, 800)
         )
-        assert num_pab_processors == num_layers
+        apply_pyramid_attention_broadcast(pipe, config)
 
         inputs = self.get_dummy_inputs(device)
         inputs["num_inference_steps"] = 4
         frames = pipe(**inputs).frames
         image_slice_pab_enabled = frames[0, -2:, -1, -3:, -3:]
 
-        pipe.disable_pyramid_attention_broadcast()
-        assert not pipe.pyramid_attention_broadcast_enabled
-
-        inputs = self.get_dummy_inputs(device)
-        frames = pipe(**inputs).frames
-        image_slice_pab_disabled = frames[0, -2:, -1, -3:, -3:]
-
         # We need to use higher tolerance because we are using a random model. With a converged/trained
         # model, the tolerance can be lower.
         assert np.allclose(
             original_image_slice, image_slice_pab_enabled, atol=0.2
         ), "PAB outputs should not differ much in specified timestep range."
-        assert np.allclose(
-            image_slice_pab_enabled, image_slice_pab_disabled, atol=0.2
-        ), "Outputs, with PAB enabled, shouldn't differ much when PAB is disabled in specified timestep range."
-        assert np.allclose(
-            original_image_slice, image_slice_pab_disabled, atol=0.2
-        ), "Original outputs should match when PAB is disabled."
 
 
 @slow
diff --git a/tests/pipelines/cogvideo/test_cogvideox_video2video.py b/tests/pipelines/cogvideo/test_cogvideox_video2video.py
@@ -21,7 +21,10 @@
 from transformers import AutoTokenizer, T5EncoderModel
 
 from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXVideoToVideoPipeline, DDIMScheduler
-from diffusers.pipelines.pyramid_attention_broadcast_utils import PyramidAttentionBroadcastAttentionProcessorWrapper
+from diffusers.pipelines.pyramid_attention_broadcast_utils import (
+    PyramidAttentionBroadcastConfig,
+    apply_pyramid_attention_broadcast,
+)
 from diffusers.utils.testing_utils import enable_full_determinism, torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -338,37 +341,18 @@ def test_pyramid_attention_broadcast(self):
         frames = pipe(**inputs).frames  # [B, F, C, H, W]
         original_image_slice = frames[0, -2:, -1, -3:, -3:]
 
-        pipe.enable_pyramid_attention_broadcast(spatial_attn_skip_range=2, spatial_attn_timestep_range=(100, 800))
-        assert pipe.pyramid_attention_broadcast_enabled
-
-        num_pab_processors = sum(
-            [
-                isinstance(processor, PyramidAttentionBroadcastAttentionProcessorWrapper)
-                for processor in pipe.transformer.attn_processors.values()
-            ]
+        config = PyramidAttentionBroadcastConfig(
+            spatial_attention_block_skip_range=2, spatial_attention_timestep_skip_range=(100, 800)
         )
-        assert num_pab_processors == num_layers
+        apply_pyramid_attention_broadcast(pipe, config)
 
         inputs = self.get_dummy_inputs(device)
         inputs["num_inference_steps"] = 4
         frames = pipe(**inputs).frames
         image_slice_pab_enabled = frames[0, -2:, -1, -3:, -3:]
 
-        pipe.disable_pyramid_attention_broadcast()
-        assert not pipe.pyramid_attention_broadcast_enabled
-
-        inputs = self.get_dummy_inputs(device)
-        frames = pipe(**inputs).frames
-        image_slice_pab_disabled = frames[0, -2:, -1, -3:, -3:]
-
         # We need to use higher tolerance because we are using a random model. With a converged/trained
         # model, the tolerance can be lower.
         assert np.allclose(
             original_image_slice, image_slice_pab_enabled, atol=0.2
         ), "PAB outputs should not differ much in specified timestep range."
-        assert np.allclose(
-            image_slice_pab_enabled, image_slice_pab_disabled, atol=0.2
-        ), "Outputs, with PAB enabled, shouldn't differ much when PAB is disabled in specified timestep range."
-        assert np.allclose(
-            original_image_slice, image_slice_pab_disabled, atol=0.2
-        ), "Original outputs should match when PAB is disabled."