update tests

a-r-r-o-w · a-r-r-o-w · commit bbcde6b09f1a · 2024-12-30T22:43:43.000+01:00
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -1277,6 +1277,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class PyramidAttentionBroadcastConfig(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class ReduxImageEncoder(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
@@ -2535,3 +2550,11 @@ def from_config(cls, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
+
+
+def apply_pyramid_attention_broadcast(*args, **kwargs):
+    requires_backends(apply_pyramid_attention_broadcast, ["torch", "transformers"])
+
+
+def apply_pyramid_attention_broadcast_on_module(*args, **kwargs):
+    requires_backends(apply_pyramid_attention_broadcast_on_module, ["torch", "transformers"])
diff --git a/tests/pipelines/allegro/test_allegro.py b/tests/pipelines/allegro/test_allegro.py
@@ -30,13 +30,13 @@
 )
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, to_np
+from ..test_pipelines_common import PipelineTesterMixin, PyramidAttentionBroadcastTesterMixin, to_np
 
 
 enable_full_determinism()
 
 
-class AllegroPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class AllegroPipelineFastTests(PipelineTesterMixin, PyramidAttentionBroadcastTesterMixin, unittest.TestCase):
     pipeline_class = AllegroPipeline
     params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
     batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
@@ -54,14 +54,14 @@ class AllegroPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     )
     test_xformers_attention = False
 
-    def get_dummy_components(self):
+    def get_dummy_components(self, num_layers: int = 1):
         torch.manual_seed(0)
         transformer = AllegroTransformer3DModel(
             num_attention_heads=2,
             attention_head_dim=12,
             in_channels=4,
             out_channels=4,
-            num_layers=1,
+            num_layers=num_layers,
             cross_attention_dim=24,
             sample_width=8,
             sample_height=8,
diff --git a/tests/pipelines/cogvideo/test_cogvideox.py b/tests/pipelines/cogvideo/test_cogvideox.py
@@ -32,6 +32,7 @@
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import (
     PipelineTesterMixin,
+    PyramidAttentionBroadcastTesterMixin,
     check_qkv_fusion_matches_attn_procs_length,
     check_qkv_fusion_processors_exist,
     to_np,
@@ -41,7 +42,7 @@
 enable_full_determinism()
 
 
-class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class CogVideoXPipelineFastTests(PipelineTesterMixin, PyramidAttentionBroadcastTesterMixin, unittest.TestCase):
     pipeline_class = CogVideoXPipeline
     params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
     batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
@@ -59,7 +60,7 @@ class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     )
     test_xformers_attention = False
 
-    def get_dummy_components(self):
+    def get_dummy_components(self, num_layers: int = 1):
         torch.manual_seed(0)
         transformer = CogVideoXTransformer3DModel(
             # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
@@ -71,7 +72,7 @@ def get_dummy_components(self):
             out_channels=4,
             time_embed_dim=2,
             text_embed_dim=32,  # Must match with tiny-random-t5
-            num_layers=1,
+            num_layers=num_layers,
             sample_width=2,  # latent width: 2 -> final width: 16
             sample_height=2,  # latent height: 2 -> final height: 16
             sample_frames=9,  # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
@@ -18,25 +18,28 @@
 from ..test_pipelines_common import (
     FluxIPAdapterTesterMixin,
     PipelineTesterMixin,
+    PyramidAttentionBroadcastTesterMixin,
     check_qkv_fusion_matches_attn_procs_length,
     check_qkv_fusion_processors_exist,
 )
 
 
-class FluxPipelineFastTests(unittest.TestCase, PipelineTesterMixin, FluxIPAdapterTesterMixin):
+class FluxPipelineFastTests(
+    unittest.TestCase, PipelineTesterMixin, FluxIPAdapterTesterMixin, PyramidAttentionBroadcastTesterMixin
+):
     pipeline_class = FluxPipeline
     params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
     batch_params = frozenset(["prompt"])
 
     # there is no xformers processor for Flux
     test_xformers_attention = False
 
-    def get_dummy_components(self):
+    def get_dummy_components(self, num_layers: int = 1):
         torch.manual_seed(0)
         transformer = FluxTransformer2DModel(
             patch_size=1,
             in_channels=4,
-            num_layers=1,
+            num_layers=num_layers,
             num_single_layers=1,
             attention_head_dim=16,
             num_attention_heads=2,
diff --git a/tests/pipelines/hunyuan_video/test_hunyuan_video.py b/tests/pipelines/hunyuan_video/test_hunyuan_video.py
@@ -30,13 +30,13 @@
     torch_device,
 )
 
-from ..test_pipelines_common import PipelineTesterMixin, to_np
+from ..test_pipelines_common import PipelineTesterMixin, PyramidAttentionBroadcastTesterMixin, to_np
 
 
 enable_full_determinism()
 
 
-class HunyuanVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class HunyuanVideoPipelineFastTests(PipelineTesterMixin, PyramidAttentionBroadcastTesterMixin, unittest.TestCase):
     pipeline_class = HunyuanVideoPipeline
     params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
     batch_params = frozenset(["prompt"])
@@ -54,14 +54,14 @@ class HunyuanVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     # there is no xformers processor for Flux
     test_xformers_attention = False
 
-    def get_dummy_components(self):
+    def get_dummy_components(self, num_layers: int = 1):
         torch.manual_seed(0)
         transformer = HunyuanVideoTransformer3DModel(
             in_channels=4,
             out_channels=4,
             num_attention_heads=2,
             attention_head_dim=10,
-            num_layers=1,
+            num_layers=num_layers,
             num_single_layers=1,
             num_refiner_layers=1,
             patch_size=1,
diff --git a/tests/pipelines/latte/test_latte.py b/tests/pipelines/latte/test_latte.py
@@ -27,6 +27,7 @@
     DDIMScheduler,
     LattePipeline,
     LatteTransformer3DModel,
+    PyramidAttentionBroadcastConfig,
 )
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
@@ -38,13 +39,13 @@
 )
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, to_np
+from ..test_pipelines_common import PipelineTesterMixin, PyramidAttentionBroadcastTesterMixin, to_np
 
 
 enable_full_determinism()
 
 
-class LattePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class LattePipelineFastTests(PipelineTesterMixin, PyramidAttentionBroadcastTesterMixin, unittest.TestCase):
     pipeline_class = LattePipeline
     params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
     batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
@@ -53,11 +54,23 @@ class LattePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 
     required_optional_params = PipelineTesterMixin.required_optional_params
 
-    def get_dummy_components(self):
+    pab_config = PyramidAttentionBroadcastConfig(
+        spatial_attention_block_skip_range=2,
+        temporal_attention_block_skip_range=2,
+        cross_attention_block_skip_range=2,
+        spatial_attention_timestep_skip_range=(100, 700),
+        temporal_attention_timestep_skip_range=(100, 800),
+        cross_attention_timestep_skip_range=(100, 800),
+        spatial_attention_block_identifiers=["transformer_blocks"],
+        temporal_attention_block_identifiers=["temporal_transformer_blocks"],
+        cross_attention_block_identifiers=["transformer_blocks"],
+    )
+
+    def get_dummy_components(self, num_layers: int = 1):
         torch.manual_seed(0)
         transformer = LatteTransformer3DModel(
             sample_size=8,
-            num_layers=1,
+            num_layers=num_layers,
             patch_size=2,
             attention_head_dim=8,
             num_attention_heads=3,
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
@@ -24,9 +24,11 @@
     DDIMScheduler,
     DiffusionPipeline,
     KolorsPipeline,
+    PyramidAttentionBroadcastConfig,
     StableDiffusionPipeline,
     StableDiffusionXLPipeline,
     UNet2DConditionModel,
+    apply_pyramid_attention_broadcast,
 )
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import FluxIPAdapterMixin, IPAdapterMixin
@@ -36,6 +38,7 @@
 from diffusers.models.unets.unet_i2vgen_xl import I2VGenXLUNet
 from diffusers.models.unets.unet_motion_model import UNetMotionModel
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.pipelines.pyramid_attention_broadcast_utils import PyramidAttentionBroadcastHook
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import logging
 from diffusers.utils.import_utils import is_xformers_available
@@ -2271,6 +2274,109 @@ def _test_save_load_optional_components(self, expected_max_difference=1e-4):
         self.assertLess(max_diff, expected_max_difference)
 
 
+class PyramidAttentionBroadcastTesterMixin:
+    pab_config = PyramidAttentionBroadcastConfig(
+        spatial_attention_block_skip_range=2,
+        spatial_attention_timestep_skip_range=(100, 800),
+        spatial_attention_block_identifiers=["transformer_blocks"],
+    )
+
+    def test_pyramid_attention_broadcast_layers(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        num_layers = 2
+        components = self.get_dummy_components(num_layers=num_layers)
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        apply_pyramid_attention_broadcast(pipe, self.pab_config)
+
+        expected_hooks = 0
+        if self.pab_config.spatial_attention_block_skip_range is not None:
+            expected_hooks += num_layers
+        if self.pab_config.temporal_attention_block_skip_range is not None:
+            expected_hooks += num_layers
+        if self.pab_config.cross_attention_block_skip_range is not None:
+            expected_hooks += num_layers
+
+        denoiser = pipe.transformer if hasattr(pipe, "transformer") else pipe.unet
+        count = 0
+        for module in denoiser.modules():
+            if hasattr(module, "_diffusers_hook"):
+                count += 1
+                self.assertTrue(
+                    isinstance(module._diffusers_hook, PyramidAttentionBroadcastHook),
+                    "Hook should be of type PyramidAttentionBroadcastHook.",
+                )
+                self.assertTrue(
+                    hasattr(module, "_pyramid_attention_broadcast_state"),
+                    "PAB state should be initialized when enabled.",
+                )
+                self.assertTrue(
+                    module._pyramid_attention_broadcast_state.cache is None, "Cache should be None at initialization."
+                )
+        self.assertEqual(count, expected_hooks, "Number of hooks should match the expected number.")
+
+        # Perform dummy inference step to ensure state is updated
+        def pab_state_check_callback(pipe, i, t, kwargs):
+            for module in denoiser.modules():
+                if hasattr(module, "_diffusers_hook"):
+                    self.assertTrue(
+                        module._pyramid_attention_broadcast_state.cache is not None,
+                        "Cache should have updated during inference.",
+                    )
+                    self.assertTrue(
+                        module._pyramid_attention_broadcast_state.iteration == i + 1,
+                        "Hook iteration state should have updated during inference.",
+                    )
+            return {}
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = 2
+        inputs["callback_on_step_end"] = pab_state_check_callback
+        pipe(**inputs)[0]
+
+        # After inference, reset_stateful_hooks is called within the pipeline, which should have reset the states
+        for module in denoiser.modules():
+            if hasattr(module, "_diffusers_hook"):
+                self.assertTrue(
+                    module._pyramid_attention_broadcast_state.cache is None,
+                    "Cache should be reset to None after inference.",
+                )
+                self.assertTrue(
+                    module._pyramid_attention_broadcast_state.iteration == 0,
+                    "Iteration should be reset to 0 after inference.",
+                )
+
+    def test_pyramid_attention_broadcast_inference(self, expected_atol: float = 0.2):
+        # We need to use higher tolerance because we are using a random model. With a converged/trained
+        # model, the tolerance can be lower.
+
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        num_layers = 2
+        components = self.get_dummy_components(num_layers=num_layers)
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = 4
+        output = pipe(**inputs)[0]
+        original_image_slice = output.flatten()
+        original_image_slice = np.concatenate((original_image_slice[:8], original_image_slice[-8:]))
+
+        apply_pyramid_attention_broadcast(pipe, self.pab_config)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_inference_steps"] = 4
+        output = pipe(**inputs)[0]
+        image_slice_pab_enabled = output.flatten()
+        image_slice_pab_enabled = np.concatenate((image_slice_pab_enabled[:8], image_slice_pab_enabled[-8:]))
+
+        assert np.allclose(
+            original_image_slice, image_slice_pab_enabled, atol=expected_atol
+        ), "PAB outputs should not differ much in specified timestep range."
+
+
 # Some models (e.g. unCLIP) are extremely likely to significantly deviate depending on which hardware is used.
 # This helper function is used to check that the image doesn't deviate on average more than 10 pixels from a
 # reference image.