huggingface
diff --git a/‎src/diffusers/models/hooks.py‎
Lines changed: 45 additions & 18 deletions b/‎src/diffusers/models/hooks.py‎
Lines changed: 45 additions & 18 deletions
diff --git a/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 10 additions & 0 deletions b/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/diffusers/pipelines/allegro/pipeline_allegro.py‎
Lines changed: 1 addition & 2 deletions b/‎src/diffusers/pipelines/allegro/pipeline_allegro.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py‎
Lines changed: 1 addition & 2 deletions b/‎src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py‎
Lines changed: 1 addition & 2 deletions b/‎src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py‎
Lines changed: 1 addition & 2 deletions b/‎src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py‎
Lines changed: 1 addition & 2 deletions b/‎src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/diffusers/pipelines/flux/pipeline_flux.py‎
Lines changed: 4 additions & 1 deletion b/‎src/diffusers/pipelines/flux/pipeline_flux.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/diffusers/pipelines/latte/pipeline_latte.py‎
Lines changed: 1 addition & 2 deletions b/‎src/diffusers/pipelines/latte/pipeline_latte.py‎
Lines changed: 1 addition & 2 deletions
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import functools
-from typing import Any, Callable, Dict, Tuple, Union
+from typing import Any, Callable, Dict, Tuple
 
 import torch
 
@@ -117,45 +117,72 @@ def reset_state(self, module):
 class PyramidAttentionBroadcastHook(ModelHook):
     def __init__(
         self,
-        skip_range: int,
-        timestep_range: Tuple[int, int],
-        timestep_callback: Callable[[], Union[torch.LongTensor, int]],
+        skip_callback: Callable[[torch.nn.Module], bool],
+        # skip_range: int,
+        # timestep_range: Tuple[int, int],
+        # timestep_callback: Callable[[], Union[torch.LongTensor, int]],
     ) -> None:
         super().__init__()
 
-        self.skip_range = skip_range
-        self.timestep_range = timestep_range
-        self.timestep_callback = timestep_callback
+        # self.skip_range = skip_range
+        # self.timestep_range = timestep_range
+        # self.timestep_callback = timestep_callback
+        self.skip_callback = skip_callback
 
-        self.attention_cache = None
+        self.cache = None
         self._iteration = 0
 
     def new_forward(self, module: torch.nn.Module, *args, **kwargs) -> Any:
         args, kwargs = module._diffusers_hook.pre_forward(module, *args, **kwargs)
 
-        current_timestep = self.timestep_callback()
-        is_within_timestep_range = self.timestep_range[0] < current_timestep < self.timestep_range[1]
-        should_compute_attention = self._iteration % self.skip_range == 0
+        # current_timestep = self.timestep_callback()
+        # is_within_timestep_range = self.timestep_range[0] < current_timestep < self.timestep_range[1]
+        # should_compute_attention = self._iteration % self.skip_range == 0
 
-        if not is_within_timestep_range or should_compute_attention:
-            output = module._old_forward(*args, **kwargs)
-        else:
-            output = self.attention_cache
+        # if not is_within_timestep_range or should_compute_attention:
+        #     output = module._old_forward(*args, **kwargs)
+        # else:
+        #     output = self.attention_cache
 
-        self._iteration = self._iteration + 1
+        if self.cache is not None and self.skip_callback(module):
+            output = self.cache
+        else:
+            output = module._old_forward(*args, **kwargs)
 
         return module._diffusers_hook.post_forward(module, output)
 
     def post_forward(self, module: torch.nn.Module, output: Any) -> Any:
-        self.attention_cache = output
+        self.cache = output
         return output
 
     def reset_state(self, module: torch.nn.Module) -> torch.nn.Module:
-        self.attention_cache = None
+        self.cache = None
         self._iteration = 0
         return module
 
 
+class LayerSkipHook(ModelHook):
+    def __init__(self, skip_: Callable[[torch.nn.Module], bool]) -> None:
+        super().__init__()
+
+        self.skip_callback = skip_
+
+    def new_forward(self, module: torch.nn.Module, *args, **kwargs) -> Any:
+        args, kwargs = module._diffusers_hook.pre_forward(module, *args, **kwargs)
+
+        if self.skip_callback(module):
+            # We want to skip this layer, so we have to return the input of the current layer
+            # as output of the next layer. But at this point, we don't have information about
+            # the arguments required by next layer. Even if we did, order matters unless we
+            # always pass kwargs. But that is not the case usually with hidden_states, encoder_hidden_states,
+            # temb, etc. TODO(aryan): implement correctly later
+            output = None
+        else:
+            output = module._old_forward(*args, **kwargs)
+
+        return module._diffusers_hook.post_forward(module, output)
+
+
 def add_hook_to_module(module: torch.nn.Module, hook: ModelHook, append: bool = False):
     r"""
     Adds a hook to a given module. This will rewrite the `forward` method of the module to include the hook, to remove
 
@@ -58,6 +58,11 @@
         "StableDiffusionMixin",
         "ImagePipelineOutput",
     ]
+    _import_structure["pyramid_attention_broadcast_utils"] = [
+        "PyramidAttentionBroadcastConfig",
+        "apply_pyramid_attention_broadcast",
+        "apply_pyramid_attention_broadcast_on_module",
+    ]
     _import_structure["deprecated"].extend(
         [
             "PNDMPipeline",
@@ -447,6 +452,11 @@
             ImagePipelineOutput,
             StableDiffusionMixin,
         )
+        from .pyramid_attention_broadcast_utils import (
+            PyramidAttentionBroadcastConfig,
+            apply_pyramid_attention_broadcast,
+            apply_pyramid_attention_broadcast_on_module,
+        )
 
     try:
         if not (is_torch_available() and is_librosa_available()):
 
@@ -38,7 +38,6 @@
 )
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
-from ..pyramid_broadcast_utils import PyramidAttentionBroadcastMixin
 from .pipeline_output import AllegroPipelineOutput
 
 
@@ -132,7 +131,7 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-class AllegroPipeline(DiffusionPipeline, PyramidAttentionBroadcastMixin):
+class AllegroPipeline(DiffusionPipeline):
     r"""
     Pipeline for text-to-video generation using Allegro.
 
 
@@ -29,7 +29,6 @@
 from ...utils import logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
-from ..pyramid_broadcast_utils import PyramidAttentionBroadcastMixin
 from .pipeline_output import CogVideoXPipelineOutput
 
 
@@ -138,7 +137,7 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin, PyramidAttentionBroadcastMixin):
+class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation using CogVideoX.
 
 
@@ -30,7 +30,6 @@
 from ...utils import logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
-from ..pyramid_broadcast_utils import PyramidAttentionBroadcastMixin
 from .pipeline_output import CogVideoXPipelineOutput
 
 
@@ -145,7 +144,7 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-class CogVideoXFunControlPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin, PyramidAttentionBroadcastMixin):
+class CogVideoXFunControlPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
     r"""
     Pipeline for controlled text-to-video generation using CogVideoX Fun.
 
 
@@ -34,7 +34,6 @@
 )
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
-from ..pyramid_broadcast_utils import PyramidAttentionBroadcastMixin
 from .pipeline_output import CogVideoXPipelineOutput
 
 
@@ -154,7 +153,7 @@ def retrieve_latents(
         raise AttributeError("Could not access latents of provided encoder_output")
 
 
-class CogVideoXImageToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin, PyramidAttentionBroadcastMixin):
+class CogVideoXImageToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
     r"""
     Pipeline for image-to-video generation using CogVideoX.
 
 
@@ -30,7 +30,6 @@
 from ...utils import logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
-from ..pyramid_broadcast_utils import PyramidAttentionBroadcastMixin
 from .pipeline_output import CogVideoXPipelineOutput
 
 
@@ -160,7 +159,7 @@ def retrieve_latents(
         raise AttributeError("Could not access latents of provided encoder_output")
 
 
-class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin, PyramidAttentionBroadcastMixin):
+class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
     r"""
     Pipeline for video-to-video generation using CogVideoX.
 
 
@@ -655,6 +655,7 @@ def __call__(
 
         self._guidance_scale = guidance_scale
         self._joint_attention_kwargs = joint_attention_kwargs
+        self._current_timestep = None
         self._interrupt = False
 
         # 2. Define call parameters
@@ -731,6 +732,7 @@ def __call__(
                 if self.interrupt:
                     continue
 
+                self._current_timestep = t
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
 
@@ -771,9 +773,10 @@ def __call__(
                 if XLA_AVAILABLE:
                     xm.mark_step()
 
+        self._current_timestep = None
+
         if output_type == "latent":
             image = latents
-
         else:
             latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
             latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
 
@@ -37,7 +37,6 @@
 )
 from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ...video_processor import VideoProcessor
-from ..pyramid_broadcast_utils import PyramidAttentionBroadcastMixin
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -133,7 +132,7 @@ class LattePipelineOutput(BaseOutput):
     frames: torch.Tensor
 
 
-class LattePipeline(DiffusionPipeline, PyramidAttentionBroadcastMixin):
+class LattePipeline(DiffusionPipeline):
     r"""
     Pipeline for text-to-video generation using Latte.