more fixes

a-r-r-o-w · a-r-r-o-w · commit 1040c911e4f7 · 2025-01-13T08:12:09.000+01:00
diff --git a/src/diffusers/hooks/hooks.py b/src/diffusers/hooks/hooks.py
@@ -121,16 +121,20 @@ def register_hook(self, hook: ModelHook, name: str) -> None:
         self._module_ref = hook.initialize_hook(self._module_ref)
 
         if hasattr(hook, "new_forward"):
-            new_forward = hook.new_forward
+            rewritten_forward = hook.new_forward
+
+            def new_forward(module, *args, **kwargs):
+                args, kwargs = hook.pre_forward(module, *args, **kwargs)
+                output = rewritten_forward(module, *args, **kwargs)
+                return hook.post_forward(module, output)
         else:
 
             def new_forward(module, *args, **kwargs):
                 args, kwargs = hook.pre_forward(module, *args, **kwargs)
                 output = old_forward(*args, **kwargs)
                 return hook.post_forward(module, output)
 
-        new_forward = functools.update_wrapper(new_forward, old_forward)
-        self._module_ref.forward = new_forward.__get__(self._module_ref)
+        self._module_ref.forward = functools.update_wrapper(functools.partial(new_forward, self._module_ref), old_forward)
 
         self.hooks[name] = hook
         self._hook_order.append(name)
@@ -147,11 +151,16 @@ def remove_hook(self, name: str) -> None:
         del self.hooks[name]
         self._hook_order.remove(name)
 
-    def reset_stateful_hooks(self):
+    def reset_stateful_hooks(self, recurse: bool = True) -> None:
         for hook_name in self._hook_order:
             hook = self.hooks[hook_name]
             if hook._is_stateful:
                 hook.reset_state(self._module_ref)
+        
+        if recurse:
+            for module in self._module_ref.modules():
+                if hasattr(module, "_diffusers_hook"):
+                    module._diffusers_hook.reset_stateful_hooks(recurse=False)
 
     @classmethod
     def check_if_exists_or_initialize(cls, module: torch.nn.Module) -> "HookRegistry":
diff --git a/src/diffusers/hooks/pyramid_attention_broadcast.py b/src/diffusers/hooks/pyramid_attention_broadcast.py
@@ -106,6 +106,14 @@ def __init__(self) -> None:
     def reset(self):
         self.iteration = 0
         self.cache = None
+    
+    def __repr__(self):
+        cache_repr = ""
+        if self.cache is None:
+            cache_repr = "None"
+        else:
+            cache_repr = f"Tensor(shape={self.cache.shape}, dtype={self.cache.dtype})"
+        return f"PyramidAttentionBroadcastState(iteration={self.iteration}, cache={cache_repr})"
 
 
 class PyramidAttentionBroadcastHook(ModelHook):
@@ -120,21 +128,21 @@ def __init__(self, skip_callback: Callable[[torch.nn.Module], bool]) -> None:
 
     def initialize_hook(self, module):
         self.state = PyramidAttentionBroadcastState()
+        return module
 
     def new_forward(self, module: torch.nn.Module, *args, **kwargs) -> Any:
-        args, kwargs = module._diffusers_hook.pre_forward(module, *args, **kwargs)
-
         if self.skip_callback(module):
-            output = module._pyramid_attention_broadcast_state.cache
+            output = self.state.cache
         else:
             output = module._old_forward(*args, **kwargs)
 
         self.state.cache = output
         self.state.iteration += 1
-        return module._diffusers_hook.post_forward(module, output)
+        return output
 
     def reset_state(self, module: torch.nn.Module) -> None:
-        module.state.reset()
+        self.state.reset()
+        return module
 
 
 def apply_pyramid_attention_broadcast(
@@ -168,7 +176,7 @@ def apply_pyramid_attention_broadcast(
     >>> config = PyramidAttentionBroadcastConfig(
     ...     spatial_attention_block_skip_range=2, spatial_attention_timestep_skip_range=(100, 800)
     ... )
-    >>> apply_pyramid_attention_broadcast(pipe, config)
+    >>> apply_pyramid_attention_broadcast(pipe.transformer, config)
     ```
     """
     if config.current_timestep_callback is None:
@@ -192,9 +200,9 @@ def apply_pyramid_attention_broadcast(
         if not isinstance(submodule, _ATTENTION_CLASSES):
             continue
         if isinstance(submodule, Attention):
-            _apply_pyramid_attention_broadcast_on_attention_class(name, module, config)
+            _apply_pyramid_attention_broadcast_on_attention_class(name, submodule, config)
         if isinstance(submodule, MochiAttention):
-            _apply_pyramid_attention_broadcast_on_mochi_attention_class(name, module, config)
+            _apply_pyramid_attention_broadcast_on_mochi_attention_class(name, submodule, config)
 
 
 def _apply_pyramid_attention_broadcast_on_attention_class(
@@ -241,7 +249,9 @@ def _apply_pyramid_attention_broadcast_on_attention_class(
         return False
 
     def skip_callback(module: torch.nn.Module) -> bool:
-        pab_state = module._pyramid_attention_broadcast_state
+        hook: PyramidAttentionBroadcastHook = module._diffusers_hook.get_hook("pyramid_attention_broadcast")
+        pab_state: PyramidAttentionBroadcastState = hook.state
+        
         if pab_state.cache is None:
             return False
 
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -930,8 +930,6 @@ def __init__(
         self.out_dim = out_dim if out_dim is not None else query_dim
         self.out_context_dim = out_context_dim if out_context_dim else query_dim
         self.context_pre_only = context_pre_only
-        # TODO(aryan): Maybe try to improve the checks in PAB instead
-        self.is_cross_attention = False
 
         self.heads = out_dim // dim_head if out_dim is not None else heads