update

a-r-r-o-w · a-r-r-o-w · commit 58933b55e282 · 2025-02-19T22:41:44.000+01:00
diff --git a/src/diffusers/hooks/_common.py b/src/diffusers/hooks/_common.py
@@ -3,6 +3,14 @@
 
 _ATTENTION_CLASSES = (Attention, MochiAttention)
 
-_SPATIAL_ATTENTION_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "single_transformer_blocks")
-_TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS = ("temporal_transformer_blocks",)
-_CROSS_ATTENTION_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks")
+_SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "single_transformer_blocks", "layers")
+_TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS = ("temporal_transformer_blocks",)
+_CROSS_TRANSFORMER_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "layers")
+
+_ALL_TRANSFORMER_BLOCK_IDENTIFIERS = tuple(
+    {
+        *_SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS,
+        *_TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS,
+        *_CROSS_TRANSFORMER_BLOCK_IDENTIFIERS,
+    }
+)
diff --git a/src/diffusers/hooks/enhance_a_video.py b/src/diffusers/hooks/enhance_a_video.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
 from dataclasses import dataclass
 from enum import Enum
-from typing import Callable
+from typing import Callable, Dict, Union
 
 import torch
 import torch.overrides
@@ -30,7 +31,7 @@
 _ENHANCE_A_VIDEO = "enhance_a_video"
 
 
-class _AttentionType(Enum):
+class _AttentionType(int, Enum):
     SELF = 0
     JOINT___LATENTS_FIRST = 1
     JOINT___LATENTS_LAST = 2
@@ -40,9 +41,30 @@ class _AttentionType(Enum):
 class EnhanceAVideoConfig:
     r"""
     Configuration for [Enhance A Video](https://huggingface.co/papers/2502.07508).
+
+    Args:
+        weight (`float` or `Dict[str, float]`, defaults to `1.0`):
+            The weighting factor for the Enhance A Video score. If a `float`, the same weight is applied to all layers.
+            If a `dict`, the keys are regex patterns that match non-overlapping layer names, and the values are the
+            corresponding weights.
+        num_frames_callback (`Callable[[], int]`, `optional`):
+            A callback function that returns the number of latent frames in the latent video stream. Since there is no
+            easy way to deduce this within the attention layers, the user must provide this information.
+        _attention_type (`_AttentionType`, defaults to `_AttentionType.SELF`):
+            The type of attention mechanism that the underlying model uses. The following options are available:
+                - `_AttentionType.SELF`:
+                    The model uses self-attention layers with only video tokens.
+                - `_AttentionType.JOINT___LATENTS_FIRST`:
+                    The model uses joint attention layers (concatenated video and text stream data) with video tokens
+                    first.
+                - `_AttentionType.JOINT___LATENTS_LAST`:
+                    The model uses joint attention layers (concatenated video and text stream data) with video tokens
+                    last.
+
+            This parameter is not backwards-compatible and may be subject to change in future versions.
     """
 
-    weight: float = 1.0
+    weight: Union[float, Dict[str, float]] = 1.0
     num_frames_callback: Callable[[], int] = None
     _attention_type: _AttentionType = _AttentionType.SELF
 
@@ -59,7 +81,7 @@ def reset(self) -> None:
         self.latents_sequence_length = None
 
     def __repr__(self):
-        return f"EnhanceAVideoAttentionState(scores={self.scores}, latents_sequence_length={self.latents_sequence_length})"
+        return f"EnhanceAVideoAttentionState(latents_sequence_length={self.latents_sequence_length})"
 
 
 class EnhanceAVideoCaptureSDPAInputsFunctionMode(torch.overrides.TorchFunctionMode):
@@ -71,6 +93,7 @@ def __init__(self, query_key_save_callback: Callable[[torch.Tensor, torch.Tensor
     def __torch_function__(self, func, types, args=(), kwargs=None):
         if kwargs is None:
             kwargs = {}
+        # TODO(aryan): revisit for torch.compile -- can trace into ATen but not triton, so this branch is never hit
         if func is torch.nn.functional.scaled_dot_product_attention:
             query = kwargs.get("query", None) or args[0]
             key = kwargs.get("key", None) or args[1]
@@ -97,7 +120,7 @@ def new_forward(self, module, *args, **kwargs):
         #   3. [batch_size, attention_heads, encoder_sequence_length + latents_sequence_length, head_dim]
         kwargs_hidden_states = kwargs.get("hidden_states", None)
         hidden_states = kwargs_hidden_states if kwargs_hidden_states is not None else args[0]
-        self.state.latents_sequence_length = hidden_states.size(2)
+        self.state.latents_sequence_length = hidden_states.size(1)
 
         # Capture query and key tensors to compute EnhanceAVideo scores
         with EnhanceAVideoCaptureSDPAInputsFunctionMode(self._query_key_capture_callback):
@@ -130,7 +153,6 @@ def reshape_for_framewise_attention(tensor: torch.Tensor) -> torch.Tensor:
         query = reshape_for_framewise_attention(query)
         key = reshape_for_framewise_attention(key)
         scores = enhance_a_video_score(query, key, num_frames, self.weight)
-        print("Applying scores:", scores)
         hidden_states = hidden_states * scores
 
         return (hidden_states, *output[1:]) if isinstance(output, tuple) else hidden_states
@@ -176,24 +198,105 @@ def enhance_a_video_score(
 
 
 def apply_enhance_a_video(module: torch.nn.Module, config: EnhanceAVideoConfig) -> None:
+    r"""
+    Applies [Enhance A Video](https://huggingface.co/papers/2502.07508) on a model.
+
+    This function applies a Diffusers Hook to all/user-configured self-attention lyaers of the model. The hook captures
+    the inputs entering `torch.nn.functional.scaled_dot_product_attention` and operates on them.
+
+    Args:
+        module (`torch.nn.Module`):
+            The model to apply Enhance A Video to. It must be a video generation model.
+        config (`EnhanceAVideoConfig`):
+            The configuration for Enhance A Video.
+
+    Example:
+
+    ```python
+    >>> import torch
+    >>> from diffusers import LTXPipeline
+    >>> from diffusers.hooks import apply_enhance_a_video, remove_enhance_a_video, EnhanceAVideoConfig
+    >>> from diffusers.utils import export_to_video
+
+    >>> pipe = LTXPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16)
+    >>> pipe.to("cuda")
+
+    >>> weight = 4
+    >>> num_frames = 161
+    >>> latent_num_frames = (num_frames - 1) // pipe.vae_temporal_compression_ratio + 1
+
+    >>> # Apply Enhance-A-Video to all layers with a weight of 4
+    >>> config = EnhanceAVideoConfig(weight=weight, num_frames_callback=lambda: latent_num_frames, _attention_type=1)
+    >>> apply_enhance_a_video(pipe.transformer, config)
+
+    >>> prompt = "A man standing in a sunlit garden, surrounded by lush greenery and colorful flowers. The man has a knife in his hand and is cutting a ripe, juicy watermelon. The watermelon is bright red and contrasts beautifully with the green foliage in the background."
+    >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
+
+    >>> video = pipe(prompt=prompt, negative_prompt=negative_prompt, num_frames=161).frames[0]
+    >>> export_to_video(video, "output.mp4", fps=24)
+
+    >>> # Remove Enhance-A-Video
+    >>> remove_enhance_a_video(pipe.transformer)
+
+    >>> # Apply Enhance-A-Video to specific layers with different weights
+    >>> config = EnhanceAVideoConfig(
+    ...     weight={
+    ...         "blocks\.(0|1|2|3|4|5|6|7)\.": 5.0,
+    ...         "blocks\.(10|11|12|13|14|15)\.": 8.0,
+    ...         "blocks\.(21|22|23|24|25|26)\.": 3.0,
+    ...     },
+    ...     num_frames_callback=lambda: latent_num_frames,
+    ...     _attention_type=1,
+    ... )
+    ```
+    """
+    weight = config.weight
+    if not isinstance(weight, dict):
+        weight = {".*": config.weight}
+    _validate_weight(module, weight)
+
+    weight_keys = set(weight.keys())
     for name, submodule in module.named_modules():
+        # We cannot apply Enhance-A-Video to cross-attention layers
         is_cross_attention = getattr(submodule, "is_cross_attention", False)
         if not isinstance(submodule, _ATTENTION_CLASSES) or is_cross_attention:
             continue
+        current_weight = next(
+            (weight[identifier] for identifier in weight_keys if re.search(identifier, name) is not None), None
+        )
+        if current_weight is None:
+            continue
         logger.debug(f"Applying Enhance-A-Video to layer '{name}'")
         hook_registry = HookRegistry.check_if_exists_or_initialize(submodule)
         hook = EnhanceAVideoSDPAHook(
-            weight=config.weight,
+            weight=current_weight,
             num_frames_callback=config.num_frames_callback,
             _attention_type=config._attention_type,
         )
         hook_registry.register_hook(hook, _ENHANCE_A_VIDEO)
 
 
 def remove_enhance_a_video(module: torch.nn.Module) -> None:
+    r"""
+    Removes the Enhance A Video hook from the model.
+
+    See [`~hooks.enhance_a_video.apply_enhance_a_video`] for an example.
+    """
     for name, submodule in module.named_modules():
         if not hasattr(submodule, "_diffusers_hook"):
             continue
         hook_registry = submodule._diffusers_hook
         hook_registry.remove_hook(_ENHANCE_A_VIDEO, recurse=False)
         logger.debug(f"Removed Enhance-A-Video from layer '{name}'")
+
+
+def _validate_weight(module: torch.nn.Module, weight: Dict[str, float]) -> None:
+    if not isinstance(weight, dict):
+        raise ValueError(f"Invalid weight type: {type(weight)}")
+    weight_keys = set(weight.keys())
+    for name, _ in module.named_modules():
+        num_matches = sum(re.search(identifier, name) is not None for identifier in weight_keys)
+        if num_matches > 1:
+            raise ValueError(
+                f"The provided weight dictionary has multiple regex matches for layer '{name}'. Please provide non-overlapping regex patterns."
+            )
diff --git a/src/diffusers/hooks/pyramid_attention_broadcast.py b/src/diffusers/hooks/pyramid_attention_broadcast.py
@@ -22,9 +22,9 @@
 from ..utils import logging
 from ._common import (
     _ATTENTION_CLASSES,
-    _CROSS_ATTENTION_BLOCK_IDENTIFIERS,
-    _SPATIAL_ATTENTION_BLOCK_IDENTIFIERS,
-    _TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS,
+    _CROSS_TRANSFORMER_BLOCK_IDENTIFIERS,
+    _SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS,
+    _TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS,
 )
 from .hooks import HookRegistry, ModelHook
 
@@ -75,9 +75,9 @@ class PyramidAttentionBroadcastConfig:
     temporal_attention_timestep_skip_range: Tuple[int, int] = (100, 800)
     cross_attention_timestep_skip_range: Tuple[int, int] = (100, 800)
 
-    spatial_attention_block_identifiers: Tuple[str, ...] = _SPATIAL_ATTENTION_BLOCK_IDENTIFIERS
-    temporal_attention_block_identifiers: Tuple[str, ...] = _TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS
-    cross_attention_block_identifiers: Tuple[str, ...] = _CROSS_ATTENTION_BLOCK_IDENTIFIERS
+    spatial_attention_block_identifiers: Tuple[str, ...] = _SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS
+    temporal_attention_block_identifiers: Tuple[str, ...] = _TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS
+    cross_attention_block_identifiers: Tuple[str, ...] = _CROSS_TRANSFORMER_BLOCK_IDENTIFIERS
 
     current_timestep_callback: Callable[[], int] = None