update

a-r-r-o-w · a-r-r-o-w · commit 29d8fccd7127 · 2025-06-04T12:27:28.000+02:00
diff --git a/src/diffusers/models/attention_dispatch.py b/src/diffusers/models/attention_dispatch.py
@@ -198,9 +198,19 @@ def dispatch_attention_fn(
     scale: Optional[float] = None,
     enable_gqa: bool = False,
     attention_kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    backend: Optional[AttentionBackendName] = None,
 ) -> torch.Tensor:
     attention_kwargs = attention_kwargs or {}
-    backend_name, backend_fn = _AttentionBackendRegistry.get_active_backend()
+
+    if backend is None:
+        # If no backend is specified, we either use the default backend (set via the DIFFUSERS_ATTN_BACKEND environment
+        # variable), or we use a custom backend based on whether user is using the `attention_backend` context manager
+        backend_name, backend_fn = _AttentionBackendRegistry.get_active_backend()
+    else:
+        backend_name = AttentionBackendName(backend)
+        backend_fn = _AttentionBackendRegistry._backends.get(backend_name)
+
     kwargs = {
         "query": query,
         "key": key,
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -997,6 +997,8 @@ def forward(
 class MochiAttnProcessor2_0:
     """Attention processor used in Mochi."""
 
+    _attention_backend = None
+
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError("MochiAttnProcessor2_0 requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.")
@@ -1074,7 +1076,9 @@ def apply_rotary_emb(x, freqs_cos, freqs_sin):
             valid_key = torch.cat([key[idx : idx + 1], valid_encoder_key], dim=2)
             valid_value = torch.cat([value[idx : idx + 1], valid_encoder_value], dim=2)
 
-            attn_output = dispatch_attention_fn(valid_query, valid_key, valid_value, dropout_p=0.0, is_causal=False)
+            attn_output = dispatch_attention_fn(
+                valid_query, valid_key, valid_value, dropout_p=0.0, is_causal=False, backend=self._attention_backend
+            )
             valid_sequence_length = attn_output.size(2)
             attn_output = F.pad(attn_output, (0, 0, 0, total_length - valid_sequence_length))
             attn_outputs.append(attn_output)
@@ -2274,6 +2278,8 @@ def __call__(
 class FluxAttnProcessor2_0:
     """Attention processor used typically in processing the SD3-like self-attention projections."""
 
+    _attention_backend = None
+
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
@@ -2339,7 +2345,13 @@ def __call__(
             key = apply_rotary_emb(key, image_rotary_emb)
 
         hidden_states = dispatch_attention_fn(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
         )
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
@@ -2366,6 +2378,8 @@ def __call__(
 class FluxAttnProcessor2_0_NPU:
     """Attention processor used typically in processing the SD3-like self-attention projections."""
 
+    _attention_backend = None
+
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError(
@@ -2448,7 +2462,9 @@ def __call__(
                 inner_precise=0,
             )[0]
         else:
-            hidden_states = dispatch_attention_fn(query, key, value, dropout_p=0.0, is_causal=False)
+            hidden_states = dispatch_attention_fn(
+                query, key, value, dropout_p=0.0, is_causal=False, backend=self._attention_backend
+            )
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.to(query.dtype)
 
@@ -2472,6 +2488,8 @@ def __call__(
 class FusedFluxAttnProcessor2_0:
     """Attention processor used typically in processing the SD3-like self-attention projections."""
 
+    _attention_backend = None
+
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError(
@@ -2542,7 +2560,9 @@ def __call__(
             query = apply_rotary_emb(query, image_rotary_emb)
             key = apply_rotary_emb(key, image_rotary_emb)
 
-        hidden_states = dispatch_attention_fn(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = dispatch_attention_fn(
+            query, key, value, dropout_p=0.0, is_causal=False, backend=self._attention_backend
+        )
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.to(query.dtype)
@@ -2567,6 +2587,8 @@ def __call__(
 class FusedFluxAttnProcessor2_0_NPU:
     """Attention processor used typically in processing the SD3-like self-attention projections."""
 
+    _attention_backend = None
+
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError(
@@ -2653,7 +2675,9 @@ def __call__(
                 inner_precise=0,
             )[0]
         else:
-            hidden_states = dispatch_attention_fn(query, key, value, dropout_p=0.0, is_causal=False)
+            hidden_states = dispatch_attention_fn(
+                query, key, value, dropout_p=0.0, is_causal=False, backend=self._attention_backend
+            )
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.to(query.dtype)
@@ -2678,6 +2702,8 @@ def __call__(
 class FluxIPAdapterJointAttnProcessor2_0(torch.nn.Module):
     """Flux Attention processor for IP-Adapter."""
 
+    _attention_backend = None
+
     def __init__(
         self, hidden_size: int, cross_attention_dim: int, num_tokens=(4,), scale=1.0, device=None, dtype=None
     ):
@@ -2775,7 +2801,9 @@ def __call__(
             query = apply_rotary_emb(query, image_rotary_emb)
             key = apply_rotary_emb(key, image_rotary_emb)
 
-        hidden_states = dispatch_attention_fn(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = dispatch_attention_fn(
+            query, key, value, dropout_p=0.0, is_causal=False, backend=self._attention_backend
+        )
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.to(query.dtype)
 
@@ -2806,7 +2834,13 @@ def __call__(
                 # the output of sdp = (batch, num_heads, seq_len, head_dim)
                 # TODO: add support for attn.scale when we move to Torch 2.1
                 current_ip_hidden_states = dispatch_attention_fn(
-                    ip_query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+                    ip_query,
+                    ip_key,
+                    ip_value,
+                    attn_mask=None,
+                    dropout_p=0.0,
+                    is_causal=False,
+                    backend=self._attention_backend,
                 )
                 current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape(
                     batch_size, -1, attn.heads * head_dim
@@ -2825,6 +2859,8 @@ class CogVideoXAttnProcessor2_0:
     query and key vectors, but does not include spatial normalization.
     """
 
+    _attention_backend = None
+
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
@@ -2872,7 +2908,13 @@ def __call__(
                 key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
 
         hidden_states = dispatch_attention_fn(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
         )
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
@@ -2894,6 +2936,8 @@ class FusedCogVideoXAttnProcessor2_0:
     query and key vectors, but does not include spatial normalization.
     """
 
+    _attention_backend = None
+
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
@@ -2943,7 +2987,13 @@ def __call__(
                 key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
 
         hidden_states = dispatch_attention_fn(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
         )
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
@@ -3129,9 +3179,10 @@ class AttnProcessorNPU:
     Processor for implementing flash attention using torch_npu. Torch_npu supports only fp16 and bf16 data types. If
     fp32 is used, F.scaled_dot_product_attention will be used for computation, but the acceleration effect on NPU is
     not significant.
-
     """
 
+    _attention_backend = None
+
     def __init__(self):
         if not is_torch_npu_available():
             raise ImportError("AttnProcessorNPU requires torch_npu extensions and is supported only on npu devices.")
@@ -3216,7 +3267,13 @@ def __call__(
         else:
             # TODO: add support for attn.scale when we move to Torch 2.1
             hidden_states = dispatch_attention_fn(
-                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+                query,
+                key,
+                value,
+                attn_mask=attention_mask,
+                dropout_p=0.0,
+                is_causal=False,
+                backend=self._attention_backend,
             )
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
@@ -3243,6 +3300,8 @@ class AttnProcessor2_0:
     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
     """
 
+    _attention_backend = None
+
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
@@ -3310,7 +3369,13 @@ def __call__(
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1
         hidden_states = dispatch_attention_fn(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
         )
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
@@ -3553,6 +3618,8 @@ class MochiVaeAttnProcessor2_0:
     Attention processor used in Mochi VAE.
     """
 
+    _attention_backend = None
+
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
@@ -3614,7 +3681,13 @@ def __call__(
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1
         hidden_states = dispatch_attention_fn(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=attn.is_causal
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=attn.is_causal,
+            backend=self._attention_backend,
         )
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -599,6 +599,52 @@ def enable_group_offload(
             low_cpu_mem_usage=low_cpu_mem_usage,
         )
 
+    def set_attention_backend(self, backend: str) -> None:
+        """
+        Set the attention backend for the model.
+
+        Args:
+            backend (`str`):
+                The name of the backend to set. Must be one of the available backends defined in
+                `AttentionBackendName`. Available backends can be found in
+                `diffusers.attention_dispatch.AttentionBackendName`. Defaults to torch native scaled dot product
+                attention as backend.
+        """
+        from .attention_dispatch import AttentionBackendName
+        from .attention_processor import Attention, MochiAttention
+
+        backend = backend.lower()
+        available_backends = {x.value for x in AttentionBackendName.__members__.values()}
+        if backend not in available_backends:
+            raise ValueError(f"`{backend=}` must be one of the following: " + ", ".join(available_backends))
+
+        backend = AttentionBackendName(backend)
+        attention_classes = (Attention, MochiAttention)
+
+        for module in self.modules():
+            if not isinstance(module, attention_classes):
+                continue
+            processor = module.processor
+            if processor is None or not hasattr(processor, "_attention_backend"):
+                continue
+            processor._attention_backend = backend
+
+    def reset_attention_backend(self) -> None:
+        """
+        Resets the attention backend for the model. Following calls to `forward` will use the environment default or
+        the torch native scaled dot product attention.
+        """
+        from .attention_processor import Attention, MochiAttention
+
+        attention_classes = (Attention, MochiAttention)
+        for module in self.modules():
+            if not isinstance(module, attention_classes):
+                continue
+            processor = module.processor
+            if processor is None or not hasattr(processor, "_attention_backend"):
+                continue
+            processor._attention_backend = None
+
     def save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],
diff --git a/src/diffusers/models/transformers/transformer_lumina2.py b/src/diffusers/models/transformers/transformer_lumina2.py
@@ -72,6 +72,8 @@ class Lumina2AttnProcessor2_0:
     used in the Lumina2Transformer2DModel model. It applies normalization and RoPE on query and key vectors.
     """
 
+    _attention_backend = None
+
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
@@ -138,7 +140,9 @@ def __call__(
         key = key.transpose(1, 2)
         value = value.transpose(1, 2)
 
-        hidden_states = dispatch_attention_fn(query, key, value, attn_mask=attention_mask, scale=softmax_scale)
+        hidden_states = dispatch_attention_fn(
+            query, key, value, attn_mask=attention_mask, scale=softmax_scale, backend=self._attention_backend
+        )
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.type_as(query)
 
diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py
@@ -36,6 +36,8 @@
 
 
 class WanAttnProcessor2_0:
+    _attention_backend = None
+
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError("WanAttnProcessor2_0 requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.")
@@ -92,13 +94,25 @@ def apply_rotary_emb(hidden_states: torch.Tensor, freqs: torch.Tensor):
             value_img = value_img.unflatten(2, (attn.heads, -1)).transpose(1, 2)
 
             hidden_states_img = dispatch_attention_fn(
-                query, key_img, value_img, attn_mask=None, dropout_p=0.0, is_causal=False
+                query,
+                key_img,
+                value_img,
+                attn_mask=None,
+                dropout_p=0.0,
+                is_causal=False,
+                backend=self._attention_backend,
             )
             hidden_states_img = hidden_states_img.transpose(1, 2).flatten(2, 3)
             hidden_states_img = hidden_states_img.type_as(query)
 
         hidden_states = dispatch_attention_fn(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
         )
         hidden_states = hidden_states.transpose(1, 2).flatten(2, 3)
         hidden_states = hidden_states.type_as(query)
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py