update

a-r-r-o-w · a-r-r-o-w · commit 23e7548a4315 · 2025-07-21T22:07:39.000+02:00
diff --git a/src/diffusers/models/attention_dispatch.py b/src/diffusers/models/attention_dispatch.py
@@ -187,13 +187,16 @@ def list_backends(cls):
 
 
 @contextlib.contextmanager
-def attention_backend(backend: AttentionBackendName = AttentionBackendName.NATIVE):
+def attention_backend(backend: Union[str, AttentionBackendName] = AttentionBackendName.NATIVE):
     """
     Context manager to set the active attention backend.
     """
     if backend not in _AttentionBackendRegistry._backends:
         raise ValueError(f"Backend {backend} is not registered.")
 
+    backend = AttentionBackendName(backend)
+    _check_attention_backend_requirements(backend)
+
     old_backend = _AttentionBackendRegistry._active_backend
     _AttentionBackendRegistry._active_backend = backend
 
@@ -226,8 +229,6 @@ def dispatch_attention_fn(
         backend_name = AttentionBackendName(backend)
         backend_fn = _AttentionBackendRegistry._backends.get(backend_name)
 
-    _check_backend_requirements(backend_name)
-
     kwargs = {
         "query": query,
         "key": key,
@@ -316,10 +317,7 @@ def _check_shape(
 # ===== Helper functions =====
 
 
-# LRU cache is hack to avoid checking the backend requirements multiple times. Maybe not needed
-# because CPU is running much farther ahead of the accelerator and this will not be blocking anyway.
-@functools.lru_cache(maxsize=16)
-def _check_backend_requirements(backend: AttentionBackendName) -> None:
+def _check_attention_backend_requirements(backend: AttentionBackendName) -> None:
     if backend in [AttentionBackendName.FLASH, AttentionBackendName.FLASH_VARLEN]:
         if not _CAN_USE_FLASH_ATTN:
             raise RuntimeError(
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -622,7 +622,7 @@ def set_attention_backend(self, backend: str) -> None:
                 attention as backend.
         """
         from .attention import AttentionModuleMixin
-        from .attention_dispatch import AttentionBackendName
+        from .attention_dispatch import AttentionBackendName, _check_attention_backend_requirements
 
         # TODO: the following will not be required when everything is refactored to AttentionModuleMixin
         from .attention_processor import Attention, MochiAttention
@@ -633,10 +633,10 @@ def set_attention_backend(self, backend: str) -> None:
         available_backends = {x.value for x in AttentionBackendName.__members__.values()}
         if backend not in available_backends:
             raise ValueError(f"`{backend=}` must be one of the following: " + ", ".join(available_backends))
-
         backend = AttentionBackendName(backend)
-        attention_classes = (Attention, MochiAttention, AttentionModuleMixin)
+        _check_attention_backend_requirements(backend)
 
+        attention_classes = (Attention, MochiAttention, AttentionModuleMixin)
         for module in self.modules():
             if not isinstance(module, attention_classes):
                 continue