setup the option to use xla flash attention or not

zpcore · zpcore · commit fb29e3724d00 · 2024-12-03T14:27:25.000-08:00
diff --git a/examples/research_projects/pytorch_xla/train_text_to_image_xla.py b/examples/research_projects/pytorch_xla/train_text_to_image_xla.py
@@ -520,6 +520,7 @@ def main(args):
     from torch_xla.distributed.fsdp.utils import apply_xla_patch_to_nn_linear
 
     unet = apply_xla_patch_to_nn_linear(unet, xs.xla_patched_nn_linear_forward)
+    unet.enable_use_xla_flash_attention(partition_spec=("data", None, None, None))
 
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -278,21 +278,33 @@ def __init__(
         # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
         # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
         # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
-        # If torch_xla is available with the correct version, we use pallas flash attention kernel to improve 
-        # the performance.
         if processor is None:
-            if hasattr(F, "scaled_dot_product_attention") and self.scale_qk:
-                if (
-                    is_torch_xla_available
-                    and is_torch_xla_version('>', '2.2')
-                    and (not is_spmd() or is_torch_xla_version('>', '2.3'))
-                ):
-                    processor = XLAFlashAttnProcessor2_0()
-                else:
-                    processor = AttnProcessor2_0()
-            else:
-                processor = AttnProcessor()
+            processor = (
+                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+
+    def set_use_xla_flash_attention(self, use_xla_flash_attention: bool, partition_spec: Optional[Tuple[Optional[str], ...]] = None) -> None:
+        r"""
+        Set whether to use xla flash attention from `torch_xla` or not.
 
+        Args:
+            use_xla_flash_attention (`bool`):
+                Whether to use pallas flash attention kernel from `torch_xla` or not.
+            partition_spec (`Tuple[]`, *optional*):
+                Specify the partition specification if using SPMD. Otherwise None.
+        """
+        if (
+            use_xla_flash_attention
+            and is_torch_xla_available
+            and is_torch_xla_version('>', '2.2')
+            and (not is_spmd() or is_torch_xla_version('>', '2.3'))
+        ):
+            processor = XLAFlashAttnProcessor2_0(partition_spec)
+        else:
+            processor = (
+                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
         self.set_processor(processor)
 
     def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None:
@@ -2772,16 +2784,17 @@ def __call__(
 
 class XLAFlashAttnProcessor2_0:
     r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using torch_xla).
+    Processor for implementing scaled dot-product attention with pallas flash attention kernel if using `torch_xla`.
     """
 
-    def __init__(self):
+    def __init__(self, partition_spec: Optional[Tuple[Optional[str], ...]] = None):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError("XLAFlashAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
         if is_torch_xla_version("<", "2.3"):
             raise ImportError("XLA flash attention requires torch_xla version >= 2.3.")
         if is_spmd() and is_torch_xla_version("<", "2.4"):
             raise ImportError("SPMD support for XLA flash attention needs torch_xla version >= 2.4.")
+        self.partition_spec=partition_spec
 
     def __call__(
         self,
@@ -2854,7 +2867,7 @@ def __call__(
                 # Apply attention mask to key
                 key = key + attention_mask
             query /= math.sqrt(query.shape[3])
-            partition_spec = ("data", None, None, None) if is_spmd() else None
+            partition_spec = self.partition_spec if is_spmd() else None
             hidden_states = flash_attention(query, key, value, causal=False, partition_spec=partition_spec)
         else:
             hidden_states = F.scaled_dot_product_attention(
@@ -5201,6 +5214,7 @@ def __init__(self):
     FusedCogVideoXAttnProcessor2_0,
     XFormersAttnAddedKVProcessor,
     XFormersAttnProcessor,
+    XLAFlashAttnProcessor2_0,
     AttnProcessorNPU,
     AttnProcessor2_0,
     MochiVaeAttnProcessor2_0,
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -208,6 +208,35 @@ def disable_npu_flash_attention(self) -> None:
         """
         self.set_use_npu_flash_attention(False)
 
+    def set_use_xla_flash_attention(
+        self, use_xla_flash_attention: bool, partition_spec: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_xla_flash_attention method
+        # gets the message
+        def fn_recursive_set_flash_attention(module: torch.nn.Module):
+            if hasattr(module, "set_use_xla_flash_attention"):
+                module.set_use_xla_flash_attention(use_xla_flash_attention, partition_spec)
+
+            for child in module.children():
+                fn_recursive_set_flash_attention(child)
+
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_flash_attention(module)
+
+    def enable_use_xla_flash_attention(self, partition_spec: Optional[Callable] = None):
+        r""" 
+        Enable the flash attention pallals kernel for torch_xla.
+        """
+        self.set_use_xla_flash_attention(True, partition_spec)
+
+    def disable_use_xla_flash_attention(self):
+        r""" 
+        Disable the flash attention pallals kernel for torch_xla.
+        """
+        self.set_use_xla_flash_attention(False)
+
     def set_use_memory_efficient_attention_xformers(
         self, valid: bool, attention_op: Optional[Callable] = None
     ) -> None: