[FlexAttn] Fix performance degradation (#5038)

whitneywhtsang · web-flow · commit 869ed8a75939 · 2025-09-04T10:11:22.000-04:00
Tensor descriptor implementation is not used without this patch. The change in `flex_attention.py` is removed from pytorch/pytorch#143553 before merging. The requirement in `can_use_tma` was too restrictive for using tensor descriptor. Fixes #5036 Benchmark CI: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/17456013457 Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/scripts/patch-pytorch.sh b/scripts/patch-pytorch.sh
@@ -36,3 +36,4 @@ echo "Applying PyTorch patches in $REPO_ROOT"
 
 # put your patch applies here
 apply_patch ./patch/flex_decoding_tensor_desc.patch
+apply_patch ./patch/use_tma.patch
diff --git a/scripts/patch/flex_decoding_tensor_desc.patch b/scripts/patch/flex_decoding_tensor_desc.patch
@@ -1,25 +1,3 @@
-diff --git a/torch/_inductor/kernel/flex/flex_decoding.py b/torch/_inductor/kernel/flex/flex_decoding.py
-index 91ba941da0..a6b87212ad 100644
---- a/torch/_inductor/kernel/flex/flex_decoding.py
-+++ b/torch/_inductor/kernel/flex/flex_decoding.py
-@@ -6,6 +6,7 @@ from typing import Any
- import sympy
- 
- import torch
-+from torch._inductor.utils import can_use_tma
- from torch._inductor.virtualized import V
- 
- from ... import ir
-@@ -326,6 +327,9 @@ def create_flex_decoding_kernel(*args, **kwargs):
-         # Set default to False
-         cur_kernel_options.setdefault("USE_TMA", False)
- 
-+        if torch.xpu.is_available() and can_use_tma(query, key, value):
-+            cur_kernel_options["USE_TMA"] = True
-+
-         # Add ROCm-specific parameters if they exist in the config
-         for attrib in ["kpack", "matrix_instr_nonkdim", "waves_per_eu"]:
-             if hasattr(conf, attrib):
 diff --git a/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja b/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
 index 31c64055e3..a75792787a 100644
 --- a/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
diff --git a/scripts/patch/use_tma.patch b/scripts/patch/use_tma.patch
@@ -0,0 +1,49 @@
+diff --git a/torch/_inductor/kernel/flex/flex_attention.py b/torch/_inductor/kernel/flex/flex_attention.py
+index 39c8f737c7..5a8df2c3f9 100644
+--- a/torch/_inductor/kernel/flex/flex_attention.py
++++ b/torch/_inductor/kernel/flex/flex_attention.py
+@@ -311,6 +311,9 @@ def flex_attention(
+         # USE TMA = false by default
+         cur_kernel_options.setdefault("USE_TMA", False)
+ 
++        if torch.xpu.is_available() and can_use_tma(query, key, value):
++            cur_kernel_options["USE_TMA"] = True
++
+         if cur_kernel_options["USE_TMA"] and can_use_tma(query, key, value):
+             cur_kernel_options["USE_TMA"] = True
+ 
+diff --git a/torch/_inductor/kernel/flex/flex_decoding.py b/torch/_inductor/kernel/flex/flex_decoding.py
+index 91ba941da0..a6b87212ad 100644
+--- a/torch/_inductor/kernel/flex/flex_decoding.py
++++ b/torch/_inductor/kernel/flex/flex_decoding.py
+@@ -6,6 +6,7 @@ from typing import Any
+ import sympy
+ 
+ import torch
++from torch._inductor.utils import can_use_tma
+ from torch._inductor.virtualized import V
+ 
+ from ... import ir
+@@ -326,6 +327,9 @@ def create_flex_decoding_kernel(*args, **kwargs):
+         # Set default to False
+         cur_kernel_options.setdefault("USE_TMA", False)
+ 
++        if torch.xpu.is_available() and can_use_tma(query, key, value):
++            cur_kernel_options["USE_TMA"] = True
++
+         # Add ROCm-specific parameters if they exist in the config
+         for attrib in ["kpack", "matrix_instr_nonkdim", "waves_per_eu"]:
+             if hasattr(conf, attrib):
+diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
+index 0876f99307..4fa1c87560 100644
+--- a/torch/_inductor/utils.py
++++ b/torch/_inductor/utils.py
+@@ -1696,7 +1696,7 @@ def can_use_tma(*matrices: IRNode, add_guards: bool = False) -> bool:
+             strides_i = [V.graph.sizevars.symbolic_hint(st) for st in strides]
+ 
+         # Every logical size ≥ 2
+-        if any(not V.graph.sizevars.statically_known_geq(s, 2) for s in sizes_i):
++        if not torch.xpu.is_available() and any(not V.graph.sizevars.statically_known_geq(s, 2) for s in sizes_i):
+             return False
+ 
+         # Find the single contiguous (“inner”) dim

Original file line number	Diff line number	Diff line change
`@@ -36,3 +36,4 @@ echo "Applying PyTorch patches in $REPO_ROOT"`
`36`	`36`
`37`	`37`	`# put your patch applies here`
`38`	`38`	`apply_patch ./patch/flex_decoding_tensor_desc.patch`
	`39`	`+apply_patch ./patch/use_tma.patch`