Add tensor descriptor implementation for Flex Decoding (#4961)

whitneywhtsang · web-flow · commit d6079517b054 · 2025-08-28T03:21:47.000-04:00
pytorch/pytorch@fc69c2b removes block pointer implementation. In order to get good performance on Intel, structural pointer representation (e.g., tensor descriptor) implementation is required. This PR adds a tensor descriptor implementation for Flex Decoding under `USE_TMA`. --------- Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/scripts/patch-pytorch.sh b/scripts/patch-pytorch.sh
@@ -36,3 +36,4 @@ echo "Applying PyTorch patches in $REPO_ROOT"
 
 # put your patch applies here
 apply_patch ./patch/flex_attn_143553.patch
+apply_patch ./patch/flex_decoding_tensor_desc.patch
diff --git a/scripts/patch/flex_decoding_tensor_desc.patch b/scripts/patch/flex_decoding_tensor_desc.patch
@@ -0,0 +1,79 @@
+diff --git a/torch/_inductor/kernel/flex/flex_decoding.py b/torch/_inductor/kernel/flex/flex_decoding.py
+index 679caa9f09..6192275691 100644
+--- a/torch/_inductor/kernel/flex/flex_decoding.py
++++ b/torch/_inductor/kernel/flex/flex_decoding.py
+@@ -326,6 +326,9 @@ def create_flex_decoding_kernel(*args, **kwargs):
+         # Set default to False
+         cur_kernel_options.setdefault("USE_TMA", False)
+ 
++        if torch.xpu.is_available():
++            cur_kernel_options["USE_TMA"] = True
++
+         # Add ROCm-specific parameters if they exist in the config
+         for attrib in ["kpack", "matrix_instr_nonkdim", "waves_per_eu"]:
+             if hasattr(conf, attrib):
+diff --git a/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja b/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
+index f4e894d9b7..3fb3b2c5bd 100644
+--- a/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
++++ b/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
+@@ -128,11 +128,28 @@
+     # last valid block according to sparse mask
+     block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+ 
++    desc_k = None
++    desc_v = None
++    {%- if USE_TMA %}
++    desc_k = tl.make_tensor_descriptor(
++        base=K,
++        shape=[KV_LEN, QK_HEAD_DIM],
++        strides=[stride_kn, 1],
++        block_shape=[BLOCK_N, QK_HEAD_DIM_ROUNDED],
++    )
++
++    desc_v = tl.make_tensor_descriptor(
++        base=V,
++        shape=[KV_LEN, V_HEAD_DIM],
++        strides=[stride_vn, 1],
++        block_shape=[BLOCK_N, V_HEAD_DIM_ROUNDED],
++    )
++    {%- endif %}
+     offs_n = tl.arange(0, BLOCK_N) + off_n
+ 
+     acc, l_i, m_i = forward_inner(
+         {{gen_argdefs()}},
+-        q, K, V, None, None, Q_LEN, KV_LEN,
++        q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+         # accumulatd values
+         acc, l_i, m_i,
+         #offsets
+@@ -163,11 +180,29 @@
+         # last valid block according to sparse mask
+         block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+ 
++        desc_k = None
++        desc_v = None
++        {%- if USE_TMA %}
++        desc_k = tl.make_tensor_descriptor(
++            base=K,
++            shape=[KV_LEN, QK_HEAD_DIM],
++            strides=[stride_kn, 1],
++            block_shape=[BLOCK_N, QK_HEAD_DIM_ROUNDED],
++        )
++
++        desc_v = tl.make_tensor_descriptor(
++            base=V,
++            shape=[KV_LEN, V_HEAD_DIM],
++            strides=[stride_vn, 1],
++            block_shape=[BLOCK_N, V_HEAD_DIM_ROUNDED],
++        )
++        {%- endif %}
++
+         offs_n = tl.arange(0, BLOCK_N) + off_n
+ 
+         acc, l_i, m_i = forward_inner(
+             {{gen_argdefs()}},
+-            q, K, V, None, None, Q_LEN, KV_LEN,
++            q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+             # accumulatd values
+             acc, l_i, m_i,
+             #offsets

Original file line number	Diff line number	Diff line change
`@@ -36,3 +36,4 @@ echo "Applying PyTorch patches in $REPO_ROOT"`
`36`	`36`
`37`	`37`	`# put your patch applies here`
`38`	`38`	`apply_patch ./patch/flex_attn_143553.patch`
	`39`	`+apply_patch ./patch/flex_decoding_tensor_desc.patch`