Unwrap tensor descriptor kernel argument into its members, and update launch code. (#4820)

etiotto · web-flow · commit 69e14adeb128 · 2025-08-05T10:21:10.000-04:00
Fixes issue #4289. --------- Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
diff --git a/python/test/unit/language/test_tensor_descriptor.py b/python/test/unit/language/test_tensor_descriptor.py
@@ -1507,8 +1507,6 @@ def test_tensor_descriptor_reduce(kind, descriptor, dtype_str, num_ctas, M_BLOCK
             pytest.skip("Broken on rocm")
         if is_xpu():
             if (kind, dtype_str) in [("add", "bfloat16")]:
-                if descriptor == "host":
-                    pytest.skip("FIXME: issue #4289")
                 pytest.skip("FIXME: issue #3914")
 
     @triton.jit(debug=True)
@@ -1593,8 +1591,6 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
 def test_host_tensor_descriptor_load(dtype_str, num_ctas, M_BLOCK, N_BLOCK, device):
     if num_ctas == 2 and (not is_cuda() or torch.cuda.get_device_capability(0)[0] not in (9, 10)):
         pytest.xfail("CTAs is unsupported for these cards")
-    if is_xpu():
-        pytest.skip("FIXME: issue #4289")
 
     @triton.jit(debug=True)
     def kernel(out_ptr, desc, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
@@ -1658,8 +1654,6 @@ def test_host_tensor_descriptor_matmul(num_stages, num_ctas, BLOCK_M, BLOCK_N, B
 
     if is_hip() and (BLOCK_M, BLOCK_N, BLOCK_K, num_stages) == (256, 128, 32, 4):
         pytest.skip("Insufficient shared memory on HIP devices")
-    if is_xpu():
-        pytest.skip("FIXME: issue #4289")
 
     if is_interpreter():
         M, N, K = BLOCK_M, BLOCK_N, BLOCK_K
diff --git a/third_party/intel/backend/driver.py b/third_party/intel/backend/driver.py
@@ -1,5 +1,6 @@
 import importlib.metadata
 import os
+import re
 import hashlib
 import shutil
 import ctypes
@@ -13,6 +14,7 @@
 from triton.runtime.cache import get_cache_manager
 from triton.backends.compiler import GPUTarget
 from triton.backends.driver import DriverBase
+from triton.tools.tensor_descriptor import TensorDescriptor
 
 # A hard-coded cache version that can be updated when we know that the cached file is invalid and
 # there are no other ways to detect that the runtime environment has changed. For example, a shared
@@ -370,10 +372,40 @@ def ty_to_cpp(ty):
 
 def make_launcher(constants, signature):
 
-    def _serialize_signature(sig):
+    def _expand_signature(signature):
+        output = []
+        # Expand tensor descriptor arguments into base pointer, shape, and
+        # strides
+        for sig in signature:
+            if isinstance(sig, str) and sig.startswith("tensordesc"):
+                match = re.match("tensordesc<([^[>]*)\\[([^]]*)\\]", sig)
+                dtype = match.group(1)
+                shape = match.group(2)
+                ndim = shape.count(",") + 1
+
+                output.append("*" + dtype)
+                # Currently the host side tensor descriptors get passed in as a
+                # tensor desc, shape, and strides. We have no way to use these
+                # shape and strides when processing tensor descriptors which is
+                # why we provide our own decomposition above. Sadly this means
+                # we have to pass the shape and strides twice.
+                for _ in range(2 * ndim):
+                    output.append("i64")
+                for _ in range(ndim):
+                    output.append("i32")
+                for _ in range(ndim):
+                    output.append("i64")
+            else:
+                output.append(sig)
+
+        return output
+
+    def _flatten_signature(sig, output):
         if isinstance(sig, tuple):
-            return ','.join(map(_serialize_signature, sig))
-        return sig
+            for x in sig:
+                _flatten_signature(x, output)
+        else:
+            output.append(sig)
 
     def _extracted_type(ty):
         if isinstance(ty, tuple):
@@ -408,11 +440,16 @@ def format_of(ty):
             "uint64_t": "K",
         }[ty_to_cpp(ty)]
 
+    expand_signature = _expand_signature(signature.values())
+    signature = {i: s for i, s in enumerate(expand_signature)}
+
     args_format = ''.join([format_of(ty) for ty in signature.values()])
     format = _BASE_ARGS_FORMAT + args_format
-    signature = ','.join(map(_serialize_signature, signature.values()))
-    signature = list(filter(bool, signature.split(',')))
-    signature = {i: s for i, s in enumerate(signature)}
+
+    flat_signature = []
+    for sig in signature.values():
+        _flatten_signature(sig, flat_signature)
+    signature = {i: s for i, s in enumerate(flat_signature)}
     args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
     # Record the end of regular arguments;
     # subsequent arguments are architecture-specific descriptors.
@@ -632,9 +669,10 @@ def format_of(ty):
   PyObject* py_kernel;
 
   {newline.join([f"{_extracted_type(ty)} _arg{i};" for i, ty in signature.items()])}
-  if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &py_obj_stream, &py_kernel,
-                                      &kernel_metadata, &launch_metadata,
-                                      &launch_enter_hook, &launch_exit_hook {args_list})) {{
+  if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ,
+                                           &py_obj_stream, &py_kernel,
+                                           &kernel_metadata, &launch_metadata,
+                                           &launch_enter_hook, &launch_exit_hook{args_list})) {{
     return NULL;
   }}
 
@@ -703,6 +741,32 @@ def format_of(ty):
     return src
 
 
+def wrap_handle_tensor_descriptor(launcher):
+    """
+    Replace all tensor descriptors with the base ptr, shape, and strides
+    """
+
+    def inner(args):
+        meta_args = args[:len(_BASE_ARGS_FORMAT)]
+        raw_kernel_args = args[len(_BASE_ARGS_FORMAT):]
+        final_args = []
+        for arg in raw_kernel_args:
+            if isinstance(arg, TensorDescriptor):
+                # Currently the host side tensor descriptors get decomposed in
+                # the frontend to tensor desc, shape, and strides. We have no
+                # way to use these shape and strides when processing tensor
+                # descriptors which is why we provide our own decomposition
+                # above. Sadly this means we have to pass the shape and strides
+                # twice.
+                final_args.extend([arg.base, *arg.shape, *arg.strides, *arg.shape, *arg.strides])
+            else:
+                final_args.append(arg)
+
+        return launcher(meta_args + tuple(final_args))
+
+    return inner
+
+
 def serialize_args(args, constants, signature):
     import torch
     import numbers
@@ -767,17 +831,23 @@ class XPULauncher(object):
     def __init__(self, src, metadata):
         constants = src.constants if hasattr(src, "constants") else dict()
         arg_idx = lambda x: (src.fn.arg_names.index(x), ) if isinstance(x, str) else x
-        self.constants = {arg_idx(idx): value for idx, value in constants.items()}
-        self.signature = {idx: value for idx, value in src.signature.items()}
-        src = make_launcher(self.constants, self.signature)
+        constants = {arg_idx(idx): value for idx, value in constants.items()}
+        signature = {idx: value for idx, value in src.signature.items()}
+        src = make_launcher(constants, signature)
         self.mod = compile_module_from_src(src=src, name="__triton_launcher")
+        has_tensor_desc_arg = any(isinstance(sig, str) and sig.startswith("tensordesc") for sig in signature.values())
+
+        self.launch = wrap_handle_tensor_descriptor(self.mod.launch) if has_tensor_desc_arg else self.mod.launch
+
         # Serialize KernelArguments for SPIR-V Runner
         self.serialize_kernel_args = knobs.intel.dump_spirv_kernel_args
+        self.constants = constants
+        self.signature = signature
 
-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args):
         if self.serialize_kernel_args:
             serialize_args(args, self.constants, self.signature)
-        self.mod.launch(args)
+        self.launch(args)
 
 
 class XPUDriver(DriverBase):