[intel] align driver after 'dba08d4' and xfail 'test_tensor_descriptor_padding'

anmyachev · anmyachev · commit 17a1e5667b08 · 2025-09-05T10:37:22.000Z
Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/python/test/unit/language/test_tensor_descriptor.py b/python/test/unit/language/test_tensor_descriptor.py
@@ -383,7 +383,9 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
 
 
 @pytest.mark.interpreter
-def test_tensor_descriptor_padding():
+def test_tensor_descriptor_padding(device):
+    if not is_cuda():
+        pytest.xfail("padding is unsupported")
 
     @triton.jit
     def device_tma_load(in_ptr, out_ptr, IM, IN, YM, YN, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr,
@@ -414,7 +416,7 @@ def host_tma_load(in_desc, out_ptr, YM, YN, M_BLOCK: tl.constexpr, N_BLOCK: tl.c
 
     # TMA descriptors require a global memory allocation
     def alloc_fn(size: int, alignment: float, stream: float):
-        return torch.ones(size, device="cuda", dtype=torch.float32)
+        return torch.ones(size, device=device, dtype=torch.float32)
 
     triton.set_allocator(alloc_fn)
 
@@ -423,16 +425,16 @@ def alloc_fn(size: int, alignment: float, stream: float):
     M_BLOCK = 32
     N_BLOCK = 32
     padding = "nan"
-    input = torch.arange(IM * IN, device="cuda", dtype=torch.float32)
+    input = torch.arange(IM * IN, device=device, dtype=torch.float32)
     input = input.reshape(IM, IN)
-    out_device_tma = torch.zeros((OM, ON), device="cuda", dtype=torch.float32)
-    out_host_tma = torch.zeros((OM, ON), device="cuda", dtype=torch.float32)
+    out_device_tma = torch.zeros((OM, ON), device=device, dtype=torch.float32)
+    out_host_tma = torch.zeros((OM, ON), device=device, dtype=torch.float32)
     dummy_block = [M_BLOCK, N_BLOCK]
     in_desc = TensorDescriptor(input, input.shape, input.stride(), dummy_block, padding=padding)
     grid = (triton.cdiv(OM, M_BLOCK), triton.cdiv(ON, N_BLOCK))
     device_tma_load[grid](input, out_device_tma, IM, IN, OM, ON, M_BLOCK, N_BLOCK, padding)
     host_tma_load[grid](in_desc, out_host_tma, OM, ON, M_BLOCK, N_BLOCK)
-    expected = torch.zeros((OM, ON), device="cuda", dtype=torch.float32)
+    expected = torch.zeros((OM, ON), device=device, dtype=torch.float32)
     expected[0:IN, 0:IM] = input
     expected[:, IN:ON] = float('nan')
     expected[IM:OM, :] = float('nan')
diff --git a/third_party/intel/backend/driver.py b/third_party/intel/backend/driver.py
@@ -391,6 +391,7 @@ def _expand_signature(signature):
                 # we have to pass the shape and strides twice.
                 for _ in range(2 * ndim):
                     output.append("i64")
+                output.append("i1")
                 for _ in range(ndim):
                     output.append("i32")
                 for _ in range(ndim):
@@ -797,7 +798,7 @@ def inner(args):
                 # descriptors which is why we provide our own decomposition
                 # above. Sadly this means we have to pass the shape and strides
                 # twice.
-                final_args.extend([arg.base, *arg.shape, *arg.strides, *arg.shape, *arg.strides])
+                final_args.extend([arg.base, *arg.shape, *arg.strides, arg.padding == "nan", *arg.shape, *arg.strides])
             else:
                 final_args.append(arg)