[tensor-descriptor]: Add test_tensor_descriptor_load_nd and other tests (#4141)

etiotto · web-flow · commit 255c7f6d59b7 · 2025-05-08T22:40:40.000-04:00
Add tests `test_tensor_descriptor_load_nd`,
`test_tensor_descriptor_store_nd`,
`kernel_make_tensor_descriptor_loop_carried`,
`test_tensor_descriptor_batched_gemm_2d_tma`,
`batched_gemm_2d_tma_kernel`

---------

Signed-off-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
diff --git a/python/test/unit/intel/test_tensor_descriptor.py b/python/test/unit/intel/test_tensor_descriptor.py
@@ -203,7 +203,7 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
 def test_tensor_descriptor_store3d(dtype_str, K_BLOCK):
 
     if dtype_str == 'bfloat16':
-        return pytest.skip("FIXME: bfloat16 test fails verification")
+        return pytest.skip("FIXME: issue #4137")
 
     @triton.jit
     def kernel(out_ptr, a_ptr, M, N, K, stride_m, stride_n, stride_k, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr,
@@ -248,6 +248,138 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
     torch.testing.assert_close(expect, actual)
 
 
+@pytest.mark.parametrize("dtype_str", tma_dtypes)
+@pytest.mark.parametrize("num_ctas", [1])
+@pytest.mark.parametrize("ndim", [1, 2, 3, 4, 5])
+@pytest.mark.parametrize("INNER_BLOCK", [16, 32, 64, 128])
+def test_tensor_descriptor_load_nd(dtype_str, num_ctas, ndim, INNER_BLOCK):
+
+    if ndim not in [1] or dtype_str not in ["uint16", "uint32"]:
+        return pytest.skip("FIXME: issue #4139")
+
+    @triton.jit
+    def kernel(out_ptr, a_ptr, shape, strides, BLOCK_SHAPE):
+        desc = tl.make_tensor_descriptor(
+            a_ptr,
+            shape=shape,
+            strides=strides,
+            block_shape=BLOCK_SHAPE,
+        )
+        ndim: tl.constexpr = len(BLOCK_SHAPE)
+
+        offs = (0, ) * ndim
+        block = desc.load(offs)
+
+        idx = tl.full(BLOCK_SHAPE, 0, tl.int32)
+        stride = 1
+        for k in tl.static_range(ndim - 1, -1, -1):
+            arange = tl.arange(0, BLOCK_SHAPE[k])
+            for _ in tl.static_range(k):
+                arange = tl.expand_dims(arange, 0)
+            for _ in tl.static_range(k + 1, ndim):
+                arange = tl.expand_dims(arange, -1)
+
+            idx += arange * stride
+            stride *= BLOCK_SHAPE[k]
+
+        tl.store(out_ptr + idx, block)
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]):
+        return torch.empty(size, dtype=torch.int8, device="xpu")
+
+    triton.set_allocator(alloc_fn)
+
+    alloc_shape = (1, 1, 3, 7, INNER_BLOCK)[-ndim:]
+    inp = to_triton(numpy_random(alloc_shape, dtype_str), device="xpu", dst_type=dtype_str)
+    inp.data = inp.data[..., :INNER_BLOCK - 3]
+
+    if INNER_BLOCK * inp.element_size() < 32:
+        return pytest.xfail("Invalid last dim size")
+
+    BLOCK_SHAPE = (2, 2, 4, 8, INNER_BLOCK)[-ndim:]
+    out = inp.new_empty(BLOCK_SHAPE)
+
+    constexpr_block_shape = tuple(tl.constexpr(v) for v in BLOCK_SHAPE)
+    kernel[(1, )](out, inp, inp.shape, inp.stride(), constexpr_block_shape, num_ctas=num_ctas)
+
+    # Check in-bounds
+    actual = unwrap_tensor(out)
+    expect = unwrap_tensor(inp)
+    idx = [slice(None, s) for s in inp.shape]
+    torch.testing.assert_close(expect, actual[idx])
+
+    # Check out-of-bounds
+    actual[idx].zero_()
+    expect = expect.new_zeros(BLOCK_SHAPE)
+    torch.testing.assert_close(expect, actual)
+
+
+@pytest.mark.parametrize("dtype_str", tma_dtypes)
+@pytest.mark.parametrize("num_ctas", [1])
+@pytest.mark.parametrize("ndim", [1, 2, 3, 4, 5])
+@pytest.mark.parametrize("INNER_BLOCK", [16, 32, 64, 128])
+def test_tensor_descriptor_store_nd(dtype_str, num_ctas, ndim, INNER_BLOCK):
+
+    if ndim not in [1]:
+        return pytest.skip("FIXME: issue #4140")
+
+    @triton.jit
+    def kernel(out_ptr, a_ptr, shape, strides, BLOCK_SHAPE):
+        desc = tl.make_tensor_descriptor(
+            out_ptr,
+            shape=shape,
+            strides=strides,
+            block_shape=BLOCK_SHAPE,
+        )
+        ndim: tl.constexpr = len(BLOCK_SHAPE)
+
+        idx = tl.full(BLOCK_SHAPE, 0, tl.int32)
+        stride = 1
+        for k in tl.static_range(ndim - 1, -1, -1):
+            arange = tl.arange(0, BLOCK_SHAPE[k])
+            for _ in tl.static_range(k):
+                arange = tl.expand_dims(arange, 0)
+            for _ in tl.static_range(k + 1, ndim):
+                arange = tl.expand_dims(arange, -1)
+
+            idx += arange * stride
+            stride *= BLOCK_SHAPE[k]
+
+        block = tl.load(a_ptr + idx)
+
+        offs = (0, ) * ndim
+        desc.store(offs, block)
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]):
+        return torch.empty(size, dtype=torch.int8, device="xpu")
+
+    triton.set_allocator(alloc_fn)
+
+    BLOCK_SHAPE = (2, 2, 4, 8, INNER_BLOCK)[-ndim:]
+    inp = to_triton(numpy_random(BLOCK_SHAPE, dtype_str), device="xpu", dst_type=dtype_str)
+
+    if INNER_BLOCK * inp.element_size() < 32:
+        return pytest.xfail("Invalid last dim size")
+
+    out = inp.new_empty(BLOCK_SHAPE)
+    out.data.fill_(-1)
+
+    desc_shape = (1, 1, 3, 7, INNER_BLOCK)[-ndim:]
+    constexpr_block_shape = tuple(tl.constexpr(v) for v in BLOCK_SHAPE)
+    kernel[(1, )](out, inp, desc_shape, out.stride(), constexpr_block_shape, num_ctas=num_ctas)
+
+    # Check in-bounds
+    actual = unwrap_tensor(out)
+    expect = unwrap_tensor(inp)
+    idx = [slice(None, s) for s in desc_shape]
+    torch.testing.assert_close(expect[idx], actual[idx])
+
+    # Check out-of-bounds
+    actual[idx].fill_(-1)
+    expect = expect.new_full(BLOCK_SHAPE, -1)
+    torch.testing.assert_close(expect, actual)
+
+
 @triton.jit(noinline=False)
 def tensor_descriptor_in_function_helper(out_ptr, in_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
     in_desc = tl.make_tensor_descriptor(
@@ -465,6 +597,186 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
     torch.testing.assert_close(ref_out, C, rtol=1e-3, atol=1e-3)
 
 
+@triton.jit
+def kernel_make_tensor_descriptor_loop_carried(a_ptr, M, N, MBLOCK: tl.constexpr, NBLOCK: tl.constexpr):
+    # Test that descriptors work with
+    pid = tl.program_id(0)
+    moffset = MBLOCK * pid
+
+    a_desc = tl.make_tensor_descriptor(
+        a_ptr,
+        shape=[M, N],
+        strides=[N, 1],
+        block_shape=[MBLOCK, NBLOCK],
+    )
+
+    for i in range(0, N, NBLOCK):
+        assert isinstance(a_desc, tl.tensor_descriptor)
+        if i % (3 * NBLOCK) == 0:
+            a_desc = tl.make_tensor_descriptor(
+                a_ptr,
+                shape=[M, N],
+                strides=[N, 1],
+                block_shape=[MBLOCK, NBLOCK],
+            )
+            assert isinstance(a_desc, tl.tensor_descriptor)
+        assert isinstance(a_desc, tl.tensor_descriptor)
+        a = a_desc.load([moffset, i])
+        a_desc.store([moffset, i], a + 10)
+
+    n = 0
+    while n < N:
+        assert isinstance(a_desc, tl.tensor_descriptor)
+        if n % (3 * NBLOCK) == 0:
+            assert isinstance(a_desc, tl.tensor_descriptor)
+            a_desc = tl.make_tensor_descriptor(
+                a_ptr,
+                shape=[M, N],
+                strides=[N, 1],
+                block_shape=[MBLOCK, NBLOCK],
+            )
+        assert isinstance(a_desc, tl.tensor_descriptor)
+        a = a_desc.load([moffset, n])
+        a_desc.store([moffset, n], a + 5)
+
+        n += NBLOCK
+
+
+@pytest.mark.interpreter
+def test_make_tensor_descriptor_loop_carried():
+    return pytest.skip("FIXME: issue #4132")
+
+    device = "xpu"
+    M, N = 64, 512
+    torch.manual_seed(42)
+    A = torch.randn((M, N), dtype=torch.float32, device=device)
+    MBLOCK, NBLOCK = 8, 128
+    grid = (triton.cdiv(M, MBLOCK), )
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]):
+        assert size == 128 * grid[0]
+        assert align == 128
+        assert stream == 0
+        return torch.empty(size, dtype=torch.int8, device="xpu")
+
+    triton.set_allocator(alloc_fn)
+
+    ref_out = A + 15
+    kernel_make_tensor_descriptor_loop_carried[grid](
+        A,
+        M,
+        N,
+        MBLOCK,
+        NBLOCK,
+    )
+    torch.testing.assert_close(ref_out, A)
+
+
+@triton.jit
+def batched_gemm_2d_tma_kernel(a_ptr, b_ptr, c_ptr,  #
+                               B, M, N, K,  #
+                               dtype: tl.constexpr,  #
+                               BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,  #
+                               NUM_SMS: tl.constexpr):
+    start_pid = tl.program_id(axis=0)
+    num_tiles_m = tl.cdiv(M, BLOCK_M)
+    num_tiles_n = tl.cdiv(N, BLOCK_N)
+    k_tiles = tl.cdiv(K, BLOCK_K)
+    num_tiles_per_batch = num_tiles_m * num_tiles_n
+    num_tiles = B * num_tiles_per_batch
+
+    tiles_per_SM = num_tiles // NUM_SMS
+    if start_pid < num_tiles % NUM_SMS:
+        tiles_per_SM += 1
+
+    tile_id = start_pid - NUM_SMS
+    ki = -1
+
+    tile_m = 0
+    tile_n = 0
+    tile_b = 0
+
+    offs_m = 0
+    offs_n = 0
+    offs_b = 0
+
+    a_desc = tl.make_tensor_descriptor(a_ptr + offs_b * (M * K), [M, K], [K, 1], [BLOCK_M, BLOCK_K])
+    b_desc = tl.make_tensor_descriptor(b_ptr + offs_b * (N * K), [N, K], [K, 1], [BLOCK_N, BLOCK_K])
+    c_desc = tl.make_tensor_descriptor(c_ptr + offs_b * (M * N), [M, N], [N, 1], [BLOCK_M, BLOCK_N])
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    for _ in range(k_tiles * tiles_per_SM):
+        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
+        if ki == 0:
+            tile_id += NUM_SMS
+            tile_b = tile_id // num_tiles_per_batch
+            tile_m = (tile_id // num_tiles_n) % num_tiles_m
+            tile_n = tile_id % num_tiles_n
+
+            offs_b = tile_b
+            offs_m = tile_m * BLOCK_M
+            offs_n = tile_n * BLOCK_N
+
+            a_desc = tl.make_tensor_descriptor(a_ptr + offs_b * (M * K), [M, K], [K, 1], [BLOCK_M, BLOCK_K])
+            b_desc = tl.make_tensor_descriptor(b_ptr + offs_b * (N * K), [N, K], [K, 1], [BLOCK_N, BLOCK_K])
+            c_desc = tl.make_tensor_descriptor(c_ptr + offs_b * (M * N), [M, N], [N, 1], [BLOCK_M, BLOCK_N])
+
+        offs_k = ki * BLOCK_K
+
+        a = a_desc.load([offs_m, offs_k])
+        b = b_desc.load([offs_n, offs_k])
+        accumulator = tl.dot(a, b.T, accumulator)
+
+        if ki == k_tiles - 1:
+            c = accumulator.to(dtype)
+
+            c_desc.store([offs_m, offs_n], c)
+            accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+
+@pytest.mark.interpreter
+def test_tensor_descriptor_batched_gemm_2d_tma():
+    return pytest.skip("FIXME: issue #4132")
+
+    device = "xpu"
+    BLOCK_M, BLOCK_N, BLOCK_K = 128, 256, 64
+    if is_interpreter():
+        B, M, N, K = 2, BLOCK_M, BLOCK_N, BLOCK_K
+    else:
+        B, M, N, K = 2, 1024, 1024, 128
+    NUM_SMS = 96
+    num_stages = 3
+
+    grid = (min(NUM_SMS, B * triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N)), )
+
+    a = torch.randn((B, M, K), device=device, dtype=torch.float16)
+    b = torch.randn((B, N, K), device=device, dtype=torch.float16)
+    c = torch.empty((B, M, N), device=device, dtype=torch.float16)
+
+    expect = torch.bmm(a, b.mT)
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]):
+        # TODO: should only need num_stages * 3 descriptors per SM
+        assert size == 128 * 3 * (num_stages + 1) * grid[0]
+        assert align == 128
+        assert stream == 0
+        return torch.empty(size, dtype=torch.int8, device="xpu")
+
+    triton.set_allocator(alloc_fn)
+
+    batched_gemm_2d_tma_kernel[grid](
+        a, b, c,  #
+        B, M, N, K,  #
+        tl.float16,  #
+        BLOCK_M, BLOCK_N, BLOCK_K,  #
+        NUM_SMS,  #
+        num_stages=num_stages, num_warps=8)
+    torch.xpu.synchronize()
+
+    torch.testing.assert_close(c, expect, rtol=1e-3, atol=1e-3)
+
+
 @triton.jit
 def batched_gemm_3d_tma_kernel(a_ptr, b_ptr, c_ptr,  #
                                B, M, N, K,  #