[Tests] Replace block_ptr with tensor_descriptor (#6846)

peterbell10 · web-flow · commit 49a72f5e2e7c · 2025-05-16T21:15:34.000+01:00
This changes tests that aren't specifically testing block_ptr to use
tensor_descriptor instead.
diff --git a/python/test/unit/conftest.py b/python/test/unit/conftest.py
@@ -88,3 +88,16 @@ def fresh_knobs_except_libraries(monkeypatch):
         yield fresh_function()
     finally:
         reset_function()
+
+
+@pytest.fixture
+def with_allocator():
+    import triton
+    from triton.runtime._allocation import NullAllocator
+    from triton._internal_testing import default_alloc_fn
+
+    triton.set_allocator(default_alloc_fn)
+    try:
+        yield
+    finally:
+        triton.set_allocator(NullAllocator())
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3528,32 +3528,29 @@ def kernel(In, Out, in_shape1: tl.constexpr, in_shape2: tl.constexpr, ou_shape1:
 
 @pytest.mark.interpreter
 @pytest.mark.parametrize("dtype_str", ["int32", "int8"])
-@pytest.mark.parametrize("shape", [(2, 2, 8, 64), (4, 4, 4, 4)])
+@pytest.mark.parametrize("shape", [(2, 2, 8, 64), (4, 4, 4, 16)])
 @pytest.mark.parametrize("perm", list(itertools.permutations([0, 1, 2, 3])))
-def test_trans_4d(dtype_str, shape, perm, device):
+def test_trans_4d(dtype_str, shape, perm, device, with_allocator):
 
     @triton.jit
     def kernel(In, Out,  #
                in_shape1: tl.constexpr, in_shape2: tl.constexpr, in_shape3: tl.constexpr, in_shape4: tl.constexpr,
                ou_shape1: tl.constexpr, ou_shape2: tl.constexpr, ou_shape3: tl.constexpr, ou_shape4: tl.constexpr,
                trans1: tl.constexpr, trans2: tl.constexpr, trans3: tl.constexpr, trans4: tl.constexpr):
-        in_ptr = tl.make_block_ptr(
+        in_desc = tl.make_tensor_descriptor(
             base=In,
-            shape=(in_shape1, in_shape2, in_shape3, in_shape4),
-            strides=(in_shape4 * in_shape3 * in_shape2, in_shape4 * in_shape3, in_shape4, 1),
-            offsets=(0, 0, 0, 0),
-            block_shape=(in_shape1, in_shape2, in_shape3, in_shape4),
-            order=(3, 2, 1, 0),
+            shape=[in_shape1, in_shape2, in_shape3, in_shape4],
+            strides=[in_shape4 * in_shape3 * in_shape2, in_shape4 * in_shape3, in_shape4, 1],
+            block_shape=[in_shape1, in_shape2, in_shape3, in_shape4],
         )
-        out_ptr = tl.make_block_ptr(
+        out_desc = tl.make_tensor_descriptor(
             base=Out,
-            shape=(ou_shape1, ou_shape2, ou_shape3, ou_shape4),
-            strides=(ou_shape4 * ou_shape3 * ou_shape2, ou_shape4 * ou_shape3, ou_shape4, 1),
-            offsets=(0, 0, 0, 0),
-            block_shape=(ou_shape1, ou_shape2, ou_shape3, ou_shape4),
-            order=(3, 2, 1, 0),
+            shape=[ou_shape1 * ou_shape2 * ou_shape3 * ou_shape4],
+            strides=[1],
+            block_shape=[ou_shape1 * ou_shape2 * ou_shape3 * ou_shape4],
         )
-        tl.store(out_ptr, tl.load(in_ptr).permute((trans1, trans2, trans3, trans4)))
+        val = in_desc.load([0, 0, 0, 0]).permute((trans1, trans2, trans3, trans4))
+        out_desc.store([0], val.reshape(out_desc.block_shape))
 
     input = torch.arange(math.prod(shape), dtype=getattr(torch, dtype_str), device=device).reshape(shape)
     expected = torch.permute(input, perm)
@@ -5145,7 +5142,7 @@ def kernel(ptr):
     assert "Descriptor block shape must have at least 16 bytes" in str(e.value.__cause__)
 
 
-def test_trans_reshape(device):
+def test_trans_reshape(device, with_allocator):
 
     @triton.jit
     def kernel(in_base_ptr, out_base_ptr, IN_SHAPE0: tl.constexpr, IN_SHAPE1: tl.constexpr):
diff --git a/python/test/unit/test_perf_warning.py b/python/test/unit/test_perf_warning.py
@@ -32,40 +32,31 @@ def matmul_kernel(
         N,
         K,
         stride_am,
-        stride_ak,
-        stride_bk,
         stride_bn,
         stride_cm,
-        stride_cn,
     ):
-        a_block_ptr = tl.make_block_ptr(
+        a_desc = tl.make_tensor_descriptor(
             base=a_ptr,
-            shape=(M, K),
-            strides=(stride_am, stride_ak),
-            offsets=(0, 0),
-            block_shape=(32, 128),
-            order=(1, 0),
+            shape=[M, K],
+            strides=[stride_am, 1],
+            block_shape=[32, 128],
         )
-        b_block_ptr = tl.make_block_ptr(
+        b_desc = tl.make_tensor_descriptor(
             base=b_ptr,
-            shape=(K, N),
-            strides=(stride_bk, stride_bn),
-            offsets=(0, 0),
-            block_shape=(128, 32),
-            order=(0, 1),
+            shape=[K, N],
+            strides=[stride_bn, 1],
+            block_shape=[32, 128],
         )
-        c_block_ptr = tl.make_block_ptr(
+        c_desc = tl.make_tensor_descriptor(
             base=c_ptr,
-            shape=(M, N),
-            strides=(stride_cm, stride_cn),
-            offsets=(0, 0),
-            block_shape=(32, 32),
-            order=(1, 0),
+            shape=[M, N],
+            strides=[stride_cm, 1],
+            block_shape=[32, 32],
         )
-        a = tl.load(a_block_ptr)
-        b = tl.load(b_block_ptr)
+        a = a_desc.load([0, 0])
+        b = b_desc.load([0, 0]).T
         c = tl.dot(a, b)
-        tl.store(c_block_ptr, c)
+        c_desc.store([0, 0], c)
 
     signature = {
         "a_ptr": "*fp32",
@@ -75,11 +66,8 @@ def matmul_kernel(
         "N": "i32",
         "K": "i32",
         "stride_am": "i32",
-        "stride_ak": "i32",
-        "stride_bk": "i32",
         "stride_bn": "i32",
         "stride_cm": "i32",
-        "stride_cn": "i32",
     }
     with enable_diagnostics_context('remarks'):
         triton.compile(triton.compiler.ASTSource(
diff --git a/python/triton/_internal_testing.py b/python/triton/_internal_testing.py
@@ -179,6 +179,10 @@ def tma_skip_msg(byval_only=False):
 requires_tma = pytest.mark.skipif(not supports_tma(), reason=tma_skip_msg())
 
 
+def default_alloc_fn(size: int, align: int, _):
+    return torch.empty(size, dtype=torch.int8, device="cuda")
+
+
 def unwrap_tensor(t: Union[torch.Tensor, triton.runtime.jit.TensorWrapper]) -> torch.Tensor:
     if isinstance(t, triton.runtime.jit.TensorWrapper):
         return t.base