Skip to content

Commit 0257c4c

Browse files
red1bluelostMicah Weston
andauthored
[Tests] Using device fixure instead of cuda in tensor descriptor tests. (#8512)
A few tests in tensor descriptor use "cuda" as device rather than a 'device' fixture in the test arguments. This PR changes those tests to use 'device' fixture instead so that third party users without a cuda runtime can run on these tests. <!--- The core Triton is a small number of people, and we receive many PRs (thank you!). To help us review your code more quickly, **if you are a new contributor (less than 3 PRs merged) we ask that you complete the following tasks and include the filled-out checklist in your PR description.** Complete the following tasks before sending your PR, and replace `[ ]` with `[x]` to indicate you have done them. --> # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [x] This PR does not need a test because it is editing the test file only. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.) Co-authored-by: Micah Weston <[email protected]>
1 parent ecd33fe commit 0257c4c

File tree

1 file changed

+15
-15
lines changed

1 file changed

+15
-15
lines changed

python/test/unit/language/test_tensor_descriptor.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,7 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
384384

385385

386386
@pytest.mark.interpreter
387-
def test_tensor_descriptor_padding():
387+
def test_tensor_descriptor_padding(device):
388388

389389
@triton.jit
390390
def device_tma_load(in_ptr, out_ptr, IM, IN, YM, YN, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr,
@@ -415,7 +415,7 @@ def host_tma_load(in_desc, out_ptr, YM, YN, M_BLOCK: tl.constexpr, N_BLOCK: tl.c
415415

416416
# TMA descriptors require a global memory allocation
417417
def alloc_fn(size: int, alignment: float, stream: float):
418-
return torch.ones(size, device="cuda", dtype=torch.float32)
418+
return torch.ones(size, device=device, dtype=torch.float32)
419419

420420
triton.set_allocator(alloc_fn)
421421

@@ -424,16 +424,16 @@ def alloc_fn(size: int, alignment: float, stream: float):
424424
M_BLOCK = 32
425425
N_BLOCK = 32
426426
padding = "nan"
427-
input = torch.arange(IM * IN, device="cuda", dtype=torch.float32)
427+
input = torch.arange(IM * IN, device=device, dtype=torch.float32)
428428
input = input.reshape(IM, IN)
429-
out_device_tma = torch.zeros((OM, ON), device="cuda", dtype=torch.float32)
430-
out_host_tma = torch.zeros((OM, ON), device="cuda", dtype=torch.float32)
429+
out_device_tma = torch.zeros((OM, ON), device=device, dtype=torch.float32)
430+
out_host_tma = torch.zeros((OM, ON), device=device, dtype=torch.float32)
431431
dummy_block = [M_BLOCK, N_BLOCK]
432432
in_desc = TensorDescriptor(input, input.shape, input.stride(), dummy_block, padding=padding)
433433
grid = (triton.cdiv(OM, M_BLOCK), triton.cdiv(ON, N_BLOCK))
434434
device_tma_load[grid](input, out_device_tma, IM, IN, OM, ON, M_BLOCK, N_BLOCK, padding)
435435
host_tma_load[grid](in_desc, out_host_tma, OM, ON, M_BLOCK, N_BLOCK)
436-
expected = torch.zeros((OM, ON), device="cuda", dtype=torch.float32)
436+
expected = torch.zeros((OM, ON), device=device, dtype=torch.float32)
437437
expected[0:IN, 0:IM] = input
438438
expected[:, IN:ON] = float('nan')
439439
expected[IM:OM, :] = float('nan')
@@ -1474,18 +1474,18 @@ def tma_scatter_rows_kernel(out_ptr, in_ptr, idx_ptr, y, X: tl.constexpr, Y: tl.
14741474
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.int8])
14751475
@pytest.mark.parametrize("y", [0, 32, 48])
14761476
@pytest.mark.skipif(is_hopper(), reason="TMA Scatter is not supported on hopper")
1477-
def test_tma_scatter(X, Y, BLOCK_X, BLOCK_Y, dtype, y):
1477+
def test_tma_scatter(X, Y, BLOCK_X, BLOCK_Y, dtype, y, device):
14781478
if BLOCK_X > X or y + BLOCK_Y > Y:
14791479
pytest.skip()
14801480

14811481
torch.manual_seed(42)
1482-
input = torch.arange(BLOCK_X * BLOCK_Y, dtype=dtype, device='cuda').reshape(BLOCK_X, BLOCK_Y)
1483-
output = torch.zeros((X, Y), dtype=dtype, device='cuda')
1482+
input = torch.arange(BLOCK_X * BLOCK_Y, dtype=dtype, device=device).reshape(BLOCK_X, BLOCK_Y)
1483+
output = torch.zeros((X, Y), dtype=dtype, device=device)
14841484

1485-
idx = torch.randperm(BLOCK_X, dtype=torch.int32, device='cuda')
1485+
idx = torch.randperm(BLOCK_X, dtype=torch.int32, device=device)
14861486

14871487
def alloc_fn(size: int, align: int, steam):
1488-
return torch.empty(size, dtype=torch.int8, device='cuda')
1488+
return torch.empty(size, dtype=torch.int8, device=device)
14891489

14901490
triton.set_allocator(alloc_fn)
14911491

@@ -1546,7 +1546,7 @@ def max_op(a, b):
15461546
@pytest.mark.parametrize("num_ctas", [1, 2])
15471547
@pytest.mark.parametrize("descriptor", ["host", "device"])
15481548
@pytest.mark.parametrize("M_BLOCK,N_BLOCK", [(2, 16), (8, 16), (8, 32), (8, 128), (512, 32), (1, 1024)])
1549-
def test_tensor_descriptor_reduce(kind, descriptor, dtype_str, num_ctas, M_BLOCK, N_BLOCK):
1549+
def test_tensor_descriptor_reduce(kind, descriptor, dtype_str, num_ctas, M_BLOCK, N_BLOCK, device):
15501550
is_native = is_cuda() and torch.cuda.get_device_capability()[0] >= 9
15511551
if not is_native:
15521552
if num_ctas != 1:
@@ -1596,8 +1596,8 @@ def kernel(out_desc, out_ptr, a_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.co
15961596

15971597
M, N = M_BLOCK * 2, N_BLOCK * 2
15981598
rs = np.random.RandomState(seed=17)
1599-
inp = to_triton(numpy_random((M, N), dtype_str, rs), device="cuda", dst_type=dtype_str)
1600-
out = to_triton(numpy_random((M, N), dtype_str, rs), device="cuda", dst_type=dtype_str)
1599+
inp = to_triton(numpy_random((M, N), dtype_str, rs), device=device, dst_type=dtype_str)
1600+
out = to_triton(numpy_random((M, N), dtype_str, rs), device=device, dst_type=dtype_str)
16011601

16021602
grid_m = M // M_BLOCK
16031603
grid_n = N // N_BLOCK
@@ -1610,7 +1610,7 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
16101610
assert size == 128 * (grid_m * grid_n) * num_ctas
16111611
assert align == 128
16121612
assert stream == 0
1613-
return torch.empty(size, dtype=torch.int8, device="cuda")
1613+
return torch.empty(size, dtype=torch.int8, device=device)
16141614

16151615
triton.set_allocator(alloc_fn)
16161616
out_desc = None

0 commit comments

Comments
 (0)