modify skipIf for TLX UT (#694)

dshi7 · meta-codesync[bot] · commit 9e9a7b7eefd0 · 2025-11-24T15:59:06.000-08:00
Summary: see [D87796244](https://www.internalfb.com/diff/D87796244) Pull Request resolved: #694 Reviewed By: htyu Differential Revision: D87812601 Pulled By: dshi7 fbshipit-source-id: 33027116d6e3e4f8b73369edeaad3c6eacd087d7
diff --git a/python/test/unit/language/test_tlx.py b/python/test/unit/language/test_tlx.py
@@ -5,7 +5,7 @@
 import re
 import triton
 import triton.language as tl
-from triton._internal_testing import is_hopper_or_newer, is_blackwell, is_hopper, is_hip, is_cuda
+from triton._internal_testing import is_hopper_or_newer, is_blackwell, is_hopper, is_hip
 import triton.language.extra.tlx as tlx
 from typing import Optional
 import traceback
@@ -585,6 +585,7 @@ def test_cta_0_kernel(
     torch.testing.assert_close(output, expected_output)
 
 
+@pytest.mark.skipif(not is_hopper_or_newer(), reason="Need Hopper or newer")
 def test_clock64(device):
 
     @triton.jit
@@ -1245,6 +1246,7 @@ def tcgen5_dot_kernel2cta_tma_ws(a_ptr, stride_am, stride_ak, b_ptr, stride_bk,
 
 @pytest.mark.parametrize("A_DATA_TYPE", ["e5m2", "e4m3"])
 @pytest.mark.parametrize("B_DATA_TYPE", ["e5m2", "e4m3"])
+@pytest.mark.skipif(not is_blackwell(), reason="Need Blackwell")
 def test_async_dot_scaled(A_DATA_TYPE, B_DATA_TYPE, device):
     """
     Test D = (A * A_scale)  * (B * B_scale)
@@ -1633,9 +1635,8 @@ def run_tlx_square(func, BLOCK_SIZE, device, expected_arrival_count=1):
 
 
 # Unit test for arrive/wait
-@pytest.mark.skipif(not (is_hip() or is_hopper_or_newer()), reason="Need Hopper or newer")
+@pytest.mark.skipif(not (is_hip() or is_hopper_or_newer()), reason="Need Hopper or newer or AMD")
 @pytest.mark.parametrize("BLOCK_SIZE", [(1024)])
-# def test_mbarriers(BLOCK_SIZE, device):
 def test_wait_arrive_non_ws(BLOCK_SIZE, device):
     expected_arrival_count = 4 if is_hip() else 1
     kernel = run_tlx_square(tlx_square_non_ws, BLOCK_SIZE, device, expected_arrival_count=expected_arrival_count)
@@ -1652,7 +1653,6 @@ def test_wait_arrive_non_ws(BLOCK_SIZE, device):
 
 @pytest.mark.skipif(not is_hopper_or_newer(), reason="Need Hopper or newer")
 @pytest.mark.parametrize("BLOCK_SIZE", [(1024)])
-# def test_mbarriers(BLOCK_SIZE, device):
 def test_wait_arrive_ws(BLOCK_SIZE, device):
     kernel = run_tlx_square(tlx_square_ws, BLOCK_SIZE, device)
 
@@ -1699,7 +1699,6 @@ def bar_live_kernel():
 
 @pytest.mark.skipif(not is_hopper_or_newer(), reason="Need Hopper or newer")
 @pytest.mark.parametrize("BLOCK_SIZE", [(1024)])
-# def test_mbarriers(BLOCK_SIZE, device):
 def test_named_wait_arrive(BLOCK_SIZE, device):
 
     @triton.jit
@@ -2155,10 +2154,7 @@ def ws_error_kernel():
     assert "ZeroDivisionError('division by zero')" in exc_msg, '\n\nExpected ZeroDivisionError but got: \n\n' + exc_msg + '\n\n'
 
 
-@pytest.mark.skipif(
-    not is_cuda() or torch.cuda.get_device_capability()[0] < 9,
-    reason="Requires compute capability >= 9 for NV",
-)
+@pytest.mark.skipif(not is_hopper_or_newer(), reason="Need Hopper or newer")
 @pytest.mark.parametrize("BLOCK_SIZE", [(64)])
 def test_local_index(BLOCK_SIZE, device):
 
@@ -2199,6 +2195,7 @@ def local_index(
     torch.testing.assert_close(y, output)
 
 
+@pytest.mark.skipif(not is_hopper_or_newer(), reason="Need Hopper or newer")
 def test_async_token_error(device):
 
     @triton.jit