Run CI on mi325x (#441)

oulgen · web-flow · commit 13f8d319ed99 · 2025-08-08T13:09:55.000-07:00
diff --git a/.github/matrix.json b/.github/matrix.json
@@ -53,6 +53,15 @@
       "runtime-version": "cu129",
       "container-options": "--gpus all",
       "alias": "b200"
+    },
+    {
+      "runner": "linux.rocm.gpu.gfx942.2",
+      "python-version": "3.12",
+      "ref-mode": "none",
+      "image": "rocm/dev-ubuntu-24.04:6.2.4",
+      "runtime-version": "rocm6.4",
+      "container-options": "--device=/dev/kfd --device=/dev/dri",
+      "alias": "mi325x"
     }
   ]
 }
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -45,6 +45,11 @@ def skipIfNormalMode(reason: str) -> Callable[[Callable], Callable]:
     return unittest.skipIf(os.environ.get("HELION_INTERPRET") != "1", reason)
 
 
+def skipIfRocm(reason: str) -> Callable[[Callable], Callable]:
+    """Skip test if running with rocm"""
+    return unittest.skipIf(torch.version.hip is not None, reason)  # pyright: ignore[reportAttributeAccessIssue]
+
+
 @contextlib.contextmanager
 def track_run_ref_calls() -> Generator[list[int], None, None]:
     """Context manager that tracks BoundKernel.run_ref calls.
diff --git a/test/test_autotuner.py b/test/test_autotuner.py
@@ -16,6 +16,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import import_path
+from helion._testing import skipIfRocm
 from helion.autotuner import DifferentialEvolutionSearch
 from helion.autotuner.config_generation import ConfigGeneration
 from helion.autotuner.random_search import RandomSearch
@@ -36,6 +37,7 @@ def setUp(self):
     @patch.object(_compat, "_supports_tensor_descriptor", lambda: True)
     @patch.object(_compat, "_min_dot_size", lambda *args: (16, 16, 16))
     @patch.object(loops, "_supports_warp_specialize", lambda: True)
+    @skipIfRocm("failure on rocm")
     def test_config_fragment0(self):
         args = (
             torch.randn([512, 512], device=DEVICE),
diff --git a/test/test_dot.py b/test/test_dot.py
@@ -12,6 +12,7 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import skipIfRefEager
+from helion._testing import skipIfRocm
 import helion.language as hl
 
 
@@ -82,6 +83,7 @@ def make_test_function(input_dtype, acc_dtype, static_shapes_option):
     """Create a test function for a specific combination of parameters."""
     combo = (input_dtype, input_dtype, acc_dtype)
 
+    @skipIfRocm("Core dumps with rocm -- https://github.com/pytorch/helion/issues/445")
     def test_impl(self):
         # Skip FP8 tests if GPU doesn't support it
         if (
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -12,6 +12,7 @@
 from helion._testing import check_example
 from helion._testing import import_path
 from helion._testing import skipIfRefEager
+from helion._testing import skipIfRocm
 
 torch.backends.cuda.matmul.fp32_precision = "tf32"
 torch.backends.cudnn.conv.fp32_precision = "tf32"
@@ -44,6 +45,7 @@ def test_matmul(self):
             )
         )
 
+    @skipIfRocm("failure on rocm")
     def test_matmul_layernorm_static_shapes(self):
         args = (
             torch.randn([128, 256], device=DEVICE, dtype=torch.float32),
@@ -66,6 +68,7 @@ def test_matmul_layernorm_static_shapes(self):
             )
         )
 
+    @skipIfRocm("failure on rocm")
     def test_matmul_layernorm_dynamic_shapes(self):
         args = (
             torch.randn([128, 256], device=DEVICE, dtype=torch.float32),
@@ -110,6 +113,7 @@ def test_bmm(self):
         not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 9,
         "FP8 requires GPU with compute capability >= 9.0 (e.g., H100)",
     )
+    @skipIfRocm("failure on rocm")
     def test_fp8_gemm(self):
         # Create FP32 tensors and convert to FP8
         x = torch.randn([256, 256], device=DEVICE, dtype=torch.float32)
@@ -334,6 +338,7 @@ def test_embedding_block_ptr(self):
             )
         )
 
+    @skipIfRocm("failure on rocm")
     def test_attention_pointer(self):
         args = (
             torch.randn(1, 32, 512, 64, dtype=torch.float32, device=DEVICE),
@@ -568,6 +573,7 @@ def test_attention_persistent_interleaved_l2_grouping(self):
         not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 9,
         "FP8 requires GPU with compute capability >= 9.0 (e.g., H100)",
     )
+    @skipIfRocm("failure on rocm")
     def test_fp8_attention(self):
         batch = 2
         heads = 4
diff --git a/test/test_indexing.py b/test/test_indexing.py
@@ -13,6 +13,7 @@
 from helion._testing import code_and_output
 from helion._testing import skipIfNormalMode
 from helion._testing import skipIfRefEager
+from helion._testing import skipIfRocm
 import helion.language as hl
 
 
@@ -626,6 +627,7 @@ def kernel(buf: torch.Tensor, zeros: torch.Tensor) -> torch.Tensor:
         expected = torch.zeros([N], device=DEVICE)
         torch.testing.assert_close(result, expected)
 
+    @skipIfRocm("failure on rocm")
     def test_1d_indexed_value_from_slice(self):
         """buf2[i] = buf[:] - Assign slice to indexed value"""
 
diff --git a/test/test_inline_asm_elementwise.py b/test/test_inline_asm_elementwise.py
@@ -10,13 +10,15 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfRocm
 import helion.language as hl
 
 
 class TestInlineAsmElementwise(RefEagerTestDisabled, TestCase):
     @pytest.mark.skipif(
         DEVICE.type != "cuda", reason="inline_asm_elementwise is only supported on CUDA"
     )
+    @skipIfRocm("only works on cuda")
     def test_inline_asm_simple(self):
         """Test basic inline_asm_elementwise with simple assembly"""
 
@@ -45,6 +47,7 @@ def kernel_simple_asm(x: torch.Tensor) -> torch.Tensor:
     @pytest.mark.skipif(
         DEVICE.type != "cuda", reason="inline_asm_elementwise is only supported on CUDA"
     )
+    @skipIfRocm("only works on cuda")
     def test_inline_asm_shift_operation(self):
         """Test inline_asm_elementwise with shift operation (similar to Triton test)"""
 
@@ -82,6 +85,7 @@ def kernel_shift_asm(x: torch.Tensor, y: torch.Tensor, n: int) -> torch.Tensor:
     @pytest.mark.skipif(
         DEVICE.type != "cuda", reason="inline_asm_elementwise is only supported on CUDA"
     )
+    @skipIfRocm("only works on cuda")
     def test_inline_asm_multiple_outputs(self):
         """Test inline_asm_elementwise with multiple outputs"""
 
@@ -130,6 +134,7 @@ def kernel_multiple_outputs(
     @pytest.mark.skipif(
         DEVICE.type != "cuda", reason="inline_asm_elementwise is only supported on CUDA"
     )
+    @skipIfRocm("only works on cuda")
     def test_inline_asm_packed(self):
         """Test inline_asm_elementwise with pack > 1"""
 
@@ -186,6 +191,7 @@ def kernel_invalid_asm(x: torch.Tensor) -> torch.Tensor:
     @pytest.mark.skipif(
         DEVICE.type != "cuda", reason="inline_asm_elementwise is only supported on CUDA"
     )
+    @skipIfRocm("only works on cuda")
     def test_inline_asm_empty_args(self):
         """Test inline_asm_elementwise with empty args (should work like Triton)"""
 
@@ -214,6 +220,7 @@ def kernel_empty_args(x: torch.Tensor) -> torch.Tensor:
         expected = torch.full([16], 42, dtype=torch.int32, device=DEVICE)
         torch.testing.assert_close(result, expected)
 
+    @skipIfRocm("only works on cuda")
     def test_inline_asm_basic_compilation(self):
         """Test that inline_asm_elementwise compiles without errors (no CUDA requirement)"""
 
diff --git a/test/test_print.py b/test/test_print.py
@@ -13,6 +13,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfRocm
 import helion.language as hl
 
 
@@ -106,6 +107,7 @@ def run_test_with_and_without_triton_interpret_envvar(self, test_func):
             else:
                 os.environ["TRITON_INTERPRET"] = original_env
 
+    @skipIfRocm("failure on rocm")
     def test_basic_print(self):
         """Test basic print with prefix and tensor values"""
 
@@ -142,6 +144,7 @@ def print_kernel(x: torch.Tensor) -> torch.Tensor:
 
         self.run_test_with_and_without_triton_interpret_envvar(run_test)
 
+    @skipIfRocm("failure on rocm")
     def test_print_multiple_tensors(self):
         """Test print with multiple tensor arguments"""
 
@@ -248,6 +251,7 @@ def print_shape_kernel(x: torch.Tensor) -> torch.Tensor:
 
         self.run_test_with_and_without_triton_interpret_envvar(run_test)
 
+    @skipIfRocm("failure on rocm")
     def test_print_prefix_only(self):
         def run_test(interpret_mode):
             @helion.kernel
@@ -280,6 +284,7 @@ def print_message_kernel(x: torch.Tensor) -> torch.Tensor:
 
         self.run_test_with_and_without_triton_interpret_envvar(run_test)
 
+    @skipIfRocm("failure on rocm")
     def test_print_in_nested_loops(self):
         def run_test(interpret_mode):
             @helion.kernel
@@ -372,6 +377,7 @@ def print_outside_kernel(x: torch.Tensor) -> torch.Tensor:
 
         self.run_test_with_and_without_triton_interpret_envvar(run_test)
 
+    @skipIfRocm("failure on rocm")
     def test_print_with_conditional(self):
         """Test print with conditional statements"""
 
@@ -431,6 +437,7 @@ def print_conditional_kernel(x: torch.Tensor) -> torch.Tensor:
 
         self.run_test_with_and_without_triton_interpret_envvar(run_test)
 
+    @skipIfRocm("failure on rocm")
     def test_print_computed_values(self):
         """Test print with computed/derived values"""
 
@@ -523,6 +530,7 @@ def print_reduction_kernel(x: torch.Tensor) -> torch.Tensor:
 
         self.run_test_with_and_without_triton_interpret_envvar(run_test)
 
+    @skipIfRocm("failure on rocm")
     def test_print_multiple_data_types(self):
         """Test print with different tensor data types"""
 
@@ -580,6 +588,7 @@ def print_dtypes_kernel(
 
         self.run_test_with_and_without_triton_interpret_envvar(run_test)
 
+    @skipIfRocm("failure on rocm")
     def test_print_with_starred_args(self):
         """Test print with starred/unpacked arguments"""
 
diff --git a/test/test_register_tunable.py b/test/test_register_tunable.py
@@ -11,6 +11,7 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfRocm
 from helion.autotuner import EnumFragment
 from helion.autotuner import IntegerFragment
 from helion.autotuner import PowerOfTwoFragment
@@ -106,6 +107,7 @@ def fn(x: torch.Tensor):
         self.assertExpectedJournal(code)
         torch.testing.assert_close(result, x.sum())
 
+    @skipIfRocm("failure on rocm")
     def test_matmul_split_k(self):
         """Test matmul_split_k kernel with register_tunable"""
 
diff --git a/test/test_signal_wait.py b/test/test_signal_wait.py
@@ -9,10 +9,12 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfRocm
 import helion.language as hl
 
 
 class TestWait(RefEagerTestDisabled, TestCase):
+    @skipIfRocm("only works on cuda")
     def test_wait_basic(self):
         @helion.kernel
         def gmem_wait_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
@@ -32,6 +34,7 @@ def gmem_wait_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
         self.maxDiff = None
         self.assertExpectedJournal(code)
 
+    @skipIfRocm("only works on cuda")
     def test_wait_2d_tile(self):
         @helion.kernel
         def wait_for_2d_tile_kernel(
@@ -55,6 +58,7 @@ def wait_for_2d_tile_kernel(
         torch.testing.assert_close(result, x)
         self.assertExpectedJournal(code)
 
+    @skipIfRocm("only works on cuda")
     def test_wait_multi_bar(self):
         @helion.kernel
         def gmem_wait_multi_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
@@ -78,6 +82,7 @@ def gmem_wait_multi_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
         self.maxDiff = None
         self.assertExpectedJournal(code)
 
+    @skipIfRocm("only works on cuda")
     def test_wait_multi_bar_cas(self):
         @helion.kernel
         def gmem_wait_multi_bar_kernel_cas(signal_pad: torch.Tensor) -> torch.Tensor:
@@ -99,6 +104,7 @@ def gmem_wait_multi_bar_kernel_cas(signal_pad: torch.Tensor) -> torch.Tensor:
         self.maxDiff = None
         self.assertExpectedJournal(code)
 
+    @skipIfRocm("only works on cuda")
     def test_signal_basic(self):
         @helion.kernel
         def gmem_signal_scalar_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
@@ -114,6 +120,7 @@ def gmem_signal_scalar_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
         )
         self.assertExpectedJournal(code)
 
+    @skipIfRocm("only works on cuda")
     def test_signal_cas(self):
         @helion.kernel
         def gmem_signal_cas_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
@@ -129,6 +136,7 @@ def gmem_signal_cas_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
         )
         self.assertExpectedJournal(code)
 
+    @skipIfRocm("only works on cuda")
     def test_signal_multiple(self):
         @helion.kernel
         def gmem_signal_tensor_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
@@ -148,6 +156,7 @@ def gmem_signal_tensor_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
         )
         self.assertExpectedJournal(code)
 
+    @skipIfRocm("only works on cuda")
     def test_signal_multiple_cas(self):
         @helion.kernel
         def gmem_signal_tensor_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
@@ -167,6 +176,7 @@ def gmem_signal_tensor_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
         )
         self.assertExpectedJournal(code)
 
+    @skipIfRocm("only works on cuda")
     def test_send_recieve_cta(self):
         @helion.kernel
         def gmem_signal_n_wait_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
@@ -186,6 +196,7 @@ def gmem_signal_n_wait_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
         self.assertIn("helion.runtime.triton_send_signal", code)
         self.assertIn("helion.runtime.triton_wait_signal", code)
 
+    @skipIfRocm("only works on cuda")
     def test_global_sync(self):
         @helion.kernel
         def gmem_multi_bar_sync_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
@@ -207,6 +218,7 @@ def gmem_multi_bar_sync_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
         )
         self.assertExpectedJournal(code)
 
+    @skipIfRocm("only works on cuda")
     def test_global_sync_cas(self):
         @helion.kernel
         def gmem_multi_bar_sync_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
@@ -232,6 +244,7 @@ def gmem_multi_bar_sync_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
         )
         self.assertIn("atomic_cas", code)
 
+    @skipIfRocm("only works on cuda")
     def test_wait_stack_signalpad(self):
         @helion.kernel
         def gmem_wait_pointers_kernel(
@@ -259,6 +272,7 @@ def gmem_wait_pointers_kernel(
         )
         self.assertExpectedJournal(code)
 
+    @skipIfRocm("only works on cuda")
     def test_signal_stack_signalpad(self):
         @helion.kernel
         def gmem_signal_pointers_kernel(

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,15 @@`
`53`	`53`	`"runtime-version": "cu129",`
`54`	`54`	`"container-options": "--gpus all",`
`55`	`55`	`"alias": "b200"`
	`56`	`+ },`
	`57`	`+ {`
	`58`	`+ "runner": "linux.rocm.gpu.gfx942.2",`
	`59`	`+ "python-version": "3.12",`
	`60`	`+ "ref-mode": "none",`
	`61`	`+ "image": "rocm/dev-ubuntu-24.04:6.2.4",`
	`62`	`+ "runtime-version": "rocm6.4",`
	`63`	`+ "container-options": "--device=/dev/kfd --device=/dev/dri",`
	`64`	`+ "alias": "mi325x"`
`56`	`65`	`}`
`57`	`66`	`]`
`58`	`67`	`}`
Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@`
`12`	`12`	`from helion._testing import check_example`
`13`	`13`	`from helion._testing import import_path`
`14`	`14`	`from helion._testing import skipIfRefEager`
	`15`	`+from helion._testing import skipIfRocm`
`15`	`16`
`16`	`17`	`torch.backends.cuda.matmul.fp32_precision = "tf32"`
`17`	`18`	`torch.backends.cudnn.conv.fp32_precision = "tf32"`
`@@ -44,6 +45,7 @@ def test_matmul(self):`
`44`	`45`	`)`
`45`	`46`	`)`
`46`	`47`
	`48`	`+ @skipIfRocm("failure on rocm")`
`47`	`49`	`def test_matmul_layernorm_static_shapes(self):`
`48`	`50`	`args = (`
`49`	`51`	`torch.randn([128, 256], device=DEVICE, dtype=torch.float32),`
`@@ -66,6 +68,7 @@ def test_matmul_layernorm_static_shapes(self):`
`66`	`68`	`)`
`67`	`69`	`)`
`68`	`70`
	`71`	`+ @skipIfRocm("failure on rocm")`
`69`	`72`	`def test_matmul_layernorm_dynamic_shapes(self):`
`70`	`73`	`args = (`
`71`	`74`	`torch.randn([128, 256], device=DEVICE, dtype=torch.float32),`
`@@ -110,6 +113,7 @@ def test_bmm(self):`
`110`	`113`	`not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 9,`
`111`	`114`	`"FP8 requires GPU with compute capability >= 9.0 (e.g., H100)",`
`112`	`115`	`)`
	`116`	`+ @skipIfRocm("failure on rocm")`
`113`	`117`	`def test_fp8_gemm(self):`
`114`	`118`	`# Create FP32 tensors and convert to FP8`
`115`	`119`	`x = torch.randn([256, 256], device=DEVICE, dtype=torch.float32)`
`@@ -334,6 +338,7 @@ def test_embedding_block_ptr(self):`
`334`	`338`	`)`
`335`	`339`	`)`
`336`	`340`
	`341`	`+ @skipIfRocm("failure on rocm")`
`337`	`342`	`def test_attention_pointer(self):`
`338`	`343`	`args = (`
`339`	`344`	`torch.randn(1, 32, 512, 64, dtype=torch.float32, device=DEVICE),`
`@@ -568,6 +573,7 @@ def test_attention_persistent_interleaved_l2_grouping(self):`
`568`	`573`	`not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 9,`
`569`	`574`	`"FP8 requires GPU with compute capability >= 9.0 (e.g., H100)",`
`570`	`575`	`)`
	`576`	`+ @skipIfRocm("failure on rocm")`
`571`	`577`	`def test_fp8_attention(self):`
`572`	`578`	`batch = 2`
`573`	`579`	`heads = 4`