[TEST][intel] Port triton_kernels

whitneywhtsang · whitneywhtsang · commit fa15dc12d67e · 2025-10-16T23:42:53.000Z
Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/python/triton_kernels/tests/test_distributed.py b/python/triton_kernels/tests/test_distributed.py
@@ -53,7 +53,7 @@ def _distributed_worker(rank, fn, world_size, kwargs):
 def distributed_launcher(request):
     n_gpus = getattr(request, "param", None)
     if not torch.cuda.is_available():
-        pytest.skip("CUDA required for distributed GPU test")
+        pytest.xfail("CUDA required for distributed GPU test")
     if torch.cuda.device_count() < n_gpus:
         pytest.skip(f"requires up to {n_gpus} CUDA devices, found {torch.cuda.device_count()}")
 
@@ -82,8 +82,7 @@ def launch(fn, **kwargs):
 
 @pytest.mark.parametrize("n_expts_shard, n_expts_tot", [(8, 512), (16, 64)])
 @pytest.mark.parametrize("affinity_mode", ["uniform", "random"])
-def test_make_expt_assignment(n_expts_shard, n_expts_tot, affinity_mode):
-    device = "cuda"
+def test_make_expt_assignment(n_expts_shard, n_expts_tot, affinity_mode, device):
     expt_dict = _make_expt_dict_for_mode(n_expts_shard, n_expts_tot, affinity_mode)
     expt_assignment = make_expt_assignment(n_expts_shard, n_expts_tot, expt_dict, device)
     # mask correctness & uniqueness: each expert set exactly once, and on the right shard
diff --git a/python/triton_kernels/tests/test_reduce.py b/python/triton_kernels/tests/test_reduce.py
@@ -56,13 +56,12 @@ def plus_a(x, a):
     "broadcast_n",  # broadcast over N: [B,M,1]
 ])
 @pytest.mark.parametrize("dim", [0, 1, 2])
-def test_op(B, M, N, dtype_str, dim, mask_mode, postprocess_fn):
+def test_op(B, M, N, dtype_str, dim, mask_mode, postprocess_fn, device):
     is_hip = triton.runtime.driver.active.get_current_target().backend == "hip"
-    is_pre_h100 = torch.cuda.get_device_capability() < (9, 0)
+    is_pre_h100 = torch.cuda.is_available() and torch.cuda.get_device_capability() < (9, 0)
     if (is_hip or is_pre_h100) and "float8" in dtype_str:
         pytest.skip("float8 not supported on CUDA < 9.0")
     torch.manual_seed(0)
-    device = "cuda"
     x = torch.randn((B, M, N), device=device, dtype=torch.float32)
     x_mscale, x_flex = None, None
     y_flex_tri, y_flex_ref = None, None
diff --git a/python/triton_kernels/tests/test_tensor.py b/python/triton_kernels/tests/test_tensor.py
@@ -28,8 +28,7 @@ def test_make_ragged_tensor_metadata(n_slices, device):
 
 
 @pytest.mark.parametrize("n_slices", [9, 32, 911, 1025])
-def test_remap_ragged_tensor_metadata(n_slices):
-    device = "cuda"
+def test_remap_ragged_tensor_metadata(n_slices, device):
     max_slice_size = 200
     n_total_rows = max_slice_size * n_slices
     slice_sizes = torch.randint(0, max_slice_size, (n_slices, ), dtype=torch.int32, device=device)