[xpu][feature] Enable triton online softmax kernels on XPU. (pytorch#163251)

weishi-deng · pytorchmergebot · commit 9d9ecdb349d0 · 2025-12-16T06:37:48.000Z
This pr is to enable triton online softmax kernels for xpu devices, so we add a device check in prepare_softmax_extra_check. Pull Request resolved: pytorch#163251 Approved by: https://github.com/etaf, https://github.com/EikanWang, https://github.com/mlazos
diff --git a/test/inductor/test_online_softmax.py b/test/inductor/test_online_softmax.py
@@ -14,7 +14,7 @@
     IS_LINUX,
     parametrize,
 )
-from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA_AND_TRITON
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, HAS_TRITON
 
 
 DO_PERF_TEST = os.environ.get("DO_PERF_TEST") == "1"
@@ -138,8 +138,14 @@ def test_prepare_softmax(self, dim, nrow):
         self.assertTrue(same(ref, act, tol=1e-2))
 
         if nrow == 2048 and dim == 0:
+            num_kernels = 2
+            # Note: split reduction is not triggered for this shape on some xpu devices.
+            #       check "num_splits" for more details
+            if GPU_TYPE == "xpu":
+                num_kernels = 1
+
             # split reduction is triggered. We have multiple kernels
-            self.assertTrue(code.count("def triton") >= 2)
+            self.assertTrue(code.count("def triton") >= num_kernels)
         else:
             if nrow == 2 and dim == 0:
                 # persistent reduction triggered
@@ -310,5 +316,5 @@ def f(x, y):
 instantiate_parametrized_tests(TestOnlineSoftmax)
 
 if __name__ == "__main__":
-    if IS_LINUX and HAS_CUDA_AND_TRITON:
+    if IS_LINUX and HAS_GPU and HAS_TRITON:
         run_tests()