[UT] Fix test_print UTs (#2867)

quintinwang5 · web-flow · commit dbe7a5b8cde2 · 2024-12-02T09:30:36.000-05:00
`print` works now on DLE 2025.0.0. But `test_print` UTs still fail. There are two reason: 1. We use subprocess to call `print` while we use `sycl::queue` from `torch`. So the `queue` cannot be synced before the exiting of subprocess. 2. `torch.xpu.synchronize()` does not work because it just sync on `reserved streams`. (See [the comment](https://github.com/pytorch/pytorch/blob/19d01a1ef0c0d65768eb0a5c97a25328eec57fbd/c10/xpu/XPUStream.cpp#L249)). Accroding to my test, our print kernels were not waited on. So I add an `launch_exit_hook` to wait on that queue.
diff --git a/python/test/unit/language/print_helper.py b/python/test/unit/language/print_helper.py
@@ -99,6 +99,16 @@ def test_print(func: str, data_type: str, device: str):
 
     x = torch.arange(0, N, dtype=torch.int32, device=device).to(getattr(torch, data_type))
     y = torch.zeros((N, ), dtype=x.dtype, device=device)
+
+    if device == "xpu":
+
+        def exit_hook(lazy_dict: triton.compiler.LazyDict):
+            # Need this for xpu device to capture print results before child process exit
+            # torch.xpu.synchronize() does not work because it just sync on reserved stream
+            triton.runtime.driver.active.utils.wait()
+
+        triton.compiler.CompiledKernel.launch_exit_hook = exit_hook
+
     if func == "device_print":
         kernel_device_print[(1, )](x, y, num_warps=num_warps, BLOCK=N)
     elif func == "device_print_scalar":
@@ -130,20 +140,15 @@ def test_print(func: str, data_type: str, device: str):
         kernel_print_pointer[(1, )](x, y, num_warps=num_warps, BLOCK=N)
     else:
         assert f"Unknown kernel: {func}"
-
-    if device == "xpu":
-        # FIXME: remove trigger to get output from kernel
-        repr(x)
-        repr(y)
-
     if func != "print_no_arg" and func != "no_arg_print" and func != "device_print_large" and \
        func != "print_multiple_args" and func != "device_print_multiple_args" and \
        func != "device_print_pointer" and func != "device_print_scalar":
         assert_close(y, x)
 
     # Wait until driver complete all the jobs for the device_print, especially test_subprocess
     # require this which captures stdout when child exits.
-    getattr(torch, device).synchronize()
+    if device != "xpu":
+        getattr(torch, device).synchronize()
 
 
 if __name__ == "__main__":
diff --git a/scripts/skiplist/conda/subprocess.txt b/scripts/skiplist/conda/subprocess.txt
@@ -1,6 +0,0 @@
-test/unit/language/test_subprocess.py::test_print[device_print_scalar-float32]
-test/unit/language/test_subprocess.py::test_print[device_print-float32]
-test/unit/language/test_subprocess.py::test_print[device_print_scalar-float16]
-test/unit/language/test_subprocess.py::test_print[device_print-float16]
-test/unit/language/test_subprocess.py::test_print[device_print_scalar-float64]
-test/unit/language/test_subprocess.py::test_print[device_print-float64]
diff --git a/scripts/skiplist/default/subprocess.txt b/scripts/skiplist/default/subprocess.txt
@@ -1,9 +0,0 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/800
-test/unit/language/test_subprocess.py::test_print[device_print-float16]
-test/unit/language/test_subprocess.py::test_print[device_print-float32]
-test/unit/language/test_subprocess.py::test_print[device_print-float64]
-test/unit/language/test_subprocess.py::test_print[device_print_scalar-float16]
-test/unit/language/test_subprocess.py::test_print[device_print_scalar-float64]
-test/unit/language/test_subprocess.py::test_print[device_print_scalar-float32]
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/1704
-test/unit/language/test_subprocess.py::test_print[device_print_uint-uint32]
diff --git a/scripts/skiplist/lts/subprocess.txt b/scripts/skiplist/lts/subprocess.txt
@@ -1,9 +0,0 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/800
-test/unit/language/test_subprocess.py::test_print[device_print-float16]
-test/unit/language/test_subprocess.py::test_print[device_print-float32]
-test/unit/language/test_subprocess.py::test_print[device_print-float64]
-test/unit/language/test_subprocess.py::test_print[device_print_scalar-float16]
-test/unit/language/test_subprocess.py::test_print[device_print_scalar-float64]
-test/unit/language/test_subprocess.py::test_print[device_print_scalar-float32]
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/1704
-test/unit/language/test_subprocess.py::test_print[device_print_uint-uint32]
diff --git a/third_party/intel/backend/driver.c b/third_party/intel/backend/driver.c
@@ -346,6 +346,19 @@ static PyObject *initDevices(PyObject *self, PyObject *args) {
   return Py_BuildValue("(i)", deviceCount);
 }
 
+static PyObject *waitOnSYCLQueue(PyObject *self, PyObject *args) {
+  PyObject *cap;
+  void *queue = NULL;
+  if (!PyArg_ParseTuple(args, "O", &cap))
+    return NULL;
+  if (!(queue = PyLong_AsVoidPtr(cap)))
+    return NULL;
+  sycl::queue *sycl_queue = static_cast<sycl::queue *>(queue);
+  sycl_queue->wait();
+
+  return Py_None;
+}
+
 static PyMethodDef ModuleMethods[] = {
     {"load_binary", loadBinary, METH_VARARGS,
      "Load provided SPV into ZE driver"},
@@ -355,6 +368,8 @@ static PyMethodDef ModuleMethods[] = {
      "Initialize the ZE GPU context"},
     {"init_devices", initDevices, METH_VARARGS,
      "Initialize the ZE GPU devices and return device count"},
+    {"wait_on_sycl_queue", waitOnSYCLQueue, METH_VARARGS,
+     "call wait on a specific sycl::queue"},
     {NULL, NULL, 0, NULL} // sentinel
 };
 
diff --git a/third_party/intel/backend/driver.py b/third_party/intel/backend/driver.py
@@ -159,6 +159,7 @@ def __init__(self):
         self.context = mod.init_context(self.get_sycl_queue())
         self.device_count = mod.init_devices(self.get_sycl_queue())
         self.current_device = 0 if self.device_count[0] > 0 else -1
+        self.wait_on_sycl_queue = mod.wait_on_sycl_queue
 
     def get_current_device(self):
         return self.current_device
@@ -167,6 +168,9 @@ def get_sycl_queue(self):
         import torch
         return torch.xpu.current_stream().sycl_queue
 
+    def wait(self):
+        self.wait_on_sycl_queue(self.get_sycl_queue())
+
 
 # ------------------------
 # Launcher