[BLAS] SYCL-Graph integration for native-command

Ewan Crawford · Ewan Crawford · commit 6e3c97c7ec24 · 2025-05-30T10:42:56.000+01:00
In order to support applications calling the library
with a sycl queue recording to a SYCL-Graph, check if
the `ext_codeplay_enqueue_native_command` command-group is being
recorded to a graph object. If so use the native stream recording
APIs to add the blas calls as nodes in the graph.

In particular this fixes the llama.cpp unit test
`MUL_MAT(type_a=f16,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],v=0)`
on CUDA with SYCL-Graph enabled. Previously this would throw an error:

```sh
$ GGML_SYCL_DISABLE_GRAPH=0 ./bin/test-backend-ops -b SYCL0 -o MUL_MAT -p type_a=f16,type_b=f32,m=16,n=1,k=256,bs=\\[1,1\\],nr=\\[2

UR CUDA ERROR:
        Value:           700
        Name:            CUDA_ERROR_ILLEGAL_ADDRESS
        Description:     an illegal memory access was encountered
        Function:        operator()
        Source Location: $HOME/dpcpp/unified-runtime/source/adapters/cuda/queue.cpp:154

Native API failed. Native API returns: 2147483646 (UR_RESULT_ERROR_UNKNOWN)
Exception caught at file:$HOME/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp, line:3598, func:operator()
SYCL error: CHECK_TRY_ERROR((stream)-&gt;wait()): Meet error in this line code!
  in function ggml_backend_sycl_synchronize at $HOME/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp:3598
$HOME/llama.cpp/ggml/src/ggml-sycl/../ggml-sycl/common.hpp:118: SYCL error
Could not attach to process.  If your uid matches the uid of the target
process, check the setting of /proc/sys/kernel/yama/ptrace_scope, or try
again as the root user.  For more details, see /etc/sysctl.d/10-ptrace.conf
ptrace: Operation not permitted.
No stack.
The program is not being run.
```
diff --git a/src/blas/backends/cublas/cublas_scope_handle.cpp b/src/blas/backends/cublas/cublas_scope_handle.cpp
@@ -32,36 +32,80 @@ namespace cublas {
  */
 thread_local cublas_handle CublasScopedContextHandler::handle_helper = cublas_handle{};
 
-CublasScopedContextHandler::CublasScopedContextHandler(sycl::interop_handle& ih) : ih(ih) {}
+CublasScopedContextHandler::CublasScopedContextHandler(sycl::interop_handle& ih) : ih(ih) {
+    // Initialize streamID member to a CUstream associated with the queue `ih`
+    // has been submitted to.
+    streamId = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
 
-cublasHandle_t CublasScopedContextHandler::get_handle() {
+    // Initialize the `cublasHandle_t` member `nativeHandle`
     CUdevice device = ih.get_native_device<sycl::backend::ext_oneapi_cuda>();
-    CUstream streamId = get_stream();
-    cublasStatus_t err;
-
     auto it = handle_helper.cublas_handle_mapper_.find(device);
     if (it != handle_helper.cublas_handle_mapper_.end()) {
-        cublasHandle_t nativeHandle = it->second;
+        // Use existing handle if one already exists for the device, but update
+        // the native stream.
+        nativeHandle = it->second;
         cudaStream_t currentStreamId;
+        cublasStatus_t err;
         CUBLAS_ERROR_FUNC(cublasGetStream, err, nativeHandle, &currentStreamId);
         if (currentStreamId != streamId) {
             CUBLAS_ERROR_FUNC(cublasSetStream, err, nativeHandle, streamId);
         }
-        return nativeHandle;
     }
-
-    cublasHandle_t nativeHandle;
-    CUBLAS_ERROR_FUNC(cublasCreate, err, &nativeHandle);
-    CUBLAS_ERROR_FUNC(cublasSetStream, err, nativeHandle, streamId);
-
-    auto insert_iter =
+    else {
+        // Create a new handle if one doesn't already exist for the device
+        cublasStatus_t err;
+        CUBLAS_ERROR_FUNC(cublasCreate, err, &nativeHandle);
+        CUBLAS_ERROR_FUNC(cublasSetStream, err, nativeHandle, streamId);
         handle_helper.cublas_handle_mapper_.insert(std::make_pair(device, nativeHandle));
+    }
+}
 
-    return nativeHandle;
+void CublasScopedContextHandler::begin_recording_if_graph() {
+// interop_handle graph methods only available from extension version 2
+#if SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND >= 2
+    if (!ih.ext_codeplay_has_graph()) {
+        return;
+    }
+
+    CUresult err;
+#if CUDA_VERSION >= 12030
+    // After CUDA 12.3 we can use cuStreamBeginCaptureToGraph to capture
+    // the stream directly in the native graph, rather than needing to
+    // instantiate the stream capture as a new graph.
+    auto graph = ih.ext_codeplay_get_native_graph<sycl::backend::ext_oneapi_cuda>();
+    CUDA_ERROR_FUNC(cuStreamBeginCaptureToGraph, err, streamId, graph, nullptr, nullptr, 0,
+                    CU_STREAM_CAPTURE_MODE_GLOBAL);
+#else
+    CUDA_ERROR_FUNC(cuStreamBeginCapture, err, streamId, CU_STREAM_CAPTURE_MODE_GLOBAL);
+#endif // CUDA_VERSION
+#endif // SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND >= 2
 }
 
-CUstream CublasScopedContextHandler::get_stream() {
-    return ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+void CublasScopedContextHandler::end_recording_if_graph() {
+// interop_handle graph methods only available from extension version 2
+#if SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND >= 2
+    if (!ih.ext_codeplay_has_graph()) {
+        return;
+    }
+
+    auto graph = ih.ext_codeplay_get_native_graph<sycl::backend::ext_oneapi_cuda>();
+    CUresult err;
+#if CUDA_VERSION >= 12030
+    CUDA_ERROR_FUNC(cuStreamEndCapture, err, streamId, &graph);
+#else
+    // cuStreamEndCapture returns a new graph, if we overwrite
+    // "graph" it won't be picked up by the SYCL runtime, as
+    // "ext_codeplay_get_native_graph" returns a passed-by-value pointer.
+    CUgraph recorded_graph;
+    CUDA_ERROR_FUNC(cuStreamEndCapture, err, streamId, &recorded_graph);
+
+    // Add graph to native graph as a child node
+    // Need to return a node object for the node to be created,
+    // can't be nullptr.
+    CUgraphNode node;
+    CUDA_ERROR_FUNC(cuGraphAddChildGraphNode, err, &node, graph, nullptr, 0, recorded_graph);
+#endif // CUDA_VERSION
+#endif // SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND >= 2
 }
 } // namespace cublas
 } // namespace blas
diff --git a/src/blas/backends/cublas/cublas_scope_handle.hpp b/src/blas/backends/cublas/cublas_scope_handle.hpp
@@ -63,18 +63,49 @@ the handle must be destroyed when the context goes out of scope. This will bind
 class CublasScopedContextHandler {
     sycl::interop_handle& ih;
     static thread_local cublas_handle handle_helper;
-    CUstream get_stream();
+    cublasHandle_t nativeHandle;
+    // Cache the native CU stream when the `CublasScopedContextHandler`object
+    // is constructed. This avoids calling `get_native_queue(ih)` multiple
+    // times which isn't guaranteed to return the same CUstream handle each
+    // time. A scenario that causes problems when trying to start/end cuda
+    // stream recording to a graph.
+    CUstream streamId;
 
 public:
+    /**
+    * @brief Constructor
+    * @detail Creates the cublasHandle_t by implicitly impose the advice
+    * given by nvidia for creating a cublas_handle. (e.g. one cuStream per device
+    * per thread).
+    */
     CublasScopedContextHandler(sycl::interop_handle& ih);
 
     /**
-   * @brief get_handle: creates the handle by implicitly impose the advice
-   * given by nvidia for creating a cublas_handle. (e.g. one cuStream per device
-   * per thread).
-   * @return cublasHandle_t a handle to construct cublas routines
-   */
-    cublasHandle_t get_handle();
+    * @brief Start recording cuBlas calls to a graph.
+    * @detail Checks if the command-group associated with \p ih is being added
+    * to a graph, and if so, begin stream recording of the native CUDA stream
+    * associated with \p queue to the native cuda-graph object.
+    */
+    void begin_recording_if_graph();
+
+    /**
+    * @brief End recording cuBlas calls to a graph.
+    * @detail Checks if the command-group associated with \p ih is being added
+    * to a graph, and if so, ends stream recording of the native CUDA stream
+    * associated with \p queue to the native cuda-graph object. Doing any
+    * extra work to ensure that stream recorded calls get added as nodes to
+    * the native graph object associated with \p ih.
+    * @param queue The sycl queue to end stream recording on native stream
+    * backing the queue.
+    */
+    void end_recording_if_graph();
+
+    /// @brief Query the cuBLAS handle created on construction
+    /// @return cublasHandle_t a handle to construct cublas routines
+    cublasHandle_t get_handle() const {
+        return nativeHandle;
+    }
+
     // This is a work-around function for reinterpret_casting the memory. This
     // will be fixed when SYCL-2020 has been implemented for Pi backend.
     template <typename T, typename U>
diff --git a/src/blas/backends/cublas/cublas_task.hpp b/src/blas/backends/cublas/cublas_task.hpp
@@ -61,7 +61,9 @@ static inline void host_task_internal(H& cgh, F f) {
     cgh.host_task([f](sycl::interop_handle ih) {
 #endif
         auto sc = CublasScopedContextHandler(ih);
+        sc.begin_recording_if_graph();
         f(sc);
+        sc.end_recording_if_graph();
     });
 }
 #endif
diff --git a/tests/unit_tests/blas/batch/gemm_batch_usm.cpp b/tests/unit_tests/blas/batch/gemm_batch_usm.cpp
@@ -48,7 +48,7 @@ extern std::vector<sycl::device*> devices;
 namespace {
 
 template <typename Ta, typename Tb, typename Tc, typename Ts>
-int test(device* dev, oneapi::math::layout layout, int64_t group_count) {
+int test(device* dev, oneapi::math::layout layout, int64_t group_count, bool graph_record = false) {
     // Catch asynchronous exceptions.
     auto exception_handler = [](exception_list exceptions) {
         for (std::exception_ptr const& e : exceptions) {
@@ -247,6 +247,13 @@ int test(device* dev, oneapi::math::layout layout, int64_t group_count) {
 
     try {
 #ifdef CALL_RT_API
+        namespace sycl_exp = sycl::ext::oneapi::experimental;
+        using modifiable_graph = sycl_exp::command_graph<sycl_exp::graph_state::modifiable>;
+        std::unique_ptr<modifiable_graph> graph;
+        if (graph_record) {
+            graph = std::make_unique<modifiable_graph>(main_queue);
+            graph->begin_recording(main_queue);
+        }
         switch (layout) {
             case oneapi::math::layout::col_major:
                 done = oneapi::math::blas::column_major::gemm_batch(
@@ -262,7 +269,15 @@ int test(device* dev, oneapi::math::layout layout, int64_t group_count) {
                 break;
             default: break;
         }
-        done.wait_and_throw();
+
+        if (graph_record) {
+            graph->end_recording(main_queue);
+            auto exec_graph = graph->finalize();
+            main_queue.ext_oneapi_graph(exec_graph).wait_and_throw();
+        }
+        else {
+            done.wait_and_throw();
+        }
 #else
         switch (layout) {
             case oneapi::math::layout::col_major:
@@ -365,58 +380,65 @@ int test(device* dev, oneapi::math::layout layout, int64_t group_count) {
 }
 
 class GemmBatchUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::math::layout>> {};
+        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::math::layout, bool>> {
+    virtual void SetUp() override {
+        // Skip test if graph recording variant and device doesn't support sycl_ext_oneapi_graph
+        if (std::get<2>(GetParam())) {
+            CHECK_GRAPH_ON_DEVICE(std::get<0>(GetParam()));
+        }
+    }
+};
 
 TEST_P(GemmBatchUsmTests, RealHalfPrecision) {
     EXPECT_TRUEORSKIP((test<sycl::half, sycl::half, sycl::half, sycl::half>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
+        std::get<0>(GetParam()), std::get<1>(GetParam()), 5, std::get<2>(GetParam()))));
 }
 
 TEST_P(GemmBatchUsmTests, HalfHalfFloatPrecision) {
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half, float, float>(std::get<0>(GetParam()),
-                                                                  std::get<1>(GetParam()), 5)));
+    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half, float, float>(
+        std::get<0>(GetParam()), std::get<1>(GetParam()), 5, std::get<2>(GetParam()))));
 }
 
 TEST_P(GemmBatchUsmTests, Int8Int8SinglePrecision) {
-    EXPECT_TRUEORSKIP((test<std::int8_t, std::int8_t, float, float>(std::get<0>(GetParam()),
-                                                                    std::get<1>(GetParam()), 5)));
+    EXPECT_TRUEORSKIP((test<std::int8_t, std::int8_t, float, float>(
+        std::get<0>(GetParam()), std::get<1>(GetParam()), 5, std::get<2>(GetParam()))));
 }
 
 TEST_P(GemmBatchUsmTests, Int8Int8Int32Precision) {
     EXPECT_TRUEORSKIP((test<std::int8_t, std::int8_t, std::int32_t, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
+        std::get<0>(GetParam()), std::get<1>(GetParam()), 5, std::get<2>(GetParam()))));
 }
 
 TEST_P(GemmBatchUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<float, float, float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
+    EXPECT_TRUEORSKIP((test<float, float, float, float>(
+        std::get<0>(GetParam()), std::get<1>(GetParam()), 5, std::get<2>(GetParam()))));
 }
 
 TEST_P(GemmBatchUsmTests, RealDoublePrecision) {
     CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
 
-    EXPECT_TRUEORSKIP((
-        test<double, double, double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
+    EXPECT_TRUEORSKIP((test<double, double, double, double>(
+        std::get<0>(GetParam()), std::get<1>(GetParam()), 5, std::get<2>(GetParam()))));
 }
 
 TEST_P(GemmBatchUsmTests, ComplexSinglePrecision) {
     EXPECT_TRUEORSKIP(
         (test<std::complex<float>, std::complex<float>, std::complex<float>, std::complex<float>>(
-            std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
+            std::get<0>(GetParam()), std::get<1>(GetParam()), 5, std::get<2>(GetParam()))));
 }
 
 TEST_P(GemmBatchUsmTests, ComplexDoublePrecision) {
     CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
 
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, std::complex<double>, std::complex<double>,
-              std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>, std::complex<double>,
+                            std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                                                  5, std::get<2>(GetParam()))));
 }
 
 INSTANTIATE_TEST_SUITE_P(GemmBatchUsmTestSuite, GemmBatchUsmTests,
                          ::testing::Combine(testing::ValuesIn(devices),
                                             testing::Values(oneapi::math::layout::col_major,
-                                                            oneapi::math::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
+                                                            oneapi::math::layout::row_major),
+                                            testing::Values(true, false)),
+                         ::LayoutGraphDeviceNamePrint());
 } // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/gemm_usm.cpp b/tests/unit_tests/blas/level3/gemm_usm.cpp
diff --git a/tests/unit_tests/include/test_helper.hpp b/tests/unit_tests/include/test_helper.hpp