From f4e1782fea7d51d6db9e8ef7908c4e01c12a1f86 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 17 Oct 2025 14:15:27 -0700
Subject: [PATCH] sycl: use async memory allocation to fix graph recording
 failures

GGML_SYCL_DISABLE_GRAPHS=0 causes crashes because:
  - Host waits are currently unsupported in graph recording mode.
  - SYCL malloc / free calls are unsupported in graph recording mode.

The following changes are made to fix SYCL graph functionality:
  - When graphs are enabled, use the SYCL async memory extension for temp
    buffers which is supported with SYCL graphs.
  - For compiler versions that do not support this extension, skip
    graphs with the affected op.
  - Switch from USM shared to device memory as the async extension
    currently just supports device allocations.
---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 129 +++++++++++++++++++++++++------
 1 file changed, 106 insertions(+), 23 deletions(-)
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index a7e077ec8ebe0..45cd0aa35e1d1 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -30,6 +30,9 @@
 #include <regex>
 
 #include <sycl/sycl.hpp>
+#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
+#    include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
+#endif
 #include <sycl/half_type.hpp>
 
 #include "ggml-sycl.h"
@@ -54,6 +57,7 @@ int g_ggml_sycl_disable_optimize = 0;
 int g_ggml_sycl_disable_graph = 0;
 int g_ggml_sycl_disable_dnn = 0;
 int g_ggml_sycl_prioritize_dmmv = 0;
+int g_ggml_sycl_disable_async_mem_alloc = 0;
 
 static ggml_sycl_device_info ggml_sycl_init() {
     ggml_sycl_device_info info = {};
@@ -237,7 +241,21 @@ static void ggml_check_sycl() try {
         fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
 #endif
 */
-
+        // Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be
+        // properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in
+        // other places.
+        g_ggml_sycl_disable_async_mem_alloc =
+#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
+            g_ggml_sycl_disable_graph;
+        for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count() && !g_ggml_sycl_disable_async_mem_alloc;
+             ++i) {
+            if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
+                g_ggml_sycl_disable_async_mem_alloc = 1;
+            }
+        }
+#else
+            1;
+#endif
         if (CHECK_TRY_ERROR(g_all_sycl_device_count =
                             dpct::dev_mgr::instance().device_count()) != 0) {
             initialized = true;
@@ -3031,19 +3049,54 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
     }
 }
 
+// Helper functions to unify device memory allocation for both async and sync paths
+static inline void * sycl_malloc_opt_async(dpct::queue_ptr  stream,
+                                           sycl::usm::alloc alloc_type,
+                                           size_t           size,
+                                           bool             use_async) {
+#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
+    if (use_async) {
+        return syclex::async_malloc(*stream, alloc_type, size);
+    }
+#else
+    // If async allocation extension is not available, we should have never passed use_async=true
+    GGML_ASSERT(!use_async);
+#endif
+    return sycl::malloc(size, *stream, alloc_type);
+}
+
+static inline void sycl_free_opt_async(dpct::queue_ptr stream, void * ptr, bool use_async) {
+#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
+    if (use_async) {
+        syclex::async_free(*stream, ptr);
+        return;
+    }
+#else
+    // If async allocation extension is not available, we should have never passed use_async=true
+    GGML_ASSERT(!use_async);
+#endif
+    sycl::free(ptr, *stream);
+}
+
 static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
                             dpct::queue_ptr stream) {
-    auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
-    SYCL_CHECK(
-        CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
-            .wait()));
+    const bool use_async = !g_ggml_sycl_disable_async_mem_alloc;
+    uint8_t *  tmp_buf =
+        static_cast<uint8_t *>(sycl_malloc_opt_async(stream, sycl::usm::alloc::device, size, use_async));
+
+    sycl::event copy_event;
+    SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
+    if (!use_async) {
+        copy_event.wait();
+    }
+
     GGML_ASSERT((size % sizeof(block_q4_0) == 0));
     GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
     int offset_blks = offset / sizeof(block_q4_0);
     auto qs_ptr      = data_device + offset_blks * QK4_0 / 2;
     auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
 
-    stream->parallel_for(
+    auto reorder_event = stream->parallel_for(
         size / sizeof(block_q4_0),
             [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
             const block_q4_0* x = (const block_q4_0*)tmp_buf;
@@ -3054,9 +3107,11 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
                 *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
             }
             *(d_ptr + ib) = x[ib].d;
-        }).wait_and_throw();
-
-    sycl::free(tmp_buf, *stream);
+        });
+    if (!use_async) {
+        reorder_event.wait_and_throw();
+    }
+    sycl_free_opt_async(stream, tmp_buf, use_async);
 }
 
 static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
@@ -3065,14 +3120,21 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
 
     const int nblocks = size / sizeof(block_q4_K);
 
-    auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
-    SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
+    const bool use_async = !g_ggml_sycl_disable_async_mem_alloc;
+    uint8_t *  tmp_buf =
+        static_cast<uint8_t *>(sycl_malloc_opt_async(stream, sycl::usm::alloc::device, size, use_async));
+
+    sycl::event copy_event;
+    SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
+    if (!use_async) {
+        copy_event.wait();
+    }
 
     auto * qs_ptr     = data_device;
     auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
     auto * dm_ptr     = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
 
-    stream->parallel_for(nblocks, [=](auto i) {
+    auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
         const block_q4_K * x  = (const block_q4_K *) tmp_buf;
         const int          ib = i;
 
@@ -3085,9 +3147,11 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
         }
 
         dm_ptr[ib] = x[ib].dm;
-    }).wait_and_throw();
-
-    sycl::free(tmp_buf, *stream);
+    });
+    if (!use_async) {
+        reorder_event.wait_and_throw();
+    }
+    sycl_free_opt_async(stream, tmp_buf, use_async);
 }
 
 static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
@@ -3096,16 +3160,22 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
 
     const int nblocks = size / sizeof(block_q6_K);
 
-    auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
-    SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
+    const bool use_async = !g_ggml_sycl_disable_async_mem_alloc;
+    uint8_t *  tmp_buf =
+        static_cast<uint8_t *>(sycl_malloc_opt_async(stream, sycl::usm::alloc::device, size, use_async));
+
+    sycl::event copy_event;
+    SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
+    if (!use_async) {
+        copy_event.wait();
+    }
 
     auto *       ql_ptr     = data_device;
     auto *       qh_ptr     = ql_ptr + (QK_K / 2) * nblocks;
     auto *       scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
     sycl::half * dm_ptr     = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
 
-    stream
-        ->parallel_for(nblocks,
+    auto reorder_event = stream->parallel_for(nblocks,
                        [=](auto i) {
                            const block_q6_K * x  = (const block_q6_K *) tmp_buf;
                            const int          ib = i;
@@ -3128,10 +3198,11 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
                            }
 
                            dm_ptr[ib] = x[ib].d;
-                       })
-        .wait_and_throw();
-
-    sycl::free(tmp_buf, *stream);
+                       });
+    if (!use_async) {
+        reorder_event.wait_and_throw();
+    }
+    sycl_free_opt_async(stream, tmp_buf, use_async);
 }
 
 static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
@@ -4041,6 +4112,18 @@ static bool check_graph_compatibility(ggml_cgraph * cgraph) {
                 GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
                               ggml_op_name(node_op));
                 return false;
+            case GGML_OP_MUL_MAT:
+                // We cannot use graphs with ggml_sycl_mul_mat() when SYCL async memory allocation extensions are not available,
+                // as SYCL malloc / free and host wait calls are not supported when recording to a graph which are all present
+                // in reordering.
+                if (g_ggml_sycl_disable_async_mem_alloc) {
+                    GGML_LOG_INFO(
+                        "%s: disabling SYCL graphs due to unsupported node type when using a compiler without the "
+                        "oneAPI async memory allocation extension "
+                        "%s\n",
+                        __func__, ggml_op_name(node_op));
+                    return false;
+                }
         }
     }
     return true;