Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 106 additions & 23 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
#include <regex>

#include <sycl/sycl.hpp>
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
#endif
#include <sycl/half_type.hpp>

#include "ggml-sycl.h"
Expand All @@ -54,6 +57,7 @@ int g_ggml_sycl_disable_optimize = 0;
int g_ggml_sycl_disable_graph = 0;
int g_ggml_sycl_disable_dnn = 0;
int g_ggml_sycl_prioritize_dmmv = 0;
int g_ggml_sycl_disable_async_mem_alloc = 0;

static ggml_sycl_device_info ggml_sycl_init() {
ggml_sycl_device_info info = {};
Expand Down Expand Up @@ -237,7 +241,21 @@ static void ggml_check_sycl() try {
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
#endif
*/

// Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be
// properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in
// other places.
g_ggml_sycl_disable_async_mem_alloc =
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
g_ggml_sycl_disable_graph;
for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count() && !g_ggml_sycl_disable_async_mem_alloc;
++i) {
if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
g_ggml_sycl_disable_async_mem_alloc = 1;
}
}
#else
1;
#endif
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
dpct::dev_mgr::instance().device_count()) != 0) {
initialized = true;
Expand Down Expand Up @@ -3031,19 +3049,54 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
}
}

// Helper functions to unify device memory allocation for both async and sync paths
static inline void * sycl_malloc_opt_async(dpct::queue_ptr stream,
sycl::usm::alloc alloc_type,
size_t size,
bool use_async) {
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
if (use_async) {
return syclex::async_malloc(*stream, alloc_type, size);
}
#else
// If async allocation extension is not available, we should have never passed use_async=true
GGML_ASSERT(!use_async);
#endif
return sycl::malloc(size, *stream, alloc_type);
}

static inline void sycl_free_opt_async(dpct::queue_ptr stream, void * ptr, bool use_async) {
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
if (use_async) {
syclex::async_free(*stream, ptr);
return;
}
#else
// If async allocation extension is not available, we should have never passed use_async=true
GGML_ASSERT(!use_async);
#endif
sycl::free(ptr, *stream);
}

static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
dpct::queue_ptr stream) {
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
SYCL_CHECK(
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
.wait()));
const bool use_async = !g_ggml_sycl_disable_async_mem_alloc;
uint8_t * tmp_buf =
static_cast<uint8_t *>(sycl_malloc_opt_async(stream, sycl::usm::alloc::device, size, use_async));

sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
if (!use_async) {
copy_event.wait();
}

GGML_ASSERT((size % sizeof(block_q4_0) == 0));
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
int offset_blks = offset / sizeof(block_q4_0);
auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;

stream->parallel_for(
auto reorder_event = stream->parallel_for(
size / sizeof(block_q4_0),
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
const block_q4_0* x = (const block_q4_0*)tmp_buf;
Expand All @@ -3054,9 +3107,11 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
}
*(d_ptr + ib) = x[ib].d;
}).wait_and_throw();

sycl::free(tmp_buf, *stream);
});
if (!use_async) {
reorder_event.wait_and_throw();
}
sycl_free_opt_async(stream, tmp_buf, use_async);
}

static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
Expand All @@ -3065,14 +3120,21 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d

const int nblocks = size / sizeof(block_q4_K);

auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
const bool use_async = !g_ggml_sycl_disable_async_mem_alloc;
uint8_t * tmp_buf =
static_cast<uint8_t *>(sycl_malloc_opt_async(stream, sycl::usm::alloc::device, size, use_async));

sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
if (!use_async) {
copy_event.wait();
}

auto * qs_ptr = data_device;
auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);

stream->parallel_for(nblocks, [=](auto i) {
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
const block_q4_K * x = (const block_q4_K *) tmp_buf;
const int ib = i;

Expand All @@ -3085,9 +3147,11 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
}

dm_ptr[ib] = x[ib].dm;
}).wait_and_throw();

sycl::free(tmp_buf, *stream);
});
if (!use_async) {
reorder_event.wait_and_throw();
}
sycl_free_opt_async(stream, tmp_buf, use_async);
}

static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
Expand All @@ -3096,16 +3160,22 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d

const int nblocks = size / sizeof(block_q6_K);

auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
const bool use_async = !g_ggml_sycl_disable_async_mem_alloc;
uint8_t * tmp_buf =
static_cast<uint8_t *>(sycl_malloc_opt_async(stream, sycl::usm::alloc::device, size, use_async));

sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
if (!use_async) {
copy_event.wait();
}

auto * ql_ptr = data_device;
auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);

stream
->parallel_for(nblocks,
auto reorder_event = stream->parallel_for(nblocks,
[=](auto i) {
const block_q6_K * x = (const block_q6_K *) tmp_buf;
const int ib = i;
Expand All @@ -3128,10 +3198,11 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
}

dm_ptr[ib] = x[ib].d;
})
.wait_and_throw();

sycl::free(tmp_buf, *stream);
});
if (!use_async) {
reorder_event.wait_and_throw();
}
sycl_free_opt_async(stream, tmp_buf, use_async);
}

static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
Expand Down Expand Up @@ -4041,6 +4112,18 @@ static bool check_graph_compatibility(ggml_cgraph * cgraph) {
GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
ggml_op_name(node_op));
return false;
case GGML_OP_MUL_MAT:
// We cannot use graphs with ggml_sycl_mul_mat() when SYCL async memory allocation extensions are not available,
// as SYCL malloc / free and host wait calls are not supported when recording to a graph which are all present
// in reordering.
if (g_ggml_sycl_disable_async_mem_alloc) {
GGML_LOG_INFO(
"%s: disabling SYCL graphs due to unsupported node type when using a compiler without the "
"oneAPI async memory allocation extension "
"%s\n",
__func__, ggml_op_name(node_op));
return false;
}
}
}
return true;
Expand Down