From 857067a9c77315c2f12712813cecd714c0440abc Mon Sep 17 00:00:00 2001 From: Akarshan Date: Mon, 7 Jul 2025 14:01:02 +0530 Subject: [PATCH 1/6] SYCL: Initial set_rows kernel implementation --- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/ggml-sycl.cpp | 6 +- ggml/src/ggml-sycl/set_rows.cpp | 143 +++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/set_rows.hpp | 8 ++ 4 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 ggml/src/ggml-sycl/set_rows.cpp create mode 100644 ggml/src/ggml-sycl/set_rows.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index f78a36ddf8f66..f839a42bc90c9 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -30,6 +30,7 @@ #include "outprod.hpp" #include "quants.hpp" #include "rope.hpp" +#include "set_rows.hpp" #include "softmax.hpp" #include "tsembd.hpp" #include "wkv.hpp" diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 21c81e99a19aa..7244521c0dcc3 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -41,6 +41,7 @@ #include "ggml-sycl/element_wise.hpp" #include "ggml-sycl/presets.hpp" #include "ggml-sycl/gemm.hpp" +#include "ggml-sycl/set_rows.hpp" #include "ggml-sycl/sycl_hw.hpp" #include "ggml-sycl/getrows.hpp" #include "ggml.h" @@ -3603,6 +3604,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_OP_GET_ROWS: ggml_sycl_get_rows(ctx, dst); break; + case GGML_OP_SET_ROWS: + ggml_sycl_op_set_rows(ctx, dst); + break; case GGML_OP_DUP: ggml_sycl_dup(ctx, dst); break; @@ -4297,7 +4301,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g { // TODO: add support // ref: https://github.com/ggml-org/llama.cpp/pull/14274 - return false; + return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64)); } break; case GGML_OP_CPY: { diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp new file mode 100644 index 0000000000000..e6d667d93ab3d --- /dev/null +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -0,0 +1,143 @@ +#include "set_rows.hpp" + +typedef void (*set_rows_kernel_t)(const char * src, char * dst); + +static void set_rows_1_f32_f32(const char * src, char * dst) { + const float * src_f = (const float *) src; + float * dst_f = (float *) dst; + *dst_f = *src_f; +} + +static void set_rows_1_f32_f16(const char * src, char * dst) { + const float * src_f = (const float *) src; + sycl::half * dst_h = (sycl::half *) dst; + *dst_h = sycl::vec(*src_f).convert()[0]; +} + +template +static void k_set_rows( + const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst, + const int64_t ne00, const int64_t ne01, const int64_t ne11, const int64_t ne12, + const size_t nb01, const size_t nb02, const size_t nb03, + const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + const size_t src_type_size, const size_t dst_type_size, + const sycl::nd_item<3> & item_ct1) { + + const int i03 = item_ct1.get_group(0); + const int i02 = item_ct1.get_group(1); + const int i01 = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); // Row index + + if (i01 >= ne01) { + return; + } + + const int i12 = i03 % ne12; + const int i11 = i02 % ne11; + const int i10 = i01; + + const int64_t dst_row = *(const int64_t *)((const char *)src1 + i10*nb10 + i11*nb11 + i12*nb12); + + const char * src0_row = src0 + i01*nb01 + i02*nb02 + i03*nb03; + char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3; + // Optimize for same-type operations: use collective memory copy + if (src_type_size == dst_type_size) { + // All threads in the work-group cooperatively copy the row + const size_t row_bytes = ne00 * src_type_size; + // Each thread copies a chunk of the row + for (size_t byte_idx = item_ct1.get_local_id(0); byte_idx < row_bytes; byte_idx += item_ct1.get_local_range(0)) { + dst_row_ptr[byte_idx] = src0_row[byte_idx]; + } + } else { + // Type conversion required, use element-wise approach + for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) { + const char * src_elem = src0_row + col * src_type_size; + char * dst_elem = dst_row_ptr + col * dst_type_size; + set_rows_1(src_elem, dst_elem); + } + } +} + +template +static void set_rows_sycl( + const char * src0_d, const int64_t * src1_d, char * dst_d, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t ne11, const int64_t ne12, const size_t nb01, const size_t nb02, const size_t nb03, + const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + const size_t src_type_size, const size_t dst_type_size, + queue_ptr stream) { + + const int max_threads_per_row = 128; // KEEPING 128 for now + const int threads_per_row = std::min((int)ne00, max_threads_per_row); + + const int max_threads_per_block = 128; + const int rows_per_block = std::max(1, max_threads_per_block / threads_per_row); + + const sycl::range<3> block_size(1, rows_per_block, threads_per_row); + const sycl::range<3> grid_size(ne03, ne02, (ne01 + rows_per_block - 1) / rows_per_block); + + if (ne01 > 0 && ne00 > 0) { + sycl_parallel_for( + stream, + sycl::nd_range<3>(grid_size * block_size, block_size), + [=](sycl::nd_item<3> item_ct1) { + k_set_rows( + src0_d, src1_d, dst_d, + ne00, ne01, ne11, ne12, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + src_type_size, dst_type_size, + item_ct1 + ); + } + ); + } +} + + +void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I64); + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t * src1_dd = static_cast(src1->data); + + dpct::queue_ptr stream = ctx.stream(); + switch (dst->type) { + case GGML_TYPE_F32: + set_rows_sycl( + (const char *)dst->src[0]->data, src1_dd, (char *)dst->data, + ne00, ne01, ne02, ne03, + ne11, ne12, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + sizeof(float), sizeof(float), + stream + ); + break; + case GGML_TYPE_F16: + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + set_rows_sycl( + (const char *)dst->src[0]->data, src1_dd, (char *)dst->data, + ne00, ne01, ne02, ne03, + ne11, ne12, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + sizeof(float), sizeof(sycl::half), + stream + ); + break; + default: + GGML_ABORT("Unsupported tensor type!"); + break; + } +} diff --git a/ggml/src/ggml-sycl/set_rows.hpp b/ggml/src/ggml-sycl/set_rows.hpp new file mode 100644 index 0000000000000..27fcc8f90175b --- /dev/null +++ b/ggml/src/ggml-sycl/set_rows.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_SET_ROWS_HPP +#define GGML_SYCL_SET_ROWS_HPP + +#include "common.hpp" + +void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_SET_ROWS_HPP From f8ff53669ff82bd2dbec7db93c0dcc73d5088d97 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Mon, 7 Jul 2025 17:53:02 +0530 Subject: [PATCH 2/6] Revert max_threads to 256 --- ggml/src/ggml-sycl/set_rows.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index e6d667d93ab3d..7e4ca5c3d6673 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -68,10 +68,10 @@ static void set_rows_sycl( const size_t src_type_size, const size_t dst_type_size, queue_ptr stream) { - const int max_threads_per_row = 128; // KEEPING 128 for now + const int max_threads_per_row = 256; // KEEPING 256 for now const int threads_per_row = std::min((int)ne00, max_threads_per_row); - const int max_threads_per_block = 128; + const int max_threads_per_block = 256; const int rows_per_block = std::max(1, max_threads_per_block / threads_per_row); const sycl::range<3> block_size(1, rows_per_block, threads_per_row); From 70365462aa6bc1c4ae2a417759b59e96ffbea08f Mon Sep 17 00:00:00 2001 From: Akarshan Date: Tue, 8 Jul 2025 17:21:07 +0530 Subject: [PATCH 3/6] Refactor set_rows and address review comments --- ggml/src/ggml-sycl/set_rows.cpp | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index 7e4ca5c3d6673..6d09f883c1716 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -36,25 +36,16 @@ static void k_set_rows( const int i11 = i02 % ne11; const int i10 = i01; - const int64_t dst_row = *(const int64_t *)((const char *)src1 + i10*nb10 + i11*nb11 + i12*nb12); + const int64_t dst_row = *(const int64_t *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12})); - const char * src0_row = src0 + i01*nb01 + i02*nb02 + i03*nb03; + + const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03}); char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3; - // Optimize for same-type operations: use collective memory copy - if (src_type_size == dst_type_size) { - // All threads in the work-group cooperatively copy the row - const size_t row_bytes = ne00 * src_type_size; - // Each thread copies a chunk of the row - for (size_t byte_idx = item_ct1.get_local_id(0); byte_idx < row_bytes; byte_idx += item_ct1.get_local_range(0)) { - dst_row_ptr[byte_idx] = src0_row[byte_idx]; - } - } else { - // Type conversion required, use element-wise approach - for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) { - const char * src_elem = src0_row + col * src_type_size; - char * dst_elem = dst_row_ptr + col * dst_type_size; - set_rows_1(src_elem, dst_elem); - } + + for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) { + const char * src_elem = src0_row + col * src_type_size; + char * dst_elem = dst_row_ptr + col * dst_type_size; + set_rows_1(src_elem, dst_elem); } } @@ -68,10 +59,10 @@ static void set_rows_sycl( const size_t src_type_size, const size_t dst_type_size, queue_ptr stream) { - const int max_threads_per_row = 256; // KEEPING 256 for now + constexpr int max_threads_per_row = 64; // KEEPING 64 for now const int threads_per_row = std::min((int)ne00, max_threads_per_row); - const int max_threads_per_block = 256; + constexpr int max_threads_per_block = 64; const int rows_per_block = std::max(1, max_threads_per_block / threads_per_row); const sycl::range<3> block_size(1, rows_per_block, threads_per_row); From 74a5fc8f116e999f6094c22595a750407ff2123a Mon Sep 17 00:00:00 2001 From: Akarshan Date: Tue, 8 Jul 2025 18:20:07 +0530 Subject: [PATCH 4/6] Deduplicate conversion function --- ggml/src/ggml-sycl/set_rows.cpp | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index 6d09f883c1716..d14503d7aaac4 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -1,20 +1,13 @@ #include "set_rows.hpp" -typedef void (*set_rows_kernel_t)(const char * src, char * dst); - -static void set_rows_1_f32_f32(const char * src, char * dst) { - const float * src_f = (const float *) src; - float * dst_f = (float *) dst; - *dst_f = *src_f; -} - -static void set_rows_1_f32_f16(const char * src, char * dst) { - const float * src_f = (const float *) src; - sycl::half * dst_h = (sycl::half *) dst; - *dst_h = sycl::vec(*src_f).convert()[0]; +template +static inline void convert(const char* src, char* dst) { + auto src_val = *reinterpret_cast(src); + auto dst_val = sycl::vec(src_val).template convert()[0]; + *reinterpret_cast(dst) = dst_val; } -template +template static void k_set_rows( const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne11, const int64_t ne12, @@ -38,18 +31,17 @@ static void k_set_rows( const int64_t dst_row = *(const int64_t *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12})); - const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03}); char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3; for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) { const char * src_elem = src0_row + col * src_type_size; char * dst_elem = dst_row_ptr + col * dst_type_size; - set_rows_1(src_elem, dst_elem); + convert(src_elem, dst_elem); } } -template +template static void set_rows_sycl( const char * src0_d, const int64_t * src1_d, char * dst_d, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, @@ -73,7 +65,7 @@ static void set_rows_sycl( stream, sycl::nd_range<3>(grid_size * block_size, block_size), [=](sycl::nd_item<3> item_ct1) { - k_set_rows( + k_set_rows( src0_d, src1_d, dst_d, ne00, ne01, ne11, ne12, nb01, nb02, nb03, @@ -103,7 +95,7 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { dpct::queue_ptr stream = ctx.stream(); switch (dst->type) { case GGML_TYPE_F32: - set_rows_sycl( + set_rows_sycl( (const char *)dst->src[0]->data, src1_dd, (char *)dst->data, ne00, ne01, ne02, ne03, ne11, ne12, @@ -116,7 +108,7 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { break; case GGML_TYPE_F16: dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - set_rows_sycl( + set_rows_sycl( (const char *)dst->src[0]->data, src1_dd, (char *)dst->data, ne00, ne01, ne02, ne03, ne11, ne12, From bab2b3bcab647046fa14b97fda0fe01852f7b41b Mon Sep 17 00:00:00 2001 From: Akarshan Date: Wed, 9 Jul 2025 10:23:37 +0530 Subject: [PATCH 5/6] Remove guard before kernel launch and refactor --- ggml/src/ggml-sycl/set_rows.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index d14503d7aaac4..ffc5b20e31361 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -60,7 +60,6 @@ static void set_rows_sycl( const sycl::range<3> block_size(1, rows_per_block, threads_per_row); const sycl::range<3> grid_size(ne03, ne02, (ne01 + rows_per_block - 1) / rows_per_block); - if (ne01 > 0 && ne00 > 0) { sycl_parallel_for( stream, sycl::nd_range<3>(grid_size * block_size, block_size), @@ -76,7 +75,6 @@ static void set_rows_sycl( ); } ); - } } @@ -96,7 +94,7 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { switch (dst->type) { case GGML_TYPE_F32: set_rows_sycl( - (const char *)dst->src[0]->data, src1_dd, (char *)dst->data, + (const char *)src0->data, src1_dd, (char *)dst->data, ne00, ne01, ne02, ne03, ne11, ne12, nb01, nb02, nb03, @@ -109,7 +107,7 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { case GGML_TYPE_F16: dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); set_rows_sycl( - (const char *)dst->src[0]->data, src1_dd, (char *)dst->data, + (const char *)src0->data, src1_dd, (char *)dst->data, ne00, ne01, ne02, ne03, ne11, ne12, nb01, nb02, nb03, From 1ed8c7c499bb5d9588f33a4bd46b959a03bf42e5 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Wed, 9 Jul 2025 13:17:52 +0530 Subject: [PATCH 6/6] Fix and add back SFINAE --- ggml/src/ggml-sycl/set_rows.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index ffc5b20e31361..4a76a63d3545d 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -1,10 +1,17 @@ #include "set_rows.hpp" +namespace utils { +template +static constexpr bool is_arithmetic_v() { + return std::is_arithmetic_v || std::is_same_v || std::is_same_v; +} +} template -static inline void convert(const char* src, char* dst) { +static inline std::enable_if_t() && utils::is_arithmetic_v(), void> +convert (const char* src, char* dst) { auto src_val = *reinterpret_cast(src); - auto dst_val = sycl::vec(src_val).template convert()[0]; - *reinterpret_cast(dst) = dst_val; + auto dst_val = sycl::vec(src_val).template convert()[0]; + *reinterpret_cast(dst) = dst_val;; } template