From 857067a9c77315c2f12712813cecd714c0440abc Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Mon, 7 Jul 2025 14:01:02 +0530
Subject: [PATCH 1/6] SYCL: Initial set_rows kernel implementation

---
 ggml/src/ggml-sycl/backend.hpp   |   1 +
 ggml/src/ggml-sycl/ggml-sycl.cpp |   6 +-
 ggml/src/ggml-sycl/set_rows.cpp  | 143 +++++++++++++++++++++++++++++++
 ggml/src/ggml-sycl/set_rows.hpp  |   8 ++
 4 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 ggml/src/ggml-sycl/set_rows.cpp
 create mode 100644 ggml/src/ggml-sycl/set_rows.hpp

diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp
index f78a36ddf8f66..f839a42bc90c9 100644
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@@ -30,6 +30,7 @@
 #include "outprod.hpp"
 #include "quants.hpp"
 #include "rope.hpp"
+#include "set_rows.hpp"
 #include "softmax.hpp"
 #include "tsembd.hpp"
 #include "wkv.hpp"
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 21c81e99a19aa..7244521c0dcc3 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -41,6 +41,7 @@
 #include "ggml-sycl/element_wise.hpp"
 #include "ggml-sycl/presets.hpp"
 #include "ggml-sycl/gemm.hpp"
+#include "ggml-sycl/set_rows.hpp"
 #include "ggml-sycl/sycl_hw.hpp"
 #include "ggml-sycl/getrows.hpp"
 #include "ggml.h"
@@ -3603,6 +3604,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
         case GGML_OP_GET_ROWS:
             ggml_sycl_get_rows(ctx, dst);
             break;
+        case GGML_OP_SET_ROWS:
+            ggml_sycl_op_set_rows(ctx, dst);
+            break;
         case GGML_OP_DUP:
             ggml_sycl_dup(ctx, dst);
             break;
@@ -4297,7 +4301,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
             {
                 // TODO: add support
                 // ref: https://github.com/ggml-org/llama.cpp/pull/14274
-                return false;
+                return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64));
             } break;
         case GGML_OP_CPY:
             {
diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp
new file mode 100644
index 0000000000000..e6d667d93ab3d
--- /dev/null
+++ b/ggml/src/ggml-sycl/set_rows.cpp
@@ -0,0 +1,143 @@
+#include "set_rows.hpp"
+
+typedef void (*set_rows_kernel_t)(const char * src, char * dst);
+
+static void set_rows_1_f32_f32(const char * src, char * dst) {
+    const float * src_f = (const float *) src;
+    float * dst_f = (float *) dst;
+    *dst_f = *src_f;
+}
+
+static void set_rows_1_f32_f16(const char * src, char * dst) {
+    const float * src_f = (const float *) src;
+    sycl::half * dst_h = (sycl::half *) dst;
+    *dst_h = sycl::vec<float, 1>(*src_f).convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+}
+
+template<set_rows_kernel_t set_rows_1>
+static void k_set_rows(
+        const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst,
+        const int64_t ne00, const int64_t ne01, const int64_t ne11, const int64_t ne12,
+        const size_t nb01, const size_t nb02, const size_t nb03,
+        const size_t nb10, const size_t nb11, const size_t nb12,
+        const size_t nb1, const size_t nb2, const size_t nb3,
+        const size_t src_type_size, const size_t dst_type_size,
+        const sycl::nd_item<3> & item_ct1) {
+
+    const int i03 = item_ct1.get_group(0);
+    const int i02 = item_ct1.get_group(1);
+    const int i01 = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);  // Row index
+
+    if (i01 >= ne01) {
+        return;
+    }
+
+    const int i12 = i03 % ne12;
+    const int i11 = i02 % ne11;
+    const int i10 = i01;
+
+    const int64_t dst_row = *(const int64_t *)((const char *)src1 + i10*nb10 + i11*nb11 + i12*nb12);
+
+    const char * src0_row = src0 + i01*nb01 + i02*nb02 + i03*nb03;
+    char * dst_row_ptr    = dst + dst_row*nb1 + i02*nb2 + i03*nb3;
+    // Optimize for same-type operations: use collective memory copy
+    if (src_type_size == dst_type_size) {
+        // All threads in the work-group cooperatively copy the row
+        const size_t row_bytes = ne00 * src_type_size;
+        // Each thread copies a chunk of the row
+        for (size_t byte_idx = item_ct1.get_local_id(0); byte_idx < row_bytes; byte_idx += item_ct1.get_local_range(0)) {
+            dst_row_ptr[byte_idx] = src0_row[byte_idx];
+        }
+    } else {
+        // Type conversion required, use element-wise approach
+        for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) {
+            const char * src_elem = src0_row + col * src_type_size;
+            char * dst_elem       = dst_row_ptr + col * dst_type_size;
+            set_rows_1(src_elem, dst_elem);
+        }
+    }
+}
+
+template<set_rows_kernel_t set_rows_1>
+static void set_rows_sycl(
+        const char * src0_d, const int64_t * src1_d, char * dst_d,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t ne11, const int64_t ne12, const size_t nb01, const size_t nb02, const size_t nb03,
+        const size_t nb10, const size_t nb11, const size_t nb12,
+        const size_t nb1, const size_t nb2, const size_t nb3,
+        const size_t src_type_size, const size_t dst_type_size,
+        queue_ptr stream) {
+
+    const int max_threads_per_row = 128; // KEEPING 128 for now
+    const int threads_per_row     = std::min((int)ne00, max_threads_per_row);
+
+    const int max_threads_per_block = 128;
+    const int rows_per_block        = std::max(1, max_threads_per_block / threads_per_row);
+
+    const sycl::range<3> block_size(1, rows_per_block, threads_per_row);
+    const sycl::range<3> grid_size(ne03, ne02, (ne01 + rows_per_block - 1) / rows_per_block);
+
+    if (ne01 > 0 && ne00 > 0) {
+        sycl_parallel_for(
+            stream,
+            sycl::nd_range<3>(grid_size * block_size, block_size),
+            [=](sycl::nd_item<3> item_ct1) {
+                k_set_rows<set_rows_1>(
+                    src0_d, src1_d, dst_d,
+                    ne00, ne01, ne11, ne12,
+                    nb01, nb02, nb03,
+                    nb10, nb11, nb12,
+                    nb1, nb2, nb3,
+                    src_type_size, dst_type_size,
+                    item_ct1
+                );
+            }
+        );
+    }
+}
+
+
+void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I64);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t * src1_dd = static_cast<const int64_t *>(src1->data);
+
+    dpct::queue_ptr stream = ctx.stream();
+    switch (dst->type) {
+        case GGML_TYPE_F32:
+            set_rows_sycl<set_rows_1_f32_f32>(
+                (const char *)dst->src[0]->data, src1_dd, (char *)dst->data,
+                ne00, ne01, ne02, ne03,
+                ne11, ne12,
+                nb01, nb02, nb03,
+                nb10, nb11, nb12,
+                nb1, nb2, nb3,
+                sizeof(float), sizeof(float),
+                stream
+            );
+            break;
+        case GGML_TYPE_F16:
+            dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
+            set_rows_sycl<set_rows_1_f32_f16>(
+                (const char *)dst->src[0]->data, src1_dd, (char *)dst->data,
+                ne00, ne01, ne02, ne03,
+                ne11, ne12,
+                nb01, nb02, nb03,
+                nb10, nb11, nb12,
+                nb1, nb2, nb3,
+                sizeof(float), sizeof(sycl::half),
+                stream
+        );
+            break;
+        default:
+            GGML_ABORT("Unsupported tensor type!");
+            break;
+    }
+}
diff --git a/ggml/src/ggml-sycl/set_rows.hpp b/ggml/src/ggml-sycl/set_rows.hpp
new file mode 100644
index 0000000000000..27fcc8f90175b
--- /dev/null
+++ b/ggml/src/ggml-sycl/set_rows.hpp
@@ -0,0 +1,8 @@
+#ifndef GGML_SYCL_SET_ROWS_HPP
+#define GGML_SYCL_SET_ROWS_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+#endif // GGML_SYCL_SET_ROWS_HPP

From f8ff53669ff82bd2dbec7db93c0dcc73d5088d97 Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Mon, 7 Jul 2025 17:53:02 +0530
Subject: [PATCH 2/6] Revert max_threads to 256

---
 ggml/src/ggml-sycl/set_rows.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp
index e6d667d93ab3d..7e4ca5c3d6673 100644
--- a/ggml/src/ggml-sycl/set_rows.cpp
+++ b/ggml/src/ggml-sycl/set_rows.cpp
@@ -68,10 +68,10 @@ static void set_rows_sycl(
         const size_t src_type_size, const size_t dst_type_size,
         queue_ptr stream) {
 
-    const int max_threads_per_row = 128; // KEEPING 128 for now
+    const int max_threads_per_row = 256; // KEEPING 256 for now
     const int threads_per_row     = std::min((int)ne00, max_threads_per_row);
 
-    const int max_threads_per_block = 128;
+    const int max_threads_per_block = 256;
     const int rows_per_block        = std::max(1, max_threads_per_block / threads_per_row);
 
     const sycl::range<3> block_size(1, rows_per_block, threads_per_row);

From 70365462aa6bc1c4ae2a417759b59e96ffbea08f Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Tue, 8 Jul 2025 17:21:07 +0530
Subject: [PATCH 3/6] Refactor set_rows and address review comments

---
 ggml/src/ggml-sycl/set_rows.cpp | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp
index 7e4ca5c3d6673..6d09f883c1716 100644
--- a/ggml/src/ggml-sycl/set_rows.cpp
+++ b/ggml/src/ggml-sycl/set_rows.cpp
@@ -36,25 +36,16 @@ static void k_set_rows(
     const int i11 = i02 % ne11;
     const int i10 = i01;
 
-    const int64_t dst_row = *(const int64_t *)((const char *)src1 + i10*nb10 + i11*nb11 + i12*nb12);
+    const int64_t dst_row = *(const int64_t *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12}));
 
-    const char * src0_row = src0 + i01*nb01 + i02*nb02 + i03*nb03;
+
+    const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03});
     char * dst_row_ptr    = dst + dst_row*nb1 + i02*nb2 + i03*nb3;
-    // Optimize for same-type operations: use collective memory copy
-    if (src_type_size == dst_type_size) {
-        // All threads in the work-group cooperatively copy the row
-        const size_t row_bytes = ne00 * src_type_size;
-        // Each thread copies a chunk of the row
-        for (size_t byte_idx = item_ct1.get_local_id(0); byte_idx < row_bytes; byte_idx += item_ct1.get_local_range(0)) {
-            dst_row_ptr[byte_idx] = src0_row[byte_idx];
-        }
-    } else {
-        // Type conversion required, use element-wise approach
-        for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) {
-            const char * src_elem = src0_row + col * src_type_size;
-            char * dst_elem       = dst_row_ptr + col * dst_type_size;
-            set_rows_1(src_elem, dst_elem);
-        }
+
+    for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) {
+        const char * src_elem = src0_row + col * src_type_size;
+        char * dst_elem       = dst_row_ptr + col * dst_type_size;
+        set_rows_1(src_elem, dst_elem);
     }
 }
 
@@ -68,10 +59,10 @@ static void set_rows_sycl(
         const size_t src_type_size, const size_t dst_type_size,
         queue_ptr stream) {
 
-    const int max_threads_per_row = 256; // KEEPING 256 for now
+    constexpr int max_threads_per_row = 64; // KEEPING 64 for now
     const int threads_per_row     = std::min((int)ne00, max_threads_per_row);
 
-    const int max_threads_per_block = 256;
+    constexpr int max_threads_per_block = 64;
     const int rows_per_block        = std::max(1, max_threads_per_block / threads_per_row);
 
     const sycl::range<3> block_size(1, rows_per_block, threads_per_row);

From 74a5fc8f116e999f6094c22595a750407ff2123a Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Tue, 8 Jul 2025 18:20:07 +0530
Subject: [PATCH 4/6] Deduplicate conversion function

---
 ggml/src/ggml-sycl/set_rows.cpp | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp
index 6d09f883c1716..d14503d7aaac4 100644
--- a/ggml/src/ggml-sycl/set_rows.cpp
+++ b/ggml/src/ggml-sycl/set_rows.cpp
@@ -1,20 +1,13 @@
 #include "set_rows.hpp"
 
-typedef void (*set_rows_kernel_t)(const char * src, char * dst);
-
-static void set_rows_1_f32_f32(const char * src, char * dst) {
-    const float * src_f = (const float *) src;
-    float * dst_f = (float *) dst;
-    *dst_f = *src_f;
-}
-
-static void set_rows_1_f32_f16(const char * src, char * dst) {
-    const float * src_f = (const float *) src;
-    sycl::half * dst_h = (sycl::half *) dst;
-    *dst_h = sycl::vec<float, 1>(*src_f).convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+template<typename TIn, typename TOut>
+static inline void convert(const char* src, char* dst) {
+    auto src_val = *reinterpret_cast<const TIn*>(src);
+    auto dst_val = sycl::vec<TIn, 1>(src_val).template convert<TOut>()[0];
+    *reinterpret_cast<TOut*>(dst) = dst_val;
 }
 
-template<set_rows_kernel_t set_rows_1>
+template<typename TIn, typename TOut>
 static void k_set_rows(
         const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst,
         const int64_t ne00, const int64_t ne01, const int64_t ne11, const int64_t ne12,
@@ -38,18 +31,17 @@ static void k_set_rows(
 
     const int64_t dst_row = *(const int64_t *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12}));
 
-
     const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03});
     char * dst_row_ptr    = dst + dst_row*nb1 + i02*nb2 + i03*nb3;
 
     for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) {
         const char * src_elem = src0_row + col * src_type_size;
         char * dst_elem       = dst_row_ptr + col * dst_type_size;
-        set_rows_1(src_elem, dst_elem);
+        convert<TIn, TOut>(src_elem, dst_elem);
     }
 }
 
-template<set_rows_kernel_t set_rows_1>
+template<typename TIn, typename TOut>
 static void set_rows_sycl(
         const char * src0_d, const int64_t * src1_d, char * dst_d,
         const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
@@ -73,7 +65,7 @@ static void set_rows_sycl(
             stream,
             sycl::nd_range<3>(grid_size * block_size, block_size),
             [=](sycl::nd_item<3> item_ct1) {
-                k_set_rows<set_rows_1>(
+                k_set_rows<TIn, TOut>(
                     src0_d, src1_d, dst_d,
                     ne00, ne01, ne11, ne12,
                     nb01, nb02, nb03,
@@ -103,7 +95,7 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     dpct::queue_ptr stream = ctx.stream();
     switch (dst->type) {
         case GGML_TYPE_F32:
-            set_rows_sycl<set_rows_1_f32_f32>(
+            set_rows_sycl<float, float>(
                 (const char *)dst->src[0]->data, src1_dd, (char *)dst->data,
                 ne00, ne01, ne02, ne03,
                 ne11, ne12,
@@ -116,7 +108,7 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
             break;
         case GGML_TYPE_F16:
             dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
-            set_rows_sycl<set_rows_1_f32_f16>(
+            set_rows_sycl<float, sycl::half>(
                 (const char *)dst->src[0]->data, src1_dd, (char *)dst->data,
                 ne00, ne01, ne02, ne03,
                 ne11, ne12,

From bab2b3bcab647046fa14b97fda0fe01852f7b41b Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Wed, 9 Jul 2025 10:23:37 +0530
Subject: [PATCH 5/6] Remove guard before kernel launch and refactor

---
 ggml/src/ggml-sycl/set_rows.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp
index d14503d7aaac4..ffc5b20e31361 100644
--- a/ggml/src/ggml-sycl/set_rows.cpp
+++ b/ggml/src/ggml-sycl/set_rows.cpp
@@ -60,7 +60,6 @@ static void set_rows_sycl(
     const sycl::range<3> block_size(1, rows_per_block, threads_per_row);
     const sycl::range<3> grid_size(ne03, ne02, (ne01 + rows_per_block - 1) / rows_per_block);
 
-    if (ne01 > 0 && ne00 > 0) {
         sycl_parallel_for(
             stream,
             sycl::nd_range<3>(grid_size * block_size, block_size),
@@ -76,7 +75,6 @@ static void set_rows_sycl(
                 );
             }
         );
-    }
 }
 
 
@@ -96,7 +94,7 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     switch (dst->type) {
         case GGML_TYPE_F32:
             set_rows_sycl<float, float>(
-                (const char *)dst->src[0]->data, src1_dd, (char *)dst->data,
+                (const char *)src0->data, src1_dd, (char *)dst->data,
                 ne00, ne01, ne02, ne03,
                 ne11, ne12,
                 nb01, nb02, nb03,
@@ -109,7 +107,7 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
         case GGML_TYPE_F16:
             dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
             set_rows_sycl<float, sycl::half>(
-                (const char *)dst->src[0]->data, src1_dd, (char *)dst->data,
+                (const char *)src0->data, src1_dd, (char *)dst->data,
                 ne00, ne01, ne02, ne03,
                 ne11, ne12,
                 nb01, nb02, nb03,

From 1ed8c7c499bb5d9588f33a4bd46b959a03bf42e5 Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Wed, 9 Jul 2025 13:17:52 +0530
Subject: [PATCH 6/6] Fix and add back SFINAE

---
 ggml/src/ggml-sycl/set_rows.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp
index ffc5b20e31361..4a76a63d3545d 100644
--- a/ggml/src/ggml-sycl/set_rows.cpp
+++ b/ggml/src/ggml-sycl/set_rows.cpp
@@ -1,10 +1,17 @@
 #include "set_rows.hpp"
 
+namespace utils {
+template<typename T>
+static constexpr bool is_arithmetic_v() {
+    return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half> || std::is_same_v<T, sycl::ext::oneapi::bfloat16>;
+}
+}
 template<typename TIn, typename TOut>
-static inline void convert(const char* src, char* dst) {
+static inline std::enable_if_t<utils::is_arithmetic_v<TIn>() && utils::is_arithmetic_v<TOut>(), void>
+convert (const char* src, char* dst) {
     auto src_val = *reinterpret_cast<const TIn*>(src);
-    auto dst_val = sycl::vec<TIn, 1>(src_val).template convert<TOut>()[0];
-    *reinterpret_cast<TOut*>(dst) = dst_val;
+    auto dst_val = sycl::vec<TIn, 1>(src_val).template convert<TOut, sycl::rounding_mode::automatic>()[0];
+   *reinterpret_cast<TOut*>(dst) = dst_val;;
 }
 
 template<typename TIn, typename TOut>