move SET op to standalone file, GPU-only implementation

Gitty Burstein · Gitty Burstein · commit f5a0dcb566eb · 2025-09-18T22:23:06.000+03:00
diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
@@ -2,7 +2,7 @@
 #include "ggml-sycl/presets.hpp"
 #include "ggml.h"
 #include "element_wise.hpp"
-#include <cstring>
+
 #define SYCL_GLOBAL_ID_LOOP(K, ITEM) \
     for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0))
 
@@ -926,132 +926,6 @@ static inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor
             ggml_sycl_detail::pad_sycl(src, dst_ptr, ne00, ne01, ne02, ne0, ne1, ne2, stream);
         });
 }
-static inline void ggml_sycl_op_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    GGML_ASSERT(dst->src[1] != nullptr);
-    const ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == dst->type);
-    GGML_ASSERT(src1->type == dst->type);
-#if defined(GGML_SYCL_F16)
-    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_I32);
-#else
-    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_I32);
-#endif
-    const size_t ts = ggml_type_size(dst->type);
-    dpct::queue_ptr q = ctx.stream();
-    {
-        const bool same_type = (src0->type == dst->type);
-        const bool src_cont  = ggml_is_contiguous(src0);
-        const bool dst_cont  = ggml_is_contiguous(dst);
-
-        const void *p_src0 = src0->data;
-        void       *p_dst  = dst->data;
-
-        auto pt_src0 = sycl::get_pointer_type((const char*)p_src0, q->get_context());
-        auto pt_dst  = sycl::get_pointer_type((char*)p_dst,       q->get_context());
-
-        if (same_type && src_cont && dst_cont && ggml_nelements(src0) == ggml_nelements(dst)) {
-            const size_t bytes = ggml_nbytes(dst);
-            if (pt_src0 != sycl::usm::alloc::unknown && pt_dst != sycl::usm::alloc::unknown) {
-                SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(p_dst, p_src0, bytes)));
-            } else {
-                std::memcpy(p_dst, p_src0, bytes);
-            }
-        } else {
-            const int64_t ne0 = dst->ne[0], ne1 = dst->ne[1], ne2 = dst->ne[2], ne3 = dst->ne[3];
-            const size_t  db0 = dst->nb[0], db1 = dst->nb[1], db2 = dst->nb[2], db3 = dst->nb[3];
-            const size_t  sb0 = src0->nb[0], sb1 = src0->nb[1], sb2 = src0->nb[2], sb3 = src0->nb[3];
-
-            const size_t N  = (size_t) ggml_nelements(dst);
-            const size_t WG = 256;
-            const size_t NG = ((N + WG - 1) / WG) * WG;
-
-            const size_t ge0 = (size_t) ne0;
-            const size_t ge1 = ge0 * (size_t) ne1;
-            const size_t ge2 = ge1 * (size_t) ne2;
-
-            q->parallel_for(
-                sycl::nd_range<1>(sycl::range<1>(NG), sycl::range<1>(WG)),
-                [=](sycl::nd_item<1> it) {
-                    size_t idx = it.get_global_linear_id();
-                    if (idx >= N) return;
-
-                    size_t i3 = idx / ge2;  size_t r2 = idx % ge2;
-                    size_t i2 = r2 / ge1;   size_t r1 = r2 % ge1;
-                    size_t i1 = r1 / ge0;   size_t i0 = r1 % ge0;
-
-                    const char * s = (const char*)p_src0 + (i0*sb0 + i1*sb1 + i2*sb2 + i3*sb3);
-                    char       * d = (char*)p_dst   + (i0*db0 + i1*db1 + i2*db2 + i3*db3);
-
-                    for (size_t b = 0; b < ts; ++b) d[b] = s[b];
-                }
-            );
-        }
-    }
-
-    {
-        const int32_t *p = (const int32_t *) dst->op_params;
-        const size_t nb1    = (size_t) p[0];
-        const size_t nb2    = (size_t) p[1];
-        const size_t nb3    = (size_t) p[2];
-        const size_t offset = (size_t) p[3];
-
-        const void *p_src1 = src1->data;
-        void       *p_dst  = dst->data;
-
-        const size_t sb0 = src1->nb[0], sb1 = src1->nb[1], sb2 = src1->nb[2], sb3 = src1->nb[3];
-        const size_t db0 = dst->nb[0];
-        const int64_t ne0 = src1->ne[0], ne1 = src1->ne[1], ne2 = src1->ne[2], ne3 = src1->ne[3];
-        if (ggml_is_contiguous(src1) && db0 == ts) {
-            const size_t row_bytes = (size_t) ne0 * ts;
-            const char *s_base = (const char*) p_src1;
-            char       *d_base = (char*) p_dst + offset;
-
-            for (int64_t i3 = 0; i3 < ne3; ++i3) {
-                for (int64_t i2 = 0; i2 < ne2; ++i2) {
-                    for (int64_t i1 = 0; i1 < ne1; ++i1) {
-                        const char *s_row = s_base + i1*sb1 + i2*sb2 + i3*sb3;
-                        char       *d_row = d_base + i1*nb1 + i2*nb2 + i3*nb3;
-
-                        auto pt_s = sycl::get_pointer_type(s_row, q->get_context());
-                        auto pt_d = sycl::get_pointer_type(d_row, q->get_context());
-                        if (pt_s != sycl::usm::alloc::unknown && pt_d != sycl::usm::alloc::unknown) {
-                            SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(d_row, s_row, row_bytes)));
-                        } else {
-                            std::memcpy(d_row, s_row, row_bytes);
-                        }
-                    }
-                }
-            }
-        } else {
-
-            const size_t N  = (size_t) (ne0 * ne1 * ne2 * ne3);
-            const size_t WG = 256;
-            const size_t NG = ((N + WG - 1) / WG) * WG;
-            const size_t ge0 = (size_t) ne0;
-            const size_t ge1 = ge0 * (size_t) ne1;
-            const size_t ge2 = ge1 * (size_t) ne2;
-
-            q->parallel_for(
-                sycl::nd_range<1>(sycl::range<1>(NG), sycl::range<1>(WG)),
-                [=](sycl::nd_item<1> it) {
-                    size_t idx = it.get_global_linear_id();
-                    if (idx >= N) return;
-
-                    size_t i3 = idx / ge2;  size_t r2 = idx % ge2;
-                    size_t i2 = r2 / ge1;   size_t r1 = r2 % ge1;
-                    size_t i1 = r1 / ge0;   size_t i0 = r1 % ge0;
-
-                    const char * s = (const char*) p_src1 + (i0*sb0 + i1*sb1 + i2*sb2 + i3*sb3);
-                    char       * d = (char*) p_dst + offset + (i0*db0 + i1*nb1 + i2*nb2 + i3*nb3);
-
-                    for (size_t b = 0; b < ts; ++b) d[b] = s[b];
-                }
-            );
-        }
-    }
-}
 
 static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     float min_val;
@@ -1250,11 +1124,6 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     ggml_sycl_op_pad(ctx, dst);
 }
 
-void ggml_sycl_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
-    ggml_sycl_op_set(ctx, dst);
-}
-
 void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_clamp(ctx, dst);
diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp
@@ -83,6 +83,4 @@ void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
-void ggml_sycl_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
-
 #endif // GGML_SYCL_ELEMENTWISE_HPP
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -42,6 +42,7 @@
 #include "ggml-sycl/presets.hpp"
 #include "ggml-sycl/gemm.hpp"
 #include "ggml-sycl/set_rows.hpp"
+#include "ggml-sycl/set.hpp"
 #include "ggml-sycl/sycl_hw.hpp"
 #include "ggml-sycl/getrows.hpp"
 #include "ggml-sycl/quantize.hpp"
@@ -3565,7 +3566,7 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
             ggml_sycl_get_rows(ctx, dst);
             break;
         case GGML_OP_SET:
-            ggml_sycl_set(ctx, dst);
+            ggml_sycl_op_set(ctx, dst);
             break;
         case GGML_OP_SET_ROWS:
             ggml_sycl_op_set_rows(ctx, dst);
@@ -4170,34 +4171,6 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_
 
 static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     switch (op->op) {
-        case GGML_OP_SET: {
-#if defined(GGML_SYCL_F16)
-    const bool types_ok =
-        (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_I32) &&
-        (op->src[0]->type == op->type) &&
-        (op->src[1] && op->src[1]->type == op->type);
-#else
-    const bool types_ok =
-        (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32) &&
-        (op->src[0]->type == op->type) &&
-        (op->src[1] && op->src[1]->type == op->type);
-#endif
-
-    const bool contiguous_ok =
-        ggml_is_contiguous(op->src[0]) &&
-        (!op->src[1] || ggml_is_contiguous(op->src[1]));
-
-    return types_ok && contiguous_ok;
-}
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            {
-                ggml_type src0_type = op->src[0]->type;
-                ggml_type src1_type = op->src[1]->type;
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
-                    return true;
-                }
-                return false;
-            }
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
                 case GGML_UNARY_OP_NEG:
@@ -4288,6 +4261,18 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                         return false;
                 }
             }
+            case GGML_OP_SET:
+                 #if defined(GGML_SYCL_F16)
+                 return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_I32) &&
+                        (op->src[0] && op->src[1]) &&
+                        (op->src[0]->type == op->type) &&
+                        (op->src[1]->type == op->type);
+                 #else
+                 return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32) &&
+                        (op->src[0] && op->src[1]) &&
+                        (op->src[0]->type == op->type) &&
+                        (op->src[1]->type == op->type);
+                  #endif
         case GGML_OP_SET_ROWS:
             {
                 return ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp
@@ -31,6 +31,7 @@
 #define SYCL_SQRT_BLOCK_SIZE 256
 #define SYCL_SIN_BLOCK_SIZE 256
 #define SYCL_SQR_BLOCK_SIZE 256
+#define SYCL_SET_BLOCK_SIZE 256
 #define SYCL_CPY_BLOCK_SIZE 32
 #define SYCL_SCALE_BLOCK_SIZE 256
 #define SYCL_CLAMP_BLOCK_SIZE 256
diff --git a/ggml/src/ggml-sycl/set.cpp b/ggml/src/ggml-sycl/set.cpp
@@ -0,0 +1,155 @@
+// ggml/src/ggml-sycl/set.cpp
+//
+// SYCL backend for GGML SET operator.
+//
+// Semantics:
+//   1) dst <- src0
+//   2) copy a sub-block from src1 into dst at byte `offset`,
+//      using destination byte-strides (nb1, nb2, nb3) for dims 1..3.
+//
+// Notes:
+//   - (nb1, nb2, nb3, offset) are BYTES (CPU-compatible).
+//   - Uses two fast paths (bulk memcpy; row-wise memcpy) and a generic 4D kernel.
+//   - Work-group size is configured in presets (SYCL_SET_BLOCK_SIZE).
+//
+//   Implementation style aligned with other SYCL operators:
+//   - No host std::memcpy fallback; no USM detection.
+//   - Copies use queue->memcpy; generic case uses a parallel_for kernel.
+
+#include "presets.hpp"   // SYCL_* tuning (incl. SYCL_SET_BLOCK_SIZE)
+#include "common.hpp"
+#include "ggml.h"
+#include "set.hpp"
+
+#include <cstdint>
+#include <cstring>
+
+// ---------------- helpers (file-local) ----------------
+
+// Byte-accurate 4D copy with independent src/dst byte strides.
+// One work-item copies exactly one element (ts bytes).
+static inline void launch_copy_4d_bytes(
+    dpct::queue_ptr q,
+    const void *p_src, void *p_dst,
+    const int64_t ne[4],
+    const size_t  sb[4],
+    const size_t  db[4],
+    const size_t  ts
+) {
+    const size_t N  = (size_t)(ne[0] * ne[1] * ne[2] * ne[3]);
+    if (N == 0) return;
+
+    const size_t WG = (size_t)SYCL_SET_BLOCK_SIZE;
+    const size_t NG = ((N + WG - 1) / WG) * WG;
+
+    const size_t ge0 = (size_t) ne[0];
+    const size_t ge1 = ge0 * (size_t) ne[1];
+    const size_t ge2 = ge1 * (size_t) ne[2];
+
+    q->parallel_for(
+        sycl::nd_range<1>(sycl::range<1>(NG), sycl::range<1>(WG)),
+        [=](sycl::nd_item<1> it) {
+            size_t idx = it.get_global_linear_id();
+            if (idx >= N) return;
+
+            // 4D indexing
+            size_t i3 = idx / ge2;  size_t r2 = idx % ge2;
+            size_t i2 = r2 / ge1;   size_t r1 = r2 % ge1;
+            size_t i1 = r1 / ge0;   size_t i0 = r1 % ge0;
+
+            const char *s = (const char *)p_src + (i0*sb[0] + i1*sb[1] + i2*sb[2] + i3*sb[3]);
+            char       *d = (char       *)p_dst + (i0*db[0] + i1*db[1] + i2*db[2] + i3*db[3]);
+
+            #pragma unroll
+            for (size_t b = 0; b < ts; ++b) {
+                d[b] = s[b];
+            }
+        }
+    );
+}
+
+// --------------------------- operator ---------------------------
+
+void ggml_sycl_op_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(dst != nullptr);
+    const ggml_tensor * src0 = dst->src[0];
+    GGML_ASSERT(dst->src[1] != nullptr);
+    const ggml_tensor * src1 = dst->src[1];
+    GGML_ASSERT(src0 && src1);
+
+    // Type constraints (CPU-compatible)
+    GGML_ASSERT(src0->type == dst->type);
+    GGML_ASSERT(src1->type == dst->type);
+#if defined(GGML_SYCL_F16)
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_I32);
+#else
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_I32);
+#endif
+
+    dpct::queue_ptr q  = ctx.stream();
+    const size_t    ts = ggml_type_size(dst->type);
+
+    // Stage 1: dst <- src0
+    {
+        const bool same_type = (src0->type == dst->type);
+        const bool src_cont  = ggml_is_contiguous(src0);
+        const bool dst_cont  = ggml_is_contiguous(dst);
+
+        const void *p_src0 = src0->data;
+        void       *p_dst  = dst->data;
+
+        if (same_type && src_cont && dst_cont &&
+            ggml_nelements(src0) == ggml_nelements(dst)) {
+            SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(p_dst, p_src0, ggml_nbytes(dst))));
+        } else {
+            // generic 4D copy
+            const int64_t ne[4] = { dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3] };
+            const size_t  sb[4] = { src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3] };
+            const size_t  db[4] = { dst ->nb[0], dst ->nb[1], dst ->nb[2], dst ->nb[3] };
+            launch_copy_4d_bytes(q, p_src0, p_dst, ne, sb, db, ts);
+        }
+    }
+
+    // Stage 2: paste src1 sub-block into dst
+    {
+        // op_params: [ nb1, nb2, nb3, offset ] (BYTES)
+        const int32_t *p = (const int32_t *) dst->op_params;
+        const size_t nb1    = (size_t) p[0];
+        const size_t nb2    = (size_t) p[1];
+        const size_t nb3    = (size_t) p[2];
+        const size_t offset = (size_t) p[3];
+
+        const void *p_src1 = src1->data;
+        void       *p_base = (char *) dst->data + offset;
+
+        const bool src1_cont = ggml_is_contiguous(src1);
+        const bool dst_tight = (dst->nb[0] == ts); // tightly-packed rows
+
+        if (src1_cont && dst_tight) {
+            // Row-wise device memcpy of src1 into dst at the given offset
+            const char *s_base     = (const char *) p_src1;
+            char       *d_base     = (char       *) p_base;
+            const size_t row_bytes = (size_t) src1->ne[0] * ts;
+
+            const size_t sb1 = src1->nb[1];
+            const size_t sb2 = src1->nb[2];
+            const size_t sb3 = src1->nb[3];
+
+            for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
+                for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
+                    for (int64_t i1 = 0; i1 < src1->ne[1]; ++i1) {
+                        const char *s_row = s_base + i1*sb1 + i2*sb2 + i3*sb3;
+                        char       *d_row = d_base + i1*nb1 + i2*nb2 + i3*nb3;
+                        SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(d_row, s_row, row_bytes)));
+                    }
+                }
+            }
+        } else {
+            // Generic 4D copy from src1 into (offsetted) dst base
+            const int64_t ne[4] = { src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3] };
+            const size_t  sb[4] = { src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3] };
+            const size_t  db[4] = { dst->nb[0],  nb1,         nb2,         nb3         };
+            launch_copy_4d_bytes(q, p_src1, p_base, ne, sb, db, ts);
+        }
+    }
+}
diff --git a/ggml/src/ggml-sycl/set.hpp b/ggml/src/ggml-sycl/set.hpp