ggml-org · ggerganov · Oct 9, 2025 · Oct 7, 2025 · Oct 8, 2025 · Oct 8, 2025
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
@@ -197,6 +197,7 @@ struct sycl_device_info {
     int     cc;                 // compute capability
     // int     nsm;                // number of streaming multiprocessors
     // size_t  smpb;               // max. shared memory per block
+    size_t  smpbo;              // max. shared memory per block (with opt-in)
     bool    vmm;                // virtual memory support
     size_t  total_vram;
     //sycl_hw_info hw_info;     \\ device id and aarch, currently not used
@@ -416,13 +417,6 @@ static __dpct_inline__ float warp_reduce_sum(float x,
     const sycl::nd_item<3>& item_ct1) {
 #pragma unroll
     for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
-        /*
-        DPCT1096:98: The right-most dimension of the work-group used in the SYCL
-        kernel that calls this function may be less than "32". The function
-        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
-        CPU device. Modify the size of the work-group to ensure that the value
-        of the right-most dimension is a multiple of "32".
-        */
         x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
     }
     return x;
@@ -440,17 +434,67 @@ warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) {
     return a;
 }
 
+template <int width = WARP_SIZE>
+static __dpct_inline__ int warp_reduce_sum(int x) {
+  return sycl::reduce_over_group(
+      sycl::ext::oneapi::this_work_item::get_sub_group(), x, sycl::plus<>());
+}
+
+template <int width = WARP_SIZE>
+static __dpct_inline__ float warp_reduce_sum(float x) {
+#pragma unroll
+  for (int offset = width / 2; offset > 0; offset >>= 1) {
+    x += dpct::permute_sub_group_by_xor(
+        sycl::ext::oneapi::this_work_item::get_sub_group(), x, offset, width);
+  }
+  return x;
+}
+
+template <int width = WARP_SIZE>
+static __dpct_inline__ sycl::float2 warp_reduce_sum(sycl::float2 a) {
+#pragma unroll
+  for (int offset = width / 2; offset > 0; offset >>= 1) {
+    a.x() += dpct::permute_sub_group_by_xor(
+        sycl::ext::oneapi::this_work_item::get_sub_group(), a.x(), offset,
+        width);
+    a.y() += dpct::permute_sub_group_by_xor(
+        sycl::ext::oneapi::this_work_item::get_sub_group(), a.y(), offset,
+        width);
+  }
+  return a;
+}
+
+template <int width = WARP_SIZE>
+static __dpct_inline__ sycl::half2 warp_reduce_sum(sycl::half2 a) {
+#pragma unroll
+  for (int offset = width / 2; offset > 0; offset >>= 1) {
+    a = a + dpct::permute_sub_group_by_xor(
+                sycl::ext::oneapi::this_work_item::get_sub_group(), a, offset,
+                width);
+  }
+  return a;
+}
+
+static constexpr int ggml_sycl_get_physical_warp_size() {
+  // todo: for old iGPU + dGPU case, need to be changed.
+  return WARP_SIZE;
+}
+
+template <int width = WARP_SIZE>
+static __dpct_inline__ float warp_reduce_max(float x) {
+#pragma unroll
+  for (int offset = width / 2; offset > 0; offset >>= 1) {
+    x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
+                          sycl::ext::oneapi::this_work_item::get_sub_group(), x,
+                          offset, width));
+  }
+  return x;
+}
+
 static __dpct_inline__ float warp_reduce_max(float x,
     const sycl::nd_item<3>& item_ct1) {
 #pragma unroll
     for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
-        /*
-        DPCT1096:97: The right-most dimension of the work-group used in the SYCL
-        kernel that calls this function may be less than "32". The function
-        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
-        CPU device. Modify the size of the work-group to ensure that the value
-        of the right-most dimension is a multiple of "32".
-        */
         x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
             item_ct1.get_sub_group(), x, mask));
     }
@@ -558,4 +602,18 @@ struct scope_op_debug_print {
     std::string_view func_suffix;
 };
 
+static __dpct_inline__ float get_alibi_slope(const float    max_bias,
+                                             const uint32_t h,
+                                             const uint32_t n_head_log2,
+                                             const float    m0,
+                                             const float    m1) {
+    if (max_bias <= 0.0f) {
+        return 1.0f;
+    }
+    const float base = h < n_head_log2 ? m0 : m1;
+    const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+    return dpct::pow(base, exph);
+}
+
 #endif // GGML_SYCL_COMMON_HPP
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -277,6 +277,26 @@ namespace dpct
 
     } // namespace detail
 
+    // COPY from DPCT head files
+    /// dim3 is used to store 3 component dimensions.
+    class dim3 {
+        public:
+        unsigned x, y, z;
+
+        constexpr dim3(unsigned x = 1, unsigned y = 1, unsigned z = 1)
+            : x(x), y(y), z(z) {}
+
+        dim3(const sycl::id<3> &r) : dim3(r[2], r[1], r[0]) {}
+
+        operator sycl::range<3>() const { return sycl::range<3>(z, y, x); }
+    }; // namespace dim3
+
+    inline dim3 operator*(const dim3 &a, const dim3 &b) {
+    return dim3{a.x * b.x, a.y * b.y, a.z * b.z};
+    }
+    // COPY from DPCT head files
+
+
     /// Pitched 2D/3D memory data.
     class pitched_data
     {

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -87,6 +87,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
             100 * prop.get_major_version() + 10 * prop.get_minor_version();
         info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
         info.max_work_group_sizes[i] = prop.get_max_work_group_size();
+        info.devices[i].smpbo = prop.get_local_mem_size();
     }
 
     for (int id = 0; id < info.device_count; ++id) {
@@ -3741,6 +3742,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
         case GGML_OP_SOFT_MAX:
             ggml_sycl_op_soft_max(ctx, dst);
             break;
+        case GGML_OP_SOFT_MAX_BACK:
+            ggml_sycl_op_soft_max_back(ctx, dst);
+            break;
         case GGML_OP_ROPE:
             ggml_sycl_rope(ctx, dst);
             break;
@@ -3778,6 +3782,7 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
     return true;
 } catch (sycl::exception & e) {
     std::cerr << e.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
+    std::cerr << "Error OP "<<ggml_op_name(dst->op)<< std::endl;
     std::exit(1);
 }
 
@@ -4386,19 +4391,15 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
             return true;
         case GGML_OP_CONT:
             return op->src[0]->type != GGML_TYPE_BF16;
-        case GGML_OP_SOFT_MAX:
-            // TODO: support batching
-            if (op->src[0]->ne[3] != 1) {
-                return false;
-            }
-            // TODO: support attention sinks [TAG_ATTN_SINKS]
-            if (op->src[2]) {
-                return false;
-            }
-            // TODO: support broadcast
-            // ref: https://github.com/ggml-org/llama.cpp/pull/14435
-            return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
         case GGML_OP_DIAG_MASK_INF:
+            return true;
+        case GGML_OP_SOFT_MAX:
+            return true;
+        case GGML_OP_SOFT_MAX_BACK: {
+            float max_bias = 0.0f;
+            memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
+            return max_bias == 0.0f;
+        }
         case GGML_OP_ROPE:
         case GGML_OP_IM2COL:
             return true;