arthw
diff --git a/‎docs/backend/SYCL.md‎
Lines changed: 14 additions & 2 deletions b/‎docs/backend/SYCL.md‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎examples/sycl/run-llama2.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/sycl/run-llama2.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-sycl/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎ggml/src/ggml-sycl/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎ggml/src/ggml-sycl/common.cpp‎
Lines changed: 71 additions & 29 deletions b/‎ggml/src/ggml-sycl/common.cpp‎
Lines changed: 71 additions & 29 deletions
diff --git a/‎ggml/src/ggml-sycl/common.hpp‎
Lines changed: 12 additions & 4 deletions b/‎ggml/src/ggml-sycl/common.hpp‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎ggml/src/ggml-sycl/convert.cpp‎
Lines changed: 33 additions & 4 deletions b/‎ggml/src/ggml-sycl/convert.cpp‎
Lines changed: 33 additions & 4 deletions
diff --git a/‎ggml/src/ggml-sycl/convert.hpp‎
Lines changed: 2 additions & 2 deletions b/‎ggml/src/ggml-sycl/convert.hpp‎
Lines changed: 2 additions & 2 deletions
@@ -43,6 +43,16 @@ For CI and performance test summary, please refer to [llama.cpp CI for SYCL Back
 
 ## News
 
+- 2025.2
+  - Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
+    |GPU|Base tokens/s|Increased tokens/s|Percent|
+    |-|-|-|-|
+    |PVC 1550|39|73|+87%|
+    |Flex 170|39|50|+28%|
+    |Arc770|42|55|+30%|
+    |MTL|13|16|+23%|
+    |ARL-H|14|17|+21%|
+
 - 2024.11
   - Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.
 
@@ -101,8 +111,8 @@ SYCL backend supports Intel GPU Family:
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
 | Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
-| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
-| Intel iGPU                    | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 |
+| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake, Arrow Lake    |
+| Intel iGPU                    | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 |
 
 *Notes:*
 
@@ -697,6 +707,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | Name              | Value            | Function                                                                                                                  |
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
+| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 | GGML_SYCL_VISIBLE_DEVICES|id1,id2,...|It's like `CUDA_VISIBLE_DEVICES`, define the SYCL device ID list to visible. Like "0", "0,2", "2,1" |
 | ONEAPI_DEVICE_SELECTOR|Refer to [oneapi-device-selector](https://intel.github.io/llvm-docs/EnvironmentVariables.html#oneapi-device-selector)|be used to limit the choice of devices available when the SYCL-using application is run|
@@ -725,6 +736,7 @@ The parameters about device choose of llama.cpp works with SYCL backend rule to
 |Multiple Device|`--split-mode=layer`|Default|
 
 
+
 ## Known Issues
 
 - `Split-mode:[row]` is not supported.
 
@@ -3,7 +3,7 @@
 #  MIT license
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
-
+export ONEAPI_DEVICE_SELECTOR="level_zero:0"
 source /opt/intel/oneapi/setvars.sh
 
 #export GGML_SYCL_DEBUG=1
 
@@ -1,3 +1,5 @@
+message(STATUS  "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
+
 if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
     message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
 endif()
 
@@ -91,7 +91,7 @@ void print_device_detail_part1(int id, sycl::device &device, std::string device_
         name.c_str(), global_mem_size);
 }
 
-void print_device_detail_part2(int id, sycl::device &device, std::string device_type) {
+void print_device_detail_part2(int id, sycl::device &device) {
 
     dpct::device_info prop;
     SYCL_CHECK(CHECK_TRY_ERROR(
@@ -103,6 +103,30 @@ void print_device_detail_part2(int id, sycl::device &device, std::string device_
         device.get_info<sycl::info::device::driver_version>().c_str());
 }
 
+void print_device_opt_feature(ggml_sycl_device_info &info) {
+    GGML_LOG_INFO("SYCL Optimization Feature:\n");
+    GGML_LOG_INFO(
+        "|ID|        Device Type|Reorder|\n");
+    GGML_LOG_INFO(
+        "|--|-------------------|-------|\n");
+    std::map<std::string, size_t> DeviceNums;
+    int device_count = info.device_count;
+
+    for (int id = 0; id < device_count; ++id) {
+        printf("zjy id=%d\n", id);
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        std::string backend_type = get_device_backend_and_type(device);
+        int type_id = DeviceNums[backend_type]++;
+        std::stringstream device_type;
+        device_type << "[" << backend_type << ":" << std::to_string(type_id)
+                    << "]";
+        std::string device_type_s = device_type.str();
+        device_type_s = std::regex_replace(device_type_s, std::regex("ext_oneapi_"), "");
+        GGML_LOG_INFO("|%2d|%19s|%7s|\n", id, device_type_s.c_str(),
+            info.infos[id].opt_feature.reorder ? "Y": "N");
+    }
+}
+
 void ggml_backend_sycl_print_sycl_devices() {
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
     int device_count = dpct::dev_mgr::instance().device_count();
@@ -113,7 +137,6 @@ void ggml_backend_sycl_print_sycl_devices() {
     fprintf(stderr, "|--|-------------------|-----|---------------------------------------|---------------|\n");
     for (int id = 0; id < device_count; ++id) {
         sycl::device device = dpct::dev_mgr::instance().get_device(id);
-        sycl::backend backend = device.get_backend();
         std::string backend_type = get_device_backend_and_type(device);
         int type_id=DeviceNums[backend_type]++;
         std::stringstream device_type;
@@ -127,64 +150,66 @@ void ggml_backend_sycl_print_sycl_devices() {
     fprintf(stderr, "|--|-----------------|--------------|------------|----------------------------------|\n");
     for (int id = 0; id < device_count; ++id) {
         sycl::device device = dpct::dev_mgr::instance().get_device(id);
-        sycl::backend backend = device.get_backend();
         std::string backend_type = get_device_backend_and_type(device);
         int type_id=DeviceNums2[backend_type]++;
         std::stringstream device_type;
         device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
-        print_device_detail_part2(id, device, device_type.str());
+        print_device_detail_part2(id, device);
     }
 }
 
 static ggml_sycl_device_info ggml_sycl_init(int main_gpu_id) try {
     static bool initialized = false;
-
+    static ggml_sycl_device_info info(main_gpu_id);
     if (!initialized) {
-        fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
-
         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
-        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__,
-                g_ggml_sycl_debug);
-
-#if defined(GGML_SYCL_F16)
-        fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
+        g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
+        GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
+        GGML_LOG_INFO("Running with Environment Variables:\n");
+        GGML_LOG_INFO("  GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
+        GGML_LOG_INFO("  GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
+        GGML_LOG_INFO("Build with Macros:\n");
+#if defined(GGML_SYCL_FORCE_MMQ)
+        GGML_LOG_INFO("  GGML_SYCL_FORCE_MMQ: yes\n");
 #else
-        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
+        GGML_LOG_INFO("  GGML_SYCL_FORCE_MMQ: no\n");
 #endif
-
-#if defined(GGML_SYCL_FORCE_MMQ)
-        fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
+#if defined(GGML_SYCL_F16)
+        GGML_LOG_INFO("  GGML_SYCL_F16: yes\n");
 #else
-        fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
+        GGML_LOG_INFO("  GGML_SYCL_F16: no\n");
 #endif
 
 #if defined(SYCL_USE_XMX)
-        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
+        GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__);
 #else
-        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
+        GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
 #endif
 
         if (CHECK_TRY_ERROR(g_all_sycl_device_count =
                                 dpct::dev_mgr::instance().device_count()) !=
             0) {
             initialized = true;
-            return;
+            GGML_LOG_INFO("  g_all_sycl_device_count is wrong:%d\n",
+                g_all_sycl_device_count);
+            return info;
         }
         GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
-        ggml_backend_sycl_print_sycl_devices();
-        initialized = true;
-    }
 
-    static ggml_sycl_device_info info(main_gpu_id);
+        if (info.device_count == 0) {
+            GGML_LOG_INFO("%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n",
+                    __func__);
+            return info;
+        }
+        GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
 
-    if (info.device_count == 0) {
-        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n",
-                __func__);
-        return info;
+        ggml_backend_sycl_print_sycl_devices();
+        print_device_opt_feature(info);
+        initialized = true;
     }
-    GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
 
     return info;
+
 } catch (sycl::exception const &exc) {
     std::cerr << exc.what() << "Exception caught at file:" << __FILE__
               << ", line:" << __LINE__ << std::endl;
@@ -245,3 +270,20 @@ catch (sycl::exception const &exc) {
             << ", line:" << __LINE__ << std::endl;
   std::exit(1);
 }
+
+
+void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
+            if (extra->events[i][is] != nullptr) {
+                SYCL_CHECK(CHECK_TRY_ERROR(dpct::destroy_event(extra->events[i][is])));
+            }
+        }
+        if (extra->data_device[i] != nullptr && streams.size()>0) {
+            ggml_sycl_set_device(i);
+            SYCL_CHECK(
+                CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
+        }
+    }
+    delete extra;
+}
@@ -38,7 +38,10 @@
 
 void ggml_sycl_host_free(void* ptr);
 
+
 extern int g_ggml_sycl_debug;
+extern int g_ggml_sycl_disable_optimize;
+
 #define GGML_SYCL_DEBUG(...)        \
   do {                              \
     if (g_ggml_sycl_debug)          \
@@ -237,20 +240,26 @@ struct ggml_tensor_extra_gpu {
                                        // tensors
   dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
                         [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
+  optimize_feature optimized_feature;
 };
 
+void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
+
 struct ggml_backend_sycl_context {
     int device;
     std::string name;
+    optimize_feature opt_feature;
+    bool optimized_graph=false;
 
     queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
 
     explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int id) :
         device(id),
         name(GGML_SYCL_NAME + std::to_string(device)) {
-            for (int i=0;i<GGML_SYCL_MAX_STREAMS; i++){
-                qptrs[id][i] = sycl_device_info.infos[id].qptrs[i];
-            }
+        for (int i=0;i<GGML_SYCL_MAX_STREAMS; i++){
+            qptrs[id][i] = sycl_device_info.infos[id].qptrs[i];
+        }
+        opt_feature = sycl_device_info.infos[id].opt_feature;
     }
 
     queue_ptr stream(int id, int stream) {
@@ -672,5 +681,4 @@ bool gpu_has_xmx(sycl::device &dev);
 void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                  const ggml_tensor *src1, ggml_tensor *dst,
                                  const ggml_sycl_op_flatten_t op);
-
 #endif // GGML_SYCL_COMMON_HPP
@@ -125,6 +125,25 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
     }
 }
 
+template <typename dst_t>
+static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
+                                     dpct::queue_ptr stream) {
+
+    dpct::has_capability_or_fail(stream->get_device(),
+                                    {sycl::aspect::fp16});
+
+    int constexpr WARP_K = WARP_SIZE * QK4_0;
+    const int n_warp = (k + WARP_K - 1) / WARP_K;
+    GGML_ASSERT(k % 2 == 0);
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
+        sycl::range<3>(1, 1, WARP_SIZE),
+        sycl::range<3>(1, 1, WARP_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{
+            dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
+        });
+
+}
+
 template <typename dst_t>
 static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
                                      dpct::queue_ptr stream) {
@@ -452,10 +471,15 @@ static void convert_unary_sycl(const void *__restrict__ vx,
     }
 }
 
-to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
+to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
     switch (type) {
         case GGML_TYPE_Q4_0:
-            return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
+            if (dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
+                return dequantize_row_q4_0_sycl_reorder;
+            } else {
+                return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
+            }
         case GGML_TYPE_Q4_1:
             return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
         case GGML_TYPE_Q5_0:
@@ -499,10 +523,15 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
     }
 }
 
-to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
+to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
     switch (type) {
         case GGML_TYPE_Q4_0:
-            return dequantize_row_q4_0_sycl;
+            if (dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
+                return dequantize_row_q4_0_sycl_reorder;
+            } else {
+                return dequantize_row_q4_0_sycl;
+            }
         case GGML_TYPE_Q4_1:
             return dequantize_row_q4_1_sycl;
         case GGML_TYPE_Q5_0:
 
@@ -21,7 +21,7 @@ using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
 typedef to_t_sycl_t<float> to_fp32_sycl_t;
 typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
 
-to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
-to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
+to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
+to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
 
 #endif // GGML_SYCL_CONVERT_HPP
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")`
	`2`	`+`
`1`	`3`	`if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL\|NVIDIA\|AMD)$")`
`2`	`4`	`message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")`
`3`	`5`	`endif()`