support optimize graph once when compute graph, record the opt status in tensor->extra, make CI passed

NeoZhangJianyu · NeoZhangJianyu · commit 5cfde909db78 · 2025-02-23T16:40:09.000+08:00
diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp
@@ -99,3 +99,20 @@ catch (sycl::exception const &exc) {
             << ", line:" << __LINE__ << std::endl;
   std::exit(1);
 }
+
+
+void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
+            if (extra->events[i][is] != nullptr) {
+                SYCL_CHECK(CHECK_TRY_ERROR(dpct::destroy_event(extra->events[i][is])));
+            }
+        }
+        if (extra->data_device[i] != nullptr && streams.size()>0) {
+            ggml_sycl_set_device(i);
+            SYCL_CHECK(
+                CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
+        }
+    }
+    delete extra;
+}
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
@@ -37,6 +37,7 @@
 void* ggml_sycl_host_malloc(size_t size);
 void ggml_sycl_host_free(void* ptr);
 
+
 static int g_ggml_sycl_debug = 0;
 #define GGML_SYCL_DEBUG(...)        \
   do {                              \
@@ -268,8 +269,11 @@ struct ggml_tensor_extra_gpu {
                                        // tensors
   dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
                         [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
+  optimize_feature optimized_feature;
 };
 
+void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
+
 inline optimize_feature check_gpu_optimize_feature(int hw_family) {
     optimize_feature opt;
     opt.reorder = ( hw_family==SYCL_HW_FAMILY_INTEL_PVC ||
@@ -283,6 +287,7 @@ struct ggml_backend_sycl_context {
     int device;
     std::string name;
     optimize_feature opt_feature;
+    bool optimized_graph=false;
 
     queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
 
diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp
@@ -125,7 +125,6 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
     }
 }
 
-
 template <typename dst_t>
 static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
                                      dpct::queue_ptr stream) {
@@ -472,10 +471,11 @@ static void convert_unary_sycl(const void *__restrict__ vx,
     }
 }
 
-to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_backend_sycl_context & ctx) {
+to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
     switch (type) {
         case GGML_TYPE_Q4_0:
-            if (ctx.opt_feature.reorder) {
+            if (dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
                 return dequantize_row_q4_0_sycl_reorder;
             } else {
                 return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
@@ -523,10 +523,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_backend_sycl_context &
     }
 }
 
-to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_backend_sycl_context & ctx) {
+to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
     switch (type) {
         case GGML_TYPE_Q4_0:
-            if (ctx.opt_feature.reorder) {
+            if (dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
                 return dequantize_row_q4_0_sycl_reorder;
             } else {
                 return dequantize_row_q4_0_sycl;
diff --git a/ggml/src/ggml-sycl/convert.hpp b/ggml/src/ggml-sycl/convert.hpp
@@ -21,7 +21,7 @@ using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
 typedef to_t_sycl_t<float> to_fp32_sycl_t;
 typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
 
-to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_backend_sycl_context & ctx);
-to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_backend_sycl_context & ctx);
+to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
+to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
 
 #endif // GGML_SYCL_CONVERT_HPP
diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp
@@ -3,7 +3,6 @@
 #include "dequantize.hpp"
 #include "presets.hpp"
 
-
 static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
     const sycl::half *x = (const sycl::half *)vx;
 
@@ -91,7 +90,7 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
     }
 }
 
-template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_recorder>
+template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_reorder>
 static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
                                    const sycl::nd_item<3> &item_ct1) {
     // qk = quantized weights per x block
@@ -134,7 +133,7 @@ static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const d
             // dequantize
             // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
             dfloat2 v;
-            dequantize_kernel_recorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
+            dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
 
             // matrix multiplication
             // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
@@ -165,7 +164,7 @@ static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const d
             // dequantize
             // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
             dfloat2 v;
-            dequantize_kernel_recorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
+            dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
 
             // matrix multiplication
             // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
@@ -865,7 +864,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
     }
 }
 
-
 static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloat *y,
                                              float *dst, const int ncols,
                                              const int nrows,
@@ -1082,7 +1080,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
 
     const int64_t ne00 = src0->ne[0];
     const int64_t row_diff = row_high - row_low;
-
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
 #ifdef GGML_SYCL_F16
@@ -1096,7 +1093,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
 
     if (src1_convert_f16) {
         src1_dfloat = src1_dfloat_a.alloc(ne00);
-        const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, ctx);
+        const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
         GGML_ASSERT(to_fp16_sycl != nullptr);
         to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
     }
@@ -1106,7 +1103,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
-            if (ctx.opt_feature.reorder) {
+            if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
                 dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
             } else {
                 dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp

Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,6 @@ static void dequantize_row_q4_0_sycl(const void vx, dst_t y, const int64_t k,`
`125`	`125`	`}`
`126`	`126`	`}`
`127`	`127`
`128`		`-`
`129`	`128`	`template <typename dst_t>`
`130`	`129`	`static void dequantize_row_q4_0_sycl_reorder(const void vx, dst_t y, const int64_t k,`
`131`	`130`	`dpct::queue_ptr stream) {`
`@@ -472,10 +471,11 @@ static void convert_unary_sycl(const void *__restrict__ vx,`
`472`	`471`	`}`
`473`	`472`	`}`
`474`	`473`
`475`		`-to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_backend_sycl_context & ctx) {`
	`474`	`+to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {`
`476`	`475`	`switch (type) {`
`477`	`476`	`case GGML_TYPE_Q4_0:`
`478`		`- if (ctx.opt_feature.reorder) {`
	`477`	`+ if (dst->src[0]->extra &&`
	`478`	`+ ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {`
`479`	`479`	`return dequantize_row_q4_0_sycl_reorder;`
`480`	`480`	`} else {`
`481`	`481`	`return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;`
`@@ -523,10 +523,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_backend_sycl_context &`
`523`	`523`	`}`
`524`	`524`	`}`
`525`	`525`
`526`		`-to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_backend_sycl_context & ctx) {`
	`526`	`+to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {`
`527`	`527`	`switch (type) {`
`528`	`528`	`case GGML_TYPE_Q4_0:`
`529`		`- if (ctx.opt_feature.reorder) {`
	`529`	`+ if (dst->src[0]->extra &&`
	`530`	`+ ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {`
`530`	`531`	`return dequantize_row_q4_0_sycl_reorder;`
`531`	`532`	`} else {`
`532`	`533`	`return dequantize_row_q4_0_sycl;`