Skip to content

Commit 5cfde90

Browse files
support optimize graph once when compute graph, record the opt status in tensor->extra, make CI passed
1 parent 63e5285 commit 5cfde90

File tree

6 files changed

+122
-91
lines changed

6 files changed

+122
-91
lines changed

ggml/src/ggml-sycl/common.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,20 @@ catch (sycl::exception const &exc) {
9999
<< ", line:" << __LINE__ << std::endl;
100100
std::exit(1);
101101
}
102+
103+
104+
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
105+
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
106+
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
107+
if (extra->events[i][is] != nullptr) {
108+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::destroy_event(extra->events[i][is])));
109+
}
110+
}
111+
if (extra->data_device[i] != nullptr && streams.size()>0) {
112+
ggml_sycl_set_device(i);
113+
SYCL_CHECK(
114+
CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
115+
}
116+
}
117+
delete extra;
118+
}

ggml/src/ggml-sycl/common.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
void* ggml_sycl_host_malloc(size_t size);
3838
void ggml_sycl_host_free(void* ptr);
3939

40+
4041
static int g_ggml_sycl_debug = 0;
4142
#define GGML_SYCL_DEBUG(...) \
4243
do { \
@@ -268,8 +269,11 @@ struct ggml_tensor_extra_gpu {
268269
// tensors
269270
dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
270271
[GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
272+
optimize_feature optimized_feature;
271273
};
272274

275+
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
276+
273277
inline optimize_feature check_gpu_optimize_feature(int hw_family) {
274278
optimize_feature opt;
275279
opt.reorder = ( hw_family==SYCL_HW_FAMILY_INTEL_PVC ||
@@ -283,6 +287,7 @@ struct ggml_backend_sycl_context {
283287
int device;
284288
std::string name;
285289
optimize_feature opt_feature;
290+
bool optimized_graph=false;
286291

287292
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
288293

ggml/src/ggml-sycl/convert.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,6 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
125125
}
126126
}
127127

128-
129128
template <typename dst_t>
130129
static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
131130
dpct::queue_ptr stream) {
@@ -472,10 +471,11 @@ static void convert_unary_sycl(const void *__restrict__ vx,
472471
}
473472
}
474473

475-
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_backend_sycl_context & ctx) {
474+
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
476475
switch (type) {
477476
case GGML_TYPE_Q4_0:
478-
if (ctx.opt_feature.reorder) {
477+
if (dst->src[0]->extra &&
478+
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
479479
return dequantize_row_q4_0_sycl_reorder;
480480
} else {
481481
return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
@@ -523,10 +523,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_backend_sycl_context &
523523
}
524524
}
525525

526-
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_backend_sycl_context & ctx) {
526+
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
527527
switch (type) {
528528
case GGML_TYPE_Q4_0:
529-
if (ctx.opt_feature.reorder) {
529+
if (dst->src[0]->extra &&
530+
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
530531
return dequantize_row_q4_0_sycl_reorder;
531532
} else {
532533
return dequantize_row_q4_0_sycl;

ggml/src/ggml-sycl/convert.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
2121
typedef to_t_sycl_t<float> to_fp32_sycl_t;
2222
typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
2323

24-
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_backend_sycl_context & ctx);
25-
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_backend_sycl_context & ctx);
24+
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
25+
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
2626

2727
#endif // GGML_SYCL_CONVERT_HPP

ggml/src/ggml-sycl/dmmv.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include "dequantize.hpp"
44
#include "presets.hpp"
55

6-
76
static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
87
const sycl::half *x = (const sycl::half *)vx;
98

@@ -91,7 +90,7 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
9190
}
9291
}
9392

94-
template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_recorder>
93+
template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_reorder>
9594
static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
9695
const sycl::nd_item<3> &item_ct1) {
9796
// qk = quantized weights per x block
@@ -134,7 +133,7 @@ static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const d
134133
// dequantize
135134
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
136135
dfloat2 v;
137-
dequantize_kernel_recorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
136+
dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
138137

139138
// matrix multiplication
140139
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
@@ -165,7 +164,7 @@ static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const d
165164
// dequantize
166165
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
167166
dfloat2 v;
168-
dequantize_kernel_recorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
167+
dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
169168

170169
// matrix multiplication
171170
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
@@ -865,7 +864,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
865864
}
866865
}
867866

868-
869867
static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloat *y,
870868
float *dst, const int ncols,
871869
const int nrows,
@@ -1082,7 +1080,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
10821080

10831081
const int64_t ne00 = src0->ne[0];
10841082
const int64_t row_diff = row_high - row_low;
1085-
10861083
GGML_ASSERT(src1->type == GGML_TYPE_F32);
10871084
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
10881085
#ifdef GGML_SYCL_F16
@@ -1096,7 +1093,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
10961093

10971094
if (src1_convert_f16) {
10981095
src1_dfloat = src1_dfloat_a.alloc(ne00);
1099-
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, ctx);
1096+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
11001097
GGML_ASSERT(to_fp16_sycl != nullptr);
11011098
to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
11021099
}
@@ -1106,7 +1103,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
11061103

11071104
switch (src0->type) {
11081105
case GGML_TYPE_Q4_0:
1109-
if (ctx.opt_feature.reorder) {
1106+
if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
1107+
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
11101108
dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
11111109
} else {
11121110
dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);

0 commit comments

Comments
 (0)