Skip to content

Commit 5ed3357

Browse files
committed
opencl: add copy_to_contiguous and utilize mm kernels
1 parent ff4bad9 commit 5ed3357

File tree

1 file changed

+137
-15
lines changed

1 file changed

+137
-15
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 137 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,33 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
233233
return { type, major, minor, patch };
234234
}
235235

236+
// cl buffer
237+
struct ggml_cl_buffer {
238+
cl_mem buffer;
239+
size_t size;
240+
241+
ggml_cl_buffer()
242+
: buffer(NULL), size(0) {}
243+
244+
~ggml_cl_buffer() {
245+
if (buffer) {
246+
CL_CHECK(clReleaseMemObject(buffer));
247+
}
248+
}
249+
250+
void allocate(cl_context context, size_t new_size) {
251+
if (new_size > size) {
252+
size = new_size;
253+
if (buffer) {
254+
CL_CHECK(clReleaseMemObject(buffer));
255+
}
256+
cl_int err;
257+
buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
258+
CL_CHECK(err);
259+
}
260+
}
261+
};
262+
236263
// Profiling
237264
struct ProfilingInfo {
238265
std::string op_name;
@@ -346,6 +373,9 @@ struct ggml_backend_opencl_context {
346373
cl_context context;
347374
cl_command_queue queue;
348375

376+
ggml_cl_buffer prealloc_src0;
377+
ggml_cl_buffer prealloc_src1;
378+
349379
cl_program program_add;
350380
cl_program program_add_id;
351381
cl_program program_clamp;
@@ -4240,6 +4270,81 @@ static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct gg
42404270
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
42414271
}
42424272

4273+
// Copy a noncontiguous tensor to contiguous tensor. ne[] remains the same but
4274+
// nb[] is recalculated such that tensor is contiguous.
4275+
static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor * src, cl_mem dst,
4276+
cl_ulong &nb0, cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) {
4277+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4278+
4279+
const int tensor_type_size = ggml_type_size(src->type);
4280+
4281+
const int ne00 = src->ne[0];
4282+
const int ne01 = src->ne[1];
4283+
const int ne02 = src->ne[2];
4284+
const int ne03 = src->ne[3];
4285+
4286+
const cl_ulong nb00 = src->nb[0];
4287+
const cl_ulong nb01 = src->nb[1];
4288+
const cl_ulong nb02 = src->nb[2];
4289+
const cl_ulong nb03 = src->nb[3];
4290+
4291+
const int ne0 = src->ne[0];
4292+
const int ne1 = src->ne[1];
4293+
const int ne2 = src->ne[2];
4294+
const int ne3 = src->ne[3];
4295+
4296+
nb0 = tensor_type_size;
4297+
nb1 = tensor_type_size*ne00;
4298+
nb2 = tensor_type_size*ne00*ne01;
4299+
nb3 = tensor_type_size*ne00*ne01*ne02;
4300+
4301+
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *)src->extra;
4302+
4303+
cl_ulong offset0 = extra->offset + src->view_offs;
4304+
cl_ulong offsetd = 0;
4305+
4306+
cl_kernel kernel;
4307+
4308+
switch (src->type) {
4309+
case GGML_TYPE_F32:
4310+
kernel = backend_ctx->kernel_cpy_f32_f32;
4311+
break;
4312+
case GGML_TYPE_F16:
4313+
kernel = backend_ctx->kernel_cpy_f16_f16;
4314+
break;
4315+
default:
4316+
GGML_ASSERT(false && "not implemented");
4317+
}
4318+
4319+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->data_device));
4320+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
4321+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst));
4322+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
4323+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
4324+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
4325+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
4326+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
4327+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
4328+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
4329+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
4330+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
4331+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
4332+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1));
4333+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne2));
4334+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne3));
4335+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb0));
4336+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1));
4337+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2));
4338+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3));
4339+
4340+
const int nth = MIN(64, ne00);
4341+
4342+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
4343+
size_t local_work_size[] = {(size_t)nth, 1, 1};
4344+
4345+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src);
4346+
}
4347+
42434348
static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
42444349
UNUSED(backend);
42454350
UNUSED(src0);
@@ -6585,20 +6690,20 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
65856690
const int ne02 = src0 ? src0->ne[2] : 0;
65866691
const int ne03 = src0 ? src0->ne[3] : 0;
65876692

6588-
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
6589-
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
6590-
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
6591-
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
6693+
cl_ulong nb00 = src0 ? src0->nb[0] : 0;
6694+
cl_ulong nb01 = src0 ? src0->nb[1] : 0;
6695+
cl_ulong nb02 = src0 ? src0->nb[2] : 0;
6696+
cl_ulong nb03 = src0 ? src0->nb[3] : 0;
65926697

65936698
const int ne10 = src1 ? src1->ne[0] : 0;
65946699
const int ne11 = src1 ? src1->ne[1] : 0;
65956700
const int ne12 = src1 ? src1->ne[2] : 0;
65966701
const int ne13 = src1 ? src1->ne[3] : 0;
65976702

6598-
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
6599-
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
6600-
const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
6601-
const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
6703+
cl_ulong nb10 = src1 ? src1->nb[0] : 0;
6704+
cl_ulong nb11 = src1 ? src1->nb[1] : 0;
6705+
cl_ulong nb12 = src1 ? src1->nb[2] : 0;
6706+
cl_ulong nb13 = src1 ? src1->nb[3] : 0;
66026707

66036708
const int ne0 = dst ? dst->ne[0] : 0;
66046709
const int ne1 = dst ? dst->ne[1] : 0;
@@ -6916,11 +7021,28 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
69167021

69177022
// GEMM using local memory
69187023
// Current BK = 16, so ne00 % 16 == 0
6919-
if (ggml_is_contiguous(src0) &&
6920-
ggml_is_contiguous(src1) &&
6921-
src1t == GGML_TYPE_F32 &&
7024+
if (src1t == GGML_TYPE_F32 &&
69227025
ne00 % 16 == 0 &&
69237026
ne11 > 1) {
7027+
cl_mem mem_src0 = extra0->data_device;
7028+
cl_mem mem_src1 = extra1->data_device;
7029+
7030+
if (!ggml_is_contiguous(src0)) {
7031+
backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
7032+
ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
7033+
nb00, nb01, nb02, nb03);
7034+
mem_src0 = backend_ctx->prealloc_src0.buffer;
7035+
offset0 = 0;
7036+
}
7037+
7038+
if (!ggml_is_contiguous(src1)) {
7039+
backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
7040+
ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
7041+
nb10, nb11, nb12, nb13);
7042+
mem_src1 = backend_ctx->prealloc_src1.buffer;
7043+
offset1 = 0;
7044+
}
7045+
69247046
switch(src0t) {
69257047
case GGML_TYPE_F32: {
69267048
kernel = backend_ctx->kernel_mul_mm_f32_f32_l4_lm;
@@ -6930,9 +7052,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
69307052
int batch_stride_b = ne10*ne11;
69317053
int batch_stride_d = ne0*ne1;
69327054

6933-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7055+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0));
69347056
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6935-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
7057+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1));
69367058
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
69377059
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
69387060
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
@@ -6965,9 +7087,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
69657087
int batch_stride_b = ne10*ne11;
69667088
int batch_stride_d = ne0*ne1;
69677089

6968-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7090+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0));
69697091
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6970-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
7092+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1));
69717093
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
69727094
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
69737095
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));

0 commit comments

Comments
 (0)