@@ -233,6 +233,33 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
233
233
return { type, major, minor, patch };
234
234
}
235
235
236
+ // cl buffer
237
+ struct ggml_cl_buffer {
238
+ cl_mem buffer;
239
+ size_t size;
240
+
241
+ ggml_cl_buffer ()
242
+ : buffer(NULL ), size(0 ) {}
243
+
244
+ ~ggml_cl_buffer () {
245
+ if (buffer) {
246
+ CL_CHECK (clReleaseMemObject (buffer));
247
+ }
248
+ }
249
+
250
+ void allocate (cl_context context, size_t new_size) {
251
+ if (new_size > size) {
252
+ size = new_size;
253
+ if (buffer) {
254
+ CL_CHECK (clReleaseMemObject (buffer));
255
+ }
256
+ cl_int err;
257
+ buffer = clCreateBuffer (context, CL_MEM_READ_WRITE, size, NULL , &err);
258
+ CL_CHECK (err);
259
+ }
260
+ }
261
+ };
262
+
236
263
// Profiling
237
264
struct ProfilingInfo {
238
265
std::string op_name;
@@ -346,6 +373,9 @@ struct ggml_backend_opencl_context {
346
373
cl_context context;
347
374
cl_command_queue queue;
348
375
376
+ ggml_cl_buffer prealloc_src0;
377
+ ggml_cl_buffer prealloc_src1;
378
+
349
379
cl_program program_add;
350
380
cl_program program_add_id;
351
381
cl_program program_clamp;
@@ -4240,6 +4270,81 @@ static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct gg
4240
4270
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32 );
4241
4271
}
4242
4272
4273
+ // Copy a noncontiguous tensor to contiguous tensor. ne[] remains the same but
4274
+ // nb[] is recalculated such that tensor is contiguous.
4275
+ static void ggml_cl_copy_to_contiguous (ggml_backend_t backend, const ggml_tensor * src, cl_mem dst,
4276
+ cl_ulong &nb0, cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) {
4277
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
4278
+
4279
+ const int tensor_type_size = ggml_type_size (src->type );
4280
+
4281
+ const int ne00 = src->ne [0 ];
4282
+ const int ne01 = src->ne [1 ];
4283
+ const int ne02 = src->ne [2 ];
4284
+ const int ne03 = src->ne [3 ];
4285
+
4286
+ const cl_ulong nb00 = src->nb [0 ];
4287
+ const cl_ulong nb01 = src->nb [1 ];
4288
+ const cl_ulong nb02 = src->nb [2 ];
4289
+ const cl_ulong nb03 = src->nb [3 ];
4290
+
4291
+ const int ne0 = src->ne [0 ];
4292
+ const int ne1 = src->ne [1 ];
4293
+ const int ne2 = src->ne [2 ];
4294
+ const int ne3 = src->ne [3 ];
4295
+
4296
+ nb0 = tensor_type_size;
4297
+ nb1 = tensor_type_size*ne00;
4298
+ nb2 = tensor_type_size*ne00*ne01;
4299
+ nb3 = tensor_type_size*ne00*ne01*ne02;
4300
+
4301
+ ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *)src->extra ;
4302
+
4303
+ cl_ulong offset0 = extra->offset + src->view_offs ;
4304
+ cl_ulong offsetd = 0 ;
4305
+
4306
+ cl_kernel kernel;
4307
+
4308
+ switch (src->type ) {
4309
+ case GGML_TYPE_F32:
4310
+ kernel = backend_ctx->kernel_cpy_f32_f32 ;
4311
+ break ;
4312
+ case GGML_TYPE_F16:
4313
+ kernel = backend_ctx->kernel_cpy_f16_f16 ;
4314
+ break ;
4315
+ default :
4316
+ GGML_ASSERT (false && " not implemented" );
4317
+ }
4318
+
4319
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra->data_device ));
4320
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
4321
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &dst));
4322
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offsetd));
4323
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (int ), &ne00));
4324
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (int ), &ne01));
4325
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne02));
4326
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (int ), &ne03));
4327
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb00));
4328
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb01));
4329
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb02));
4330
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_ulong), &nb03));
4331
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (int ), &ne0));
4332
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ), &ne1));
4333
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (int ), &ne2));
4334
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne3));
4335
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (cl_ulong), &nb0));
4336
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (cl_ulong), &nb1));
4337
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (cl_ulong), &nb2));
4338
+ CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (cl_ulong), &nb3));
4339
+
4340
+ const int nth = MIN (64 , ne00);
4341
+
4342
+ size_t global_work_size[] = {(size_t )ne01*nth, (size_t )ne02, (size_t )ne03};
4343
+ size_t local_work_size[] = {(size_t )nth, 1 , 1 };
4344
+
4345
+ backend_ctx->enqueue_ndrange_kernel (kernel, 3 , global_work_size, local_work_size, src);
4346
+ }
4347
+
4243
4348
static void ggml_cl_nop (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4244
4349
UNUSED (backend);
4245
4350
UNUSED (src0);
@@ -6585,20 +6690,20 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6585
6690
const int ne02 = src0 ? src0->ne [2 ] : 0 ;
6586
6691
const int ne03 = src0 ? src0->ne [3 ] : 0 ;
6587
6692
6588
- const cl_ulong nb00 = src0 ? src0->nb [0 ] : 0 ;
6589
- const cl_ulong nb01 = src0 ? src0->nb [1 ] : 0 ;
6590
- const cl_ulong nb02 = src0 ? src0->nb [2 ] : 0 ;
6591
- const cl_ulong nb03 = src0 ? src0->nb [3 ] : 0 ;
6693
+ cl_ulong nb00 = src0 ? src0->nb [0 ] : 0 ;
6694
+ cl_ulong nb01 = src0 ? src0->nb [1 ] : 0 ;
6695
+ cl_ulong nb02 = src0 ? src0->nb [2 ] : 0 ;
6696
+ cl_ulong nb03 = src0 ? src0->nb [3 ] : 0 ;
6592
6697
6593
6698
const int ne10 = src1 ? src1->ne [0 ] : 0 ;
6594
6699
const int ne11 = src1 ? src1->ne [1 ] : 0 ;
6595
6700
const int ne12 = src1 ? src1->ne [2 ] : 0 ;
6596
6701
const int ne13 = src1 ? src1->ne [3 ] : 0 ;
6597
6702
6598
- const cl_ulong nb10 = src1 ? src1->nb [0 ] : 0 ;
6599
- const cl_ulong nb11 = src1 ? src1->nb [1 ] : 0 ;
6600
- const cl_ulong nb12 = src1 ? src1->nb [2 ] : 0 ;
6601
- const cl_ulong nb13 = src1 ? src1->nb [3 ] : 0 ;
6703
+ cl_ulong nb10 = src1 ? src1->nb [0 ] : 0 ;
6704
+ cl_ulong nb11 = src1 ? src1->nb [1 ] : 0 ;
6705
+ cl_ulong nb12 = src1 ? src1->nb [2 ] : 0 ;
6706
+ cl_ulong nb13 = src1 ? src1->nb [3 ] : 0 ;
6602
6707
6603
6708
const int ne0 = dst ? dst->ne [0 ] : 0 ;
6604
6709
const int ne1 = dst ? dst->ne [1 ] : 0 ;
@@ -6916,11 +7021,28 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6916
7021
6917
7022
// GEMM using local memory
6918
7023
// Current BK = 16, so ne00 % 16 == 0
6919
- if (ggml_is_contiguous (src0) &&
6920
- ggml_is_contiguous (src1) &&
6921
- src1t == GGML_TYPE_F32 &&
7024
+ if (src1t == GGML_TYPE_F32 &&
6922
7025
ne00 % 16 == 0 &&
6923
7026
ne11 > 1 ) {
7027
+ cl_mem mem_src0 = extra0->data_device ;
7028
+ cl_mem mem_src1 = extra1->data_device ;
7029
+
7030
+ if (!ggml_is_contiguous (src0)) {
7031
+ backend_ctx->prealloc_src0 .allocate (backend_ctx->context , ggml_nbytes (src0));
7032
+ ggml_cl_copy_to_contiguous (backend, src0, backend_ctx->prealloc_src0 .buffer ,
7033
+ nb00, nb01, nb02, nb03);
7034
+ mem_src0 = backend_ctx->prealloc_src0 .buffer ;
7035
+ offset0 = 0 ;
7036
+ }
7037
+
7038
+ if (!ggml_is_contiguous (src1)) {
7039
+ backend_ctx->prealloc_src1 .allocate (backend_ctx->context , ggml_nbytes (src1));
7040
+ ggml_cl_copy_to_contiguous (backend, src1, backend_ctx->prealloc_src1 .buffer ,
7041
+ nb10, nb11, nb12, nb13);
7042
+ mem_src1 = backend_ctx->prealloc_src1 .buffer ;
7043
+ offset1 = 0 ;
7044
+ }
7045
+
6924
7046
switch (src0t) {
6925
7047
case GGML_TYPE_F32: {
6926
7048
kernel = backend_ctx->kernel_mul_mm_f32_f32_l4_lm ;
@@ -6930,9 +7052,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6930
7052
int batch_stride_b = ne10*ne11;
6931
7053
int batch_stride_d = ne0*ne1;
6932
7054
6933
- CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0-> data_device ));
7055
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &mem_src0 ));
6934
7056
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
6935
- CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1-> data_device ));
7057
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &mem_src1 ));
6936
7058
CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6937
7059
CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6938
7060
CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
@@ -6965,9 +7087,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6965
7087
int batch_stride_b = ne10*ne11;
6966
7088
int batch_stride_d = ne0*ne1;
6967
7089
6968
- CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0-> data_device ));
7090
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &mem_src0 ));
6969
7091
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
6970
- CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1-> data_device ));
7092
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &mem_src1 ));
6971
7093
CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6972
7094
CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6973
7095
CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
0 commit comments