@@ -444,19 +444,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
444444 backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
445445 backend_ctx->adreno_gen = get_adreno_gpu_gen (default_device->name );
446446
447- // Default wave size is 128, A8x uses 64.
448- if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
449- backend_ctx->adreno_wave_size = 64 ;
450- } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
451- backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
452- backend_ctx->adreno_wave_size = 128 ;
453- } else {
454- backend_ctx->adreno_wave_size = 128 ;
455- GGML_LOG_WARN (" ggml_opencl: Unsupported Adreno GPU: %s, "
456- " using wave size %d, "
457- " may not work as expected\n " ,
458- backend_ctx->device_name .c_str (), backend_ctx->adreno_wave_size );
459- }
447+ // Use wave size of 64 for all Adreno GPUs.
448+ backend_ctx->adreno_wave_size = 64 ;
460449 } else if (strstr (default_device->name , " Intel" )) {
461450 backend_ctx->gpu_family = GPU_FAMILY::INTEL;
462451 } else {
@@ -1376,6 +1365,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
13761365 int M = tensor->ne [1 ]; // ne01
13771366 int K = tensor->ne [0 ]; // ne00
13781367
1368+ // For matrix-vector multiplication kernel, we assume K is a multiple of 32
1369+ GGML_ASSERT (K % 32 == 0 );
1370+ // For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
1371+ GGML_ASSERT (M % 4 == 0 );
1372+
13791373 // transpose is out of place, so we need to allocate transposed buffers
13801374 // <----------------------------------------------------------------------------------> //
13811375 // use sub_buffer of max buffer size instead
@@ -1416,36 +1410,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14161410 cl_mem qT_d_image1D;
14171411 cl_mem dT_d_image1D;
14181412
1419- cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
1413+ cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
14201414 cl_image_desc img_desc_1d;
14211415
14221416 memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
14231417 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1424- img_desc_1d.image_width = M * K / 8 / 4 ;
1418+ img_desc_1d.image_width = M * K / 4 / 4 ;
14251419 img_desc_1d.buffer = extra->q ;
14261420 q_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
14271421 CL_CHECK (err);
14281422
1429- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1423+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
14301424 memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
14311425 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1432- img_desc_1d.image_width = M * K / 8 / 4 ;
1426+ img_desc_1d.image_width = M * K / 4 / 4 ;
14331427 img_desc_1d.buffer = qT_d;
14341428 qT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
14351429 CL_CHECK (err);
14361430
1437- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1431+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
14381432 memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
14391433 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1440- img_desc_1d.image_width = M * K / 32 / 4 / 2 ;
1434+ img_desc_1d.image_width = M * K / 32 / 4 ;
14411435 img_desc_1d.buffer = extra->d ;
14421436 d_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
14431437 CL_CHECK (err);
14441438
1445- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1439+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
14461440 memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
14471441 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1448- img_desc_1d.image_width = M * K / 32 / 4 / 2 ;
1442+ img_desc_1d.image_width = M * K / 32 / 4 ;
14491443 img_desc_1d.buffer = dT_d;
14501444 dT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
14511445 CL_CHECK (err);
@@ -1454,8 +1448,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14541448 // set up and call the transpose kernels
14551449 // <----------------------------------------------------------------------------------> //
14561450 // weights
1457- int height_q = M / 8 ;
1458- int width_q = K / 8 / 4 ;
1451+ int height_q = M / 4 ;
1452+ int width_q = K / 4 / 4 ;
14591453 kernel = backend_ctx->kernel_transpose_16 ;
14601454
14611455 CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &q_d_image1D));
@@ -1469,8 +1463,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14691463 CL_CHECK (clWaitForEvents (1 , &evt));
14701464
14711465 // scales
1472- int height_s = M / 8 ;
1473- int width_s = K / 32 / 8 ;
1466+ int height_s = M / 4 ;
1467+ int width_s = K / 32 / 4 ;
14741468
14751469 kernel = backend_ctx->kernel_transpose_16 ;
14761470 CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &d_d_image1D));
@@ -1864,7 +1858,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
18641858 void * buf_d;
18651859#endif
18661860
1867- #ifdef GGML_USE_OPENCL
18681861 // Make sure everything is done.
18691862 CL_CHECK(clFinish(queue));
18701863
@@ -1900,7 +1893,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
19001893 extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
19011894 CL_CHECK(clFinish(queue));
19021895#endif // GGML_OPENCL_SOA_Q
1903- #endif // GGML_USE_OPENCL
19041896
19051897 // Open file and dump.
19061898 char fname[512];
@@ -2865,6 +2857,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
28652857 CL_CHECK (status);
28662858
28672859 int height_B = N/4 ;
2860+ if (height_B == 0 ) {
2861+ height_B = 1 ;
2862+ }
28682863 int width_B = K/4 ;
28692864 int padded_height_B = (N + padding)/4 ;
28702865
@@ -3013,11 +3008,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
30133008 }
30143009
30153010 if (N == 1 ) {
3016- local_work_size[0 ] = backend_ctx->adreno_wave_size ; // localsize
3011+ size_t wavesize = backend_ctx->adreno_wave_size ;
3012+ local_work_size[0 ] = wavesize; // localsize
30173013 local_work_size[1 ] = 4 ; // reduce factor
30183014 local_work_size[2 ] = 1 ;
30193015
3020- global_work_size[0 ] = M / 2 ;
3016+ global_work_size[0 ] = ((( M / 2 ) + wavesize - 1 ) / wavesize) * wavesize ;
30213017 global_work_size[1 ] = 4 ; // reduce factor
30223018 global_work_size[2 ] = 1 ;
30233019 }
0 commit comments