@@ -1365,6 +1365,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
13651365 int M = tensor->ne [1 ]; // ne01
13661366 int K = tensor->ne [0 ]; // ne00
13671367
1368+ // For matrix-vector multiplication kernel, we assume K is a multiple of 32
1369+ GGML_ASSERT (K % 32 == 0 );
1370+ // For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
1371+ GGML_ASSERT (M % 4 == 0 );
1372+
13681373 // transpose is out of place, so we need to allocate transposed buffers
13691374 // <----------------------------------------------------------------------------------> //
13701375 // use sub_buffer of max buffer size instead
@@ -1405,36 +1410,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14051410 cl_mem qT_d_image1D;
14061411 cl_mem dT_d_image1D;
14071412
1408- cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
1413+ cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
14091414 cl_image_desc img_desc_1d;
14101415
14111416 memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
14121417 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1413- img_desc_1d.image_width = M * K / 8 / 4 ;
1418+ img_desc_1d.image_width = M * K / 4 / 4 ;
14141419 img_desc_1d.buffer = extra->q ;
14151420 q_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
14161421 CL_CHECK (err);
14171422
1418- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1423+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
14191424 memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
14201425 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1421- img_desc_1d.image_width = M * K / 8 / 4 ;
1426+ img_desc_1d.image_width = M * K / 4 / 4 ;
14221427 img_desc_1d.buffer = qT_d;
14231428 qT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
14241429 CL_CHECK (err);
14251430
1426- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1431+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
14271432 memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
14281433 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1429- img_desc_1d.image_width = M * K / 32 / 4 / 2 ;
1434+ img_desc_1d.image_width = M * K / 32 / 4 ;
14301435 img_desc_1d.buffer = extra->d ;
14311436 d_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
14321437 CL_CHECK (err);
14331438
1434- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1439+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
14351440 memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
14361441 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1437- img_desc_1d.image_width = M * K / 32 / 4 / 2 ;
1442+ img_desc_1d.image_width = M * K / 32 / 4 ;
14381443 img_desc_1d.buffer = dT_d;
14391444 dT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
14401445 CL_CHECK (err);
@@ -1443,8 +1448,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14431448 // set up and call the transpose kernels
14441449 // <----------------------------------------------------------------------------------> //
14451450 // weights
1446- int height_q = M / 8 ;
1447- int width_q = K / 8 / 4 ;
1451+ int height_q = M / 4 ;
1452+ int width_q = K / 4 / 4 ;
14481453 kernel = backend_ctx->kernel_transpose_16 ;
14491454
14501455 CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &q_d_image1D));
@@ -1458,8 +1463,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14581463 CL_CHECK (clWaitForEvents (1 , &evt));
14591464
14601465 // scales
1461- int height_s = M / 8 ;
1462- int width_s = K / 32 / 8 ;
1466+ int height_s = M / 4 ;
1467+ int width_s = K / 32 / 4 ;
14631468
14641469 kernel = backend_ctx->kernel_transpose_16 ;
14651470 CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &d_d_image1D));
@@ -1853,7 +1858,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
18531858 void * buf_d;
18541859#endif
18551860
1856- #ifdef GGML_USE_OPENCL
18571861 // Make sure everything is done.
18581862 CL_CHECK(clFinish(queue));
18591863
@@ -1889,7 +1893,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
18891893 extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
18901894 CL_CHECK(clFinish(queue));
18911895#endif // GGML_OPENCL_SOA_Q
1892- #endif // GGML_USE_OPENCL
18931896
18941897 // Open file and dump.
18951898 char fname[512];
0 commit comments