@@ -1365,6 +1365,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1365
1365
int M = tensor->ne [1 ]; // ne01
1366
1366
int K = tensor->ne [0 ]; // ne00
1367
1367
1368
+ // For matrix-vector multiplication kernel, we assume K is a multiple of 32
1369
+ GGML_ASSERT (K % 32 == 0 );
1370
+ // For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
1371
+ GGML_ASSERT (M % 4 == 0 );
1372
+
1368
1373
// transpose is out of place, so we need to allocate transposed buffers
1369
1374
// <----------------------------------------------------------------------------------> //
1370
1375
// use sub_buffer of max buffer size instead
@@ -1405,36 +1410,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1405
1410
cl_mem qT_d_image1D;
1406
1411
cl_mem dT_d_image1D;
1407
1412
1408
- cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
1413
+ cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1409
1414
cl_image_desc img_desc_1d;
1410
1415
1411
1416
memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
1412
1417
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1413
- img_desc_1d.image_width = M * K / 8 / 4 ;
1418
+ img_desc_1d.image_width = M * K / 4 / 4 ;
1414
1419
img_desc_1d.buffer = extra->q ;
1415
1420
q_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
1416
1421
CL_CHECK (err);
1417
1422
1418
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1423
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1419
1424
memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
1420
1425
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1421
- img_desc_1d.image_width = M * K / 8 / 4 ;
1426
+ img_desc_1d.image_width = M * K / 4 / 4 ;
1422
1427
img_desc_1d.buffer = qT_d;
1423
1428
qT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
1424
1429
CL_CHECK (err);
1425
1430
1426
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1431
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1427
1432
memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
1428
1433
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1429
- img_desc_1d.image_width = M * K / 32 / 4 / 2 ;
1434
+ img_desc_1d.image_width = M * K / 32 / 4 ;
1430
1435
img_desc_1d.buffer = extra->d ;
1431
1436
d_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
1432
1437
CL_CHECK (err);
1433
1438
1434
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1439
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1435
1440
memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
1436
1441
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1437
- img_desc_1d.image_width = M * K / 32 / 4 / 2 ;
1442
+ img_desc_1d.image_width = M * K / 32 / 4 ;
1438
1443
img_desc_1d.buffer = dT_d;
1439
1444
dT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
1440
1445
CL_CHECK (err);
@@ -1443,8 +1448,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1443
1448
// set up and call the transpose kernels
1444
1449
// <----------------------------------------------------------------------------------> //
1445
1450
// weights
1446
- int height_q = M / 8 ;
1447
- int width_q = K / 8 / 4 ;
1451
+ int height_q = M / 4 ;
1452
+ int width_q = K / 4 / 4 ;
1448
1453
kernel = backend_ctx->kernel_transpose_16 ;
1449
1454
1450
1455
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &q_d_image1D));
@@ -1458,8 +1463,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1458
1463
CL_CHECK (clWaitForEvents (1 , &evt));
1459
1464
1460
1465
// scales
1461
- int height_s = M / 8 ;
1462
- int width_s = K / 32 / 8 ;
1466
+ int height_s = M / 4 ;
1467
+ int width_s = K / 32 / 4 ;
1463
1468
1464
1469
kernel = backend_ctx->kernel_transpose_16 ;
1465
1470
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &d_d_image1D));
@@ -1853,7 +1858,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
1853
1858
void * buf_d;
1854
1859
#endif
1855
1860
1856
- #ifdef GGML_USE_OPENCL
1857
1861
// Make sure everything is done.
1858
1862
CL_CHECK(clFinish(queue));
1859
1863
@@ -1889,7 +1893,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
1889
1893
extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
1890
1894
CL_CHECK(clFinish(queue));
1891
1895
#endif // GGML_OPENCL_SOA_Q
1892
- #endif // GGML_USE_OPENCL
1893
1896
1894
1897
// Open file and dump.
1895
1898
char fname[512];
0 commit comments