@@ -444,19 +444,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
444444        backend_ctx->gpu_family  = GPU_FAMILY::ADRENO;
445445        backend_ctx->adreno_gen  = get_adreno_gpu_gen (default_device->name );
446446
447-         //  Default wave size is 128, A8x uses 64.
448-         if  (backend_ctx->adreno_gen  == ADRENO_GPU_GEN::A8X) {
449-             backend_ctx->adreno_wave_size  = 64 ;
450-         } else  if  (backend_ctx->adreno_gen  == ADRENO_GPU_GEN::A7X ||
451-                    backend_ctx->adreno_gen  == ADRENO_GPU_GEN::X1E) {
452-             backend_ctx->adreno_wave_size  = 128 ;
453-         } else  {
454-             backend_ctx->adreno_wave_size  = 128 ;
455-             GGML_LOG_WARN (" ggml_opencl: Unsupported Adreno GPU: %s, " 
456-                 " using wave size %d, " 
457-                 " may not work as expected\n "  ,
458-                 backend_ctx->device_name .c_str (), backend_ctx->adreno_wave_size );
459-         }
447+         //  Use wave size of 64 for all Adreno GPUs.
448+         backend_ctx->adreno_wave_size  = 64 ;
460449    } else  if  (strstr (default_device->name , " Intel"  )) {
461450        backend_ctx->gpu_family  = GPU_FAMILY::INTEL;
462451    } else  {
@@ -1376,6 +1365,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
13761365        int  M = tensor->ne [1 ];   //  ne01
13771366        int  K = tensor->ne [0 ];   //  ne00
13781367
1368+         // For matrix-vector multiplication kernel, we assume K is a multiple of 32
1369+         GGML_ASSERT (K % 32  == 0 );
1370+         // For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
1371+         GGML_ASSERT (M % 4  == 0 );
1372+ 
13791373        //  transpose is out of place, so we need to allocate transposed buffers
13801374        //  <----------------------------------------------------------------------------------> //
13811375        //  use sub_buffer of max buffer size instead
@@ -1416,36 +1410,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14161410        cl_mem qT_d_image1D;
14171411        cl_mem dT_d_image1D;
14181412
1419-         cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT  };
1413+         cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT  };
14201414        cl_image_desc img_desc_1d;
14211415
14221416        memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
14231417        img_desc_1d.image_type  = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1424-         img_desc_1d.image_width  = M * K / 8  / 4 ;
1418+         img_desc_1d.image_width  = M * K / 4  / 4 ;
14251419        img_desc_1d.buffer  = extra->q ;
14261420        q_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
14271421        CL_CHECK (err);
14281422
1429-         img_fmt_1d = { CL_RGBA, CL_FLOAT  };
1423+         img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT  };
14301424        memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
14311425        img_desc_1d.image_type  = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1432-         img_desc_1d.image_width  = M * K / 8  / 4 ;
1426+         img_desc_1d.image_width  = M * K / 4  / 4 ;
14331427        img_desc_1d.buffer  = qT_d;
14341428        qT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
14351429        CL_CHECK (err);
14361430
1437-         img_fmt_1d = { CL_RGBA, CL_FLOAT  };
1431+         img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT  };
14381432        memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
14391433        img_desc_1d.image_type  = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1440-         img_desc_1d.image_width  = M * K / 32  / 4  /  2 ;
1434+         img_desc_1d.image_width  = M * K / 32  / 4 ;
14411435        img_desc_1d.buffer  = extra->d ;
14421436        d_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
14431437        CL_CHECK (err);
14441438
1445-         img_fmt_1d = { CL_RGBA, CL_FLOAT  };
1439+         img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT  };
14461440        memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
14471441        img_desc_1d.image_type  = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1448-         img_desc_1d.image_width  = M * K / 32  / 4  /  2 ;
1442+         img_desc_1d.image_width  = M * K / 32  / 4 ;
14491443        img_desc_1d.buffer  = dT_d;
14501444        dT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
14511445        CL_CHECK (err);
@@ -1454,8 +1448,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14541448        //  set up and call the transpose kernels
14551449        //  <----------------------------------------------------------------------------------> //
14561450        //  weights
1457-         int  height_q = M / 8 ;
1458-         int  width_q = K / 8  / 4 ;
1451+         int  height_q = M / 4 ;
1452+         int  width_q = K / 4  / 4 ;
14591453        kernel = backend_ctx->kernel_transpose_16 ;
14601454
14611455        CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &q_d_image1D));
@@ -1469,8 +1463,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14691463        CL_CHECK (clWaitForEvents (1 , &evt));
14701464
14711465        //  scales
1472-         int  height_s = M / 8 ;
1473-         int  width_s = K / 32  / 8 ;
1466+         int  height_s = M / 4 ;
1467+         int  width_s = K / 32  / 4 ;
14741468
14751469        kernel = backend_ctx->kernel_transpose_16 ;
14761470        CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &d_d_image1D));
@@ -1864,7 +1858,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
18641858    void * buf_d;
18651859#endif
18661860
1867- #ifdef GGML_USE_OPENCL
18681861    // Make sure everything is done.
18691862    CL_CHECK(clFinish(queue));
18701863
@@ -1900,7 +1893,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
19001893        extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
19011894    CL_CHECK(clFinish(queue));
19021895#endif // GGML_OPENCL_SOA_Q
1903- #endif // GGML_USE_OPENCL
19041896
19051897    // Open file and dump.
19061898    char fname[512];
@@ -2865,6 +2857,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
28652857            CL_CHECK (status);
28662858
28672859            int  height_B = N/4 ;
2860+             if  (height_B == 0 ) {
2861+                 height_B = 1 ;
2862+             }
28682863            int  width_B = K/4 ;
28692864            int  padded_height_B = (N + padding)/4 ;
28702865
@@ -3013,11 +3008,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
30133008        }
30143009
30153010        if  (N == 1 ) {
3016-             local_work_size[0 ] = backend_ctx->adreno_wave_size ; //  localsize
3011+             size_t  wavesize = backend_ctx->adreno_wave_size ;
3012+             local_work_size[0 ] = wavesize; //  localsize
30173013            local_work_size[1 ] = 4 ; //  reduce factor
30183014            local_work_size[2 ] = 1 ;
30193015
3020-             global_work_size[0 ] = M / 2 ;
3016+             global_work_size[0 ] = ((( M / 2 ) + wavesize -  1 ) / wavesize) * wavesize ;
30213017            global_work_size[1 ] = 4 ; //  reduce factor
30223018            global_work_size[2 ] = 1 ;
30233019        }
0 commit comments