@@ -74,6 +74,7 @@ struct ggml_cl_version {
7474 cl_uint minor = 0 ;
7575};
7676
77+
7778struct ggml_cl_compiler_version {
7879 ADRENO_CL_COMPILER_TYPE type;
7980 int major = -1 ;
@@ -91,6 +92,14 @@ struct ggml_cl_compiler_version {
9192 }
9293};
9394
95+ static size_t align_to (size_t value, size_t to_alignment) {
96+ GGML_ASSERT (to_alignment && " Invalid alignment (must be non-zero)" );
97+ GGML_ASSERT ((to_alignment & (to_alignment - 1 )) == 0 && " to_alignment must be power-of-two" );
98+
99+ return ((value + to_alignment - 1 ) / to_alignment) * to_alignment;
100+ }
101+
102+
94103// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
95104static ggml_cl_version parse_cl_version (std::string_view str) {
96105 size_t major_str_begin = 0 ;
@@ -248,6 +257,8 @@ struct ggml_backend_opencl_context {
248257
249258 int adreno_wave_size;
250259
260+ cl_bool non_uniform_workgroups;
261+
251262 cl_context context;
252263 cl_command_queue queue;
253264
@@ -1397,6 +1408,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
13971408 GGML_LOG_INFO (" ggml_opencl: SVM atomics support: %s\n " ,
13981409 svm_caps & CL_DEVICE_SVM_ATOMICS ? " true" : " false" );
13991410
1411+ CL_CHECK (clGetDeviceInfo (device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof (cl_bool),
1412+ &backend_ctx->non_uniform_workgroups , 0 ));
1413+
14001414 // Print out configurations
14011415#ifdef GGML_OPENCL_SOA_Q
14021416 GGML_LOG_INFO (" ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n " );
@@ -2058,15 +2072,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
20582072 // The original tensor memory is divided into scales and quants, i.e.,
20592073 // we first store scales, then quants.
20602074 // Create subbuffer for scales.
2061- region.origin = extra_orig->offset + tensor->view_offs + offset;
2075+ region.origin = align_to ( extra_orig->offset + tensor->view_offs + offset, backend_ctx-> alignment ) ;
20622076 region.size = size_d;
20632077 extra->d = clCreateSubBuffer (
20642078 extra_orig->data_device , CL_MEM_READ_WRITE,
20652079 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
20662080 CL_CHECK (err);
2081+ auto previous_origin = region.origin ;
20672082
20682083 // Create subbuffer for quants.
2069- region.origin = extra_orig-> offset + tensor-> view_offs + offset + size_d ;
2084+ region.origin = align_to (previous_origin + size_d, backend_ctx-> alignment ) ;
20702085 region.size = size_q;
20712086 extra->q = clCreateSubBuffer (
20722087 extra_orig->data_device , CL_MEM_READ_WRITE,
@@ -2942,14 +2957,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
29422957 size_t global_work_size[] = {(size_t )n, 1 , 1 };
29432958 size_t local_work_size[] = {64 , 1 , 1 };
29442959
2960+ size_t * local_work_size_ptr = local_work_size;
2961+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
2962+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
2963+ }
2964+
29452965#ifdef GGML_OPENCL_PROFILING
29462966 cl_event evt;
2947- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2967+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
29482968
29492969 g_profiling_info.emplace_back ();
2950- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2970+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
29512971#else
2952- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2972+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
29532973#endif
29542974 } else {
29552975 unsigned int nth = MIN (64 , ne0);
@@ -3077,14 +3097,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
30773097 size_t global_work_size[] = {(size_t )n, 1 , 1 };
30783098 size_t local_work_size[] = {64 , 1 , 1 };
30793099
3100+ size_t * local_work_size_ptr = local_work_size;
3101+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3102+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3103+ }
3104+
30803105#ifdef GGML_OPENCL_PROFILING
30813106 cl_event evt;
3082- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3107+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
30833108
30843109 g_profiling_info.emplace_back ();
3085- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3110+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
30863111#else
3087- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3112+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
30883113#endif
30893114 } else {
30903115 unsigned int nth = MIN (64 , ne0);
@@ -3233,14 +3258,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
32333258 size_t global_work_size[] = {(size_t )n, 1 , 1 };
32343259 size_t local_work_size[] = {64 , 1 , 1 };
32353260
3261+ size_t * local_work_size_ptr = local_work_size;
3262+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3263+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3264+ }
3265+
32363266#ifdef GGML_OPENCL_PROFILING
32373267 cl_event evt;
3238- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3268+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
32393269
32403270 g_profiling_info.emplace_back ();
3241- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3271+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
32423272#else
3243- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3273+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
32443274#endif
32453275}
32463276
@@ -3273,14 +3303,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
32733303 size_t global_work_size[] = {(size_t )n, 1 , 1 };
32743304 size_t local_work_size[] = {64 , 1 , 1 };
32753305
3306+ size_t * local_work_size_ptr = local_work_size;
3307+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3308+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3309+ }
3310+
32763311#ifdef GGML_OPENCL_PROFILING
32773312 cl_event evt;
3278- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3313+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
32793314
32803315 g_profiling_info.emplace_back ();
3281- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3316+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
32823317#else
3283- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3318+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
32843319#endif
32853320}
32863321
@@ -3320,14 +3355,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
33203355 size_t global_work_size[] = {(size_t )n, 1 , 1 };
33213356 size_t local_work_size[] = {64 , 1 , 1 };
33223357
3358+ size_t * local_work_size_ptr = local_work_size;
3359+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3360+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3361+ }
3362+
33233363#ifdef GGML_OPENCL_PROFILING
33243364 cl_event evt;
3325- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3365+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
33263366
33273367 g_profiling_info.emplace_back ();
3328- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3368+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
33293369#else
3330- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3370+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
33313371#endif
33323372}
33333373
@@ -4230,14 +4270,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
42304270 size_t global_work_size[] = {(size_t )n, 1 , 1 };
42314271 size_t local_work_size[] = {64 , 1 , 1 };
42324272
4273+ size_t * local_work_size_ptr = local_work_size;
4274+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
4275+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
4276+ }
4277+
42334278#ifdef GGML_OPENCL_PROFILING
42344279 cl_event evt;
4235- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
4280+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
42364281
42374282 g_profiling_info.emplace_back ();
4238- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
4283+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
42394284#else
4240- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
4285+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
42414286#endif
42424287}
42434288
@@ -4418,14 +4463,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
44184463 size_t global_work_size[] = {(size_t )ne00, (size_t )ne01, (size_t )ne02};
44194464 size_t local_work_size[] = {64 , 1 , 1 };
44204465
4466+ size_t * local_work_size_ptr = local_work_size;
4467+ if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
4468+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
4469+ }
4470+
44214471#ifdef GGML_OPENCL_PROFILING
44224472 cl_event evt;
4423- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
4473+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
44244474
44254475 g_profiling_info.emplace_back ();
4426- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
4476+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
44274477#else
4428- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
4478+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
44294479#endif
44304480 }
44314481}
0 commit comments