@@ -74,6 +74,7 @@ struct ggml_cl_version {
7474 cl_uint minor = 0 ;
7575};
7676
77+
7778struct ggml_cl_compiler_version {
7879 ADRENO_CL_COMPILER_TYPE type;
7980 int major = -1 ;
@@ -91,6 +92,14 @@ struct ggml_cl_compiler_version {
9192 }
9293};
9394
95+ static size_t align_to (size_t value, size_t to_alignment) {
96+ GGML_ASSERT (to_alignment && " Invalid alignment (must be non-zero)" );
97+ GGML_ASSERT ((to_alignment & (to_alignment - 1 )) == 0 && " to_alignment must be power-of-two" );
98+
99+ return ((value + to_alignment - 1 ) / to_alignment) * to_alignment;
100+ }
101+
102+
94103// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
95104static ggml_cl_version parse_cl_version (std::string_view str) {
96105 size_t major_str_begin = 0 ;
@@ -248,6 +257,8 @@ struct ggml_backend_opencl_context {
248257
249258 int adreno_wave_size;
250259
260+ cl_bool non_uniform_workgroups;
261+
251262 cl_context context;
252263 cl_command_queue queue;
253264
@@ -1397,6 +1408,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
13971408 GGML_LOG_INFO (" ggml_opencl: SVM atomics support: %s\n " ,
13981409 svm_caps & CL_DEVICE_SVM_ATOMICS ? " true" : " false" );
13991410
1411+ if (opencl_c_version.major >= 3 ) {
1412+ CL_CHECK (clGetDeviceInfo (device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof (cl_bool),
1413+ &backend_ctx->non_uniform_workgroups , 0 ));
1414+ } else {
1415+ GGML_ASSERT (opencl_c_version.major == 2 );
1416+ // Non-uniform workgroup sizes is mandatory feature in v2.x.
1417+ backend_ctx->non_uniform_workgroups = true ;
1418+ }
1419+
14001420 // Print out configurations
14011421#ifdef GGML_OPENCL_SOA_Q
14021422 GGML_LOG_INFO (" ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n " );
@@ -2058,15 +2078,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
20582078 // The original tensor memory is divided into scales and quants, i.e.,
20592079 // we first store scales, then quants.
20602080 // Create subbuffer for scales.
2061- region.origin = extra_orig->offset + tensor->view_offs + offset;
2081+ region.origin = align_to ( extra_orig->offset + tensor->view_offs + offset, backend_ctx-> alignment ) ;
20622082 region.size = size_d;
20632083 extra->d = clCreateSubBuffer (
20642084 extra_orig->data_device , CL_MEM_READ_WRITE,
20652085 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
20662086 CL_CHECK (err);
2087+ auto previous_origin = region.origin ;
20672088
20682089 // Create subbuffer for quants.
2069- region.origin = extra_orig-> offset + tensor-> view_offs + offset + size_d ;
2090+ region.origin = align_to (previous_origin + size_d, backend_ctx-> alignment ) ;
20702091 region.size = size_q;
20712092 extra->q = clCreateSubBuffer (
20722093 extra_orig->data_device , CL_MEM_READ_WRITE,
@@ -2942,14 +2963,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
29422963 size_t global_work_size[] = {(size_t )n, 1 , 1 };
29432964 size_t local_work_size[] = {64 , 1 , 1 };
29442965
2966+ size_t * local_work_size_ptr = local_work_size;
2967+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
2968+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
2969+ }
2970+
29452971#ifdef GGML_OPENCL_PROFILING
29462972 cl_event evt;
2947- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2973+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
29482974
29492975 g_profiling_info.emplace_back ();
2950- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2976+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
29512977#else
2952- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2978+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
29532979#endif
29542980 } else {
29552981 unsigned int nth = MIN (64 , ne0);
@@ -3077,14 +3103,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
30773103 size_t global_work_size[] = {(size_t )n, 1 , 1 };
30783104 size_t local_work_size[] = {64 , 1 , 1 };
30793105
3106+ size_t * local_work_size_ptr = local_work_size;
3107+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3108+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3109+ }
3110+
30803111#ifdef GGML_OPENCL_PROFILING
30813112 cl_event evt;
3082- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3113+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
30833114
30843115 g_profiling_info.emplace_back ();
3085- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3116+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
30863117#else
3087- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3118+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
30883119#endif
30893120 } else {
30903121 unsigned int nth = MIN (64 , ne0);
@@ -3233,14 +3264,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
32333264 size_t global_work_size[] = {(size_t )n, 1 , 1 };
32343265 size_t local_work_size[] = {64 , 1 , 1 };
32353266
3267+ size_t * local_work_size_ptr = local_work_size;
3268+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3269+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3270+ }
3271+
32363272#ifdef GGML_OPENCL_PROFILING
32373273 cl_event evt;
3238- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3274+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
32393275
32403276 g_profiling_info.emplace_back ();
3241- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3277+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
32423278#else
3243- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3279+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
32443280#endif
32453281}
32463282
@@ -3273,14 +3309,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
32733309 size_t global_work_size[] = {(size_t )n, 1 , 1 };
32743310 size_t local_work_size[] = {64 , 1 , 1 };
32753311
3312+ size_t * local_work_size_ptr = local_work_size;
3313+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3314+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3315+ }
3316+
32763317#ifdef GGML_OPENCL_PROFILING
32773318 cl_event evt;
3278- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3319+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
32793320
32803321 g_profiling_info.emplace_back ();
3281- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3322+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
32823323#else
3283- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3324+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
32843325#endif
32853326}
32863327
@@ -3320,14 +3361,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
33203361 size_t global_work_size[] = {(size_t )n, 1 , 1 };
33213362 size_t local_work_size[] = {64 , 1 , 1 };
33223363
3364+ size_t * local_work_size_ptr = local_work_size;
3365+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3366+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3367+ }
3368+
33233369#ifdef GGML_OPENCL_PROFILING
33243370 cl_event evt;
3325- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3371+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
33263372
33273373 g_profiling_info.emplace_back ();
3328- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3374+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
33293375#else
3330- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3376+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
33313377#endif
33323378}
33333379
@@ -4230,14 +4276,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
42304276 size_t global_work_size[] = {(size_t )n, 1 , 1 };
42314277 size_t local_work_size[] = {64 , 1 , 1 };
42324278
4279+ size_t * local_work_size_ptr = local_work_size;
4280+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
4281+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
4282+ }
4283+
42334284#ifdef GGML_OPENCL_PROFILING
42344285 cl_event evt;
4235- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
4286+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
42364287
42374288 g_profiling_info.emplace_back ();
4238- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
4289+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
42394290#else
4240- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
4291+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
42414292#endif
42424293}
42434294
@@ -4418,14 +4469,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
44184469 size_t global_work_size[] = {(size_t )ne00, (size_t )ne01, (size_t )ne02};
44194470 size_t local_work_size[] = {64 , 1 , 1 };
44204471
4472+ size_t * local_work_size_ptr = local_work_size;
4473+ if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
4474+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
4475+ }
4476+
44214477#ifdef GGML_OPENCL_PROFILING
44224478 cl_event evt;
4423- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
4479+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
44244480
44254481 g_profiling_info.emplace_back ();
4426- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
4482+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
44274483#else
4428- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
4484+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
44294485#endif
44304486 }
44314487}
0 commit comments