@@ -69,6 +69,13 @@ struct ggml_cl_version {
6969    cl_uint minor = 0 ;
7070};
7171
72+ static  size_t  align_to (size_t  value, size_t  to_alignment) {
73+     GGML_ASSERT (to_alignment && " Invalid alignment (must be non-zero)"  );
74+     GGML_ASSERT ((to_alignment & (to_alignment - 1 )) == 0  && " to_alignment must be power-of-two"  );
75+ 
76+     return  ((value + to_alignment - 1 ) / to_alignment) * to_alignment;
77+ }
78+ 
7279//  Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
7380static  ggml_cl_version parse_cl_version (std::string_view str) {
7481    size_t  major_str_begin = 0 ;
@@ -218,6 +225,8 @@ struct ggml_backend_opencl_context {
218225
219226    int  adreno_wave_size;
220227
228+     cl_bool non_uniform_workgroups;
229+ 
221230    cl_context context;
222231    cl_command_queue queue;
223232
@@ -655,6 +664,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
655664    GGML_LOG_INFO (" ggml_opencl: SVM atomics support: %s\n "  ,
656665        svm_caps & CL_DEVICE_SVM_ATOMICS ? " true"   : " false"  );
657666
667+     CL_CHECK (clGetDeviceInfo (device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof (cl_bool),
668+                              &backend_ctx->non_uniform_workgroups , 0 ));
669+ 
658670    //  Print out configurations
659671#ifdef  GGML_OPENCL_SOA_Q
660672    GGML_LOG_INFO (" ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n "  );
@@ -1546,15 +1558,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
15461558        //  The original tensor memory is divided into scales and quants, i.e.,
15471559        //  we first store scales, then quants.
15481560        //  Create subbuffer for scales.
1549-         region.origin  = extra_orig->offset  + tensor->view_offs  + offset;
1561+         region.origin  = align_to ( extra_orig->offset  + tensor->view_offs  + offset, backend_ctx-> alignment ) ;
15501562        region.size  = size_d;
15511563        extra->d  = clCreateSubBuffer (
15521564            extra_orig->data_device , CL_MEM_READ_WRITE,
15531565            CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
15541566        CL_CHECK (err);
1567+         auto  previous_origin = region.origin ;
15551568
15561569        //  Create subbuffer for quants.
1557-         region.origin  = extra_orig-> offset  + tensor-> view_offs  + offset + size_d ;
1570+         region.origin  = align_to (previous_origin  + size_d, backend_ctx-> alignment ) ;
15581571        region.size  = size_q;
15591572        extra->q  = clCreateSubBuffer (
15601573            extra_orig->data_device , CL_MEM_READ_WRITE,
@@ -2430,14 +2443,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
24302443        size_t  global_work_size[] = {(size_t )n, 1 , 1 };
24312444        size_t  local_work_size[] = {64 , 1 , 1 };
24322445
2446+         size_t  * local_work_size_ptr = local_work_size;
2447+         if  (n % 64  != 0  && !backend_ctx->non_uniform_workgroups ) {
2448+             local_work_size_ptr = nullptr ;  //  Let driver choose the work-group sizes.
2449+         }
2450+ 
24332451#ifdef  GGML_OPENCL_PROFILING
24342452        cl_event evt;
2435-         CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2453+         CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
24362454
24372455        g_profiling_info.emplace_back ();
2438-         populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2456+         populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
24392457#else 
2440-         CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2458+         CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
24412459#endif 
24422460    } else  {
24432461        unsigned  int  nth = MIN (64 , ne0);
@@ -2565,14 +2583,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
25652583        size_t  global_work_size[] = {(size_t )n, 1 , 1 };
25662584        size_t  local_work_size[] = {64 , 1 , 1 };
25672585
2586+         size_t  * local_work_size_ptr = local_work_size;
2587+         if  (n % 64  != 0  && !backend_ctx->non_uniform_workgroups ) {
2588+             local_work_size_ptr = nullptr ;  //  Let driver choose the work-group sizes.
2589+         }
2590+ 
25682591#ifdef  GGML_OPENCL_PROFILING
25692592        cl_event evt;
2570-         CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2593+         CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
25712594
25722595        g_profiling_info.emplace_back ();
2573-         populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2596+         populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
25742597#else 
2575-         CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2598+         CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
25762599#endif 
25772600    } else  {
25782601        unsigned  int  nth = MIN (64 , ne0);
@@ -2721,14 +2744,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
27212744    size_t  global_work_size[] = {(size_t )n, 1 , 1 };
27222745    size_t  local_work_size[] = {64 , 1 , 1 };
27232746
2747+     size_t  * local_work_size_ptr = local_work_size;
2748+     if  (n % 64  != 0  && !backend_ctx->non_uniform_workgroups ) {
2749+         local_work_size_ptr = nullptr ;  //  Let driver choose the work-group sizes.
2750+     }
2751+ 
27242752#ifdef  GGML_OPENCL_PROFILING
27252753    cl_event evt;
2726-     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2754+     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
27272755
27282756    g_profiling_info.emplace_back ();
2729-     populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2757+     populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
27302758#else 
2731-     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2759+     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
27322760#endif 
27332761}
27342762
@@ -2761,14 +2789,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
27612789    size_t  global_work_size[] = {(size_t )n, 1 , 1 };
27622790    size_t  local_work_size[] = {64 , 1 , 1 };
27632791
2792+     size_t  * local_work_size_ptr = local_work_size;
2793+     if  (n % 64  != 0  && !backend_ctx->non_uniform_workgroups ) {
2794+         local_work_size_ptr = nullptr ;  //  Let driver choose the work-group sizes.
2795+     }
2796+ 
27642797#ifdef  GGML_OPENCL_PROFILING
27652798    cl_event evt;
2766-     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2799+     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
27672800
27682801    g_profiling_info.emplace_back ();
2769-     populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2802+     populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
27702803#else 
2771-     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2804+     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
27722805#endif 
27732806}
27742807
@@ -2808,14 +2841,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
28082841    size_t  global_work_size[] = {(size_t )n, 1 , 1 };
28092842    size_t  local_work_size[] = {64 , 1 , 1 };
28102843
2844+     size_t  * local_work_size_ptr = local_work_size;
2845+     if  (n % 64  != 0  && !backend_ctx->non_uniform_workgroups ) {
2846+         local_work_size_ptr = nullptr ;  //  Let driver choose the work-group sizes.
2847+     }
2848+ 
28112849#ifdef  GGML_OPENCL_PROFILING
28122850    cl_event evt;
2813-     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
2851+     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
28142852
28152853    g_profiling_info.emplace_back ();
2816-     populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
2854+     populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
28172855#else 
2818-     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
2856+     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
28192857#endif 
28202858}
28212859
@@ -3711,14 +3749,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
37113749    size_t  global_work_size[] = {(size_t )n, 1 , 1 };
37123750    size_t  local_work_size[] = {64 , 1 , 1 };
37133751
3752+     size_t  * local_work_size_ptr = local_work_size;
3753+     if  (n % 64  != 0  && !backend_ctx->non_uniform_workgroups ) {
3754+         local_work_size_ptr = nullptr ;  //  Let driver choose the work-group sizes.
3755+     }
3756+ 
37143757#ifdef  GGML_OPENCL_PROFILING
37153758    cl_event evt;
3716-     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3759+     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
37173760
37183761    g_profiling_info.emplace_back ();
3719-     populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3762+     populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
37203763#else 
3721-     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3764+     CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
37223765#endif 
37233766}
37243767
@@ -3899,14 +3942,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
38993942        size_t  global_work_size[] = {(size_t )ne00, (size_t )ne01, (size_t )ne02};
39003943        size_t  local_work_size[] = {64 , 1 , 1 };
39013944
3945+         size_t  * local_work_size_ptr = local_work_size;
3946+         if  (ne00 % 64  != 0  && !backend_ctx->non_uniform_workgroups ) {
3947+             local_work_size_ptr = nullptr ;  //  Let driver choose the work-group sizes.
3948+         }
3949+ 
39023950#ifdef  GGML_OPENCL_PROFILING
39033951        cl_event evt;
3904-         CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , &evt));
3952+         CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , &evt));
39053953
39063954        g_profiling_info.emplace_back ();
3907-         populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size , dst);
3955+         populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr , dst);
39083956#else 
3909-         CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size , 0 , NULL , NULL ));
3957+         CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr , 0 , NULL , NULL ));
39103958#endif 
39113959    }
39123960}
0 commit comments