@@ -300,7 +300,7 @@ struct ggml_backend_opencl_context {
300300 cl_program program_mul_mv_f32_f32;
301301 cl_program program_mul;
302302 cl_program program_norm;
303- cl_program program_group_norm; // Added for group_norm
303+ cl_program program_group_norm;
304304 cl_program program_repeat;
305305 cl_program program_pad;
306306 cl_program program_unary;
@@ -328,13 +328,13 @@ struct ggml_backend_opencl_context {
328328 cl_kernel kernel_tanh_f16_nd;
329329 cl_kernel kernel_clamp;
330330 cl_kernel kernel_norm;
331- cl_kernel kernel_group_norm; // Added for group_norm
331+ cl_kernel kernel_group_norm;
332332 cl_kernel kernel_repeat;
333333 cl_kernel kernel_pad;
334334 cl_kernel kernel_upscale;
335335 cl_kernel kernel_upscale_bilinear;
336- cl_kernel kernel_concat_f32_contiguous; // Added for concat
337- cl_kernel kernel_concat_f32_non_contiguous; // Added for concat
336+ cl_kernel kernel_concat_f32_contiguous;
337+ cl_kernel kernel_concat_f32_non_contiguous;
338338 cl_kernel kernel_timestep_embedding;
339339 cl_kernel kernel_rms_norm;
340340 cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
@@ -854,21 +854,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
854854 // group_norm
855855 {
856856#ifdef GGML_OPENCL_EMBED_KERNELS
857- // Assuming group_norm.cl.h will be created similarly if embedding
858- // For now, direct include from norm.cl.h implies group_norm kernel is in norm.cl
859857 const std::string kernel_src {
860858 #include " norm.cl.h"
861859 };
862860#else
863- // Assuming group_norm kernel is now part of norm.cl as per previous step
864861 const std::string kernel_src = read_file (" norm.cl" );
865862#endif
866- // If group_norm is in a separate file, adjust program creation:
867- // backend_ctx->program_group_norm =
868- // build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
869- // CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
870- // Since it's added to norm.cl, reuse program_norm
871- // Since it's added to norm.cl, reuse program_norm
872863 CL_CHECK ((backend_ctx->kernel_group_norm = clCreateKernel (backend_ctx->program_norm , " kernel_group_norm" , &err), err));
873864 GGML_LOG_CONT (" ." );
874865 }
@@ -951,7 +942,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
951942 backend_ctx->program_upscale =
952943 build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
953944 CL_CHECK ((backend_ctx->kernel_upscale = clCreateKernel (backend_ctx->program_upscale , " kernel_upscale" , &err), err));
954- // Try to load bilinear kernel from the same program
955945 if (backend_ctx->program_upscale ) {
956946 cl_int err_bilinear;
957947 backend_ctx->kernel_upscale_bilinear = clCreateKernel (backend_ctx->program_upscale , " kernel_upscale_bilinear" , &err_bilinear);
@@ -974,13 +964,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
974964 // concat
975965 {
976966#ifdef GGML_OPENCL_EMBED_KERNELS
977- // Assuming concat.cl.h will be created if embedding concat kernels
978- // For now, assuming concat.cl is a separate file or its content is available
979967 const std::string kernel_src {
980- #include " concat.cl.h" // Placeholder if you create this embedded header
968+ #include " concat.cl.h"
981969 };
982970#else
983- // Assuming concat kernels are in concat.cl
971+
984972 const std::string kernel_src = read_file (" concat.cl" );
985973#endif
986974 if (!kernel_src.empty ()) {
@@ -1002,20 +990,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1002990 {
1003991#ifdef GGML_OPENCL_EMBED_KERNELS
1004992 const std::string kernel_src {
1005- #include " tsembd.cl.h" // Assuming tsembd.cl.h if embedding
993+ #include " tsembd.cl.h"
1006994 };
1007995#else
1008- // Assuming tsembd kernel is in tsembd.cl (or norm.cl if you added it there)
1009- const std::string kernel_src = read_file (" tsembd.cl" ); // Or "norm.cl"
996+
997+ const std::string kernel_src = read_file (" tsembd.cl" );
1010998#endif
1011999 if (!kernel_src.empty ()) {
1012- // Check if program_tsembd should reuse program_norm or be a new one
1013- // If tsembd.cl is separate:
10141000 backend_ctx->program_tsembd =
10151001 build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
10161002 CL_CHECK ((backend_ctx->kernel_timestep_embedding = clCreateKernel (backend_ctx->program_tsembd , " kernel_timestep_embedding" , &err), err));
1017- // If kernel_timestep_embedding is in norm.cl, then:
1018- // CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_norm, "kernel_timestep_embedding", &err), err));
10191003 GGML_LOG_CONT (" ." );
10201004 } else {
10211005 GGML_LOG_WARN (" ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n " );
@@ -2070,7 +2054,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
20702054 return op->src [0 ]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
20712055 case GGML_OP_PAD:
20722056 return op->src [0 ]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
2073- op->src [0 ]->ne [3 ] == 1 && op->ne [3 ] == 1 ; // Only 3D tensors for now
2057+ op->src [0 ]->ne [3 ] == 1 && op->ne [3 ] == 1 ;
20742058 case GGML_OP_GROUP_NORM:
20752059 return op->src [0 ]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
20762060 case GGML_OP_UPSCALE:
@@ -3659,7 +3643,6 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
36593643 kernel = backend_ctx->kernel_tanh_f16_nd ;
36603644 } else {
36613645 GGML_ASSERT (false && " Unsupported type for ggml_cl_tanh" );
3662- return ;
36633646 }
36643647 GGML_ASSERT (kernel != nullptr );
36653648
@@ -3700,14 +3683,11 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
37003683 global_work_size[1 ] = (size_t )ne11;
37013684 global_work_size[2 ] = (size_t )ne12;
37023685
3703- // Determine appropriate local work size. Max 256 total threads per workgroup.
3704- // Try to make it somewhat balanced.
37053686 size_t lws0 = 16 , lws1 = 4 , lws2 = 1 ;
37063687 if (ne10 < 16 ) lws0 = ne10;
37073688 if (ne11 < 4 ) lws1 = ne11;
3708- if (ne12 < 1 ) lws2 = ne12 > 0 ? ne12 : 1 ; // Ensure lws2 is at least 1 if ne12 > 0
3689+ if (ne12 < 1 ) lws2 = ne12 > 0 ? ne12 : 1 ;
37093690
3710- // Ensure total local size is not too large
37113691 while (lws0 * lws1 * lws2 > 256 && lws0 > 1 ) lws0 /= 2 ;
37123692 while (lws0 * lws1 * lws2 > 256 && lws1 > 1 ) lws1 /= 2 ;
37133693 while (lws0 * lws1 * lws2 > 256 && lws2 > 1 ) lws2 /= 2 ;
@@ -3720,7 +3700,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
37203700 if (global_work_size[0 ] % local_work_size[0 ] != 0 ||
37213701 global_work_size[1 ] % local_work_size[1 ] != 0 ||
37223702 global_work_size[2 ] % local_work_size[2 ] != 0 ) {
3723- local_work_size_ptr = NULL ; // Let runtime decide if padding is not perfect
3703+ local_work_size_ptr = NULL ;
37243704 }
37253705 }
37263706 if (global_work_size[0 ] == 0 || global_work_size[1 ] == 0 || global_work_size[2 ] == 0 ) return ;
@@ -4177,7 +4157,6 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
41774157
41784158 const int ne00_src = src0->ne [0 ];
41794159 const int ne01_src = src0->ne [1 ];
4180- // ne02_src, ne03_src are not passed to bilinear kernel directly
41814160
41824161 const int ne10_dst = dst->ne [0 ];
41834162 const int ne11_dst = dst->ne [1 ];
@@ -4263,8 +4242,6 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
42634242
42644243 if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr ) {
42654244 GGML_LOG_WARN (" %s: concat kernels not available, skipping OpenCL execution.\n " , __func__);
4266- // Fallback or error handling would be needed here in a real scenario,
4267- // for now, it will likely lead to an assertion or error later if not handled.
42684245 return ;
42694246 }
42704247
@@ -4281,21 +4258,16 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
42814258
42824259 if (ggml_is_contiguous (src0) && ggml_is_contiguous (src1) && ggml_is_contiguous (dst)) {
42834260 if (dim == 3 ) {
4284- // Handle dim 3 for contiguous with two clEnqueueCopyBuffer calls
4285- // (or clEnqueueCopyBufferRect if strides were complex but still block-copyable)
4286- // This matches the CUDA logic of using cudaMemcpyAsync for dim 3.
4261+
42874262 size_t nbytes_src0 = ggml_nbytes (src0);
42884263 size_t nbytes_src1 = ggml_nbytes (src1);
42894264
4290- // Copy src0
42914265 CL_CHECK (clEnqueueCopyBuffer (queue, extra0_cl->data_device , extrad_cl->data_device ,
42924266 off_src0, off_dst, nbytes_src0, 0 , NULL , NULL ));
4293- // Copy src1
42944267 CL_CHECK (clEnqueueCopyBuffer (queue, extra1_cl->data_device , extrad_cl->data_device ,
42954268 off_src1, off_dst + nbytes_src0, nbytes_src1, 0 , NULL , NULL ));
42964269 } else {
4297- // Use specialized contiguous kernel: kernel_concat_f32_contiguous
4298- // This kernel is designed to be called in a loop for the 4th dimension (i3)
4270+
42994271 cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous ;
43004272 size_t global_work_size[3 ];
43014273
@@ -4304,8 +4276,6 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
43044276 cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb [3 ]);
43054277 cl_ulong current_off_dst = off_dst + (i3 * dst->nb [3 ]);
43064278
4307- // Kernel expects 3D slice dimensions.
4308- // src0->ne[0..2], src1->ne[0..2], dst->ne[0..2]
43094279 int d_ne00 = src0->ne [0 ]; int d_ne01 = src0->ne [1 ]; int d_ne02 = src0->ne [2 ];
43104280 int d_ne10 = src1->ne [0 ]; int d_ne11 = src1->ne [1 ]; int d_ne12 = src1->ne [2 ];
43114281 int d_ne0 = dst->ne [0 ]; int d_ne1 = dst->ne [1 ]; int d_ne2 = dst->ne [2 ];
@@ -4335,15 +4305,11 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
43354305 }
43364306 }
43374307 } else {
4338- // Use generic non-contiguous kernel: kernel_concat_f32_non_contiguous
43394308 cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous ;
43404309
43414310 long ne00 = src0->ne [0 ], ne01 = src0->ne [1 ], ne02 = src0->ne [2 ], ne03 = src0->ne [3 ];
43424311 cl_ulong nb00 = src0->nb [0 ], nb01 = src0->nb [1 ], nb02 = src0->nb [2 ], nb03 = src0->nb [3 ];
43434312
4344- // src1 dimensions (ne10-ne13) are not explicitly passed to this OpenCL kernel,
4345- // as the logic inside the kernel derives necessary src1 indices based on dst indices and src0 dimensions.
4346- // Strides for src1 are important.
43474313 cl_ulong nb10 = src1->nb [0 ], nb11 = src1->nb [1 ], nb12 = src1->nb [2 ], nb13 = src1->nb [3 ];
43484314
43494315 long d_ne0 = dst->ne [0 ], d_ne1 = dst->ne [1 ], d_ne2 = dst->ne [2 ], d_ne3 = dst->ne [3 ];
@@ -4381,16 +4347,10 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
43814347 CL_CHECK (clSetKernelArg (kernel, 25 , sizeof (cl_ulong), &d_nb3));
43824348 CL_CHECK (clSetKernelArg (kernel, 26 , sizeof (int ), &dim));
43834349
4384- // Global work size is based on dst's dimensions ne[1], ne[2], ne[3].
4385- // Local work size for the 0th dimension is handled inside the kernel loop.
4386- // A common local size for the first dimension.
43874350 size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t )d_ne1 : 1 ,
43884351 d_ne2 > 0 ? (size_t )d_ne2 : 1 ,
43894352 d_ne3 > 0 ? (size_t )d_ne3 : 1 };
43904353
4391- // Using NULL for local_work_size lets the OpenCL runtime decide,
4392- // or effectively makes the i0 loop in the kernel serial per work-item.
4393- // This change is paired with a kernel modification to iterate i0 serially.
43944354 CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size_nc, NULL , 0 , NULL , NULL ));
43954355 }
43964356}
@@ -4419,7 +4379,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
44194379
44204380 const int logical_dim = dst->op_params [0 ];
44214381 const int max_period = dst->op_params [1 ];
4422- const int dst_nb1_bytes = dst->nb [1 ]; // This is already in bytes
4382+ const int dst_nb1_bytes = dst->nb [1 ];
44234383
44244384 cl_kernel kernel = backend_ctx->kernel_timestep_embedding ;
44254385
@@ -4431,33 +4391,21 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
44314391 CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (int ), &logical_dim));
44324392 CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &max_period));
44334393
4434- // Global work size
4435- // Dimension 0 (x) is for feature_idx_j
4436- // Max j needed is (logical_dim+1)/2. So global size is (logical_dim+1)/2 + 1 to make j range up to (logical_dim+1)/2.
44374394 size_t gws0 = (size_t )(((logical_dim + 1 ) / 2 ) + 1 );
4438- // Dimension 1 (y) is for timestep_idx
4395+
44394396 size_t gws1 = (size_t )src0->ne [0 ];
44404397
44414398 size_t global_work_size[] = {gws0, gws1, 1 };
44424399
4443- // Local work size can be NULL to let the runtime decide, or a small fixed size.
4444- // For simplicity and correctness first, use NULL.
4445- // size_t local_work_size[] = {16, 1, 1}; // Example, can be tuned
4446- // Adjust gws0 to be a multiple of lws0 if lws0 is not NULL and non-uniform WS is not robustly supported.
4447- // if (local_work_size[0] > 0 && !backend_ctx->non_uniform_workgroups) {
4448- // global_work_size[0] = ((global_work_size[0] + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0];
4449- // }
4450-
4451-
44524400#ifdef GGML_OPENCL_PROFILING
44534401 cl_event evt;
44544402 CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 2 , NULL , global_work_size, NULL , 0 , NULL , &evt)); // Pass 2 for 2D problem
44554403
44564404 g_profiling_info.emplace_back ();
44574405 size_t profiling_gws[3 ] = {global_work_size[0 ], global_work_size[1 ], 1 };
44584406 size_t profiling_lws[3 ] = {0 ,0 ,0 }; // Reflects NULL LWS
4459- if (false /* replace with actual LWS if used */ ) {
4460- // profiling_lws[0] = local_work_size[0]; profiling_lws[1] = local_work_size[1]; profiling_lws[2] = local_work_size[2];
4407+ if (false ) {
4408+
44614409 }
44624410 populateProfilingInfo (g_profiling_info.back (), evt, kernel, profiling_gws, profiling_lws, dst);
44634411#else
@@ -5942,10 +5890,8 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
59425890 if (!any_on_device) {
59435891 return false ;
59445892 }
5945- // ggml_cl_timestep_embedding takes (backend, src0, dst)
5946- // Need a small wrapper or direct call logic here if func signature is strict
59475893 ggml_cl_timestep_embedding (backend, tensor->src [0 ], tensor);
5948- return true ; // Handled directly
5894+ return true ;
59495895 case GGML_OP_RMS_NORM:
59505896 if (!any_on_device) {
59515897 return false ;
0 commit comments