ggml-org
diff --git a/‎ggml/src/ggml-opencl/ggml-opencl.cpp‎
Lines changed: 19 additions & 73 deletions b/‎ggml/src/ggml-opencl/ggml-opencl.cpp‎
Lines changed: 19 additions & 73 deletions
@@ -300,7 +300,7 @@ struct ggml_backend_opencl_context {
     cl_program program_mul_mv_f32_f32;
     cl_program program_mul;
     cl_program program_norm;
-    cl_program program_group_norm; // Added for group_norm
+    cl_program program_group_norm;
     cl_program program_repeat;
     cl_program program_pad;
     cl_program program_unary;
@@ -328,13 +328,13 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_tanh_f16_nd;    
     cl_kernel kernel_clamp;
     cl_kernel kernel_norm;
-    cl_kernel kernel_group_norm; // Added for group_norm
+    cl_kernel kernel_group_norm; 
     cl_kernel kernel_repeat;
     cl_kernel kernel_pad;
     cl_kernel kernel_upscale;
     cl_kernel kernel_upscale_bilinear;
-    cl_kernel kernel_concat_f32_contiguous; // Added for concat
-    cl_kernel kernel_concat_f32_non_contiguous; // Added for concat
+    cl_kernel kernel_concat_f32_contiguous; 
+    cl_kernel kernel_concat_f32_non_contiguous;
     cl_kernel kernel_timestep_embedding;    
     cl_kernel kernel_rms_norm;
     cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
@@ -854,21 +854,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
     // group_norm
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
-        // Assuming group_norm.cl.h will be created similarly if embedding
-        // For now, direct include from norm.cl.h implies group_norm kernel is in norm.cl
         const std::string kernel_src {
             #include "norm.cl.h"
         };
 #else
-        // Assuming group_norm kernel is now part of norm.cl as per previous step
         const std::string kernel_src = read_file("norm.cl");
 #endif
-        // If group_norm is in a separate file, adjust program creation:
-        // backend_ctx->program_group_norm =
-        // build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-        // CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
-        // Since it's added to norm.cl, reuse program_norm
-        // Since it's added to norm.cl, reuse program_norm
         CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_norm, "kernel_group_norm", &err), err));
         GGML_LOG_CONT(".");
     }
@@ -951,7 +942,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
             backend_ctx->program_upscale =
                 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
             CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
-            // Try to load bilinear kernel from the same program
             if (backend_ctx->program_upscale) {
                  cl_int err_bilinear;
                  backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
@@ -974,13 +964,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
     // concat
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
-        // Assuming concat.cl.h will be created if embedding concat kernels
-        // For now, assuming concat.cl is a separate file or its content is available
         const std::string kernel_src {
-            #include "concat.cl.h" // Placeholder if you create this embedded header
+            #include "concat.cl.h" 
         };
 #else
-        // Assuming concat kernels are in concat.cl
+
         const std::string kernel_src = read_file("concat.cl");
 #endif
         if (!kernel_src.empty()) {
@@ -1002,20 +990,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
         const std::string kernel_src {
-            #include "tsembd.cl.h" // Assuming tsembd.cl.h if embedding
+            #include "tsembd.cl.h" 
         };
 #else
-        // Assuming tsembd kernel is in tsembd.cl (or norm.cl if you added it there)
-        const std::string kernel_src = read_file("tsembd.cl"); // Or "norm.cl"
+
+        const std::string kernel_src = read_file("tsembd.cl"); 
 #endif
         if (!kernel_src.empty()) {
-            // Check if program_tsembd should reuse program_norm or be a new one
-            // If tsembd.cl is separate:
             backend_ctx->program_tsembd =
                 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
             CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err));
-            // If kernel_timestep_embedding is in norm.cl, then:
-            // CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_norm, "kernel_timestep_embedding", &err), err));
             GGML_LOG_CONT(".");
         } else {
             GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n");
@@ -2070,7 +2054,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
             return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
         case GGML_OP_PAD:
             return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
-                   op->src[0]->ne[3] == 1 && op->ne[3] == 1; // Only 3D tensors for now
+                   op->src[0]->ne[3] == 1 && op->ne[3] == 1;
         case GGML_OP_GROUP_NORM:
             return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
         case GGML_OP_UPSCALE:
@@ -3659,7 +3643,6 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
         kernel = backend_ctx->kernel_tanh_f16_nd;
     } else {
         GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
-        return;
     }
     GGML_ASSERT(kernel != nullptr);
 
@@ -3700,14 +3683,11 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
     global_work_size[1] = (size_t)ne11;
     global_work_size[2] = (size_t)ne12;
 
-    // Determine appropriate local work size. Max 256 total threads per workgroup.
-    // Try to make it somewhat balanced.
     size_t lws0 = 16, lws1 = 4, lws2 = 1;
     if (ne10 < 16) lws0 = ne10;
     if (ne11 < 4) lws1 = ne11;
-    if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1; // Ensure lws2 is at least 1 if ne12 > 0
+    if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1; 
 
-    // Ensure total local size is not too large
     while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
     while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
     while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
@@ -3720,7 +3700,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
         if (global_work_size[0] % local_work_size[0] != 0 ||
             global_work_size[1] % local_work_size[1] != 0 ||
             global_work_size[2] % local_work_size[2] != 0) {
-            local_work_size_ptr = NULL; // Let runtime decide if padding is not perfect
+            local_work_size_ptr = NULL; 
         }
     }
     if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
@@ -4177,7 +4157,6 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
 
     const int ne00_src = src0->ne[0];
     const int ne01_src = src0->ne[1];
-    // ne02_src, ne03_src are not passed to bilinear kernel directly
 
     const int ne10_dst = dst->ne[0];
     const int ne11_dst = dst->ne[1];
@@ -4263,8 +4242,6 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
 
     if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
         GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
-        // Fallback or error handling would be needed here in a real scenario,
-        // for now, it will likely lead to an assertion or error later if not handled.
         return;
     }
 
@@ -4281,21 +4258,16 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
 
     if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
         if (dim == 3) {
-            // Handle dim 3 for contiguous with two clEnqueueCopyBuffer calls
-            // (or clEnqueueCopyBufferRect if strides were complex but still block-copyable)
-            // This matches the CUDA logic of using cudaMemcpyAsync for dim 3.
+
             size_t nbytes_src0 = ggml_nbytes(src0);
             size_t nbytes_src1 = ggml_nbytes(src1);
 
-            // Copy src0
             CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
                                          off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
-            // Copy src1
             CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
                                          off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
         } else {
-            // Use specialized contiguous kernel: kernel_concat_f32_contiguous
-            // This kernel is designed to be called in a loop for the 4th dimension (i3)
+
             cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
             size_t global_work_size[3];
 
@@ -4304,8 +4276,6 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
                 cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
                 cl_ulong current_off_dst  = off_dst  + (i3 * dst->nb[3]);
 
-                // Kernel expects 3D slice dimensions.
-                // src0->ne[0..2], src1->ne[0..2], dst->ne[0..2]
                 int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
                 int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
                 int d_ne0  = dst->ne[0];  int d_ne1  = dst->ne[1];  int d_ne2  = dst->ne[2];
@@ -4335,15 +4305,11 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
             }
         }
     } else {
-        // Use generic non-contiguous kernel: kernel_concat_f32_non_contiguous
         cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
 
         long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
         cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
 
-        // src1 dimensions (ne10-ne13) are not explicitly passed to this OpenCL kernel,
-        // as the logic inside the kernel derives necessary src1 indices based on dst indices and src0 dimensions.
-        // Strides for src1 are important.
         cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
 
         long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
@@ -4381,16 +4347,10 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
         CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong),    &d_nb3));
         CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int),      &dim));
 
-        // Global work size is based on dst's dimensions ne[1], ne[2], ne[3].
-        // Local work size for the 0th dimension is handled inside the kernel loop.
-        // A common local size for the first dimension.
         size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
                                          d_ne2 > 0 ? (size_t)d_ne2 : 1,
                                          d_ne3 > 0 ? (size_t)d_ne3 : 1 };
 
-        // Using NULL for local_work_size lets the OpenCL runtime decide,
-        // or effectively makes the i0 loop in the kernel serial per work-item.
-        // This change is paired with a kernel modification to iterate i0 serially.
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size_nc, NULL, 0, NULL, NULL));
     }
 }
@@ -4419,7 +4379,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
 
     const int logical_dim = dst->op_params[0];
     const int max_period  = dst->op_params[1];
-    const int dst_nb1_bytes = dst->nb[1]; // This is already in bytes
+    const int dst_nb1_bytes = dst->nb[1]; 
 
     cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
 
@@ -4431,33 +4391,21 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
     CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),       &logical_dim));
     CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),       &max_period));
 
-    // Global work size
-    // Dimension 0 (x) is for feature_idx_j
-    // Max j needed is (logical_dim+1)/2. So global size is (logical_dim+1)/2 + 1 to make j range up to (logical_dim+1)/2.
     size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
-    // Dimension 1 (y) is for timestep_idx
+
     size_t gws1 = (size_t)src0->ne[0];
 
     size_t global_work_size[] = {gws0, gws1, 1};
 
-    // Local work size can be NULL to let the runtime decide, or a small fixed size.
-    // For simplicity and correctness first, use NULL.
-    // size_t local_work_size[] = {16, 1, 1}; // Example, can be tuned
-    // Adjust gws0 to be a multiple of lws0 if lws0 is not NULL and non-uniform WS is not robustly supported.
-    // if (local_work_size[0] > 0 && !backend_ctx->non_uniform_workgroups) {
-    //    global_work_size[0] = ((global_work_size[0] + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0];
-    // }
-
-
 #ifdef GGML_OPENCL_PROFILING
     cl_event evt;
     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem
 
     g_profiling_info.emplace_back();
     size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1};
     size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS
-    if (false /* replace with actual LWS if used */) {
-        // profiling_lws[0] = local_work_size[0]; profiling_lws[1] = local_work_size[1]; profiling_lws[2] = local_work_size[2];
+    if (false) {
+        
     }
     populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
 #else
@@ -5942,10 +5890,8 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
             if (!any_on_device) {
                 return false;
             }
-            // ggml_cl_timestep_embedding takes (backend, src0, dst)
-            // Need a small wrapper or direct call logic here if func signature is strict
             ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
-            return true; // Handled directly
+            return true;
         case GGML_OP_RMS_NORM:
             if (!any_on_device) {
                 return false;