Skip to content

Commit 344cf20

Browse files
committed
small refactoring
1 parent 1617cdd commit 344cf20

File tree

4 files changed

+38
-175
lines changed

4 files changed

+38
-175
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 19 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ struct ggml_backend_opencl_context {
300300
cl_program program_mul_mv_f32_f32;
301301
cl_program program_mul;
302302
cl_program program_norm;
303-
cl_program program_group_norm; // Added for group_norm
303+
cl_program program_group_norm;
304304
cl_program program_repeat;
305305
cl_program program_pad;
306306
cl_program program_unary;
@@ -328,13 +328,13 @@ struct ggml_backend_opencl_context {
328328
cl_kernel kernel_tanh_f16_nd;
329329
cl_kernel kernel_clamp;
330330
cl_kernel kernel_norm;
331-
cl_kernel kernel_group_norm; // Added for group_norm
331+
cl_kernel kernel_group_norm;
332332
cl_kernel kernel_repeat;
333333
cl_kernel kernel_pad;
334334
cl_kernel kernel_upscale;
335335
cl_kernel kernel_upscale_bilinear;
336-
cl_kernel kernel_concat_f32_contiguous; // Added for concat
337-
cl_kernel kernel_concat_f32_non_contiguous; // Added for concat
336+
cl_kernel kernel_concat_f32_contiguous;
337+
cl_kernel kernel_concat_f32_non_contiguous;
338338
cl_kernel kernel_timestep_embedding;
339339
cl_kernel kernel_rms_norm;
340340
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
@@ -854,21 +854,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
854854
// group_norm
855855
{
856856
#ifdef GGML_OPENCL_EMBED_KERNELS
857-
// Assuming group_norm.cl.h will be created similarly if embedding
858-
// For now, direct include from norm.cl.h implies group_norm kernel is in norm.cl
859857
const std::string kernel_src {
860858
#include "norm.cl.h"
861859
};
862860
#else
863-
// Assuming group_norm kernel is now part of norm.cl as per previous step
864861
const std::string kernel_src = read_file("norm.cl");
865862
#endif
866-
// If group_norm is in a separate file, adjust program creation:
867-
// backend_ctx->program_group_norm =
868-
// build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
869-
// CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
870-
// Since it's added to norm.cl, reuse program_norm
871-
// Since it's added to norm.cl, reuse program_norm
872863
CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_norm, "kernel_group_norm", &err), err));
873864
GGML_LOG_CONT(".");
874865
}
@@ -951,7 +942,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
951942
backend_ctx->program_upscale =
952943
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
953944
CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
954-
// Try to load bilinear kernel from the same program
955945
if (backend_ctx->program_upscale) {
956946
cl_int err_bilinear;
957947
backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
@@ -974,13 +964,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
974964
// concat
975965
{
976966
#ifdef GGML_OPENCL_EMBED_KERNELS
977-
// Assuming concat.cl.h will be created if embedding concat kernels
978-
// For now, assuming concat.cl is a separate file or its content is available
979967
const std::string kernel_src {
980-
#include "concat.cl.h" // Placeholder if you create this embedded header
968+
#include "concat.cl.h"
981969
};
982970
#else
983-
// Assuming concat kernels are in concat.cl
971+
984972
const std::string kernel_src = read_file("concat.cl");
985973
#endif
986974
if (!kernel_src.empty()) {
@@ -1002,20 +990,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1002990
{
1003991
#ifdef GGML_OPENCL_EMBED_KERNELS
1004992
const std::string kernel_src {
1005-
#include "tsembd.cl.h" // Assuming tsembd.cl.h if embedding
993+
#include "tsembd.cl.h"
1006994
};
1007995
#else
1008-
// Assuming tsembd kernel is in tsembd.cl (or norm.cl if you added it there)
1009-
const std::string kernel_src = read_file("tsembd.cl"); // Or "norm.cl"
996+
997+
const std::string kernel_src = read_file("tsembd.cl");
1010998
#endif
1011999
if (!kernel_src.empty()) {
1012-
// Check if program_tsembd should reuse program_norm or be a new one
1013-
// If tsembd.cl is separate:
10141000
backend_ctx->program_tsembd =
10151001
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
10161002
CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err));
1017-
// If kernel_timestep_embedding is in norm.cl, then:
1018-
// CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_norm, "kernel_timestep_embedding", &err), err));
10191003
GGML_LOG_CONT(".");
10201004
} else {
10211005
GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n");
@@ -2070,7 +2054,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
20702054
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
20712055
case GGML_OP_PAD:
20722056
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
2073-
op->src[0]->ne[3] == 1 && op->ne[3] == 1; // Only 3D tensors for now
2057+
op->src[0]->ne[3] == 1 && op->ne[3] == 1;
20742058
case GGML_OP_GROUP_NORM:
20752059
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
20762060
case GGML_OP_UPSCALE:
@@ -3659,7 +3643,6 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
36593643
kernel = backend_ctx->kernel_tanh_f16_nd;
36603644
} else {
36613645
GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
3662-
return;
36633646
}
36643647
GGML_ASSERT(kernel != nullptr);
36653648

@@ -3700,14 +3683,11 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
37003683
global_work_size[1] = (size_t)ne11;
37013684
global_work_size[2] = (size_t)ne12;
37023685

3703-
// Determine appropriate local work size. Max 256 total threads per workgroup.
3704-
// Try to make it somewhat balanced.
37053686
size_t lws0 = 16, lws1 = 4, lws2 = 1;
37063687
if (ne10 < 16) lws0 = ne10;
37073688
if (ne11 < 4) lws1 = ne11;
3708-
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1; // Ensure lws2 is at least 1 if ne12 > 0
3689+
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
37093690

3710-
// Ensure total local size is not too large
37113691
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
37123692
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
37133693
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
@@ -3720,7 +3700,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
37203700
if (global_work_size[0] % local_work_size[0] != 0 ||
37213701
global_work_size[1] % local_work_size[1] != 0 ||
37223702
global_work_size[2] % local_work_size[2] != 0) {
3723-
local_work_size_ptr = NULL; // Let runtime decide if padding is not perfect
3703+
local_work_size_ptr = NULL;
37243704
}
37253705
}
37263706
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
@@ -4177,7 +4157,6 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
41774157

41784158
const int ne00_src = src0->ne[0];
41794159
const int ne01_src = src0->ne[1];
4180-
// ne02_src, ne03_src are not passed to bilinear kernel directly
41814160

41824161
const int ne10_dst = dst->ne[0];
41834162
const int ne11_dst = dst->ne[1];
@@ -4263,8 +4242,6 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
42634242

42644243
if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
42654244
GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
4266-
// Fallback or error handling would be needed here in a real scenario,
4267-
// for now, it will likely lead to an assertion or error later if not handled.
42684245
return;
42694246
}
42704247

@@ -4281,21 +4258,16 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
42814258

42824259
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
42834260
if (dim == 3) {
4284-
// Handle dim 3 for contiguous with two clEnqueueCopyBuffer calls
4285-
// (or clEnqueueCopyBufferRect if strides were complex but still block-copyable)
4286-
// This matches the CUDA logic of using cudaMemcpyAsync for dim 3.
4261+
42874262
size_t nbytes_src0 = ggml_nbytes(src0);
42884263
size_t nbytes_src1 = ggml_nbytes(src1);
42894264

4290-
// Copy src0
42914265
CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
42924266
off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
4293-
// Copy src1
42944267
CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
42954268
off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
42964269
} else {
4297-
// Use specialized contiguous kernel: kernel_concat_f32_contiguous
4298-
// This kernel is designed to be called in a loop for the 4th dimension (i3)
4270+
42994271
cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
43004272
size_t global_work_size[3];
43014273

@@ -4304,8 +4276,6 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
43044276
cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
43054277
cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]);
43064278

4307-
// Kernel expects 3D slice dimensions.
4308-
// src0->ne[0..2], src1->ne[0..2], dst->ne[0..2]
43094279
int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
43104280
int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
43114281
int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2];
@@ -4335,15 +4305,11 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
43354305
}
43364306
}
43374307
} else {
4338-
// Use generic non-contiguous kernel: kernel_concat_f32_non_contiguous
43394308
cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
43404309

43414310
long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
43424311
cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
43434312

4344-
// src1 dimensions (ne10-ne13) are not explicitly passed to this OpenCL kernel,
4345-
// as the logic inside the kernel derives necessary src1 indices based on dst indices and src0 dimensions.
4346-
// Strides for src1 are important.
43474313
cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
43484314

43494315
long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
@@ -4381,16 +4347,10 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
43814347
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3));
43824348
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim));
43834349

4384-
// Global work size is based on dst's dimensions ne[1], ne[2], ne[3].
4385-
// Local work size for the 0th dimension is handled inside the kernel loop.
4386-
// A common local size for the first dimension.
43874350
size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
43884351
d_ne2 > 0 ? (size_t)d_ne2 : 1,
43894352
d_ne3 > 0 ? (size_t)d_ne3 : 1 };
43904353

4391-
// Using NULL for local_work_size lets the OpenCL runtime decide,
4392-
// or effectively makes the i0 loop in the kernel serial per work-item.
4393-
// This change is paired with a kernel modification to iterate i0 serially.
43944354
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size_nc, NULL, 0, NULL, NULL));
43954355
}
43964356
}
@@ -4419,7 +4379,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
44194379

44204380
const int logical_dim = dst->op_params[0];
44214381
const int max_period = dst->op_params[1];
4422-
const int dst_nb1_bytes = dst->nb[1]; // This is already in bytes
4382+
const int dst_nb1_bytes = dst->nb[1];
44234383

44244384
cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
44254385

@@ -4431,33 +4391,21 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
44314391
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim));
44324392
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period));
44334393

4434-
// Global work size
4435-
// Dimension 0 (x) is for feature_idx_j
4436-
// Max j needed is (logical_dim+1)/2. So global size is (logical_dim+1)/2 + 1 to make j range up to (logical_dim+1)/2.
44374394
size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
4438-
// Dimension 1 (y) is for timestep_idx
4395+
44394396
size_t gws1 = (size_t)src0->ne[0];
44404397

44414398
size_t global_work_size[] = {gws0, gws1, 1};
44424399

4443-
// Local work size can be NULL to let the runtime decide, or a small fixed size.
4444-
// For simplicity and correctness first, use NULL.
4445-
// size_t local_work_size[] = {16, 1, 1}; // Example, can be tuned
4446-
// Adjust gws0 to be a multiple of lws0 if lws0 is not NULL and non-uniform WS is not robustly supported.
4447-
// if (local_work_size[0] > 0 && !backend_ctx->non_uniform_workgroups) {
4448-
// global_work_size[0] = ((global_work_size[0] + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0];
4449-
// }
4450-
4451-
44524400
#ifdef GGML_OPENCL_PROFILING
44534401
cl_event evt;
44544402
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem
44554403

44564404
g_profiling_info.emplace_back();
44574405
size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1};
44584406
size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS
4459-
if (false /* replace with actual LWS if used */) {
4460-
// profiling_lws[0] = local_work_size[0]; profiling_lws[1] = local_work_size[1]; profiling_lws[2] = local_work_size[2];
4407+
if (false) {
4408+
44614409
}
44624410
populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
44634411
#else
@@ -5942,10 +5890,8 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
59425890
if (!any_on_device) {
59435891
return false;
59445892
}
5945-
// ggml_cl_timestep_embedding takes (backend, src0, dst)
5946-
// Need a small wrapper or direct call logic here if func signature is strict
59475893
ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
5948-
return true; // Handled directly
5894+
return true;
59495895
case GGML_OP_RMS_NORM:
59505896
if (!any_on_device) {
59515897
return false;

0 commit comments

Comments
 (0)