Skip to content

Commit b91a6ac

Browse files
committed
opencl: add mul_mv_id_q4_0_f32_8x_flat
1 parent 71e74a3 commit b91a6ac

File tree

3 files changed

+448
-1
lines changed

3 files changed

+448
-1
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ set(GGML_OPENCL_KERNELS
8080
mul_mv_q4_0_f32_1d_8x_flat
8181
mul_mv_q4_0_f32_1d_16x_flat
8282
mul_mv_q6_k
83+
mul_mv_id_q4_0_f32_8x_flat
8384
mul
8485
norm
8586
relu

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 164 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ struct ggml_backend_opencl_context {
321321
cl_program program_upscale;
322322
cl_program program_concat;
323323
cl_program program_tsembd;
324+
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
324325

325326
cl_kernel kernel_add, kernel_add_row;
326327
cl_kernel kernel_mul, kernel_mul_row;
@@ -366,6 +367,7 @@ struct ggml_backend_opencl_context {
366367
cl_kernel kernel_concat_f32_contiguous;
367368
cl_kernel kernel_concat_f32_non_contiguous;
368369
cl_kernel kernel_timestep_embedding;
370+
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
369371

370372
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
371373
// Transpose kernels
@@ -1112,7 +1114,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
11121114
GGML_LOG_CONT(".");
11131115
}
11141116

1115-
// repeat
1117+
// repeat
11161118
{
11171119
#ifdef GGML_OPENCL_EMBED_KERNELS
11181120
const std::string kernel_src {
@@ -1256,6 +1258,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
12561258
}
12571259
}
12581260

1261+
// mul_mv_id_q4_0_f32_8x_flat
1262+
{
1263+
#ifdef GGML_OPENCL_EMBED_KERNELS
1264+
const std::string kernel_src {
1265+
#include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
1266+
};
1267+
#else
1268+
const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
1269+
#endif
1270+
backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
1271+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1272+
1273+
CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
1274+
GGML_LOG_CONT(".");
1275+
}
1276+
12591277
// Adreno kernels
12601278
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
12611279
// transpose
@@ -2178,6 +2196,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
21782196
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
21792197
}
21802198
return false;
2199+
case GGML_OP_MUL_MAT_ID:
2200+
if (op->src[0]->type == GGML_TYPE_Q4_0) {
2201+
if (op->src[1]->type == GGML_TYPE_F32) {
2202+
GGML_ASSERT(ggml_is_contiguous(op->src[0]));
2203+
GGML_ASSERT(ggml_is_contiguous(op->src[1]));
2204+
return true;
2205+
}
2206+
}
2207+
return false;
21812208
case GGML_OP_RESHAPE:
21822209
case GGML_OP_VIEW:
21832210
case GGML_OP_PERMUTE:
@@ -5536,6 +5563,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
55365563
}
55375564
}
55385565

5566+
static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5567+
GGML_ASSERT(src0);
5568+
GGML_ASSERT(src0->extra);
5569+
GGML_ASSERT(src1);
5570+
GGML_ASSERT(src1->extra);
5571+
GGML_ASSERT(dst);
5572+
GGML_ASSERT(dst->extra);
5573+
5574+
const ggml_tensor * src2 = dst->src[2];
5575+
GGML_ASSERT(src2);
5576+
GGML_ASSERT(src2->extra);
5577+
5578+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5579+
cl_command_queue queue = backend_ctx->queue;
5580+
5581+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5582+
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
5583+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5584+
5585+
cl_ulong offset1 = extra1->offset + src1->view_offs;
5586+
cl_ulong offset2 = extra2->offset + src2->view_offs;
5587+
cl_ulong offsetd = extrad->offset + dst->view_offs;
5588+
5589+
#ifdef GGML_OPENCL_SOA_Q
5590+
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
5591+
#endif
5592+
5593+
const int ne00 = src0->ne[0];
5594+
const int ne01 = src0->ne[1];
5595+
const int ne02 = src0->ne[2];
5596+
const int ne03 = src0->ne[3];
5597+
5598+
const cl_ulong nb00 = src0->nb[0];
5599+
const cl_ulong nb02 = src0->nb[2];
5600+
5601+
const int ne10 = src1->ne[0];
5602+
const int ne11 = src1->ne[1];
5603+
const int ne12 = src1->ne[2];
5604+
const int ne13 = src1->ne[3];
5605+
5606+
const cl_ulong nb11 = src1->nb[1];
5607+
const cl_ulong nb12 = src1->nb[2];
5608+
5609+
const int ne20 = src2->ne[0];
5610+
const int ne21 = src2->ne[1];
5611+
5612+
const cl_ulong nb21 = src2->nb[1];
5613+
5614+
const int ne0 = dst->ne[0];
5615+
const int ne1 = dst->ne[1];
5616+
5617+
const int r2 = ne12/ne02;
5618+
const int r3 = ne13/ne03;
5619+
const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
5620+
5621+
GGML_ASSERT(ne00 == ne10);
5622+
5623+
int sgs = 32; // subgroup size
5624+
int nsg = 1; // number of subgroups
5625+
int nrows = 1; // number of row in src1
5626+
int ndst = 4; // number of values produced by each subgroup
5627+
5628+
cl_kernel kernel;
5629+
5630+
// subgroup mat vec
5631+
switch (src0->type) {
5632+
case GGML_TYPE_Q4_0: {
5633+
kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
5634+
5635+
if (backend_ctx->gpu_family == INTEL) {
5636+
sgs = 16;
5637+
nsg = 1;
5638+
ndst = 8;
5639+
} else if (backend_ctx->gpu_family == ADRENO) {
5640+
sgs = 64;
5641+
nsg = 1;
5642+
ndst = 8;
5643+
} else {
5644+
GGML_ASSERT(false && "TODO: Unknown GPU");
5645+
}
5646+
5647+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
5648+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
5649+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5650+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5651+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
5652+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
5653+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
5654+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
5655+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
5656+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
5657+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
5658+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
5659+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
5660+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
5661+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
5662+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
5663+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
5664+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
5665+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
5666+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
5667+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
5668+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
5669+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
5670+
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
5671+
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
5672+
5673+
break;
5674+
}
5675+
default:
5676+
GGML_ASSERT(false && "not implemented");;
5677+
}
5678+
5679+
int _ne1 = 1;
5680+
int ne123 = dst_rows;
5681+
5682+
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
5683+
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
5684+
5685+
#ifdef GGML_OPENCL_PROFILING
5686+
cl_event evt;
5687+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5688+
5689+
g_profiling_info.emplace_back();
5690+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5691+
#else
5692+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5693+
#endif
5694+
}
5695+
55395696
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
55405697
GGML_ASSERT(src0);
55415698
GGML_ASSERT(src0->extra);
@@ -6444,6 +6601,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
64446601
}
64456602
func = ggml_cl_mul_mat;
64466603
break;
6604+
case GGML_OP_MUL_MAT_ID:
6605+
if (!any_on_device) {
6606+
return false;
6607+
}
6608+
func = ggml_cl_mul_mat_id;
6609+
break;
64476610
case GGML_OP_SCALE:
64486611
if (!any_on_device) {
64496612
return false;

0 commit comments

Comments
 (0)