Skip to content

Commit e2c1bff

Browse files
lhezshawngu-quic
andauthored
opencl: add initial mxfp4 support via mv (#15270)
* opencl: add reference `mul_mv_mxfp4_f32` * opencl: add reference `mul_mv_id` for mxfp4 * Q4_0 tranpose fix for Adreno --------- Co-authored-by: shawngu-quic <[email protected]>
1 parent 5edf159 commit e2c1bff

File tree

5 files changed

+496
-6
lines changed

5 files changed

+496
-6
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,9 @@ set(GGML_OPENCL_KERNELS
8282
mul_mv_q4_0_f32_1d_8x_flat
8383
mul_mv_q4_0_f32_1d_16x_flat
8484
mul_mv_q6_k
85+
mul_mv_mxfp4_f32
8586
mul_mv_id_q4_0_f32_8x_flat
87+
mul_mv_id_mxfp4_f32
8688
mul_mm_f32_f32_l4_lm
8789
mul_mm_f16_f32_l4_lm
8890
mul

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 141 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,7 @@ struct ggml_backend_opencl_context {
365365
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
366366
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
367367
cl_program program_mul_mv_q6_K;
368+
cl_program program_mul_mv_mxfp4_f32;
368369
cl_program program_mul_mv_f16_f16;
369370
cl_program program_mul_mv_f16_f32_1row;
370371
cl_program program_mul_mv_f16_f32_l4;
@@ -398,6 +399,7 @@ struct ggml_backend_opencl_context {
398399
cl_program program_conv_2d_f16_f32;
399400
cl_program program_tsembd;
400401
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
402+
cl_program program_mul_mv_id_mxfp4_f32;
401403
cl_program program_mul_mm_f32_f32_l4_lm;
402404
cl_program program_mul_mm_f16_f32_l4_lm;
403405

@@ -439,6 +441,7 @@ struct ggml_backend_opencl_context {
439441
cl_kernel kernel_convert_block_q4_0_noshuffle;
440442
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
441443
cl_kernel kernel_mul_mv_q6_K_f32;
444+
cl_kernel kernel_mul_mv_mxfp4_f32;
442445
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
443446
cl_kernel kernel_argsort_f32_i32;
444447
cl_kernel kernel_sum_rows_f32;
@@ -455,6 +458,7 @@ struct ggml_backend_opencl_context {
455458
cl_kernel kernel_conv_2d_f16_f32;
456459
cl_kernel kernel_timestep_embedding;
457460
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
461+
cl_kernel kernel_mul_mv_id_mxfp4_f32;
458462
cl_kernel kernel_mul_mm_f32_f32_l4_lm;
459463
cl_kernel kernel_mul_mm_f16_f32_l4_lm;
460464

@@ -577,6 +581,7 @@ struct ggml_backend_opencl_context {
577581
cl_kernel kernel_transpose_32;
578582
cl_kernel kernel_transpose_32_16;
579583
cl_kernel kernel_transpose_16;
584+
cl_kernel kernel_transpose_16_4x1;
580585

581586
cl_mem A_s_d_max; // max scale buffer size for transpose
582587
cl_mem A_q_d_max; // max weight buffer size for transpose
@@ -971,6 +976,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
971976
GGML_LOG_CONT(".");
972977
}
973978

979+
// mul_mv_mxfp4_f32
980+
{
981+
#ifdef GGML_OPENCL_EMBED_KERNELS
982+
const std::string kernel_src {
983+
#include "mul_mv_mxfp4_f32.cl.h"
984+
};
985+
#else
986+
const std::string kernel_src = read_file("mul_mv_mxfp4_f32.cl");
987+
#endif
988+
backend_ctx->program_mul_mv_mxfp4_f32 =
989+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
990+
991+
CL_CHECK((backend_ctx->kernel_mul_mv_mxfp4_f32 = clCreateKernel(backend_ctx->program_mul_mv_mxfp4_f32, "kernel_mul_mv_mxfp4_f32", &err), err));
992+
GGML_LOG_CONT(".");
993+
}
994+
974995
// mul_mv_f16_f16
975996
{
976997
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1611,6 +1632,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
16111632
GGML_LOG_CONT(".");
16121633
}
16131634

1635+
// mul_mv_id_mxfp4_f32
1636+
{
1637+
#ifdef GGML_OPENCL_EMBED_KERNELS
1638+
const std::string kernel_src {
1639+
#include "mul_mv_id_mxfp4_f32.cl.h"
1640+
};
1641+
#else
1642+
const std::string kernel_src = read_file("mul_mv_id_mxfp4_f32.cl");
1643+
#endif
1644+
backend_ctx->program_mul_mv_id_mxfp4_f32 =
1645+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1646+
1647+
CL_CHECK((backend_ctx->kernel_mul_mv_id_mxfp4_f32 = clCreateKernel(backend_ctx->program_mul_mv_id_mxfp4_f32, "kernel_mul_mv_id_mxfp4_f32", &err), err));
1648+
GGML_LOG_CONT(".");
1649+
}
1650+
16141651
// Adreno kernels
16151652
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
16161653
// transpose
@@ -1628,6 +1665,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
16281665
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
16291666
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
16301667
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
1668+
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
16311669
GGML_LOG_CONT(".");
16321670
}
16331671

@@ -2552,13 +2590,14 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
25522590
return true;
25532591
} else if (op->src[0]->type == GGML_TYPE_F32) {
25542592
return op->src[1]->type == GGML_TYPE_F32;
2555-
} else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
2593+
} else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_MXFP4 ||
25562594
op->src[0]->type == GGML_TYPE_Q6_K) {
25572595
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
25582596
}
25592597
return false;
25602598
case GGML_OP_MUL_MAT_ID:
2561-
if (op->src[0]->type == GGML_TYPE_Q4_0) {
2599+
if (op->src[0]->type == GGML_TYPE_Q4_0 ||
2600+
op->src[0]->type == GGML_TYPE_MXFP4) {
25622601
if (op->src[1]->type == GGML_TYPE_F32) {
25632602
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
25642603
}
@@ -2944,7 +2983,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
29442983
// cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
29452984
CL_CHECK(err);
29462985

2947-
// size_t d_size_bytes = M * (K / 32) / 2 * sizeof(float);
2986+
bool K_tile_trans = true;
2987+
if ((K / 32) % 4 != 0){
2988+
K_tile_trans =false;
2989+
}
29482990
size_t d_size_bytes = M * (K / 32) * 2;
29492991
region.origin = 0;
29502992
region.size = d_size_bytes;
@@ -2985,10 +3027,15 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
29853027
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
29863028
CL_CHECK(err);
29873029

2988-
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
29893030
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
3031+
if (K_tile_trans) {
3032+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
3033+
img_desc_1d.image_width = M * K / 32 / 4;
3034+
} else {
3035+
img_fmt_1d = { CL_R, CL_HALF_FLOAT };
3036+
img_desc_1d.image_width = M * K / 32;
3037+
}
29903038
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
2991-
img_desc_1d.image_width = M * K / 32 / 4;
29923039
img_desc_1d.buffer = extra->d;
29933040
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
29943041
CL_CHECK(err);
@@ -3024,6 +3071,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
30243071
int width_s = K / 32 / 4;
30253072

30263073
kernel = backend_ctx->kernel_transpose_16;
3074+
if (!K_tile_trans) {
3075+
kernel = backend_ctx->kernel_transpose_16_4x1;
3076+
width_s = K / 32;
3077+
}
30273078
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
30283079
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
30293080
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
@@ -6254,11 +6305,47 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
62546305
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
62556306
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
62566307
break;
6308+
case GGML_TYPE_MXFP4: {
6309+
kernel = backend_ctx->kernel_mul_mv_mxfp4_f32;
6310+
6311+
if (backend_ctx->gpu_family == INTEL) {
6312+
nth0 = 16;
6313+
nth1 = 2;
6314+
ndst = nth1*2;
6315+
} else if (backend_ctx->gpu_family == ADRENO) {
6316+
nth0 = 64;
6317+
nth1 = 2;
6318+
ndst = nth1*2;
6319+
} else {
6320+
GGML_ASSERT(false && "TODO: Unknown GPU");
6321+
}
6322+
6323+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6324+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6325+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
6326+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6327+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6328+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6329+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
6330+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
6331+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
6332+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
6333+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
6334+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
6335+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb12));
6336+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb13));
6337+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne0));
6338+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne1));
6339+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r2));
6340+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r3));
6341+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float)*nth0,nullptr));
6342+
break;
6343+
}
62576344
default:
62586345
GGML_ASSERT(false && "not implemented");
62596346
}
62606347

6261-
if (src0t == GGML_TYPE_Q4_0 ||
6348+
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_MXFP4 ||
62626349
src0t == GGML_TYPE_Q4_1 ||
62636350
src0t == GGML_TYPE_Q8_0 ||
62646351
src0t == GGML_TYPE_Q2_K) {
@@ -6307,10 +6394,12 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
63076394

63086395
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
63096396

6397+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
63106398
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
63116399
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
63126400
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
63136401

6402+
cl_ulong offset0 = extra0->offset + src0->view_offs;
63146403
cl_ulong offset1 = extra1->offset + src1->view_offs;
63156404
cl_ulong offset2 = extra2->offset + src2->view_offs;
63166405
cl_ulong offsetd = extrad->offset + dst->view_offs;
@@ -6325,7 +6414,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
63256414
const int ne03 = src0->ne[3];
63266415

63276416
const cl_ulong nb00 = src0->nb[0];
6417+
const cl_ulong nb01 = src0->nb[1];
63286418
const cl_ulong nb02 = src0->nb[2];
6419+
const cl_ulong nb03 = src0->nb[3];
63296420

63306421
const int ne10 = src1->ne[0];
63316422
const int ne11 = src1->ne[1];
@@ -6334,6 +6425,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
63346425

63356426
const cl_ulong nb11 = src1->nb[1];
63366427
const cl_ulong nb12 = src1->nb[2];
6428+
const cl_ulong nb13 = src1->nb[3];
63376429

63386430
const int ne20 = src2->ne[0];
63396431
const int ne21 = src2->ne[1];
@@ -6401,6 +6493,49 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
64016493

64026494
break;
64036495
}
6496+
case GGML_TYPE_MXFP4: {
6497+
kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32;
6498+
6499+
if (backend_ctx->gpu_family == INTEL) {
6500+
sgs = 16;
6501+
nsg = 2;
6502+
ndst = 2;
6503+
} else if (backend_ctx->gpu_family == ADRENO) {
6504+
sgs = 64;
6505+
nsg = 2;
6506+
ndst = 2;
6507+
} else {
6508+
GGML_ASSERT(false && "TODO: Unknown GPU");
6509+
}
6510+
6511+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6512+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6513+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
6514+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6515+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
6516+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
6517+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
6518+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
6519+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
6520+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
6521+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
6522+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
6523+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
6524+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
6525+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
6526+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
6527+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
6528+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne20));
6529+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne21));
6530+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb21));
6531+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
6532+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
6533+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
6534+
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
6535+
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs,nullptr));
6536+
6537+
break;
6538+
}
64046539
default:
64056540
GGML_ASSERT(false && "not implemented");;
64066541
}

0 commit comments

Comments
 (0)