@@ -365,6 +365,7 @@ struct ggml_backend_opencl_context {
365365 cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
366366 cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
367367 cl_program program_mul_mv_q6_K;
368+ cl_program program_mul_mv_mxfp4_f32;
368369 cl_program program_mul_mv_f16_f16;
369370 cl_program program_mul_mv_f16_f32_1row;
370371 cl_program program_mul_mv_f16_f32_l4;
@@ -398,6 +399,7 @@ struct ggml_backend_opencl_context {
398399 cl_program program_conv_2d_f16_f32;
399400 cl_program program_tsembd;
400401 cl_program program_mul_mv_id_q4_0_f32_8x_flat;
402+ cl_program program_mul_mv_id_mxfp4_f32;
401403 cl_program program_mul_mm_f32_f32_l4_lm;
402404 cl_program program_mul_mm_f16_f32_l4_lm;
403405
@@ -439,6 +441,7 @@ struct ggml_backend_opencl_context {
439441 cl_kernel kernel_convert_block_q4_0_noshuffle;
440442 cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
441443 cl_kernel kernel_mul_mv_q6_K_f32;
444+ cl_kernel kernel_mul_mv_mxfp4_f32;
442445 cl_kernel kernel_im2col_f32, kernel_im2col_f16;
443446 cl_kernel kernel_argsort_f32_i32;
444447 cl_kernel kernel_sum_rows_f32;
@@ -455,6 +458,7 @@ struct ggml_backend_opencl_context {
455458 cl_kernel kernel_conv_2d_f16_f32;
456459 cl_kernel kernel_timestep_embedding;
457460 cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
461+ cl_kernel kernel_mul_mv_id_mxfp4_f32;
458462 cl_kernel kernel_mul_mm_f32_f32_l4_lm;
459463 cl_kernel kernel_mul_mm_f16_f32_l4_lm;
460464
@@ -577,6 +581,7 @@ struct ggml_backend_opencl_context {
577581 cl_kernel kernel_transpose_32;
578582 cl_kernel kernel_transpose_32_16;
579583 cl_kernel kernel_transpose_16;
584+ cl_kernel kernel_transpose_16_4x1;
580585
581586 cl_mem A_s_d_max; // max scale buffer size for transpose
582587 cl_mem A_q_d_max; // max weight buffer size for transpose
@@ -971,6 +976,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
971976 GGML_LOG_CONT (" ." );
972977 }
973978
979+ // mul_mv_mxfp4_f32
980+ {
981+ #ifdef GGML_OPENCL_EMBED_KERNELS
982+ const std::string kernel_src {
983+ #include " mul_mv_mxfp4_f32.cl.h"
984+ };
985+ #else
986+ const std::string kernel_src = read_file (" mul_mv_mxfp4_f32.cl" );
987+ #endif
988+ backend_ctx->program_mul_mv_mxfp4_f32 =
989+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
990+
991+ CL_CHECK ((backend_ctx->kernel_mul_mv_mxfp4_f32 = clCreateKernel (backend_ctx->program_mul_mv_mxfp4_f32 , " kernel_mul_mv_mxfp4_f32" , &err), err));
992+ GGML_LOG_CONT (" ." );
993+ }
994+
974995 // mul_mv_f16_f16
975996 {
976997#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1611,6 +1632,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
16111632 GGML_LOG_CONT (" ." );
16121633 }
16131634
1635+ // mul_mv_id_mxfp4_f32
1636+ {
1637+ #ifdef GGML_OPENCL_EMBED_KERNELS
1638+ const std::string kernel_src {
1639+ #include " mul_mv_id_mxfp4_f32.cl.h"
1640+ };
1641+ #else
1642+ const std::string kernel_src = read_file (" mul_mv_id_mxfp4_f32.cl" );
1643+ #endif
1644+ backend_ctx->program_mul_mv_id_mxfp4_f32 =
1645+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1646+
1647+ CL_CHECK ((backend_ctx->kernel_mul_mv_id_mxfp4_f32 = clCreateKernel (backend_ctx->program_mul_mv_id_mxfp4_f32 , " kernel_mul_mv_id_mxfp4_f32" , &err), err));
1648+ GGML_LOG_CONT (" ." );
1649+ }
1650+
16141651 // Adreno kernels
16151652#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
16161653 // transpose
@@ -1628,6 +1665,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
16281665 CL_CHECK ((backend_ctx->kernel_transpose_32_16 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_32_16" , &err), err));
16291666 CL_CHECK ((backend_ctx->kernel_transpose_32 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_32" , &err), err));
16301667 CL_CHECK ((backend_ctx->kernel_transpose_16 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_16" , &err), err));
1668+ CL_CHECK ((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_16_4x1" , &err), err));
16311669 GGML_LOG_CONT (" ." );
16321670 }
16331671
@@ -2552,13 +2590,14 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
25522590 return true ;
25532591 } else if (op->src [0 ]->type == GGML_TYPE_F32) {
25542592 return op->src [1 ]->type == GGML_TYPE_F32;
2555- } else if (op->src [0 ]->type == GGML_TYPE_Q4_0 ||
2593+ } else if (op->src [0 ]->type == GGML_TYPE_Q4_0 || op-> src [ 0 ]-> type == GGML_TYPE_MXFP4 ||
25562594 op->src [0 ]->type == GGML_TYPE_Q6_K) {
25572595 return op->src [1 ]->type == GGML_TYPE_F32 && ggml_is_contiguous (op->src [0 ]) && ggml_is_contiguous (op->src [1 ]);
25582596 }
25592597 return false ;
25602598 case GGML_OP_MUL_MAT_ID:
2561- if (op->src [0 ]->type == GGML_TYPE_Q4_0) {
2599+ if (op->src [0 ]->type == GGML_TYPE_Q4_0 ||
2600+ op->src [0 ]->type == GGML_TYPE_MXFP4) {
25622601 if (op->src [1 ]->type == GGML_TYPE_F32) {
25632602 return ggml_is_contiguous (op->src [0 ]) && ggml_is_contiguous (op->src [1 ]);
25642603 }
@@ -2944,7 +2983,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
29442983 // cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
29452984 CL_CHECK (err);
29462985
2947- // size_t d_size_bytes = M * (K / 32) / 2 * sizeof(float);
2986+ bool K_tile_trans = true ;
2987+ if ((K / 32 ) % 4 != 0 ){
2988+ K_tile_trans =false ;
2989+ }
29482990 size_t d_size_bytes = M * (K / 32 ) * 2 ;
29492991 region.origin = 0 ;
29502992 region.size = d_size_bytes;
@@ -2985,10 +3027,15 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
29853027 qT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
29863028 CL_CHECK (err);
29873029
2988- img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
29893030 memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
3031+ if (K_tile_trans) {
3032+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
3033+ img_desc_1d.image_width = M * K / 32 / 4 ;
3034+ } else {
3035+ img_fmt_1d = { CL_R, CL_HALF_FLOAT };
3036+ img_desc_1d.image_width = M * K / 32 ;
3037+ }
29903038 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
2991- img_desc_1d.image_width = M * K / 32 / 4 ;
29923039 img_desc_1d.buffer = extra->d ;
29933040 d_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
29943041 CL_CHECK (err);
@@ -3024,6 +3071,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
30243071 int width_s = K / 32 / 4 ;
30253072
30263073 kernel = backend_ctx->kernel_transpose_16 ;
3074+ if (!K_tile_trans) {
3075+ kernel = backend_ctx->kernel_transpose_16_4x1 ;
3076+ width_s = K / 32 ;
3077+ }
30273078 CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &d_d_image1D));
30283079 CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &dT_d_image1D));
30293080 CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (int ), &height_s));
@@ -6254,11 +6305,47 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
62546305 CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ), &r2));
62556306 CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (int ), &r3));
62566307 break ;
6308+ case GGML_TYPE_MXFP4: {
6309+ kernel = backend_ctx->kernel_mul_mv_mxfp4_f32 ;
6310+
6311+ if (backend_ctx->gpu_family == INTEL) {
6312+ nth0 = 16 ;
6313+ nth1 = 2 ;
6314+ ndst = nth1*2 ;
6315+ } else if (backend_ctx->gpu_family == ADRENO) {
6316+ nth0 = 64 ;
6317+ nth1 = 2 ;
6318+ ndst = nth1*2 ;
6319+ } else {
6320+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6321+ }
6322+
6323+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
6324+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
6325+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6326+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6327+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6328+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
6329+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
6330+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &nb01));
6331+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb02));
6332+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb03));
6333+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (int ), &ne12));
6334+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_ulong), &nb11));
6335+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb12));
6336+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb13));
6337+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (int ), &ne0));
6338+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne1));
6339+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &r2));
6340+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r3));
6341+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (float )*nth0,nullptr ));
6342+ break ;
6343+ }
62576344 default :
62586345 GGML_ASSERT (false && " not implemented" );
62596346 }
62606347
6261- if (src0t == GGML_TYPE_Q4_0 ||
6348+ if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_MXFP4 ||
62626349 src0t == GGML_TYPE_Q4_1 ||
62636350 src0t == GGML_TYPE_Q8_0 ||
62646351 src0t == GGML_TYPE_Q2_K) {
@@ -6307,10 +6394,12 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
63076394
63086395 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
63096396
6397+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
63106398 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
63116399 ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra ;
63126400 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
63136401
6402+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
63146403 cl_ulong offset1 = extra1->offset + src1->view_offs ;
63156404 cl_ulong offset2 = extra2->offset + src2->view_offs ;
63166405 cl_ulong offsetd = extrad->offset + dst->view_offs ;
@@ -6325,7 +6414,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
63256414 const int ne03 = src0->ne [3 ];
63266415
63276416 const cl_ulong nb00 = src0->nb [0 ];
6417+ const cl_ulong nb01 = src0->nb [1 ];
63286418 const cl_ulong nb02 = src0->nb [2 ];
6419+ const cl_ulong nb03 = src0->nb [3 ];
63296420
63306421 const int ne10 = src1->ne [0 ];
63316422 const int ne11 = src1->ne [1 ];
@@ -6334,6 +6425,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
63346425
63356426 const cl_ulong nb11 = src1->nb [1 ];
63366427 const cl_ulong nb12 = src1->nb [2 ];
6428+ const cl_ulong nb13 = src1->nb [3 ];
63376429
63386430 const int ne20 = src2->ne [0 ];
63396431 const int ne21 = src2->ne [1 ];
@@ -6401,6 +6493,49 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
64016493
64026494 break ;
64036495 }
6496+ case GGML_TYPE_MXFP4: {
6497+ kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32 ;
6498+
6499+ if (backend_ctx->gpu_family == INTEL) {
6500+ sgs = 16 ;
6501+ nsg = 2 ;
6502+ ndst = 2 ;
6503+ } else if (backend_ctx->gpu_family == ADRENO) {
6504+ sgs = 64 ;
6505+ nsg = 2 ;
6506+ ndst = 2 ;
6507+ } else {
6508+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6509+ }
6510+
6511+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
6512+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
6513+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6514+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6515+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extra2->data_device ));
6516+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offset2));
6517+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_mem), &extrad->data_device ));
6518+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &offsetd));
6519+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (int ), &ne00));
6520+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb01));
6521+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb02));
6522+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_ulong), &nb03));
6523+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (int ), &ne11));
6524+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ), &ne12));
6525+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb11));
6526+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (cl_ulong), &nb12));
6527+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (cl_ulong), &nb13));
6528+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &ne20));
6529+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &ne21));
6530+ CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (cl_ulong), &nb21));
6531+ CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &ne0));
6532+ CL_CHECK (clSetKernelArg (kernel, 21 , sizeof (int ), &ne1));
6533+ CL_CHECK (clSetKernelArg (kernel, 22 , sizeof (int ), &r2));
6534+ CL_CHECK (clSetKernelArg (kernel, 23 , sizeof (int ), &r3));
6535+ CL_CHECK (clSetKernelArg (kernel, 24 , sizeof (float )*sgs,nullptr ));
6536+
6537+ break ;
6538+ }
64046539 default :
64056540 GGML_ASSERT (false && " not implemented" );;
64066541 }
0 commit comments