@@ -365,6 +365,7 @@ struct ggml_backend_opencl_context {
365
365
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
366
366
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
367
367
cl_program program_mul_mv_q6_K;
368
+ cl_program program_mul_mv_mxfp4_f32;
368
369
cl_program program_mul_mv_f16_f16;
369
370
cl_program program_mul_mv_f16_f32_1row;
370
371
cl_program program_mul_mv_f16_f32_l4;
@@ -398,6 +399,7 @@ struct ggml_backend_opencl_context {
398
399
cl_program program_conv_2d_f16_f32;
399
400
cl_program program_tsembd;
400
401
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
402
+ cl_program program_mul_mv_id_mxfp4_f32;
401
403
cl_program program_mul_mm_f32_f32_l4_lm;
402
404
cl_program program_mul_mm_f16_f32_l4_lm;
403
405
@@ -439,6 +441,7 @@ struct ggml_backend_opencl_context {
439
441
cl_kernel kernel_convert_block_q4_0_noshuffle;
440
442
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
441
443
cl_kernel kernel_mul_mv_q6_K_f32;
444
+ cl_kernel kernel_mul_mv_mxfp4_f32;
442
445
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
443
446
cl_kernel kernel_argsort_f32_i32;
444
447
cl_kernel kernel_sum_rows_f32;
@@ -455,6 +458,7 @@ struct ggml_backend_opencl_context {
455
458
cl_kernel kernel_conv_2d_f16_f32;
456
459
cl_kernel kernel_timestep_embedding;
457
460
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
461
+ cl_kernel kernel_mul_mv_id_mxfp4_f32;
458
462
cl_kernel kernel_mul_mm_f32_f32_l4_lm;
459
463
cl_kernel kernel_mul_mm_f16_f32_l4_lm;
460
464
@@ -577,6 +581,7 @@ struct ggml_backend_opencl_context {
577
581
cl_kernel kernel_transpose_32;
578
582
cl_kernel kernel_transpose_32_16;
579
583
cl_kernel kernel_transpose_16;
584
+ cl_kernel kernel_transpose_16_4x1;
580
585
581
586
cl_mem A_s_d_max; // max scale buffer size for transpose
582
587
cl_mem A_q_d_max; // max weight buffer size for transpose
@@ -971,6 +976,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
971
976
GGML_LOG_CONT (" ." );
972
977
}
973
978
979
+ // mul_mv_mxfp4_f32
980
+ {
981
+ #ifdef GGML_OPENCL_EMBED_KERNELS
982
+ const std::string kernel_src {
983
+ #include " mul_mv_mxfp4_f32.cl.h"
984
+ };
985
+ #else
986
+ const std::string kernel_src = read_file (" mul_mv_mxfp4_f32.cl" );
987
+ #endif
988
+ backend_ctx->program_mul_mv_mxfp4_f32 =
989
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
990
+
991
+ CL_CHECK ((backend_ctx->kernel_mul_mv_mxfp4_f32 = clCreateKernel (backend_ctx->program_mul_mv_mxfp4_f32 , " kernel_mul_mv_mxfp4_f32" , &err), err));
992
+ GGML_LOG_CONT (" ." );
993
+ }
994
+
974
995
// mul_mv_f16_f16
975
996
{
976
997
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1611,6 +1632,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1611
1632
GGML_LOG_CONT (" ." );
1612
1633
}
1613
1634
1635
+ // mul_mv_id_mxfp4_f32
1636
+ {
1637
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1638
+ const std::string kernel_src {
1639
+ #include " mul_mv_id_mxfp4_f32.cl.h"
1640
+ };
1641
+ #else
1642
+ const std::string kernel_src = read_file (" mul_mv_id_mxfp4_f32.cl" );
1643
+ #endif
1644
+ backend_ctx->program_mul_mv_id_mxfp4_f32 =
1645
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1646
+
1647
+ CL_CHECK ((backend_ctx->kernel_mul_mv_id_mxfp4_f32 = clCreateKernel (backend_ctx->program_mul_mv_id_mxfp4_f32 , " kernel_mul_mv_id_mxfp4_f32" , &err), err));
1648
+ GGML_LOG_CONT (" ." );
1649
+ }
1650
+
1614
1651
// Adreno kernels
1615
1652
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1616
1653
// transpose
@@ -1628,6 +1665,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1628
1665
CL_CHECK ((backend_ctx->kernel_transpose_32_16 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_32_16" , &err), err));
1629
1666
CL_CHECK ((backend_ctx->kernel_transpose_32 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_32" , &err), err));
1630
1667
CL_CHECK ((backend_ctx->kernel_transpose_16 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_16" , &err), err));
1668
+ CL_CHECK ((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_16_4x1" , &err), err));
1631
1669
GGML_LOG_CONT (" ." );
1632
1670
}
1633
1671
@@ -2552,13 +2590,14 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2552
2590
return true ;
2553
2591
} else if (op->src [0 ]->type == GGML_TYPE_F32) {
2554
2592
return op->src [1 ]->type == GGML_TYPE_F32;
2555
- } else if (op->src [0 ]->type == GGML_TYPE_Q4_0 ||
2593
+ } else if (op->src [0 ]->type == GGML_TYPE_Q4_0 || op-> src [ 0 ]-> type == GGML_TYPE_MXFP4 ||
2556
2594
op->src [0 ]->type == GGML_TYPE_Q6_K) {
2557
2595
return op->src [1 ]->type == GGML_TYPE_F32 && ggml_is_contiguous (op->src [0 ]) && ggml_is_contiguous (op->src [1 ]);
2558
2596
}
2559
2597
return false ;
2560
2598
case GGML_OP_MUL_MAT_ID:
2561
- if (op->src [0 ]->type == GGML_TYPE_Q4_0) {
2599
+ if (op->src [0 ]->type == GGML_TYPE_Q4_0 ||
2600
+ op->src [0 ]->type == GGML_TYPE_MXFP4) {
2562
2601
if (op->src [1 ]->type == GGML_TYPE_F32) {
2563
2602
return ggml_is_contiguous (op->src [0 ]) && ggml_is_contiguous (op->src [1 ]);
2564
2603
}
@@ -2944,7 +2983,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
2944
2983
// cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
2945
2984
CL_CHECK (err);
2946
2985
2947
- // size_t d_size_bytes = M * (K / 32) / 2 * sizeof(float);
2986
+ bool K_tile_trans = true ;
2987
+ if ((K / 32 ) % 4 != 0 ){
2988
+ K_tile_trans =false ;
2989
+ }
2948
2990
size_t d_size_bytes = M * (K / 32 ) * 2 ;
2949
2991
region.origin = 0 ;
2950
2992
region.size = d_size_bytes;
@@ -2985,10 +3027,15 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
2985
3027
qT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
2986
3028
CL_CHECK (err);
2987
3029
2988
- img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
2989
3030
memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
3031
+ if (K_tile_trans) {
3032
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
3033
+ img_desc_1d.image_width = M * K / 32 / 4 ;
3034
+ } else {
3035
+ img_fmt_1d = { CL_R, CL_HALF_FLOAT };
3036
+ img_desc_1d.image_width = M * K / 32 ;
3037
+ }
2990
3038
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
2991
- img_desc_1d.image_width = M * K / 32 / 4 ;
2992
3039
img_desc_1d.buffer = extra->d ;
2993
3040
d_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
2994
3041
CL_CHECK (err);
@@ -3024,6 +3071,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
3024
3071
int width_s = K / 32 / 4 ;
3025
3072
3026
3073
kernel = backend_ctx->kernel_transpose_16 ;
3074
+ if (!K_tile_trans) {
3075
+ kernel = backend_ctx->kernel_transpose_16_4x1 ;
3076
+ width_s = K / 32 ;
3077
+ }
3027
3078
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &d_d_image1D));
3028
3079
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &dT_d_image1D));
3029
3080
CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (int ), &height_s));
@@ -6254,11 +6305,47 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6254
6305
CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ), &r2));
6255
6306
CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (int ), &r3));
6256
6307
break ;
6308
+ case GGML_TYPE_MXFP4: {
6309
+ kernel = backend_ctx->kernel_mul_mv_mxfp4_f32 ;
6310
+
6311
+ if (backend_ctx->gpu_family == INTEL) {
6312
+ nth0 = 16 ;
6313
+ nth1 = 2 ;
6314
+ ndst = nth1*2 ;
6315
+ } else if (backend_ctx->gpu_family == ADRENO) {
6316
+ nth0 = 64 ;
6317
+ nth1 = 2 ;
6318
+ ndst = nth1*2 ;
6319
+ } else {
6320
+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6321
+ }
6322
+
6323
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
6324
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
6325
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6326
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6327
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6328
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
6329
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne00));
6330
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &nb01));
6331
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb02));
6332
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb03));
6333
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (int ), &ne12));
6334
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_ulong), &nb11));
6335
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb12));
6336
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb13));
6337
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (int ), &ne0));
6338
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &ne1));
6339
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &r2));
6340
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &r3));
6341
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (float )*nth0,nullptr ));
6342
+ break ;
6343
+ }
6257
6344
default :
6258
6345
GGML_ASSERT (false && " not implemented" );
6259
6346
}
6260
6347
6261
- if (src0t == GGML_TYPE_Q4_0 ||
6348
+ if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_MXFP4 ||
6262
6349
src0t == GGML_TYPE_Q4_1 ||
6263
6350
src0t == GGML_TYPE_Q8_0 ||
6264
6351
src0t == GGML_TYPE_Q2_K) {
@@ -6307,10 +6394,12 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
6307
6394
6308
6395
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
6309
6396
6397
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
6310
6398
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
6311
6399
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra ;
6312
6400
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
6313
6401
6402
+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
6314
6403
cl_ulong offset1 = extra1->offset + src1->view_offs ;
6315
6404
cl_ulong offset2 = extra2->offset + src2->view_offs ;
6316
6405
cl_ulong offsetd = extrad->offset + dst->view_offs ;
@@ -6325,7 +6414,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
6325
6414
const int ne03 = src0->ne [3 ];
6326
6415
6327
6416
const cl_ulong nb00 = src0->nb [0 ];
6417
+ const cl_ulong nb01 = src0->nb [1 ];
6328
6418
const cl_ulong nb02 = src0->nb [2 ];
6419
+ const cl_ulong nb03 = src0->nb [3 ];
6329
6420
6330
6421
const int ne10 = src1->ne [0 ];
6331
6422
const int ne11 = src1->ne [1 ];
@@ -6334,6 +6425,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
6334
6425
6335
6426
const cl_ulong nb11 = src1->nb [1 ];
6336
6427
const cl_ulong nb12 = src1->nb [2 ];
6428
+ const cl_ulong nb13 = src1->nb [3 ];
6337
6429
6338
6430
const int ne20 = src2->ne [0 ];
6339
6431
const int ne21 = src2->ne [1 ];
@@ -6401,6 +6493,49 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
6401
6493
6402
6494
break ;
6403
6495
}
6496
+ case GGML_TYPE_MXFP4: {
6497
+ kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32 ;
6498
+
6499
+ if (backend_ctx->gpu_family == INTEL) {
6500
+ sgs = 16 ;
6501
+ nsg = 2 ;
6502
+ ndst = 2 ;
6503
+ } else if (backend_ctx->gpu_family == ADRENO) {
6504
+ sgs = 64 ;
6505
+ nsg = 2 ;
6506
+ ndst = 2 ;
6507
+ } else {
6508
+ GGML_ASSERT (false && " TODO: Unknown GPU" );
6509
+ }
6510
+
6511
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
6512
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
6513
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
6514
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6515
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extra2->data_device ));
6516
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offset2));
6517
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_mem), &extrad->data_device ));
6518
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &offsetd));
6519
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (int ), &ne00));
6520
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb01));
6521
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_ulong), &nb02));
6522
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_ulong), &nb03));
6523
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (int ), &ne11));
6524
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (int ), &ne12));
6525
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb11));
6526
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (cl_ulong), &nb12));
6527
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (cl_ulong), &nb13));
6528
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &ne20));
6529
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &ne21));
6530
+ CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (cl_ulong), &nb21));
6531
+ CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &ne0));
6532
+ CL_CHECK (clSetKernelArg (kernel, 21 , sizeof (int ), &ne1));
6533
+ CL_CHECK (clSetKernelArg (kernel, 22 , sizeof (int ), &r2));
6534
+ CL_CHECK (clSetKernelArg (kernel, 23 , sizeof (int ), &r3));
6535
+ CL_CHECK (clSetKernelArg (kernel, 24 , sizeof (float )*sgs,nullptr ));
6536
+
6537
+ break ;
6538
+ }
6404
6539
default :
6405
6540
GGML_ASSERT (false && " not implemented" );;
6406
6541
}
0 commit comments