@@ -231,19 +231,31 @@ struct ggml_backend_opencl_context {
231231 cl_program program_gemv_noshuffle_general;
232232 cl_program program_gemv_noshuffle;
233233 cl_program program_get_rows;
234- cl_program program_im2col;
234+ cl_program program_im2col_f16;
235+ cl_program program_im2col_f32;
235236 cl_program program_mul_mat_Ab_Bi_8x4;
236- cl_program program_mul_mv_q4_0;
237+ cl_program program_mul_mv_q4_0_f32;
238+ cl_program program_mul_mv_q4_0_f32_v;
239+ cl_program program_mul_mv_q4_0_f32_8x_flat;
240+ cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
241+ cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
237242 cl_program program_mul_mv_q6_K;
238- cl_program program_mul_mv;
243+ cl_program program_mul_mv_f16_f16;
244+ cl_program program_mul_mv_f16_f32_1row;
245+ cl_program program_mul_mv_f16_f32_l4;
246+ cl_program program_mul_mv_f16_f32;
247+ cl_program program_mul_mv_f32_f32;
239248 cl_program program_mul;
240249 cl_program program_norm;
241250 cl_program program_relu;
242251 cl_program program_rms_norm;
243252 cl_program program_rope;
244253 cl_program program_scale;
245254 cl_program program_silu;
246- cl_program program_softmax;
255+ cl_program program_softmax_f32;
256+ cl_program program_softmax_f16;
257+ cl_program program_softmax_4_f32;
258+ cl_program program_softmax_4_f16;
247259
248260 cl_kernel kernel_add, kernel_add_row;
249261 cl_kernel kernel_mul, kernel_mul_row;
@@ -268,7 +280,7 @@ struct ggml_backend_opencl_context {
268280 cl_kernel kernel_mul_mat_f16_f32;
269281 cl_kernel kernel_mul_mat_f16_f32_l4;
270282 cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
271- cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0, kernel_mul_mat_q4_0_f32_flat ;
283+ cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
272284 cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
273285 cl_kernel kernel_convert_block_q4_0_noshuffle;
274286 cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
@@ -527,41 +539,115 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
527539 GGML_LOG_CONT (" ." );
528540 }
529541
530- // im2col
542+ // im2col_f32
531543 {
532544#ifdef GGML_OPENCL_EMBED_KERNELS
533545 const std::string kernel_src {
534- #include " im2col .cl.h"
546+ #include " im2col_f32 .cl.h"
535547 };
536548#else
537- const std::string kernel_src = read_file (" im2col .cl" );
549+ const std::string kernel_src = read_file (" im2col_f32 .cl" );
538550#endif
539- backend_ctx->program_im2col =
551+ backend_ctx->program_im2col_f32 =
540552 build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
541553
542- CL_CHECK ((backend_ctx->kernel_im2col_f32 = clCreateKernel (backend_ctx->program_im2col , " kernel_im2col_f32" , &err), err));
543- CL_CHECK ((backend_ctx->kernel_im2col_f16 = clCreateKernel (backend_ctx->program_im2col , " kernel_im2col_f16" , &err), err));
554+ CL_CHECK ((backend_ctx->kernel_im2col_f32 = clCreateKernel (backend_ctx->program_im2col_f32 , " kernel_im2col_f32" , &err), err));
544555 GGML_LOG_CONT (" ." );
545556 }
546557
547- // mul_mv_q4_0
558+ // im2col_f16
548559 {
549560#ifdef GGML_OPENCL_EMBED_KERNELS
550561 const std::string kernel_src {
551- #include " mul_mv_q4_0 .cl.h"
562+ #include " im2col_f16 .cl.h"
552563 };
553564#else
554- const std::string kernel_src = read_file (" mul_mv_q4_0 .cl" );
565+ const std::string kernel_src = read_file (" im2col_f16 .cl" );
555566#endif
556- backend_ctx->program_mul_mv_q4_0 =
567+ backend_ctx->program_im2col_f16 =
557568 build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
558569
559- CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel (backend_ctx->program_mul_mv_q4_0 , " kernel_mul_mat_q4_0_f32" , &err), err));
560- CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel (backend_ctx->program_mul_mv_q4_0 , " kernel_mul_mat_q4_0_f32_v" , &err), err));
561- CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0 , " kernel_mul_mat_q4_0_f32_flat" , &err), err));
562- CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0 , " kernel_mul_mat_q4_0_f32_8x_flat" , &err), err));
563- CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0 , " kernel_mul_mat_q4_0_f32_1d_8x_flat" , &err), err));
564- CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0 , " kernel_mul_mat_q4_0_f32_1d_16x_flat" , &err), err));
570+ CL_CHECK ((backend_ctx->kernel_im2col_f16 = clCreateKernel (backend_ctx->program_im2col_f16 , " kernel_im2col_f16" , &err), err));
571+ GGML_LOG_CONT (" ." );
572+ }
573+
574+ // mul_mv_q4_0_f32
575+ {
576+ #ifdef GGML_OPENCL_EMBED_KERNELS
577+ const std::string kernel_src {
578+ #include " mul_mv_q4_0_f32.cl.h"
579+ };
580+ #else
581+ const std::string kernel_src = read_file (" mul_mv_q4_0_f32.cl" );
582+ #endif
583+ backend_ctx->program_mul_mv_q4_0_f32 =
584+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
585+
586+ CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel (backend_ctx->program_mul_mv_q4_0_f32 , " kernel_mul_mat_q4_0_f32" , &err), err));
587+ GGML_LOG_CONT (" ." );
588+ }
589+
590+ // mul_mv_q4_0_f32_v
591+ {
592+ #ifdef GGML_OPENCL_EMBED_KERNELS
593+ const std::string kernel_src {
594+ #include " mul_mv_q4_0_f32_v.cl.h"
595+ };
596+ #else
597+ const std::string kernel_src = read_file (" mul_mv_q4_0_f32_v.cl" );
598+ #endif
599+ backend_ctx->program_mul_mv_q4_0_f32_v =
600+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
601+
602+ CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel (backend_ctx->program_mul_mv_q4_0_f32_v , " kernel_mul_mat_q4_0_f32_v" , &err), err));
603+ GGML_LOG_CONT (" ." );
604+ }
605+
606+ // mul_mv_q4_0_f32_8x_flat
607+ {
608+ #ifdef GGML_OPENCL_EMBED_KERNELS
609+ const std::string kernel_src {
610+ #include " mul_mv_q4_0_f32_8x_flat.cl.h"
611+ };
612+ #else
613+ const std::string kernel_src = read_file (" mul_mv_q4_0_f32_8x_flat.cl" );
614+ #endif
615+ backend_ctx->program_mul_mv_q4_0_f32_8x_flat =
616+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
617+
618+ CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0_f32_8x_flat , " kernel_mul_mat_q4_0_f32_8x_flat" , &err), err));
619+ GGML_LOG_CONT (" ." );
620+ }
621+
622+ // mul_mv_q4_0_f32_1d_8x_flat
623+ {
624+ #ifdef GGML_OPENCL_EMBED_KERNELS
625+ const std::string kernel_src {
626+ #include " mul_mv_q4_0_f32_1d_8x_flat.cl.h"
627+ };
628+ #else
629+ const std::string kernel_src = read_file (" mul_mv_q4_0_f32_1d_8x_flat.cl" );
630+ #endif
631+ backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat =
632+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
633+
634+ CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat , " kernel_mul_mat_q4_0_f32_1d_8x_flat" , &err), err));
635+ GGML_LOG_CONT (" ." );
636+ }
637+
638+ // mul_mv_q4_0_f32_1d_16x_flat
639+ {
640+ #ifdef GGML_OPENCL_EMBED_KERNELS
641+ const std::string kernel_src {
642+ #include " mul_mv_q4_0_f32_1d_16x_flat.cl.h"
643+ };
644+ #else
645+ const std::string kernel_src = read_file (" mul_mv_q4_0_f32_1d_16x_flat.cl" );
646+ #endif
647+ backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat =
648+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
649+
650+ CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat , " kernel_mul_mat_q4_0_f32_1d_16x_flat" , &err), err));
565651 GGML_LOG_CONT (" ." );
566652 }
567653
@@ -581,23 +667,83 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
581667 GGML_LOG_CONT (" ." );
582668 }
583669
584- // mul_mv
670+ // mul_mv_f16_f16
671+ {
672+ #ifdef GGML_OPENCL_EMBED_KERNELS
673+ const std::string kernel_src {
674+ #include " mul_mv_f16_f16.cl.h"
675+ };
676+ #else
677+ const std::string kernel_src = read_file (" mul_mv_f16_f16.cl" );
678+ #endif
679+ backend_ctx->program_mul_mv_f16_f16 =
680+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
681+
682+ CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel (backend_ctx->program_mul_mv_f16_f16 , " kernel_mul_mat_f16_f16" , &err), err));
683+ GGML_LOG_CONT (" ." );
684+ }
685+
686+ // mul_mv_f16_f32_1row
585687 {
586688#ifdef GGML_OPENCL_EMBED_KERNELS
587689 const std::string kernel_src {
588- #include " mul_mv .cl.h"
690+ #include " mul_mv_f16_f32_1row .cl.h"
589691 };
590692#else
591- const std::string kernel_src = read_file (" mul_mv .cl" );
693+ const std::string kernel_src = read_file (" mul_mv_f16_f32_1row .cl" );
592694#endif
593- backend_ctx->program_mul_mv =
695+ backend_ctx->program_mul_mv_f16_f32_1row =
594696 build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
595697
596- CL_CHECK ((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel (backend_ctx->program_mul_mv , " kernel_mul_mat_f32_f32" , &err), err));
597- CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel (backend_ctx->program_mul_mv , " kernel_mul_mat_f16_f16" , &err), err));
598- CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel (backend_ctx->program_mul_mv , " kernel_mul_mat_f16_f32_1row" , &err), err));
599- CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel (backend_ctx->program_mul_mv , " kernel_mul_mat_f16_f32" , &err), err));
600- CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel (backend_ctx->program_mul_mv , " kernel_mul_mat_f16_f32_l4" , &err), err));
698+ CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel (backend_ctx->program_mul_mv_f16_f32_1row , " kernel_mul_mat_f16_f32_1row" , &err), err));
699+ GGML_LOG_CONT (" ." );
700+ }
701+
702+ // mul_mv_f16_f32_l4
703+ {
704+ #ifdef GGML_OPENCL_EMBED_KERNELS
705+ const std::string kernel_src {
706+ #include " mul_mv_f16_f32_l4.cl.h"
707+ };
708+ #else
709+ const std::string kernel_src = read_file (" mul_mv_f16_f32_l4.cl" );
710+ #endif
711+ backend_ctx->program_mul_mv_f16_f32_l4 =
712+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
713+
714+ CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel (backend_ctx->program_mul_mv_f16_f32_l4 , " kernel_mul_mat_f16_f32_l4" , &err), err));
715+ GGML_LOG_CONT (" ." );
716+ }
717+
718+ // mul_mv_f16_f32
719+ {
720+ #ifdef GGML_OPENCL_EMBED_KERNELS
721+ const std::string kernel_src {
722+ #include " mul_mv_f16_f32.cl.h"
723+ };
724+ #else
725+ const std::string kernel_src = read_file (" mul_mv_f16_f32.cl" );
726+ #endif
727+ backend_ctx->program_mul_mv_f16_f32 =
728+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
729+
730+ CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel (backend_ctx->program_mul_mv_f16_f32 , " kernel_mul_mat_f16_f32" , &err), err));
731+ GGML_LOG_CONT (" ." );
732+ }
733+
734+ // mul_mv_f32_f32
735+ {
736+ #ifdef GGML_OPENCL_EMBED_KERNELS
737+ const std::string kernel_src {
738+ #include " mul_mv_f32_f32.cl.h"
739+ };
740+ #else
741+ const std::string kernel_src = read_file (" mul_mv_f32_f32.cl" );
742+ #endif
743+ backend_ctx->program_mul_mv_f32_f32 =
744+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
745+
746+ CL_CHECK ((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel (backend_ctx->program_mul_mv_f32_f32 , " kernel_mul_mat_f32_f32" , &err), err));
601747 GGML_LOG_CONT (" ." );
602748 }
603749
@@ -722,22 +868,67 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
722868 GGML_LOG_CONT (" ." );
723869 }
724870
725- // softmax
871+ // softmax_f32
872+ {
873+ #ifdef GGML_OPENCL_EMBED_KERNELS
874+ const std::string kernel_src {
875+ #include " softmax_f32.cl.h"
876+ };
877+ #else
878+ const std::string kernel_src = read_file (" softmax_f32.cl" );
879+ #endif
880+ backend_ctx->program_softmax_f32 =
881+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
882+
883+ CL_CHECK ((backend_ctx->kernel_soft_max = clCreateKernel (backend_ctx->program_softmax_f32 , " kernel_soft_max" , &err), err));
884+ GGML_LOG_CONT (" ." );
885+ }
886+
887+ // softmax_f16
888+ {
889+ #ifdef GGML_OPENCL_EMBED_KERNELS
890+ const std::string kernel_src {
891+ #include " softmax_f16.cl.h"
892+ };
893+ #else
894+ const std::string kernel_src = read_file (" softmax_f16.cl" );
895+ #endif
896+ backend_ctx->program_softmax_f16 =
897+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
898+
899+ CL_CHECK ((backend_ctx->kernel_soft_max_f16 = clCreateKernel (backend_ctx->program_softmax_f16 , " kernel_soft_max_f16" , &err), err));
900+ GGML_LOG_CONT (" ." );
901+ }
902+
903+ // softmax_4_f32
904+ {
905+ #ifdef GGML_OPENCL_EMBED_KERNELS
906+ const std::string kernel_src {
907+ #include " softmax_4_f32.cl.h"
908+ };
909+ #else
910+ const std::string kernel_src = read_file (" softmax_4_f32.cl" );
911+ #endif
912+ backend_ctx->program_softmax_4_f32 =
913+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
914+
915+ CL_CHECK ((backend_ctx->kernel_soft_max_4 = clCreateKernel (backend_ctx->program_softmax_4_f32 , " kernel_soft_max_4" , &err), err));
916+ GGML_LOG_CONT (" ." );
917+ }
918+
919+ // softmax_4_f16
726920 {
727921#ifdef GGML_OPENCL_EMBED_KERNELS
728922 const std::string kernel_src {
729- #include " softmax .cl.h"
923+ #include " softmax_4_f16 .cl.h"
730924 };
731925#else
732- const std::string kernel_src = read_file (" softmax .cl" );
926+ const std::string kernel_src = read_file (" softmax_4_f16 .cl" );
733927#endif
734- backend_ctx->program_softmax =
928+ backend_ctx->program_softmax_4_f16 =
735929 build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
736930
737- CL_CHECK ((backend_ctx->kernel_soft_max = clCreateKernel (backend_ctx->program_softmax , " kernel_soft_max" , &err), err));
738- CL_CHECK ((backend_ctx->kernel_soft_max_4 = clCreateKernel (backend_ctx->program_softmax , " kernel_soft_max_4" , &err), err));
739- CL_CHECK ((backend_ctx->kernel_soft_max_f16 = clCreateKernel (backend_ctx->program_softmax , " kernel_soft_max_f16" , &err), err));
740- CL_CHECK ((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel (backend_ctx->program_softmax , " kernel_soft_max_4_f16" , &err), err));
931+ CL_CHECK ((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel (backend_ctx->program_softmax_4_f16 , " kernel_soft_max_4_f16" , &err), err));
741932 GGML_LOG_CONT (" ." );
742933 }
743934
0 commit comments