@@ -231,19 +231,31 @@ struct ggml_backend_opencl_context {
231
231
cl_program program_gemv_noshuffle_general;
232
232
cl_program program_gemv_noshuffle;
233
233
cl_program program_get_rows;
234
- cl_program program_im2col;
234
+ cl_program program_im2col_f16;
235
+ cl_program program_im2col_f32;
235
236
cl_program program_mul_mat_Ab_Bi_8x4;
236
- cl_program program_mul_mv_q4_0;
237
+ cl_program program_mul_mv_q4_0_f32;
238
+ cl_program program_mul_mv_q4_0_f32_v;
239
+ cl_program program_mul_mv_q4_0_f32_8x_flat;
240
+ cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
241
+ cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
237
242
cl_program program_mul_mv_q6_K;
238
- cl_program program_mul_mv;
243
+ cl_program program_mul_mv_f16_f16;
244
+ cl_program program_mul_mv_f16_f32_1row;
245
+ cl_program program_mul_mv_f16_f32_l4;
246
+ cl_program program_mul_mv_f16_f32;
247
+ cl_program program_mul_mv_f32_f32;
239
248
cl_program program_mul;
240
249
cl_program program_norm;
241
250
cl_program program_relu;
242
251
cl_program program_rms_norm;
243
252
cl_program program_rope;
244
253
cl_program program_scale;
245
254
cl_program program_silu;
246
- cl_program program_softmax;
255
+ cl_program program_softmax_f32;
256
+ cl_program program_softmax_f16;
257
+ cl_program program_softmax_4_f32;
258
+ cl_program program_softmax_4_f16;
247
259
248
260
cl_kernel kernel_add, kernel_add_row;
249
261
cl_kernel kernel_mul, kernel_mul_row;
@@ -268,7 +280,7 @@ struct ggml_backend_opencl_context {
268
280
cl_kernel kernel_mul_mat_f16_f32;
269
281
cl_kernel kernel_mul_mat_f16_f32_l4;
270
282
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
271
- cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0, kernel_mul_mat_q4_0_f32_flat ;
283
+ cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
272
284
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
273
285
cl_kernel kernel_convert_block_q4_0_noshuffle;
274
286
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
@@ -527,41 +539,115 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
527
539
GGML_LOG_CONT (" ." );
528
540
}
529
541
530
- // im2col
542
+ // im2col_f32
531
543
{
532
544
#ifdef GGML_OPENCL_EMBED_KERNELS
533
545
const std::string kernel_src {
534
- #include " im2col .cl.h"
546
+ #include " im2col_f32 .cl.h"
535
547
};
536
548
#else
537
- const std::string kernel_src = read_file (" im2col .cl" );
549
+ const std::string kernel_src = read_file (" im2col_f32 .cl" );
538
550
#endif
539
- backend_ctx->program_im2col =
551
+ backend_ctx->program_im2col_f32 =
540
552
build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
541
553
542
- CL_CHECK ((backend_ctx->kernel_im2col_f32 = clCreateKernel (backend_ctx->program_im2col , " kernel_im2col_f32" , &err), err));
543
- CL_CHECK ((backend_ctx->kernel_im2col_f16 = clCreateKernel (backend_ctx->program_im2col , " kernel_im2col_f16" , &err), err));
554
+ CL_CHECK ((backend_ctx->kernel_im2col_f32 = clCreateKernel (backend_ctx->program_im2col_f32 , " kernel_im2col_f32" , &err), err));
544
555
GGML_LOG_CONT (" ." );
545
556
}
546
557
547
- // mul_mv_q4_0
558
+ // im2col_f16
548
559
{
549
560
#ifdef GGML_OPENCL_EMBED_KERNELS
550
561
const std::string kernel_src {
551
- #include " mul_mv_q4_0 .cl.h"
562
+ #include " im2col_f16 .cl.h"
552
563
};
553
564
#else
554
- const std::string kernel_src = read_file (" mul_mv_q4_0 .cl" );
565
+ const std::string kernel_src = read_file (" im2col_f16 .cl" );
555
566
#endif
556
- backend_ctx->program_mul_mv_q4_0 =
567
+ backend_ctx->program_im2col_f16 =
557
568
build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
558
569
559
- CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel (backend_ctx->program_mul_mv_q4_0 , " kernel_mul_mat_q4_0_f32" , &err), err));
560
- CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel (backend_ctx->program_mul_mv_q4_0 , " kernel_mul_mat_q4_0_f32_v" , &err), err));
561
- CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0 , " kernel_mul_mat_q4_0_f32_flat" , &err), err));
562
- CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0 , " kernel_mul_mat_q4_0_f32_8x_flat" , &err), err));
563
- CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0 , " kernel_mul_mat_q4_0_f32_1d_8x_flat" , &err), err));
564
- CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0 , " kernel_mul_mat_q4_0_f32_1d_16x_flat" , &err), err));
570
+ CL_CHECK ((backend_ctx->kernel_im2col_f16 = clCreateKernel (backend_ctx->program_im2col_f16 , " kernel_im2col_f16" , &err), err));
571
+ GGML_LOG_CONT (" ." );
572
+ }
573
+
574
+ // mul_mv_q4_0_f32
575
+ {
576
+ #ifdef GGML_OPENCL_EMBED_KERNELS
577
+ const std::string kernel_src {
578
+ #include " mul_mv_q4_0_f32.cl.h"
579
+ };
580
+ #else
581
+ const std::string kernel_src = read_file (" mul_mv_q4_0_f32.cl" );
582
+ #endif
583
+ backend_ctx->program_mul_mv_q4_0_f32 =
584
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
585
+
586
+ CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel (backend_ctx->program_mul_mv_q4_0_f32 , " kernel_mul_mat_q4_0_f32" , &err), err));
587
+ GGML_LOG_CONT (" ." );
588
+ }
589
+
590
+ // mul_mv_q4_0_f32_v
591
+ {
592
+ #ifdef GGML_OPENCL_EMBED_KERNELS
593
+ const std::string kernel_src {
594
+ #include " mul_mv_q4_0_f32_v.cl.h"
595
+ };
596
+ #else
597
+ const std::string kernel_src = read_file (" mul_mv_q4_0_f32_v.cl" );
598
+ #endif
599
+ backend_ctx->program_mul_mv_q4_0_f32_v =
600
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
601
+
602
+ CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel (backend_ctx->program_mul_mv_q4_0_f32_v , " kernel_mul_mat_q4_0_f32_v" , &err), err));
603
+ GGML_LOG_CONT (" ." );
604
+ }
605
+
606
+ // mul_mv_q4_0_f32_8x_flat
607
+ {
608
+ #ifdef GGML_OPENCL_EMBED_KERNELS
609
+ const std::string kernel_src {
610
+ #include " mul_mv_q4_0_f32_8x_flat.cl.h"
611
+ };
612
+ #else
613
+ const std::string kernel_src = read_file (" mul_mv_q4_0_f32_8x_flat.cl" );
614
+ #endif
615
+ backend_ctx->program_mul_mv_q4_0_f32_8x_flat =
616
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
617
+
618
+ CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0_f32_8x_flat , " kernel_mul_mat_q4_0_f32_8x_flat" , &err), err));
619
+ GGML_LOG_CONT (" ." );
620
+ }
621
+
622
+ // mul_mv_q4_0_f32_1d_8x_flat
623
+ {
624
+ #ifdef GGML_OPENCL_EMBED_KERNELS
625
+ const std::string kernel_src {
626
+ #include " mul_mv_q4_0_f32_1d_8x_flat.cl.h"
627
+ };
628
+ #else
629
+ const std::string kernel_src = read_file (" mul_mv_q4_0_f32_1d_8x_flat.cl" );
630
+ #endif
631
+ backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat =
632
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
633
+
634
+ CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat , " kernel_mul_mat_q4_0_f32_1d_8x_flat" , &err), err));
635
+ GGML_LOG_CONT (" ." );
636
+ }
637
+
638
+ // mul_mv_q4_0_f32_1d_16x_flat
639
+ {
640
+ #ifdef GGML_OPENCL_EMBED_KERNELS
641
+ const std::string kernel_src {
642
+ #include " mul_mv_q4_0_f32_1d_16x_flat.cl.h"
643
+ };
644
+ #else
645
+ const std::string kernel_src = read_file (" mul_mv_q4_0_f32_1d_16x_flat.cl" );
646
+ #endif
647
+ backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat =
648
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
649
+
650
+ CL_CHECK ((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel (backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat , " kernel_mul_mat_q4_0_f32_1d_16x_flat" , &err), err));
565
651
GGML_LOG_CONT (" ." );
566
652
}
567
653
@@ -581,23 +667,83 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
581
667
GGML_LOG_CONT (" ." );
582
668
}
583
669
584
- // mul_mv
670
+ // mul_mv_f16_f16
671
+ {
672
+ #ifdef GGML_OPENCL_EMBED_KERNELS
673
+ const std::string kernel_src {
674
+ #include " mul_mv_f16_f16.cl.h"
675
+ };
676
+ #else
677
+ const std::string kernel_src = read_file (" mul_mv_f16_f16.cl" );
678
+ #endif
679
+ backend_ctx->program_mul_mv_f16_f16 =
680
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
681
+
682
+ CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel (backend_ctx->program_mul_mv_f16_f16 , " kernel_mul_mat_f16_f16" , &err), err));
683
+ GGML_LOG_CONT (" ." );
684
+ }
685
+
686
+ // mul_mv_f16_f32_1row
585
687
{
586
688
#ifdef GGML_OPENCL_EMBED_KERNELS
587
689
const std::string kernel_src {
588
- #include " mul_mv .cl.h"
690
+ #include " mul_mv_f16_f32_1row .cl.h"
589
691
};
590
692
#else
591
- const std::string kernel_src = read_file (" mul_mv .cl" );
693
+ const std::string kernel_src = read_file (" mul_mv_f16_f32_1row .cl" );
592
694
#endif
593
- backend_ctx->program_mul_mv =
695
+ backend_ctx->program_mul_mv_f16_f32_1row =
594
696
build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
595
697
596
- CL_CHECK ((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel (backend_ctx->program_mul_mv , " kernel_mul_mat_f32_f32" , &err), err));
597
- CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel (backend_ctx->program_mul_mv , " kernel_mul_mat_f16_f16" , &err), err));
598
- CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel (backend_ctx->program_mul_mv , " kernel_mul_mat_f16_f32_1row" , &err), err));
599
- CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel (backend_ctx->program_mul_mv , " kernel_mul_mat_f16_f32" , &err), err));
600
- CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel (backend_ctx->program_mul_mv , " kernel_mul_mat_f16_f32_l4" , &err), err));
698
+ CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel (backend_ctx->program_mul_mv_f16_f32_1row , " kernel_mul_mat_f16_f32_1row" , &err), err));
699
+ GGML_LOG_CONT (" ." );
700
+ }
701
+
702
+ // mul_mv_f16_f32_l4
703
+ {
704
+ #ifdef GGML_OPENCL_EMBED_KERNELS
705
+ const std::string kernel_src {
706
+ #include " mul_mv_f16_f32_l4.cl.h"
707
+ };
708
+ #else
709
+ const std::string kernel_src = read_file (" mul_mv_f16_f32_l4.cl" );
710
+ #endif
711
+ backend_ctx->program_mul_mv_f16_f32_l4 =
712
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
713
+
714
+ CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel (backend_ctx->program_mul_mv_f16_f32_l4 , " kernel_mul_mat_f16_f32_l4" , &err), err));
715
+ GGML_LOG_CONT (" ." );
716
+ }
717
+
718
+ // mul_mv_f16_f32
719
+ {
720
+ #ifdef GGML_OPENCL_EMBED_KERNELS
721
+ const std::string kernel_src {
722
+ #include " mul_mv_f16_f32.cl.h"
723
+ };
724
+ #else
725
+ const std::string kernel_src = read_file (" mul_mv_f16_f32.cl" );
726
+ #endif
727
+ backend_ctx->program_mul_mv_f16_f32 =
728
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
729
+
730
+ CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel (backend_ctx->program_mul_mv_f16_f32 , " kernel_mul_mat_f16_f32" , &err), err));
731
+ GGML_LOG_CONT (" ." );
732
+ }
733
+
734
+ // mul_mv_f32_f32
735
+ {
736
+ #ifdef GGML_OPENCL_EMBED_KERNELS
737
+ const std::string kernel_src {
738
+ #include " mul_mv_f32_f32.cl.h"
739
+ };
740
+ #else
741
+ const std::string kernel_src = read_file (" mul_mv_f32_f32.cl" );
742
+ #endif
743
+ backend_ctx->program_mul_mv_f32_f32 =
744
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
745
+
746
+ CL_CHECK ((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel (backend_ctx->program_mul_mv_f32_f32 , " kernel_mul_mat_f32_f32" , &err), err));
601
747
GGML_LOG_CONT (" ." );
602
748
}
603
749
@@ -722,22 +868,67 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
722
868
GGML_LOG_CONT (" ." );
723
869
}
724
870
725
- // softmax
871
+ // softmax_f32
872
+ {
873
+ #ifdef GGML_OPENCL_EMBED_KERNELS
874
+ const std::string kernel_src {
875
+ #include " softmax_f32.cl.h"
876
+ };
877
+ #else
878
+ const std::string kernel_src = read_file (" softmax_f32.cl" );
879
+ #endif
880
+ backend_ctx->program_softmax_f32 =
881
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
882
+
883
+ CL_CHECK ((backend_ctx->kernel_soft_max = clCreateKernel (backend_ctx->program_softmax_f32 , " kernel_soft_max" , &err), err));
884
+ GGML_LOG_CONT (" ." );
885
+ }
886
+
887
+ // softmax_f16
888
+ {
889
+ #ifdef GGML_OPENCL_EMBED_KERNELS
890
+ const std::string kernel_src {
891
+ #include " softmax_f16.cl.h"
892
+ };
893
+ #else
894
+ const std::string kernel_src = read_file (" softmax_f16.cl" );
895
+ #endif
896
+ backend_ctx->program_softmax_f16 =
897
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
898
+
899
+ CL_CHECK ((backend_ctx->kernel_soft_max_f16 = clCreateKernel (backend_ctx->program_softmax_f16 , " kernel_soft_max_f16" , &err), err));
900
+ GGML_LOG_CONT (" ." );
901
+ }
902
+
903
+ // softmax_4_f32
904
+ {
905
+ #ifdef GGML_OPENCL_EMBED_KERNELS
906
+ const std::string kernel_src {
907
+ #include " softmax_4_f32.cl.h"
908
+ };
909
+ #else
910
+ const std::string kernel_src = read_file (" softmax_4_f32.cl" );
911
+ #endif
912
+ backend_ctx->program_softmax_4_f32 =
913
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
914
+
915
+ CL_CHECK ((backend_ctx->kernel_soft_max_4 = clCreateKernel (backend_ctx->program_softmax_4_f32 , " kernel_soft_max_4" , &err), err));
916
+ GGML_LOG_CONT (" ." );
917
+ }
918
+
919
+ // softmax_4_f16
726
920
{
727
921
#ifdef GGML_OPENCL_EMBED_KERNELS
728
922
const std::string kernel_src {
729
- #include " softmax .cl.h"
923
+ #include " softmax_4_f16 .cl.h"
730
924
};
731
925
#else
732
- const std::string kernel_src = read_file (" softmax .cl" );
926
+ const std::string kernel_src = read_file (" softmax_4_f16 .cl" );
733
927
#endif
734
- backend_ctx->program_softmax =
928
+ backend_ctx->program_softmax_4_f16 =
735
929
build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
736
930
737
- CL_CHECK ((backend_ctx->kernel_soft_max = clCreateKernel (backend_ctx->program_softmax , " kernel_soft_max" , &err), err));
738
- CL_CHECK ((backend_ctx->kernel_soft_max_4 = clCreateKernel (backend_ctx->program_softmax , " kernel_soft_max_4" , &err), err));
739
- CL_CHECK ((backend_ctx->kernel_soft_max_f16 = clCreateKernel (backend_ctx->program_softmax , " kernel_soft_max_f16" , &err), err));
740
- CL_CHECK ((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel (backend_ctx->program_softmax , " kernel_soft_max_4_f16" , &err), err));
931
+ CL_CHECK ((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel (backend_ctx->program_softmax_4_f16 , " kernel_soft_max_4_f16" , &err), err));
741
932
GGML_LOG_CONT (" ." );
742
933
}
743
934
0 commit comments