Skip to content

Commit a423f7c

Browse files
committed
opencl: split more kernels into separate files
1 parent 32761b8 commit a423f7c

21 files changed

+2668
-2181
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,19 +63,31 @@ set(GGML_OPENCL_KERNELS
6363
gemv_noshuffle_general
6464
gemv_noshuffle
6565
get_rows
66-
im2col
66+
im2col_f32
67+
im2col_f16
6768
mul_mat_Ab_Bi_8x4
68-
mul_mv_q4_0
69+
mul_mv_f16_f16
70+
mul_mv_f16_f32_1row
71+
mul_mv_f16_f32_l4
72+
mul_mv_f16_f32
73+
mul_mv_f32_f32
74+
mul_mv_q4_0_f32
75+
mul_mv_q4_0_f32_v
76+
mul_mv_q4_0_f32_8x_flat
77+
mul_mv_q4_0_f32_1d_8x_flat
78+
mul_mv_q4_0_f32_1d_16x_flat
6979
mul_mv_q6_k
70-
mul_mv
7180
mul
7281
norm
7382
relu
7483
rms_norm
7584
rope
7685
scale
7786
silu
78-
softmax
87+
softmax_4_f32
88+
softmax_4_f16
89+
softmax_f32
90+
softmax_f16
7991
transpose
8092
)
8193

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 229 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -231,19 +231,31 @@ struct ggml_backend_opencl_context {
231231
cl_program program_gemv_noshuffle_general;
232232
cl_program program_gemv_noshuffle;
233233
cl_program program_get_rows;
234-
cl_program program_im2col;
234+
cl_program program_im2col_f16;
235+
cl_program program_im2col_f32;
235236
cl_program program_mul_mat_Ab_Bi_8x4;
236-
cl_program program_mul_mv_q4_0;
237+
cl_program program_mul_mv_q4_0_f32;
238+
cl_program program_mul_mv_q4_0_f32_v;
239+
cl_program program_mul_mv_q4_0_f32_8x_flat;
240+
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
241+
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
237242
cl_program program_mul_mv_q6_K;
238-
cl_program program_mul_mv;
243+
cl_program program_mul_mv_f16_f16;
244+
cl_program program_mul_mv_f16_f32_1row;
245+
cl_program program_mul_mv_f16_f32_l4;
246+
cl_program program_mul_mv_f16_f32;
247+
cl_program program_mul_mv_f32_f32;
239248
cl_program program_mul;
240249
cl_program program_norm;
241250
cl_program program_relu;
242251
cl_program program_rms_norm;
243252
cl_program program_rope;
244253
cl_program program_scale;
245254
cl_program program_silu;
246-
cl_program program_softmax;
255+
cl_program program_softmax_f32;
256+
cl_program program_softmax_f16;
257+
cl_program program_softmax_4_f32;
258+
cl_program program_softmax_4_f16;
247259

248260
cl_kernel kernel_add, kernel_add_row;
249261
cl_kernel kernel_mul, kernel_mul_row;
@@ -268,7 +280,7 @@ struct ggml_backend_opencl_context {
268280
cl_kernel kernel_mul_mat_f16_f32;
269281
cl_kernel kernel_mul_mat_f16_f32_l4;
270282
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
271-
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0, kernel_mul_mat_q4_0_f32_flat;
283+
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
272284
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
273285
cl_kernel kernel_convert_block_q4_0_noshuffle;
274286
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
@@ -527,41 +539,115 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
527539
GGML_LOG_CONT(".");
528540
}
529541

530-
// im2col
542+
// im2col_f32
531543
{
532544
#ifdef GGML_OPENCL_EMBED_KERNELS
533545
const std::string kernel_src {
534-
#include "im2col.cl.h"
546+
#include "im2col_f32.cl.h"
535547
};
536548
#else
537-
const std::string kernel_src = read_file("im2col.cl");
549+
const std::string kernel_src = read_file("im2col_f32.cl");
538550
#endif
539-
backend_ctx->program_im2col =
551+
backend_ctx->program_im2col_f32 =
540552
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
541553

542-
CL_CHECK((backend_ctx->kernel_im2col_f32 = clCreateKernel(backend_ctx->program_im2col, "kernel_im2col_f32", &err), err));
543-
CL_CHECK((backend_ctx->kernel_im2col_f16 = clCreateKernel(backend_ctx->program_im2col, "kernel_im2col_f16", &err), err));
554+
CL_CHECK((backend_ctx->kernel_im2col_f32 = clCreateKernel(backend_ctx->program_im2col_f32, "kernel_im2col_f32", &err), err));
544555
GGML_LOG_CONT(".");
545556
}
546557

547-
// mul_mv_q4_0
558+
// im2col_f16
548559
{
549560
#ifdef GGML_OPENCL_EMBED_KERNELS
550561
const std::string kernel_src {
551-
#include "mul_mv_q4_0.cl.h"
562+
#include "im2col_f16.cl.h"
552563
};
553564
#else
554-
const std::string kernel_src = read_file("mul_mv_q4_0.cl");
565+
const std::string kernel_src = read_file("im2col_f16.cl");
555566
#endif
556-
backend_ctx->program_mul_mv_q4_0 =
567+
backend_ctx->program_im2col_f16 =
557568
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
558569

559-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_q4_0, "kernel_mul_mat_q4_0_f32", &err), err));
560-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program_mul_mv_q4_0, "kernel_mul_mat_q4_0_f32_v", &err), err));
561-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0, "kernel_mul_mat_q4_0_f32_flat", &err), err));
562-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
563-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
564-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
570+
CL_CHECK((backend_ctx->kernel_im2col_f16 = clCreateKernel(backend_ctx->program_im2col_f16, "kernel_im2col_f16", &err), err));
571+
GGML_LOG_CONT(".");
572+
}
573+
574+
// mul_mv_q4_0_f32
575+
{
576+
#ifdef GGML_OPENCL_EMBED_KERNELS
577+
const std::string kernel_src {
578+
#include "mul_mv_q4_0_f32.cl.h"
579+
};
580+
#else
581+
const std::string kernel_src = read_file("mul_mv_q4_0_f32.cl");
582+
#endif
583+
backend_ctx->program_mul_mv_q4_0_f32 =
584+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
585+
586+
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32, "kernel_mul_mat_q4_0_f32", &err), err));
587+
GGML_LOG_CONT(".");
588+
}
589+
590+
// mul_mv_q4_0_f32_v
591+
{
592+
#ifdef GGML_OPENCL_EMBED_KERNELS
593+
const std::string kernel_src {
594+
#include "mul_mv_q4_0_f32_v.cl.h"
595+
};
596+
#else
597+
const std::string kernel_src = read_file("mul_mv_q4_0_f32_v.cl");
598+
#endif
599+
backend_ctx->program_mul_mv_q4_0_f32_v =
600+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
601+
602+
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_v, "kernel_mul_mat_q4_0_f32_v", &err), err));
603+
GGML_LOG_CONT(".");
604+
}
605+
606+
// mul_mv_q4_0_f32_8x_flat
607+
{
608+
#ifdef GGML_OPENCL_EMBED_KERNELS
609+
const std::string kernel_src {
610+
#include "mul_mv_q4_0_f32_8x_flat.cl.h"
611+
};
612+
#else
613+
const std::string kernel_src = read_file("mul_mv_q4_0_f32_8x_flat.cl");
614+
#endif
615+
backend_ctx->program_mul_mv_q4_0_f32_8x_flat =
616+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
617+
618+
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_8x_flat, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
619+
GGML_LOG_CONT(".");
620+
}
621+
622+
// mul_mv_q4_0_f32_1d_8x_flat
623+
{
624+
#ifdef GGML_OPENCL_EMBED_KERNELS
625+
const std::string kernel_src {
626+
#include "mul_mv_q4_0_f32_1d_8x_flat.cl.h"
627+
};
628+
#else
629+
const std::string kernel_src = read_file("mul_mv_q4_0_f32_1d_8x_flat.cl");
630+
#endif
631+
backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat =
632+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
633+
634+
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
635+
GGML_LOG_CONT(".");
636+
}
637+
638+
// mul_mv_q4_0_f32_1d_16x_flat
639+
{
640+
#ifdef GGML_OPENCL_EMBED_KERNELS
641+
const std::string kernel_src {
642+
#include "mul_mv_q4_0_f32_1d_16x_flat.cl.h"
643+
};
644+
#else
645+
const std::string kernel_src = read_file("mul_mv_q4_0_f32_1d_16x_flat.cl");
646+
#endif
647+
backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat =
648+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
649+
650+
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
565651
GGML_LOG_CONT(".");
566652
}
567653

@@ -581,23 +667,83 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
581667
GGML_LOG_CONT(".");
582668
}
583669

584-
// mul_mv
670+
// mul_mv_f16_f16
671+
{
672+
#ifdef GGML_OPENCL_EMBED_KERNELS
673+
const std::string kernel_src {
674+
#include "mul_mv_f16_f16.cl.h"
675+
};
676+
#else
677+
const std::string kernel_src = read_file("mul_mv_f16_f16.cl");
678+
#endif
679+
backend_ctx->program_mul_mv_f16_f16 =
680+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
681+
682+
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program_mul_mv_f16_f16, "kernel_mul_mat_f16_f16", &err), err));
683+
GGML_LOG_CONT(".");
684+
}
685+
686+
// mul_mv_f16_f32_1row
585687
{
586688
#ifdef GGML_OPENCL_EMBED_KERNELS
587689
const std::string kernel_src {
588-
#include "mul_mv.cl.h"
690+
#include "mul_mv_f16_f32_1row.cl.h"
589691
};
590692
#else
591-
const std::string kernel_src = read_file("mul_mv.cl");
693+
const std::string kernel_src = read_file("mul_mv_f16_f32_1row.cl");
592694
#endif
593-
backend_ctx->program_mul_mv =
695+
backend_ctx->program_mul_mv_f16_f32_1row =
594696
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
595697

596-
CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program_mul_mv, "kernel_mul_mat_f32_f32", &err), err));
597-
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program_mul_mv, "kernel_mul_mat_f16_f16", &err), err));
598-
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program_mul_mv, "kernel_mul_mat_f16_f32_1row", &err), err));
599-
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program_mul_mv, "kernel_mul_mat_f16_f32", &err), err));
600-
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program_mul_mv, "kernel_mul_mat_f16_f32_l4", &err), err));
698+
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_1row, "kernel_mul_mat_f16_f32_1row", &err), err));
699+
GGML_LOG_CONT(".");
700+
}
701+
702+
// mul_mv_f16_f32_l4
703+
{
704+
#ifdef GGML_OPENCL_EMBED_KERNELS
705+
const std::string kernel_src {
706+
#include "mul_mv_f16_f32_l4.cl.h"
707+
};
708+
#else
709+
const std::string kernel_src = read_file("mul_mv_f16_f32_l4.cl");
710+
#endif
711+
backend_ctx->program_mul_mv_f16_f32_l4 =
712+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
713+
714+
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err));
715+
GGML_LOG_CONT(".");
716+
}
717+
718+
// mul_mv_f16_f32
719+
{
720+
#ifdef GGML_OPENCL_EMBED_KERNELS
721+
const std::string kernel_src {
722+
#include "mul_mv_f16_f32.cl.h"
723+
};
724+
#else
725+
const std::string kernel_src = read_file("mul_mv_f16_f32.cl");
726+
#endif
727+
backend_ctx->program_mul_mv_f16_f32 =
728+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
729+
730+
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32, "kernel_mul_mat_f16_f32", &err), err));
731+
GGML_LOG_CONT(".");
732+
}
733+
734+
// mul_mv_f32_f32
735+
{
736+
#ifdef GGML_OPENCL_EMBED_KERNELS
737+
const std::string kernel_src {
738+
#include "mul_mv_f32_f32.cl.h"
739+
};
740+
#else
741+
const std::string kernel_src = read_file("mul_mv_f32_f32.cl");
742+
#endif
743+
backend_ctx->program_mul_mv_f32_f32 =
744+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
745+
746+
CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program_mul_mv_f32_f32, "kernel_mul_mat_f32_f32", &err), err));
601747
GGML_LOG_CONT(".");
602748
}
603749

@@ -722,22 +868,67 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
722868
GGML_LOG_CONT(".");
723869
}
724870

725-
// softmax
871+
// softmax_f32
872+
{
873+
#ifdef GGML_OPENCL_EMBED_KERNELS
874+
const std::string kernel_src {
875+
#include "softmax_f32.cl.h"
876+
};
877+
#else
878+
const std::string kernel_src = read_file("softmax_f32.cl");
879+
#endif
880+
backend_ctx->program_softmax_f32 =
881+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
882+
883+
CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program_softmax_f32, "kernel_soft_max", &err), err));
884+
GGML_LOG_CONT(".");
885+
}
886+
887+
// softmax_f16
888+
{
889+
#ifdef GGML_OPENCL_EMBED_KERNELS
890+
const std::string kernel_src {
891+
#include "softmax_f16.cl.h"
892+
};
893+
#else
894+
const std::string kernel_src = read_file("softmax_f16.cl");
895+
#endif
896+
backend_ctx->program_softmax_f16 =
897+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
898+
899+
CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program_softmax_f16, "kernel_soft_max_f16", &err), err));
900+
GGML_LOG_CONT(".");
901+
}
902+
903+
// softmax_4_f32
904+
{
905+
#ifdef GGML_OPENCL_EMBED_KERNELS
906+
const std::string kernel_src {
907+
#include "softmax_4_f32.cl.h"
908+
};
909+
#else
910+
const std::string kernel_src = read_file("softmax_4_f32.cl");
911+
#endif
912+
backend_ctx->program_softmax_4_f32 =
913+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
914+
915+
CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program_softmax_4_f32, "kernel_soft_max_4", &err), err));
916+
GGML_LOG_CONT(".");
917+
}
918+
919+
// softmax_4_f16
726920
{
727921
#ifdef GGML_OPENCL_EMBED_KERNELS
728922
const std::string kernel_src {
729-
#include "softmax.cl.h"
923+
#include "softmax_4_f16.cl.h"
730924
};
731925
#else
732-
const std::string kernel_src = read_file("softmax.cl");
926+
const std::string kernel_src = read_file("softmax_4_f16.cl");
733927
#endif
734-
backend_ctx->program_softmax =
928+
backend_ctx->program_softmax_4_f16 =
735929
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
736930

737-
CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program_softmax, "kernel_soft_max", &err), err));
738-
CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program_softmax, "kernel_soft_max_4", &err), err));
739-
CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program_softmax, "kernel_soft_max_f16", &err), err));
740-
CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program_softmax, "kernel_soft_max_4_f16", &err), err));
931+
CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program_softmax_4_f16, "kernel_soft_max_4_f16", &err), err));
741932
GGML_LOG_CONT(".");
742933
}
743934

0 commit comments

Comments
 (0)