2222#include " ggml-cuda/fattn.cuh"
2323#include " ggml-cuda/getrows.cuh"
2424#include " ggml-cuda/im2col.cuh"
25+ #include " ggml-cuda/mmf.cuh"
2526#include " ggml-cuda/mmq.cuh"
26- #include " ggml-cuda/mmv .cuh"
27+ #include " ggml-cuda/mmvf .cuh"
2728#include " ggml-cuda/mmvq.cuh"
2829#include " ggml-cuda/norm.cuh"
2930#include " ggml-cuda/opt-step-adamw.cuh"
@@ -2008,7 +2009,9 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20082009 const bool bad_padding_clear = ggml_backend_buffer_get_usage (src0->buffer ) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
20092010 && ggml_nbytes (src0) != ggml_backend_buffer_get_alloc_size (src0->buffer , src0) && src0->view_src ;
20102011
2011- bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
2012+ bool use_mul_mat_vec_f = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
2013+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2014+ bool use_mul_mat_f = !ggml_is_quantized (src0->type )
20122015 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
20132016 bool use_mul_mat_vec_q = ggml_is_quantized (src0->type ) && !bad_padding_clear
20142017 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
@@ -2028,14 +2031,18 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20282031 }
20292032
20302033 const int cc = ggml_cuda_info ().devices [id].cc ;
2034+ const int warp_size = ggml_cuda_info ().devices [id].warp_size ;
20312035 use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
2032- use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv (src0->type , cc, src0->ne , src1->ne [1 ]);
2036+ use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf (src0->type , cc, warp_size, src0->ne , src1->ne [1 ]);
2037+ use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf (src0->type , cc, src0->ne , src1->ne [1 ]);
20332038 any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
20342039 }
20352040 } else {
20362041 const int cc = ggml_cuda_info ().devices [ctx.device ].cc ;
2042+ const int warp_size = ggml_cuda_info ().devices [ctx.device ].warp_size ;
20372043 use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
2038- use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv (src0->type , cc, src0->ne , src1->ne [1 ]);
2044+ use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf (src0->type , cc, warp_size, src0->ne , src1->ne [1 ]);
2045+ use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf (src0->type , cc, src0->ne , src1->ne [1 ]);
20392046 any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
20402047 }
20412048
@@ -2048,15 +2055,17 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20482055 // printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
20492056
20502057 // TODO update for generic tensor parallelism
2051- const int cc = ggml_cuda_info ().devices [ggml_cuda_get_device ()].cc ;
2058+ const int cc = ggml_cuda_info ().devices [ggml_cuda_get_device ()].cc ;
20522059 bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
20532060 bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available (cc);
20542061 bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
20552062
2056- if (!split && use_mul_mat_vec ) {
2063+ if (!split && use_mul_mat_vec_f ) {
20572064 // the custom F16 vector kernel can be used over batched cuBLAS GEMM
20582065 // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
2059- ggml_cuda_mul_mat_vec (ctx, src0, src1, nullptr , dst);
2066+ ggml_cuda_mul_mat_vec_f (ctx, src0, src1, nullptr , dst);
2067+ } else if (!split && use_mul_mat_f) {
2068+ ggml_cuda_mul_mat_f (ctx, src0, src1, nullptr , dst);
20602069 } else if (!split && use_mul_mat_vec_q) {
20612070 ggml_cuda_mul_mat_vec_q (ctx, src0, src1, nullptr , dst);
20622071 } else if (!split && use_mul_mat_q) {
@@ -2065,8 +2074,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20652074 && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
20662075 // general KQ + KQV multi-batch without FlashAttention
20672076 ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
2068- } else if (use_mul_mat_vec ) {
2069- ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec , nullptr );
2077+ } else if (use_mul_mat_vec_f ) {
2078+ ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_f , nullptr );
20702079 } else if (use_mul_mat_vec_q) {
20712080 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
20722081 } else if (use_mul_mat_q) {
@@ -2094,7 +2103,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
20942103 if (ggml_is_quantized (src0->type )) {
20952104 ggml_cuda_mul_mat_vec_q (ctx, src0, src1, ids, dst);
20962105 } else {
2097- ggml_cuda_mul_mat_vec (ctx, src0, src1, ids, dst);
2106+ ggml_cuda_mul_mat_vec_f (ctx, src0, src1, ids, dst);
20982107 }
20992108 return ;
21002109 }
@@ -3516,7 +3525,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
35163525#endif // FLASH_ATTN_AVAILABLE
35173526 if (op->src [1 ]->ne [0 ] != op->src [2 ]->ne [0 ]) {
35183527 const int cc = ggml_cuda_info ().devices [dev_ctx->device ].cc ;
3519- if (!new_mma_available (cc)) {
3528+ if (!turing_mma_available (cc)) {
35203529 return false ;
35213530 }
35223531 const int gqa_ratio = op->src [0 ]->ne [2 ] / op->src [1 ]->ne [2 ];
0 commit comments