2222#include " ggml-cuda/fattn.cuh"
2323#include " ggml-cuda/getrows.cuh"
2424#include " ggml-cuda/im2col.cuh"
25- #include " ggml-cuda/mmf.cuh"
2625#include " ggml-cuda/mmq.cuh"
27- #include " ggml-cuda/mmvf .cuh"
26+ #include " ggml-cuda/mmv .cuh"
2827#include " ggml-cuda/mmvq.cuh"
2928#include " ggml-cuda/norm.cuh"
3029#include " ggml-cuda/opt-step-adamw.cuh"
@@ -2009,9 +2008,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20092008 const bool bad_padding_clear = ggml_backend_buffer_get_usage (src0->buffer ) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
20102009 && ggml_nbytes (src0) != ggml_backend_buffer_get_alloc_size (src0->buffer , src0) && src0->view_src ;
20112010
2012- bool use_mul_mat_vec_f = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
2013- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2014- bool use_mul_mat_f = !ggml_is_quantized (src0->type )
2011+ bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
20152012 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
20162013 bool use_mul_mat_vec_q = ggml_is_quantized (src0->type ) && !bad_padding_clear
20172014 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
@@ -2031,18 +2028,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20312028 }
20322029
20332030 const int cc = ggml_cuda_info ().devices [id].cc ;
2034- const int warp_size = ggml_cuda_info ().devices [id].warp_size ;
20352031 use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
2036- use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf (src0->type , cc, warp_size, src0->ne , src1->ne [1 ]);
2037- use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf (src0->type , cc, src0->ne , src1->ne [1 ]);
2032+ use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv (src0->type , cc, src0->ne , src1->ne [1 ]);
20382033 any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
20392034 }
20402035 } else {
20412036 const int cc = ggml_cuda_info ().devices [ctx.device ].cc ;
2042- const int warp_size = ggml_cuda_info ().devices [ctx.device ].warp_size ;
20432037 use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
2044- use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf (src0->type , cc, warp_size, src0->ne , src1->ne [1 ]);
2045- use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf (src0->type , cc, src0->ne , src1->ne [1 ]);
2038+ use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv (src0->type , cc, src0->ne , src1->ne [1 ]);
20462039 any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
20472040 }
20482041
@@ -2055,17 +2048,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20552048 // printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
20562049
20572050 // TODO update for generic tensor parallelism
2058- const int cc = ggml_cuda_info ().devices [ggml_cuda_get_device ()].cc ;
2051+ const int cc = ggml_cuda_info ().devices [ggml_cuda_get_device ()].cc ;
20592052 bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
20602053 bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available (cc);
20612054 bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
20622055
2063- if (!split && use_mul_mat_vec_f ) {
2056+ if (!split && use_mul_mat_vec ) {
20642057 // the custom F16 vector kernel can be used over batched cuBLAS GEMM
20652058 // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
2066- ggml_cuda_mul_mat_vec_f (ctx, src0, src1, nullptr , dst);
2067- } else if (!split && use_mul_mat_f) {
2068- ggml_cuda_mul_mat_f (ctx, src0, src1, nullptr , dst);
2059+ ggml_cuda_mul_mat_vec (ctx, src0, src1, nullptr , dst);
20692060 } else if (!split && use_mul_mat_vec_q) {
20702061 ggml_cuda_mul_mat_vec_q (ctx, src0, src1, nullptr , dst);
20712062 } else if (!split && use_mul_mat_q) {
@@ -2074,8 +2065,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20742065 && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
20752066 // general KQ + KQV multi-batch without FlashAttention
20762067 ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
2077- } else if (use_mul_mat_vec_f ) {
2078- ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_f , nullptr );
2068+ } else if (use_mul_mat_vec ) {
2069+ ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec , nullptr );
20792070 } else if (use_mul_mat_vec_q) {
20802071 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
20812072 } else if (use_mul_mat_q) {
@@ -2103,7 +2094,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
21032094 if (ggml_is_quantized (src0->type )) {
21042095 ggml_cuda_mul_mat_vec_q (ctx, src0, src1, ids, dst);
21052096 } else {
2106- ggml_cuda_mul_mat_vec_f (ctx, src0, src1, ids, dst);
2097+ ggml_cuda_mul_mat_vec (ctx, src0, src1, ids, dst);
21072098 }
21082099 return ;
21092100 }
@@ -3525,7 +3516,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
35253516#endif // FLASH_ATTN_AVAILABLE
35263517 if (op->src [1 ]->ne [0 ] != op->src [2 ]->ne [0 ]) {
35273518 const int cc = ggml_cuda_info ().devices [dev_ctx->device ].cc ;
3528- if (!turing_mma_available (cc)) {
3519+ if (!new_mma_available (cc)) {
35293520 return false ;
35303521 }
35313522 const int gqa_ratio = op->src [0 ]->ne [2 ] / op->src [1 ]->ne [2 ];
0 commit comments