@@ -1772,7 +1772,7 @@ static __global__ void k_compute_batched_ptrs(
17721772 ptrs_dst[0 *ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
17731773}
17741774
1775- // Type traits for CUDA types
1775+ // Type traits for mapping ggml types to CUDA/cuBLAS types
17761776template <ggml_type T>
17771777struct batched_mul_mat_traits ;
17781778
@@ -1826,6 +1826,9 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
18261826 GGML_ASSERT (src0->type == src0_type);
18271827 GGML_ASSERT (ggml_is_contiguous (dst));
18281828
1829+ // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
1830+ // As long as dst is contiguous this does not matter though.
1831+
18291832 GGML_TENSOR_BINARY_OP_LOCALS
18301833
18311834 const int64_t ne_dst = ggml_nelements (dst);
@@ -1877,6 +1880,8 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
18771880 cudaDataType_t cu_data_type_b = traits::data_type;
18781881 const void * alpha = traits::get_alpha ();
18791882 const void * beta = traits::get_beta ();
1883+ const float alpha_f32 = 1 .0f ;
1884+ const float beta_f32 = 0 .0f ;
18801885
18811886 if (dst->op_params [0 ] == GGML_PREC_DEFAULT) {
18821887 if constexpr (src0_type == GGML_TYPE_F32) {
@@ -1890,8 +1895,6 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
18901895 dst_t = (char *) dst_ddf;
18911896 cu_compute_type = CUBLAS_COMPUTE_32F;
18921897 cu_data_type = CUDA_R_32F;
1893- const float alpha_f32 = 1 .0f ;
1894- const float beta_f32 = 0 .0f ;
18951898 alpha = &alpha_f32;
18961899 beta = &beta_f32;
18971900 }
@@ -1900,8 +1903,6 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
19001903 const int cc = ggml_cuda_info ().devices [id].cc ;
19011904 if (GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA4 (cc)) {
19021905 cu_compute_type = CUBLAS_COMPUTE_32F;
1903- const float alpha_f32 = 1 .0f ;
1904- const float beta_f32 = 0 .0f ;
19051906 alpha = &alpha_f32;
19061907 beta = &beta_f32;
19071908 }
@@ -2032,6 +2033,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20322033 // printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
20332034 // printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
20342035
2036+ const int cc = ggml_cuda_info ().devices [ggml_cuda_get_device ()].cc ;
2037+ bool can_use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
2038+ bool can_use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available (cc);
2039+ bool can_use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
2040+
20352041 if (!split && use_mul_mat_vec) {
20362042 // the custom F16 vector kernel can be used over batched cuBLAS GEMM
20372043 // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
@@ -2040,8 +2046,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20402046 ggml_cuda_mul_mat_vec_q (ctx, src0, src1, nullptr , dst);
20412047 } else if (!split && use_mul_mat_q) {
20422048 ggml_cuda_mul_mat_q (ctx, src0, src1, nullptr , dst);
2043- } else if (!split && (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F32)
2044- && (src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_BF16 || src1->type == GGML_TYPE_F32)
2049+ } else if (!split && (can_use_batched_cublas_f16 || can_use_batched_cublas_bf16 || can_use_batched_cublas_f32)
20452050 && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
20462051 // general KQ + KQV multi-batch without FlashAttention
20472052 ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
0 commit comments