@@ -1195,7 +1195,8 @@ static void ggml_cuda_op_mul_mat_cublas(
11951195
11961196 const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized (src0->type )) && ggml_is_contiguous (src0) && row_diff == src0->ne [1 ] && dst->op_params [0 ] == GGML_PREC_DEFAULT;
11971197
1198- if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous (src0) && row_diff == src0->ne [1 ]) {
1198+ if ((GGML_CUDA_CC_IS_MTHREADS (cc) && cc >= GGML_CUDA_CC_QY2) &&
1199+ src0->type == GGML_TYPE_BF16 && ggml_is_contiguous (src0) && row_diff == src0->ne [1 ]) {
11991200 ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16 (ctx.pool (id));
12001201 if (src1->type != GGML_TYPE_BF16) {
12011202 const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda (src1->type );
@@ -1940,6 +1941,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19401941
19411942 bool any_gpus_with_slow_fp16 = false ;
19421943 bool any_gpus_without_fp16_mma = false ;
1944+ bool any_gpus_without_batched_cublas = false ;
19431945
19441946 if (split) {
19451947 ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer ->buft ->context ;
@@ -1954,12 +1956,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19541956 use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
19551957 any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
19561958 any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available (cc);
1959+ any_gpus_without_batched_cublas = any_gpus_without_batched_cublas || !(GGML_CUDA_CC_IS_MTHREADS (cc) && cc >= GGML_CUDA_CC_QY2);
19571960 }
19581961 } else {
19591962 const int cc = ggml_cuda_info ().devices [ctx.device ].cc ;
19601963 use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
19611964 any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
19621965 any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available (cc);
1966+ any_gpus_without_batched_cublas = any_gpus_without_batched_cublas || !(GGML_CUDA_CC_IS_MTHREADS (cc) && cc >= GGML_CUDA_CC_QY2);
19631967 }
19641968
19651969 // debug helpers
@@ -1978,7 +1982,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19781982 ggml_cuda_mul_mat_vec_q (ctx, src0, src1, nullptr , dst);
19791983 } else if (!split && use_mul_mat_q) {
19801984 ggml_cuda_mul_mat_q (ctx, src0, src1, nullptr , dst);
1981- } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
1985+ } else if (!split && !any_gpus_without_batched_cublas && src0->type == GGML_TYPE_F16 &&
1986+ (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
19821987 !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
19831988 // general KQ + KQV multi-batch without FlashAttention
19841989 ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
@@ -3016,12 +3021,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
30163021 if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
30173022 return false ;
30183023 }
3019- // #ifdef GGML_USE_MUSA
3020- // if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
3021- // !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
3022- // return false;
3023- // }
3024- // #endif // GGML_USE_MUSA
3024+ #ifdef GGML_USE_MUSA
3025+ if (b->type == GGML_TYPE_F16 && b->ne [2 ]*b->ne [3 ] > 1 &&
3026+ !ggml_is_transposed (a) && !ggml_is_transposed (b)) {
3027+ return false ;
3028+ }
3029+ #endif // GGML_USE_MUSA
30253030 switch (a->type ) {
30263031 case GGML_TYPE_F32:
30273032 case GGML_TYPE_F16:
0 commit comments