File tree Expand file tree Collapse file tree 1 file changed +5
-2
lines changed Expand file tree Collapse file tree 1 file changed +5
-2
lines changed Original file line number Diff line number Diff line change @@ -1200,7 +1200,8 @@ static void ggml_cuda_op_mul_mat_cublas(
12001200
12011201 const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized (src0->type )) && ggml_is_contiguous (src0) && row_diff == src0->ne [1 ] && dst->op_params [0 ] == GGML_PREC_DEFAULT;
12021202
1203- if (!(GGML_CUDA_CC_IS_MTHREADS (cc) && cc < GGML_CUDA_CC_QY2) &&
1203+ if ((GGML_CUDA_CC_IS_NVIDIA (cc) || GGML_CUDA_CC_IS_AMD (cc) ||
1204+ (GGML_CUDA_CC_IS_MTHREADS (cc) && cc >= GGML_CUDA_CC_QY2)) &&
12041205 src0->type == GGML_TYPE_BF16 && ggml_is_contiguous (src0) && row_diff == src0->ne [1 ]) {
12051206 ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16 (ctx.pool (id));
12061207 if (src1->type != GGML_TYPE_BF16) {
@@ -1229,7 +1230,9 @@ static void ggml_cuda_op_mul_mat_cublas(
12291230
12301231 const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_BF16);
12311232 to_fp32_cuda (dst_bf16.get (), dst_dd_i, row_diff*src1_ncols, stream);
1232- } else if (((GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD (cc)) && use_fp16) {
1233+ } else if (((GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_VOLTA) ||
1234+ (GGML_CUDA_CC_IS_MTHREADS (cc) && cc >= GGML_CUDA_CC_QY2) ||
1235+ GGML_CUDA_CC_IS_AMD (cc)) && use_fp16) {
12331236 // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
12341237 ggml_cuda_pool_alloc<half> src0_as_f16 (ctx.pool (id));
12351238 if (src0->type != GGML_TYPE_F16) {
You can’t perform that action at this time.
0 commit comments