We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 65a098f commit 975ef38Copy full SHA for 975ef38
ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2131,6 +2131,11 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
2131
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 &&
2132
dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
2133
2134
+ // fusion is not universally faster on Pascal
2135
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2136
+ if (cc <= GGML_CUDA_CC_PASCAL) {
2137
+ return false;
2138
+ }
2139
//we only support fusion for ncols_dst = 1
2140
if (tensor->op == GGML_OP_MUL_MAT && dst->ne[1] != 1) {
2141
return false;
0 commit comments