Skip to content

Commit 487d509

Browse files
committed
try fix oldpc cuda broken without flash attn since upstream pr14361 between 1.94 and 1.95 (+1 squashed commits)
Squashed commits: [940f0c6] try fix oldpc cuda broken without flash attn since upstream pr14361 between 1.94 and 1.95
1 parent 4c1faf6 commit 487d509

File tree

2 files changed

+8
-1
lines changed

2 files changed

+8
-1
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2061,6 +2061,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20612061
bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
20622062
bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
20632063

2064+
if(ggml_cuda_highest_compiled_arch(cc) <= GGML_CUDA_CC_TURING)
2065+
{
2066+
//kcpp: https://github.com/ggml-org/llama.cpp/pull/14361 broke oldpc mode without this.
2067+
use_batched_cublas_bf16 = false;
2068+
use_batched_cublas_f32 = false;
2069+
}
2070+
20642071
if (!split && use_mul_mat_vec_f) {
20652072
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
20662073
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)

koboldcpp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
extra_images_max = 4
6464

6565
# global vars
66-
KcppVersion = "1.97.3"
66+
KcppVersion = "1.97.4"
6767
showdebug = True
6868
kcpp_instance = None #global running instance
6969
global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False, "restart_override_config_target":""}

0 commit comments

Comments
 (0)