Skip to content

Commit ace537d

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .github/workflows/release.yml # CMakeLists.txt # examples/simple-chat/simple-chat.cpp # src/llama-quant.cpp # tools/run/run.cpp # tools/server/README.md
2 parents 8ce56bd + c148cf1 commit ace537d

File tree

17 files changed

+554
-212
lines changed

17 files changed

+554
-212
lines changed

ggml/src/ggml-cpu/kcpp-quantmapper.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,5 @@
2121
#include "arch/s390/quants.c"
2222
#else
2323
#pragma message("KoboldCpp Cannot Compile Quants! Unknown Architecture!")
24+
#error "Compilation halted due to unknown architecture."
2425
#endif

ggml/src/ggml-cpu/kcpp-repackmapper.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,5 @@
1818
#pragma message("KoboldCpp Compiling Repack for S390X")
1919
#else
2020
#pragma message("KoboldCpp Cannot Compile Repack! Unknown Architecture!")
21+
#error "Compilation halted due to unknown architecture."
2122
#endif

ggml/src/ggml-cuda/common.cuh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,14 @@ static bool fp16_mma_hardware_available(const int cc) {
266266
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
267267
}
268268

269+
static bool bf16_mma_hardware_available(const int cc) {
270+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
271+
}
272+
273+
static bool fp32_mma_hardware_available(const int cc) {
274+
return GGML_CUDA_CC_IS_CDNA(cc);
275+
}
276+
269277
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
270278
static bool new_mma_available(const int cc) {
271279
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1944,16 +1944,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19441944
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
19451945

19461946
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
1947-
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1948-
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
1947+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
19491948
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
19501949
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
19511950
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
19521951
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
19531952
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
19541953

1955-
bool any_gpus_with_slow_fp16 = false;
1956-
bool any_gpus_without_fp16_mma = false;
1954+
bool any_gpus_with_slow_fp16 = false;
19571955

19581956
if (split) {
19591957
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
@@ -1964,16 +1962,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19641962
continue;
19651963
}
19661964

1967-
const int cc = ggml_cuda_info().devices[id].cc;
1968-
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1969-
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
1970-
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
1965+
const int cc = ggml_cuda_info().devices[id].cc;
1966+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1967+
use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
1968+
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
19711969
}
19721970
} else {
1973-
const int cc = ggml_cuda_info().devices[ctx.device].cc;
1974-
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1975-
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
1976-
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
1971+
const int cc = ggml_cuda_info().devices[ctx.device].cc;
1972+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1973+
use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
1974+
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
19771975
}
19781976

19791977
// debug helpers
@@ -1984,7 +1982,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19841982
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
19851983
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
19861984

1987-
if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
1985+
if (!split && use_mul_mat_vec) {
19881986
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
19891987
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
19901988
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);

0 commit comments

Comments
 (0)