Use bf16 kv cache when it's faster

jart · jart · commit fa4c4e7bafc3 · 2024-08-24T18:18:11.000-07:00
diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
@@ -2159,6 +2159,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
     if (s == "f32") {
         return GGML_TYPE_F32;
     }
+    if (s == "bf16") {
+        return GGML_TYPE_BF16;
+    }
     if (s == "f16") {
         return GGML_TYPE_F16;
     }
diff --git a/llama.cpp/common.h b/llama.cpp/common.h
@@ -22,6 +22,7 @@
 #include <thread>
 #include <unordered_map>
 #include <tuple>
+#include <cosmo.h>
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -192,8 +193,8 @@ struct gpt_params {
     bool warmup            = true;  // warmup run
     bool check_tensors     = false; // validate tensor data
 
-    std::string cache_type_k = "f16"; // KV cache data type for the K
-    std::string cache_type_v = "f16"; // KV cache data type for the V
+    std::string cache_type_k = X86_HAVE(AVX512_BF16) ? "bf16" : "f16"; // KV cache data type for the K [jart]
+    std::string cache_type_v = X86_HAVE(AVX512_BF16) ? "bf16" : "f16"; // KV cache data type for the V [jart]
 
     // multimodal models (see examples/llava)
     std::string mmproj = "";        // path to multimodal projector
diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp
@@ -16766,7 +16766,10 @@ struct llama_context * llama_new_context_with_model(
         params.flash_attn = false;
     }
 
-    if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
+    // [jart] allow bf16
+    if (params.type_v != GGML_TYPE_F16 &&
+        params.type_v != GGML_TYPE_BF16 &&
+        !params.flash_attn) {
         LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
         return nullptr;
     }
diff --git a/llamafile/tinyblas_cpu_sgemm.inc b/llamafile/tinyblas_cpu_sgemm.inc
@@ -73,7 +73,7 @@ bool llamafile_sgemm_impl(long m, long n, long k, const void *A, long lda, const
 
     case GGML_TYPE_BF16: {
 #if defined(__AVX512BF16__)
-        if (Btype == GGML_TYPE_F32 && n < 2) {
+        if (Btype == GGML_TYPE_F32 && n <= 2) {
             tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{
                 k, (const ggml_bf16_t *)A, lda, (const float *)B, ldb, C, ldc, ith, nth};
             tb.matmul(m, n);
@@ -120,7 +120,7 @@ bool llamafile_sgemm_impl(long m, long n, long k, const void *A, long lda, const
 
     case GGML_TYPE_F16: {
 #if defined(__AVX512F__)
-        if (Btype == GGML_TYPE_F32 && n < 2) {
+        if (Btype == GGML_TYPE_F32 && n <= 2) {
             tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, float, TC> tb{
                 k, (const ggml_fp16_t *)A, lda, (const float *)B, ldb, C, ldc, ith, nth};
             tb.matmul(m, n);
@@ -136,7 +136,7 @@ bool llamafile_sgemm_impl(long m, long n, long k, const void *A, long lda, const
         return true;
 #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
         if (X86_CHECK(F16C)) {
-            if (Btype == GGML_TYPE_F32 && n < 2) {
+            if (Btype == GGML_TYPE_F32 && n <= 2) {
                 tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, float, TC> tb{
                     k, (const ggml_fp16_t *)A, lda, (const float *)B, ldb, C, ldc, ith, nth};
                 tb.matmul(m, n);

Original file line number	Diff line number	Diff line change
`@@ -2159,6 +2159,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {`
`2159`	`2159`	`if (s == "f32") {`
`2160`	`2160`	`return GGML_TYPE_F32;`
`2161`	`2161`	`}`
	`2162`	`+ if (s == "bf16") {`
	`2163`	`+ return GGML_TYPE_BF16;`
	`2164`	`+ }`
`2162`	`2165`	`if (s == "f16") {`
`2163`	`2166`	`return GGML_TYPE_F16;`
`2164`	`2167`	`}`
Original file line number	Diff line number	Diff line change
`@@ -16766,7 +16766,10 @@ struct llama_context * llama_new_context_with_model(`
`16766`	`16766`	`params.flash_attn = false;`
`16767`	`16767`	`}`
`16768`	`16768`
`16769`		`- if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {`
	`16769`	`+ // [jart] allow bf16`
	`16770`	`+ if (params.type_v != GGML_TYPE_F16 &&`
	`16771`	`+ params.type_v != GGML_TYPE_BF16 &&`
	`16772`	`+ !params.flash_attn) {`
`16770`	`16773`	`LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);`
`16771`	`16774`	`return nullptr;`
`16772`	`16775`	`}`