ggml-org
diff --git a/‎.github/workflows/server.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/server.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CODEOWNERS‎
Lines changed: 3 additions & 1 deletion b/‎CODEOWNERS‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎common/arg.cpp‎
Lines changed: 52 additions & 13 deletions b/‎common/arg.cpp‎
Lines changed: 52 additions & 13 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 2 additions & 34 deletions b/‎common/common.cpp‎
Lines changed: 2 additions & 34 deletions
diff --git a/‎common/common.h‎
Lines changed: 2 additions & 2 deletions b/‎common/common.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/CMakeLists.txt‎
Lines changed: 11 additions & 2 deletions b/‎examples/CMakeLists.txt‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎examples/gguf-split/gguf-split.cpp‎
Lines changed: 2 additions & 2 deletions b/‎examples/gguf-split/gguf-split.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/llama-bench/llama-bench.cpp‎
Lines changed: 5 additions & 5 deletions b/‎examples/llama-bench/llama-bench.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/quantize/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/quantize/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/retrieval/retrieval.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/retrieval/retrieval.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -79,7 +79,7 @@ jobs:
       # Setup nodejs (to be used for verifying bundled index.html)
       - uses: actions/setup-node@v4
         with:
-          node-version: 22
+          node-version: '22.11.0'
 
       - name: Verify bundled index.html
         id: verify_server_index_html
 
@@ -1,3 +1,5 @@
 # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
 
-ci/ @ggerganov
+/ci/ @ggerganov
+/.devops/ @ngxson
+/examples/server/ @ngxson
@@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) {
     }
 }
 
+const std::vector<ggml_type> kv_cache_types = {
+    GGML_TYPE_F32,
+    GGML_TYPE_F16,
+    GGML_TYPE_BF16,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_Q5_0,
+    GGML_TYPE_Q5_1,
+};
+
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (ggml_type_name(type) == s) {
+            return type;
+        }
+    }
+    throw std::runtime_error("Unsupported cache type: " + s);
+}
+
+static std::string get_all_kv_cache_types() {
+    std::ostringstream msg;
+    for (const auto & type : kv_cache_types) {
+        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
+    }
+    return msg.str();
+}
+
 //
 // CLI argument parsing functions
 //
@@ -1184,18 +1213,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
     add_opt(common_arg(
         {"-ctk", "--cache-type-k"}, "TYPE",
-        string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
+        string_format(
+            "KV cache data type for K\n"
+            "allowed values: %s\n"
+            "(default: %s)",
+            get_all_kv_cache_types().c_str(),
+            ggml_type_name(params.cache_type_k)
+        ),
         [](common_params & params, const std::string & value) {
-            // TODO: get the type right here
-            params.cache_type_k = value;
+            params.cache_type_k = kv_cache_type_from_str(value);
         }
     ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
     add_opt(common_arg(
         {"-ctv", "--cache-type-v"}, "TYPE",
-        string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
+        string_format(
+            "KV cache data type for V\n"
+            "allowed values: %s\n"
+            "(default: %s)",
+            get_all_kv_cache_types().c_str(),
+            ggml_type_name(params.cache_type_v)
+        ),
         [](common_params & params, const std::string & value) {
-            // TODO: get the type right here
-            params.cache_type_v = value;
+            params.cache_type_v = kv_cache_type_from_str(value);
         }
     ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
     add_opt(common_arg(
@@ -2093,35 +2132,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.speculative.n_max = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
     add_opt(common_arg(
         {"--draft-min", "--draft-n-min"}, "N",
         string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
         [](common_params & params, int value) {
             params.speculative.n_min = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
     add_opt(common_arg(
         {"--draft-p-split"}, "P",
         string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
         [](common_params & params, const std::string & value) {
             params.speculative.p_split = std::stof(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
     add_opt(common_arg(
         {"--draft-p-min"}, "P",
         string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
         [](common_params & params, const std::string & value) {
             params.speculative.p_min = std::stof(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
     add_opt(common_arg(
         {"-cd", "--ctx-size-draft"}, "N",
         string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
         [](common_params & params, int value) {
             params.speculative.n_ctx = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
     add_opt(common_arg(
         {"-devd", "--device-draft"}, "<dev1,dev2,..>",
         "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -2141,14 +2180,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
     add_opt(common_arg(
         {"-md", "--model-draft"}, "FNAME",
         "draft model for speculative decoding (default: unused)",
         [](common_params & params, const std::string & value) {
             params.speculative.model = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
 
     return ctx_arg;
 }
@@ -1015,38 +1015,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     return mparams;
 }
 
-static ggml_type kv_cache_type_from_str(const std::string & s) {
-    if (s == "f32") {
-        return GGML_TYPE_F32;
-    }
-    if (s == "f16") {
-        return GGML_TYPE_F16;
-    }
-    if (s == "bf16") {
-        return GGML_TYPE_BF16;
-    }
-    if (s == "q8_0") {
-        return GGML_TYPE_Q8_0;
-    }
-    if (s == "q4_0") {
-        return GGML_TYPE_Q4_0;
-    }
-    if (s == "q4_1") {
-        return GGML_TYPE_Q4_1;
-    }
-    if (s == "iq4_nl") {
-        return GGML_TYPE_IQ4_NL;
-    }
-    if (s == "q5_0") {
-        return GGML_TYPE_Q5_0;
-    }
-    if (s == "q5_1") {
-        return GGML_TYPE_Q5_1;
-    }
-
-    throw std::runtime_error("Unsupported cache type: " + s);
-}
-
 struct llama_context_params common_context_params_to_llama(const common_params & params) {
     auto cparams = llama_context_default_params();
 
@@ -1081,8 +1049,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
         cparams.pooling_type  = LLAMA_POOLING_TYPE_RANK;
     }
 
-    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
-    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
+    cparams.type_k = params.cache_type_k;
+    cparams.type_v = params.cache_type_v;
 
     return cparams;
 }
 
@@ -287,8 +287,8 @@ struct common_params {
     bool warmup            = true;  // warmup run
     bool check_tensors     = false; // validate tensor data
 
-    std::string cache_type_k = "f16"; // KV cache data type for the K
-    std::string cache_type_v = "f16"; // KV cache data type for the V
+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 
     // multimodal models (see examples/llava)
     std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
 
@@ -20,7 +20,12 @@ else()
     add_subdirectory(batched)
     add_subdirectory(embedding)
     add_subdirectory(eval-callback)
-    add_subdirectory(gbnf-validator)
+
+    if (NOT WIN32)
+        # disabled on Windows because it uses internal functions not exported with LLAMA_API
+        add_subdirectory(gbnf-validator)
+    endif()
+
     add_subdirectory(gguf-hash)
     add_subdirectory(gguf-split)
     add_subdirectory(gguf)
@@ -46,12 +51,16 @@ else()
     add_subdirectory(speculative)
     add_subdirectory(speculative-simple)
     add_subdirectory(tokenize)
+    add_subdirectory(gen-docs)
     if (NOT GGML_BACKEND_DL)
         # these examples use the backends directly and cannot be built with dynamic loading
         add_subdirectory(convert-llama2c-to-ggml)
         add_subdirectory(cvector-generator)
         add_subdirectory(export-lora)
-        add_subdirectory(quantize-stats)
+        if (NOT WIN32)
+            # disabled on Windows because it uses internal functions not exported with LLAMA_API
+            add_subdirectory(quantize-stats)
+        endif()
         add_subdirectory(llava)
         if (GGML_RPC)
             add_subdirectory(rpc)
 
@@ -287,7 +287,7 @@ struct split_strategy {
     }
 
     void print_info() {
-        printf("n_split: %ld\n", ctx_outs.size());
+        printf("n_split: %zu\n", ctx_outs.size());
         int i_split = 0;
         for (auto & ctx_out : ctx_outs) {
             // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
@@ -297,7 +297,7 @@ struct split_strategy {
                 total_size += ggml_nbytes(t);
             }
             total_size = total_size / 1000 / 1000; // convert to megabytes
-            printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
+            printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
             i_split++;
         }
     }
 
@@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) {
     for (const auto & inst : params_instances) {
         params_idx++;
         if (params.progress) {
-            fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
+            fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
         }
         // keep the same model between tests when possible
         if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) {
         // warmup run
         if (t.n_prompt > 0) {
             if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
+                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
             }
             //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
             test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
         }
         if (t.n_gen > 0) {
             if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
+                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
             }
             test_gen(ctx, 1, t.n_threads);
         }
@@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) {
 
             if (t.n_prompt > 0) {
                 if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
                             i + 1, params.reps);
                 }
                 test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
             }
             if (t.n_gen > 0) {
                 if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
                             i + 1, params.reps);
                 }
                 test_gen(ctx, t.n_gen, t.n_threads);
 
@@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d
   - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
   - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
   - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
-  - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
+  - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
   - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
   - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
   - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
 
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
         std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
         chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
     }
-    LOG_INF("Number of chunks: %ld\n", chunks.size());
+    LOG_INF("Number of chunks: %zu\n", chunks.size());
 
     llama_backend_init();
     llama_numa_init(params.numa);
Original file line number	Diff line number	Diff line change
`@@ -287,7 +287,7 @@ struct split_strategy {`
`287`	`287`	`}`
`288`	`288`
`289`	`289`	`void print_info() {`
`290`		`- printf("n_split: %ld\n", ctx_outs.size());`
	`290`	`+ printf("n_split: %zu\n", ctx_outs.size());`
`291`	`291`	`int i_split = 0;`
`292`	`292`	`for (auto & ctx_out : ctx_outs) {`
`293`	`293`	`// re-calculate the real gguf size for each split (= metadata size + total size of all tensors)`
`@@ -297,7 +297,7 @@ struct split_strategy {`
`297`	`297`	`total_size += ggml_nbytes(t);`
`298`	`298`	`}`
`299`	`299`	`total_size = total_size / 1000 / 1000; // convert to megabytes`
`300`		`- printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);`
	`300`	`+ printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);`
`301`	`301`	`i_split++;`
`302`	`302`	`}`
`303`	`303`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) {`
`1521`	`1521`	`for (const auto & inst : params_instances) {`
`1522`	`1522`	`params_idx++;`
`1523`	`1523`	`if (params.progress) {`
`1524`		`- fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);`
	`1524`	`+ fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);`
`1525`	`1525`	`}`
`1526`	`1526`	`// keep the same model between tests when possible`
`1527`	`1527`	`if (!lmodel \|\| !prev_inst \|\| !inst.equal_mparams(*prev_inst)) {`
`@@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) {`
`1573`	`1573`	`// warmup run`
`1574`	`1574`	`if (t.n_prompt > 0) {`
`1575`	`1575`	`if (params.progress) {`
`1576`		`- fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);`
	`1576`	`+ fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);`
`1577`	`1577`	`}`
`1578`	`1578`	`//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);`
`1579`	`1579`	`test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);`
`1580`	`1580`	`}`
`1581`	`1581`	`if (t.n_gen > 0) {`
`1582`	`1582`	`if (params.progress) {`
`1583`		`- fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);`
	`1583`	`+ fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);`
`1584`	`1584`	`}`
`1585`	`1585`	`test_gen(ctx, 1, t.n_threads);`
`1586`	`1586`	`}`
`@@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) {`
`1592`	`1592`
`1593`	`1593`	`if (t.n_prompt > 0) {`
`1594`	`1594`	`if (params.progress) {`
`1595`		`- fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,`
	`1595`	`+ fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,`
`1596`	`1596`	`i + 1, params.reps);`
`1597`	`1597`	`}`
`1598`	`1598`	`test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);`
`1599`	`1599`	`}`
`1600`	`1600`	`if (t.n_gen > 0) {`
`1601`	`1601`	`if (params.progress) {`
`1602`		`- fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,`
	`1602`	`+ fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,`
`1603`	`1603`	`i + 1, params.reps);`
`1604`	`1604`	`}`
`1605`	`1605`	`test_gen(ctx, t.n_gen, t.n_threads);`
Original file line number	Diff line number	Diff line change
`@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {`
`143`	`143`	`std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);`
`144`	`144`	`chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());`
`145`	`145`	`}`
`146`		`- LOG_INF("Number of chunks: %ld\n", chunks.size());`
	`146`	`+ LOG_INF("Number of chunks: %zu\n", chunks.size());`
`147`	`147`
`148`	`148`	`llama_backend_init();`
`149`	`149`	`llama_numa_init(params.numa);`