feat: Expose cache_type_k and cache_type_v.

daavoo · daavoo · commit ea10f03904eb · 2026-01-09T11:14:06.000+01:00
diff --git a/src/model.cpp b/src/model.cpp
@@ -112,6 +112,10 @@ Model::initialize_context(const ModelConfig& model_config)
     llama_context_params ctx_params = llama_context_default_params();
     ctx_params.n_ctx = model_config.n_ctx;
     ctx_params.n_batch = model_config.n_batch;
+    ctx_params.n_threads = model_config.n_threads;
+    ctx_params.n_threads_batch = model_config.n_threads_batch;
+    ctx_params.type_k = model_config.cache_type_k;
+    ctx_params.type_v = model_config.cache_type_v;
 
     ctx_ = llama_init_from_model(weights_->get_model(), ctx_params);
     if (ctx_ == nullptr) {
diff --git a/src/model.h b/src/model.h
@@ -2,10 +2,12 @@
 
 #include "chat.h"
 #include "llama.h"
+#include <algorithm>
 #include <functional>
 #include <memory>
 #include <optional>
 #include <string>
+#include <thread>
 
 namespace agent_cpp {
 
@@ -20,11 +22,17 @@ struct ModelConfig
     int top_k = 0;
     float temp = 0.0F;
     uint32_t seed = LLAMA_DEFAULT_SEED;
-    // Chat format for parsing tool calls. When nullopt (default), the format
-    // is auto-detected from the model's chat template.
+    // When nullopt (default), the format is auto-detected from the model's chat
+    // template.
     std::optional<common_chat_format> chat_format = std::nullopt;
     int n_ctx = 10240;
     int n_batch = -1;
+    int n_threads =
+      static_cast<int>(std::max(1u, std::thread::hardware_concurrency() - 1));
+    int n_threads_batch =
+      static_cast<int>(std::max(1u, std::thread::hardware_concurrency() - 1));
+    ggml_type cache_type_k = GGML_TYPE_F16;
+    ggml_type cache_type_v = GGML_TYPE_F16;
 };
 
 // Forward declaration