refactor: Make prompt_cache part of the Agent API.

daavoo · daavoo · commit 221404909f84 · 2025-12-19T10:42:13.000+01:00
diff --git a/examples/README.md b/examples/README.md
@@ -12,4 +12,3 @@ The [shared](./shared) directory contains reusable helper components used across
 | `chat_loop.h` | Interactive chat loop that reads user input from stdin and prints agent responses. Handles colored output for TTY terminals. |
 | `error_recovery_callback.h` | Callback that converts tool errors into JSON results, allowing the agent to see errors and retry gracefully instead of crashing. |
 | `logging_callback.h` | Callback that logs tool calls and their results to stderr. Useful for debugging and understanding agent behavior. |
-| `prompt_cache.h` | Utilities for building and caching the agent's system prompt tokens. Speeds up startup by reusing cached KV state. |
diff --git a/examples/context-engineering/context-engineering.cpp b/examples/context-engineering/context-engineering.cpp
@@ -8,7 +8,6 @@
 #include "llama.h"
 #include "logging_callback.h"
 #include "model.h"
-#include "prompt_cache.h"
 #include "tool.h"
 #include <algorithm>
 #include <cstdio>
@@ -210,7 +209,7 @@ main(int argc, char** argv)
     agent_cpp::Agent agent(
       std::move(model), std::move(tools), std::move(callbacks), instructions);
 
-    load_or_create_agent_cache(agent, "context-engineering.cache");
+    agent.load_or_create_cache("context-engineering.cache");
 
     printf("\nContext Engineering Demo ready!\n");
     printf("   Try to ask multiple calculations (i.e. 3+4, then 4 * 6) and");
diff --git a/examples/memory/memory.cpp b/examples/memory/memory.cpp
@@ -7,7 +7,6 @@
 #include "llama.h"
 #include "logging_callback.h"
 #include "model.h"
-#include "prompt_cache.h"
 #include "tool.h"
 #include <cstdio>
 #include <cstring>
@@ -326,7 +325,7 @@ main(int argc, char** argv)
 
     agent_cpp::Agent agent(
       std::move(model), std::move(tools), std::move(callbacks), instructions);
-    load_or_create_agent_cache(agent, "memory.cache");
+    agent.load_or_create_cache("memory.cache");
 
     printf("\nMemory Agent ready!\n");
     printf("   Try telling me your name, preferences, or ask to remember "
diff --git a/examples/multi-agent/multi-agent.cpp b/examples/multi-agent/multi-agent.cpp
@@ -8,7 +8,6 @@
 #include "llama.h"
 #include "logging_callback.h"
 #include "model.h"
-#include "prompt_cache.h"
 #include "tool.h"
 
 #include <cstdio>
@@ -190,7 +189,7 @@ class MainAgent
                                                     std::move(callbacks),
                                                     get_instructions());
 
-        load_or_create_agent_cache(*agent_, cache_path);
+        agent_->load_or_create_cache(cache_path);
     }
 
     agent_cpp::Agent& get() { return *agent_; }
diff --git a/examples/shared/prompt_cache.h b/examples/shared/prompt_cache.h
diff --git a/examples/shell/shell.cpp b/examples/shell/shell.cpp
@@ -6,7 +6,6 @@
 #include "error_recovery_callback.h"
 #include "llama.h"
 #include "model.h"
-#include "prompt_cache.h"
 #include "tool.h"
 
 #include <algorithm>
@@ -223,8 +222,10 @@ main(int argc, char** argv)
 
     printf("Loading model...\n");
     std::shared_ptr<agent_cpp::Model> model;
+    auto model_config = agent_cpp::ModelConfig{};
+    model_config.n_ctx = 16384;
     try {
-        model = agent_cpp::Model::create(model_path);
+        model = agent_cpp::Model::create(model_path, model_config);
     } catch (const agent_cpp::ModelError& e) {
         fprintf(stderr, "error: %s\n", e.what());
         return 1;
@@ -246,7 +247,7 @@ main(int argc, char** argv)
     agent_cpp::Agent agent(
       std::move(model), std::move(tools), std::move(callbacks), instructions);
 
-    load_or_create_agent_cache(agent, "shell.cache");
+    agent.load_or_create_cache("shell.cache");
 
     printf("Shell Agent ready!\n");
     printf("   This agent can execute shell commands and scripts.\n");
diff --git a/examples/tracing/tracing.cpp b/examples/tracing/tracing.cpp
@@ -8,7 +8,6 @@
 #include "llama.h"
 #include "logging_callback.h"
 #include "model.h"
-#include "prompt_cache.h"
 #include "tool.h"
 
 #include <opentelemetry/exporters/otlp/otlp_http_exporter_factory.h>
@@ -284,7 +283,7 @@ main(int argc, char** argv)
     agent_cpp::Agent agent(
       std::move(model), std::move(tools), std::move(callbacks), instructions);
 
-    load_or_create_agent_cache(agent, "tracing.cache");
+    agent.load_or_create_cache("tracing.cache");
 
     printf("\nTracing Agent ready! Try asking me to do some calculations.\n");
     printf("   Type an empty line to quit.\n\n");
diff --git a/src/agent.cpp b/src/agent.cpp
@@ -2,6 +2,7 @@
 #include "error.h"
 #include <algorithm>
 #include <cstdio>
+#include <filesystem>
 
 namespace agent_cpp {
 
@@ -144,4 +145,65 @@ Agent::run_loop(std::vector<common_chat_msg>& messages,
     }
 }
 
+std::vector<llama_token>
+Agent::build_prompt_tokens()
+{
+    if (!model) {
+        return {};
+    }
+
+    std::vector<common_chat_msg> system_messages;
+    if (!instructions.empty()) {
+        common_chat_msg system_msg;
+        system_msg.role = "system";
+        system_msg.content = instructions;
+        system_messages.push_back(system_msg);
+    }
+
+    std::vector<common_chat_tool> tool_definitions = get_tool_definitions();
+
+    common_chat_templates_inputs inputs;
+    inputs.messages = system_messages;
+    inputs.tools = tool_definitions;
+    inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
+    inputs.add_generation_prompt = false;
+    inputs.enable_thinking = false;
+
+    auto params = common_chat_templates_apply(model->get_templates(), inputs);
+
+    return model->tokenize(params.prompt);
+}
+
+bool
+Agent::load_or_create_cache(const std::string& cache_path)
+{
+    if (!model) {
+        return false;
+    }
+
+    if (std::filesystem::exists(cache_path)) {
+        auto cached_tokens = model->load_cache(cache_path);
+        if (!cached_tokens.empty()) {
+            printf("Loaded prompt cache from '%s' (%zu tokens)\n",
+                   cache_path.c_str(),
+                   cached_tokens.size());
+            return true;
+        }
+    }
+
+    auto prompt_tokens = build_prompt_tokens();
+    if (prompt_tokens.empty()) {
+        return true;
+    }
+
+    printf("Creating prompt cache at '%s' (%zu tokens)\n",
+           cache_path.c_str(),
+           prompt_tokens.size());
+
+    // warms the KV cache
+    model->generate_from_tokens(prompt_tokens);
+
+    return model->save_cache(cache_path);
+}
+
 } // namespace agent_cpp
diff --git a/src/agent.h b/src/agent.h
@@ -48,6 +48,14 @@ class Agent
 
     // Get the model (for cache operations)
     [[nodiscard]] Model* get_model() const { return model.get(); }
+
+    // Load prompt cache from file, or create it if it doesn't exist
+    // Returns true on success, false on failure
+    bool load_or_create_cache(const std::string& cache_path);
+
+  private:
+    // Build the agent's prompt tokens (system message + tool definitions)
+    std::vector<llama_token> build_prompt_tokens();
 };
 
 } // namespace agent_cpp