feat: add custom chat template, ref #65

pminev · pminev · commit 9e92fd18503e · 2025-05-08T16:28:42.000+03:00
diff --git a/ac-local-plugin/code/LocalLlama.cpp b/ac-local-plugin/code/LocalLlama.cpp
@@ -10,6 +10,7 @@
 #include <ac/llama/ControlVector.hpp>
 #include <ac/llama/LogitComparer.hpp>
 #include <ac/llama/ResourceCache.hpp>
+#include <ac/llama/ChatFormat.hpp>
 
 #include <ac/local/Service.hpp>
 #include <ac/local/ServiceFactory.hpp>
@@ -38,9 +39,6 @@
 #include "aclp-llama-version.h"
 #include "aclp-llama-interface.hpp"
 
-// TODO: remove this include
-#include <iostream>
-
 namespace ac::local {
 
 namespace {
@@ -53,13 +51,12 @@ class ChatSession {
     const llama::Vocab& m_vocab;
     llama::Instance& m_instance;
     IoEndpoint& m_io;
-    std::string m_userPrefix;
-    std::string m_assistantPrefix;
 
-    std::vector<llama::Token> m_promptTokens;
+    std::string m_userPrefix;
+    std::unique_ptr<llama::ChatFormat> m_chatFormat;
+    std::vector<llama::ChatMsg> m_chatMessages;
+    size_t m_submittedMessages = 0;
 
-    bool m_addUserPrefix = true;
-    bool m_addAssistantPrefix = true;
 public:
     using Schema = sc::StateChatInstance;
 
@@ -69,50 +66,57 @@ class ChatSession {
         , m_instance(instance)
         , m_io(io)
     {
-        m_promptTokens = instance.model().vocab().tokenize(params.setup.value(), true, true);
-        m_session.setInitialPrompt(m_promptTokens);
+        auto& chatTemplate = params.chatTemplate.value();
+        auto modelChatParams = llama::ChatFormat::getChatParams(instance.model());
+        if (chatTemplate.empty()) {
+            if (modelChatParams.chatTemplate.empty()) {
+                throw_ex{} << "The model does not have a default chat template, please provide one.";
+            }
+
+            m_chatFormat = std::make_unique<llama::ChatFormat>(modelChatParams.chatTemplate);
+        } else {
+            modelChatParams.chatTemplate = chatTemplate;
+            m_chatFormat = std::make_unique<llama::ChatFormat>(std::move(modelChatParams));
+        }
+
+        auto promptTokens = instance.model().vocab().tokenize(params.setup.value(), true, true);
+        m_session.setInitialPrompt(promptTokens);
 
         m_userPrefix = "\n";
         m_userPrefix += params.roleUser;
         m_userPrefix += ":";
-        m_assistantPrefix = "\n";
-        m_assistantPrefix += params.roleAssistant;
-        m_assistantPrefix += ":";
     }
 
     ~ChatSession() {
         m_instance.stopSession();
     }
 
-    xec::coro<void> sendMessages(Schema::OpSendMessages::Params& params) {
+    xec::coro<void> addMessages(Schema::OpAddChatMessages::Params& params) {
         auto& messages = params.messages.value();
-        for (size_t i = 0; i < messages.size(); ++i) {
-            std::cout << messages[i].role.value() << ": " << messages[i].content.value() << "\n";
+        std::vector<llama::Token> tokens;
+
+        for (const auto& message : messages) {
+            m_chatMessages.push_back(llama::ChatMsg{
+                .role = message.role.value(),
+                .text = message.content.value()
+            });
         }
 
-        co_await m_io.push(Frame_from(schema::SimpleOpReturn<Schema::OpSendMessages>{}, {}));
+        co_await m_io.push(Frame_from(schema::SimpleOpReturn<Schema::OpAddChatMessages>{}, {}));
     }
 
-    xec::coro<void> pushPrompt(Schema::OpAddChatPrompt::Params& params) {
-        auto& prompt = params.prompt.value();
-
-        // prefix with space as the generated content doesn't include it
-        prompt = ' ' + prompt;
-
-        if (m_addUserPrefix) {
-            // we haven't had an interaction yet, so we need to add the user prefix
-            // subsequent interaction will have it generated
-            prompt = m_userPrefix + prompt;
+    void submitPendingImages() {
+        auto messagesToSubmit = m_chatMessages.size() - m_submittedMessages;
+        std::string formatted;
+        if (messagesToSubmit == 1) {
+            formatted = m_chatFormat->formatMsg(
+                m_chatMessages.back(), {m_chatMessages.begin(), m_chatMessages.end() - 1}, true);
+        } else {
+            formatted = m_chatFormat->formatChat(
+                {m_chatMessages.begin() + m_submittedMessages, m_chatMessages.end()}, true);
         }
 
-        // prepare for the next generation
-        prompt += m_assistantPrefix;
-
-        m_promptTokens = m_vocab.tokenize(prompt, false, false);
-        m_session.pushPrompt(m_promptTokens);
-        m_addAssistantPrefix = false;
-
-        co_await m_io.push(Frame_from(schema::SimpleOpReturn<Schema::OpAddChatPrompt>{}, {}));
+        m_session.pushPrompt(m_vocab.tokenize(formatted, true, true));
     }
 
     xec::coro<void> getResponse(Schema::ChatResponseParams params, bool isStreaming) {
@@ -122,18 +126,14 @@ class ChatSession {
             maxTokens = 1000;
         }
 
-        if (m_addAssistantPrefix) {
-            // generated responses are requested first, but we haven't yet fed the assistant prefix to the model
-            auto prompt = m_assistantPrefix;
-            assert(m_promptTokens.empty()); // nothing should be pending here
-            m_promptTokens = m_vocab.tokenize(prompt, false, false);
-            m_session.pushPrompt(m_promptTokens);
+        if (m_submittedMessages != m_chatMessages.size()) {
+            submitPendingImages();
+            m_submittedMessages = m_chatMessages.size();
         }
 
         ac::llama::AntipromptManager antiprompt;
         antiprompt.addAntiprompt(m_userPrefix);
 
-        m_addUserPrefix = true;
         Schema::OpGetChatResponse::Return ret;
         auto& result = ret.response.materialize();
 
@@ -149,14 +149,10 @@ class ChatSession {
 
             auto matchedAntiPrompt = antiprompt.feedGeneratedText(tokenStr);
             if (!matchedAntiPrompt.empty()) {
-                // user prefix was added by generation, so don't add it again
-                m_addUserPrefix = false;
-
                 // and also hide it from the return value
                 // note that we assume that m_userPrefix is always the final piece of text in the response
                 // TODO: update to better match the cutoff when issue #131 is done
                 result.erase(result.size() - matchedAntiPrompt.size());
-                m_addUserPrefix = false;
                 break;
             }
 
@@ -447,14 +443,12 @@ struct LocalLlama {
             Frame err;
 
             try {
-                if (auto iparams = Frame_optTo(schema::OpParams<Schema::OpAddChatPrompt>{}, *f)) {
-                    co_await chatSession.pushPrompt(*iparams);
-                } else if (auto iparams = Frame_optTo(schema::OpParams<Schema::OpGetChatResponse>{}, *f)) {
+                if (auto iparams = Frame_optTo(schema::OpParams<Schema::OpGetChatResponse>{}, *f)) {
                     co_await chatSession.getResponse(*iparams, false);
                 } else if (auto iparams = Frame_optTo(schema::OpParams<Schema::OpStreamChatResponse>{}, *f)) {
                     co_await chatSession.getResponse(*iparams, true);
-                } else if (auto iparams = Frame_optTo(schema::OpParams<Schema::OpSendMessages>{}, *f)) {
-                    co_await chatSession.sendMessages(*iparams);
+                } else if (auto iparams = Frame_optTo(schema::OpParams<Schema::OpAddChatMessages>{}, *f)) {
+                    co_await chatSession.addMessages(*iparams);
                 } else {
                     err = unknownOpError(*f);
                 }
@@ -478,7 +472,6 @@ struct LocalLlama {
         lparams.vocabOnly = lmParams.vocabOnly.valueOr(false);
         lparams.prefixInputsWithBos = lmParams.prefixInputsWithBos.valueOr(false);
 
-
         auto model = m_resourceCache.getModel({.gguf = gguf, .params = lparams});
 
         std::vector<llama::ResourceCache::LoraLock> loras;
diff --git a/ac-local-plugin/example/ep-chat.cpp b/ac-local-plugin/example/ep-chat.cpp
@@ -31,8 +31,7 @@ int main() try {
     std::cout << "Initial state: " << sid << '\n';
 
     for (auto x : llama.stream<schema::StateLlama::OpLoadModel>({
-        // .ggufPath = AC_TEST_DATA_LLAMA_DIR "/gpt2-117m-q6_k.gguf"
-        .ggufPath = AC_TEST_DATA_LLAMA_DIR "/../../../tmp/Meta-Llama-3.1-8B-Instruct-Q6_K.gguf"
+        .ggufPath = AC_TEST_DATA_LLAMA_DIR "/gpt2-117m-q6_k.gguf"
     })) {
         std::cout << "Model loaded: " << x.tag.value() << " " << x.progress.value() << '\n';
     }
@@ -43,22 +42,27 @@ int main() try {
     sid = llama.call<schema::StateModelLoaded::OpStartInstance>({
         .instanceType = "chat",
         .setup = "A chat between a human user and a helpful AI assistant.",
-        // .roleUser = roleUser,
-        // .roleAssistant = roleAssistant
+        .roleUser = roleUser,
     });
     std::cout << "Instance started: " << sid << '\n';
 
-    std::vector<schema::Message> initMessages = {
-        {roleUser, "I need assistance for API design"},
-        {roleAssistant, "What aspect of API design are you looking for help with? Do you have a specific problem or question in mind?"},
-        {roleUser, "It's a C++ implementation of a class"},
-    };
-
-    llama.call<schema::StateChatInstance::OpSendMessages>({
-        .messages = initMessages
-    });
+    constexpr bool addPreviousMessages = true;
+    if (addPreviousMessages) {
+        std::vector<schema::Message> msgs = {
+            {roleUser, "Hey, I need help planning a surprise weekend getaway."},
+            {roleAssistant, "Sure! Are you thinking of something outdoorsy, a relaxing spa weekend, or maybe a city adventure?"},
+            {roleUser, "A quiet nature retreat would be perfect."},
+            {roleAssistant, "Great choice. I can suggest a few scenic cabin locations and even help you build a checklist for the trip."}
+        };
+
+        llama.call<schema::StateChatInstance::OpAddChatMessages>({
+            .messages = msgs
+        });
 
-    std::vector<schema::Message> messages;
+        for (auto& m : msgs) {
+            std::cout << m.role.value() << ": " << m.content.value() << '\n';
+        }
+    }
 
     while (true) {
         std::cout << roleUser <<": ";
@@ -67,27 +71,23 @@ int main() try {
             std::getline(std::cin, user);
         }
         if (user == "/quit") break;
-        user = ' ' + user;
-        messages.push_back({roleUser, user});
 
-        llama.call<schema::StateChatInstance::OpAddChatPrompt>({
-            .prompt = user
+        llama.call<schema::StateChatInstance::OpAddChatMessages>({
+            .messages = std::vector<schema::Message>{
+                { roleUser, user}
+            }
         });
 
-        std::string text;
         std::cout << roleAssistant << ": ";
-        constexpr bool streamChat = false;
+        constexpr bool streamChat = true;
         if (streamChat) {
             for(auto t: llama.stream<schema::StateChatInstance::OpStreamChatResponse>({})) {
-                text += t;
                 std::cout << t << std::flush;
             }
         } else {
             auto res = llama.call<schema::StateChatInstance::OpGetChatResponse>({});
-            text += res.response.value();
             std::cout << res.response.value() << std::flush;
         }
-        messages.push_back({roleUser, text});
         std::cout << "\n";
     }
 
diff --git a/ac-local-plugin/schema/ac/schema/LlamaCpp.hpp b/ac-local-plugin/schema/ac/schema/LlamaCpp.hpp
@@ -83,19 +83,22 @@ struct StateModelLoaded {
 
         Field<std::string> setup = Default();
         Field<std::string> chatTemplate = Default();
+        Field<std::string> bosOverride = Default();
+        Field<std::string> eosOverride = Default();
         Field<std::string> roleUser = Default("User");
-        Field<std::string> roleAssistant = Default("Assistant");
 
         template <typename Visitor>
         void visitFields(Visitor& v) {
             v(instanceType, "instance_type", "Type of the instance to start");
             v(ctxSize, "ctx_size", "Size of the context");
             v(batchSize, "batch_size", "Size of the single batch");
             v(ubatchSize, "ubatch_size", "Size of the context");
-            v(ctrlVectorPaths, "ctrl-vectors", "Paths to the control vectors.");
-            v(setup, "setup", "Initial setup for the chat session");
+            v(ctrlVectorPaths, "ctrl_vectors", "Paths to the control vectors.");
+            v(setup, "setup", "Initial setup prompt for the chat session");
+            v(chatTemplate, "chat_template", "Chat template to use. If empty will use the model default");
+            v(bosOverride, "bos_override", "BOS token to use with the custom template. If empty will use the model default");
+            v(eosOverride, "eos_override", "EOS token to use with the custom template. If empty will use the model default");
             v(roleUser, "role_user", "Role name for the user");
-            v(roleAssistant, "role_assistant", "Role name for the assistant");
         }
     };
 
@@ -220,8 +223,8 @@ struct StateChatInstance {
     static constexpr auto id = "chat-instance";
     static constexpr auto desc = "Chat state";
 
-    struct OpSendMessages {
-        static inline constexpr std::string_view id = "send-messages";
+    struct OpAddChatMessages {
+        static inline constexpr std::string_view id = "add-messages";
         static inline constexpr std::string_view desc = "Send messages to the chat session";
 
         struct Params {
@@ -236,22 +239,6 @@ struct StateChatInstance {
         using Return = nullptr_t;
     };
 
-    struct OpAddChatPrompt {
-        static inline constexpr std::string_view id = "add-chat-prompt";
-        static inline constexpr std::string_view desc = "Add a prompt to the chat session as a user";
-
-        struct Params {
-            Field<std::string> prompt = Default();
-
-            template <typename Visitor>
-            void visitFields(Visitor& v) {
-                v(prompt, "prompt", "Prompt to add to the chat session");
-            }
-        };
-
-        using Return = nullptr_t;
-    };
-
     struct ChatResponseParams {
         Field<uint32_t> maxTokens = Default(0);