ggml-org
diff --git a/‎Makefile‎
Lines changed: 4 additions & 0 deletions b/‎Makefile‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎common/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎common/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎common/chat-template.cpp‎
Lines changed: 118 additions & 0 deletions b/‎common/chat-template.cpp‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎common/chat-template.h‎
Lines changed: 64 additions & 0 deletions b/‎common/chat-template.h‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 23 additions & 20 deletions b/‎common/common.cpp‎
Lines changed: 23 additions & 20 deletions
diff --git a/‎common/common.h‎
Lines changed: 4 additions & 19 deletions b/‎common/common.h‎
Lines changed: 4 additions & 19 deletions
@@ -934,6 +934,7 @@ OBJ_LLAMA = \
 
 OBJ_COMMON = \
 	common/common.o \
+	common/chat-template.o \
 	common/arg.o \
 	common/log.o \
 	common/console.o \
@@ -1170,6 +1171,8 @@ $(LIB_LLAMA_S): \
 common/common.o: \
 	common/common.cpp \
 	common/common.h \
+	common/chat-template.cpp \
+	common/chat-template.h \
 	common/console.h \
 	common/sampling.h \
 	common/json.hpp \
@@ -1465,6 +1468,7 @@ llama-server: \
 	examples/server/prompt-formats.js.hpp \
 	examples/server/json-schema-to-grammar.mjs.hpp \
 	examples/server/loading.html.hpp \
+	common/chat-template.h \
 	common/json.hpp \
 	common/stb_image.h \
 	$(OBJ_ALL)
 
@@ -54,6 +54,8 @@ add_library(${TARGET} STATIC
     arg.cpp
     arg.h
     base64.hpp
+    chat-template.cpp
+    chat-template.h
     common.cpp
     common.h
     console.cpp
 
@@ -0,0 +1,118 @@
+#include "chat-template.h"
+#include "minja.hpp"
+#include "llama.h"
+
+using json = nlohmann::ordered_json;
+
+static std::string _llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
+    std::string piece;
+    piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
+    const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
+    if (n_chars < 0) {
+        piece.resize(-n_chars);
+        int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
+        GGML_ASSERT(check == -n_chars);
+    }
+    else {
+        piece.resize(n_chars);
+    }
+
+    return piece;
+}
+
+static std::string llama_model_meta_val_str(const struct llama_model * model, const char * key) {
+    int32_t tlen = llama_model_meta_val_str(model, key, nullptr, 0);
+    if (tlen > 0) {
+        std::vector<char> curr_tmpl_buf(tlen + 1, 0);
+        if (llama_model_meta_val_str(model, key, curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) {
+            return std::string(curr_tmpl_buf.data(), tlen);
+        }
+    }
+    return "";
+}
+
+llama_chat_template llama_chat_template::from_model(
+    const struct llama_model * model,
+    const std::string & chat_template_override)
+{
+    // TODO: handle "chatml"?
+    auto chat_template = chat_template_override.empty()
+        ? llama_model_meta_val_str(model, "tokenizer.chat_template")
+        : chat_template_override;
+    auto bos_token = _llama_token_to_piece(model, llama_token_bos(model), true);
+    auto eos_token = _llama_token_to_piece(model, llama_token_eos(model), true);
+    return llama_chat_template(chat_template, bos_token, eos_token);
+}
+
+std::string llama_chat_template::apply(
+    const json & messages,
+    const json & tools,
+    bool add_generation_prompt) const
+{
+    auto actual_messages = messages;
+
+    // First, "fix" messages so they have a chance to be rendered correctly by the template
+
+    if (_requires_object_arguments || !_supports_system_role) {
+        std::string pending_system;
+        auto flush_sys = [&]() {
+            if (!pending_system.empty()) {
+                actual_messages.push_back({
+                    {"role", "user"},
+                    {"content", pending_system},
+                });
+                pending_system.clear();
+            }
+        };
+        for (auto & message : actual_messages) {
+            if (!message.contains("role") || !message.contains("content")) {
+                throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
+            }
+            std::string role = message.at("role");
+            std::string content = message.at("content");
+
+            if (!_supports_system_role) {
+                if (role == "system") {
+                    if (!pending_system.empty()) pending_system += "\n";
+                    pending_system += content;
+                    continue;
+                } else {
+                    if (role == "user") {
+                        if (!pending_system.empty()) {
+                            message["content"] = pending_system + (content.empty() ? "" : "\n" + content);
+                            pending_system.clear();
+                        }
+                    } else {
+                        flush_sys();
+                    }
+                }
+            }
+            if (_requires_object_arguments && message.contains("tool_calls")) {
+                for (auto & tool_call : message.at("tool_calls")) {
+                    std::string arguments = tool_call.at("arguments");
+                    tool_call["arguments"] = json::parse(arguments);
+                }
+            }
+        }
+        flush_sys();
+    }
+
+    auto context = minja::Context::make(json({
+        {"messages", actual_messages},
+        {"add_generation_prompt", add_generation_prompt},
+        {"bos_token", _bos_token},
+        {"eos_token", _eos_token},
+    }));
+
+    if (!tools.is_null() && !tools.empty()) {
+        auto tools_val = minja::Value(tools);
+        context->set("tools", tools_val);
+    }
+
+    auto tmpl_root = minja::Parser::parse(_chat_template, {
+        /* .trim_blocks = */ true,
+        /* .lstrip_blocks = */ true,
+        /* .keep_trailing_newline = */ false,
+    });
+    return tmpl_root->render(context);
+}
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <json.hpp>
+#include <string>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+enum llama_tool_call_style {
+    Unknown,
+    Llama31,
+    FunctionaryV3Llama3,
+    FunctionaryV3Llama31,
+    Hermes2Pro,
+};
+
+class llama_chat_template {
+  public:
+
+  private:
+    llama_tool_call_style _tool_call_style = Unknown;
+    bool _supports_tools = true;
+    // Meta-Llama-3.1-8B-Instruct's template expects arguments to be an object.
+    // Most other templates (and OpenAI's API) expect the arguments object to be stringified.
+    bool _requires_object_arguments = false;
+    bool _supports_system_role = true;
+    std::string _chat_template;
+    std::string _bos_token;
+    std::string _eos_token;
+  public:
+    llama_chat_template(const std::string & chat_template, const std::string & bos_token, const std::string & eos_token)
+        : _chat_template(chat_template), _bos_token(bos_token), _eos_token(eos_token) {
+
+        _supports_tools = chat_template.find("tools") != std::string::npos;
+        _requires_object_arguments = chat_template.find("tool_call.arguments | items") != std::string::npos;
+        _supports_system_role = chat_template.find("System role not supported") == std::string::npos;
+
+        if (chat_template.find("<tool_call>") != std::string::npos) {
+            _tool_call_style = Hermes2Pro;
+        } else if (chat_template.find(">>>all") != std::string::npos) {
+            _tool_call_style = FunctionaryV3Llama3;
+        } else if (chat_template.find("<|start_header_id|>") != std::string::npos) {
+            if (chat_template.find("<function=") != std::string::npos) {
+                _tool_call_style = FunctionaryV3Llama31;
+            } else if (chat_template.find("<|python_tag|>") != std::string::npos) {
+                _tool_call_style = Llama31;
+            }
+        }
+    }
+
+    static llama_chat_template from_model(
+        const struct llama_model * model,
+        const std::string & chat_template_override);
+
+    llama_tool_call_style tool_call_style() const { return _tool_call_style; }
+
+    const std::string & chat_template() const { return _chat_template; }
+    bool supports_tools() const { return _supports_tools; }
+
+    std::string apply(
+        const nlohmann::ordered_json & messages,
+        const nlohmann::ordered_json & tools,
+        bool add_generation_prompt) const;
+};
@@ -9,6 +9,7 @@
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
+#include "chat-template.h"
 
 #include <algorithm>
 #include <cinttypes>
@@ -1511,6 +1512,20 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
 //
 
 bool llama_chat_verify_template(const std::string & tmpl, bool use_jinja) {
+    if (use_jinja) {
+        try {
+            auto chat_template = llama_chat_template(tmpl, "<s>", "</s>");
+            chat_template.apply({{
+                {"role", "user"},
+                {"content", "test"},
+            }}, json(), true);
+            return true;
+        } catch (const std::exception & e) {
+            LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
+            return false;
+        }
+    }
+
     llama_chat_message chat[] = {{"user", "test"}};
     int res = llama_chat_apply_template(
         nullptr,
@@ -1519,22 +1534,14 @@ bool llama_chat_verify_template(const std::string & tmpl, bool use_jinja) {
         1,
         /* add_ass= */ true,
         /* buffer= */ nullptr,
-        /* length= */ 0,
-        use_jinja,
-        /* tools= */ nullptr,
-        "<s>",
-        "</s>");
+        /* length= */ 0);
     return res >= 0;
 }
 
 std::string llama_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
         const std::vector<llama_chat_msg> & msgs,
-        bool add_ass,
-        bool use_jinja,
-        const char * tools,
-        const char * bos_token,
-        const char * eos_token) {
+        bool add_ass) {
     int alloc_size = 0;
     bool fallback = false; // indicate if we must fallback to default chatml
     std::vector<llama_chat_message> chat;
@@ -1547,7 +1554,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
     std::vector<char> buf(alloc_size);
 
     // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size(), use_jinja, tools, bos_token, eos_token);
+    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
 
     // error: chat template is not supported
     if (res < 0) {
@@ -1557,7 +1564,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
             throw std::runtime_error("this custom template is not supported");
         } else {
             // If the built-in template is not supported, we default to chatml
-            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size(), use_jinja, tools, bos_token, eos_token);
+            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
             fallback = true;
         }
     }
@@ -1568,7 +1575,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
         res = llama_chat_apply_template(
             fallback ? nullptr : model,
             fallback ? "chatml" : ptr_tmpl,
-            chat.data(), chat.size(), add_ass, buf.data(), buf.size(), use_jinja, tools, bos_token, eos_token);
+            chat.data(), chat.size(), add_ass, buf.data(), buf.size());
     }
 
     std::string formatted_chat(buf.data(), res);
@@ -1579,21 +1586,17 @@ std::string llama_chat_format_single(const struct llama_model * model,
         const std::string & tmpl,
         const std::vector<llama_chat_msg> & past_msg,
         const llama_chat_msg & new_msg,
-        bool add_ass,
-        bool use_jinja,
-        const char * tools,
-        const char * bos_token,
-        const char * eos_token) {
+        bool add_ass) {
     std::ostringstream ss;
-    auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false, use_jinja, tools, bos_token, eos_token);
+    auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
     std::vector<llama_chat_msg> chat_new(past_msg);
     // if the past_msg ends with a newline, we must preserve it in the formatted version
     if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
         ss << "\n";
     };
     // format chat with new_msg
     chat_new.push_back(new_msg);
-    auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass, use_jinja, tools, bos_token, eos_token);
+    auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
     // get the diff part
     ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
     return ss.str();
 
@@ -471,44 +471,29 @@ std::string llama_detokenize(
 // Chat template utils
 //
 
-struct llama_chat_msg_tool_call {
-    std::string name;
-    std::string arguments;
-};
-
 // same as llama_chat_message, but uses std::string and std::vector
 struct llama_chat_msg {
     std::string role;
     std::string content;
-    std::string tool;
-    std::vector<struct llama_chat_msg_tool_call> tool_calls;
 };
 
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool llama_chat_verify_template(const std::string & tmpl, bool use_jinja = false);
+// Check if the template is supported or not. Returns true if it's valid
+bool llama_chat_verify_template(const std::string & tmpl, bool use_jinja);
 
 // CPP wrapper for llama_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
 std::string llama_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
         const std::vector<llama_chat_msg> & chat,
-        bool add_ass,
-        bool use_jinja = false,
-        const char * tools = nullptr,
-        const char * bos_token = nullptr,
-        const char * eos_token = nullptr);
+        bool add_ass);
 
 // Format single message, while taking into account the position of that message in chat history
 std::string llama_chat_format_single(const struct llama_model * model,
         const std::string & tmpl,
         const std::vector<llama_chat_msg> & past_msg,
         const llama_chat_msg & new_msg,
-        bool add_ass,
-        bool use_jinja = false,
-        const char * tools = nullptr,
-        const char * bos_token = nullptr,
-        const char * eos_token = nullptr);
+        bool add_ass);
 
 // Returns an example of formatted chat
 std::string llama_chat_format_example(const struct llama_model * model,