Separate tool-call from template application

bandoti · bandoti · commit a9e3404a4c88 · 2025-02-15T16:19:14.000-04:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -1768,42 +1768,18 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
     return text;
 }
 
-//
-// Chat template utils
-//
-
-bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
-    if (use_jinja) {
-        try {
-            auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
-            common_chat_inputs inputs;
-            inputs.messages = json::array({{
-                {"role", "user"},
-                {"content", "test"},
-            }});
-            common_chat_params_init(chat_template, inputs);
-            return true;
-        } catch (const std::exception & e) {
-            LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
-            return false;
-        }
-    }
-    llama_chat_message chat[] = {{"user", "test"}};
-    const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
-    return res >= 0;
-}
-
-static void copy_chat_params(const common_chat_params & src, toolcall::sampling_updater * update_sparams)
+void common_chat_grammar_to_sampler(const common_chat_params * src,
+                                    const llama_vocab * vocab,
+                                    common_params_sampling * sparams)
 {
-    GGML_ASSERT(update_sparams && update_sparams->sparams && update_sparams->vocab);
+    GGML_ASSERT(src && vocab && sparams);
 
-    auto & dst = *update_sparams->sparams;
-    auto vocab = update_sparams->vocab;
+    auto & dst = *sparams;
 
-    dst.grammar      = src.grammar;
-    dst.grammar_lazy = src.grammar_lazy;
+    dst.grammar      = src->grammar;
+    dst.grammar_lazy = src->grammar_lazy;
 
-    for (const auto & trigger : src.grammar_triggers) {
+    for (const auto & trigger : src->grammar_triggers) {
         auto ids = common_tokenize(vocab, trigger.word, false, true);
 
         if (ids.size() == 1) {
@@ -1816,7 +1792,7 @@ static void copy_chat_params(const common_chat_params & src, toolcall::sampling_
         dst.grammar_trigger_words.push_back(trigger);
     }
 
-    for (const auto & preserved : src.preserved_tokens) {
+    for (const auto & preserved : src->preserved_tokens) {
         auto ids = common_tokenize(vocab, preserved, false, true);
         if (ids.size() == 1) {
             LOG_DBG("Preserved token: %d\n", ids[0]);
@@ -1831,19 +1807,45 @@ static void copy_chat_params(const common_chat_params & src, toolcall::sampling_
     }
 }
 
+
+//
+// Chat template utils
+//
+
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
+    if (use_jinja) {
+        try {
+            auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
+            common_chat_inputs inputs;
+            inputs.messages = json::array({{
+                {"role", "user"},
+                {"content", "test"},
+            }});
+            common_chat_params_init(chat_template, inputs);
+            return true;
+        } catch (const std::exception & e) {
+            LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
+            return false;
+        }
+    }
+    llama_chat_message chat[] = {{"user", "test"}};
+    const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
+    return res >= 0;
+}
+
 std::string common_chat_apply_template(
         const common_chat_templates & tmpl,
         const std::vector<common_chat_msg> & msgs,
         bool add_ass,
         bool use_jinja,
-        toolcall::handler::ptr handler,
-        toolcall::sampling_updater * update_sparams)
+        const common_chat_inputs * inputs_,
+        common_chat_params * out_params)
 {
-    bool use_tool_template = (use_jinja && handler != nullptr) && tmpl.template_tool_use;
+    bool use_tool_template = use_jinja && tmpl.template_tool_use;
     const auto & tmpl_selected = use_tool_template ? *tmpl.template_tool_use : *tmpl.template_default;
 
     if (use_jinja) {
-        common_chat_inputs inputs;
+        common_chat_inputs inputs = inputs_ ? *inputs_ : common_chat_inputs();
 
         auto messages = json::array();
         for (const auto & msg : msgs) {
@@ -1852,35 +1854,11 @@ std::string common_chat_apply_template(
         inputs.messages = messages;
         inputs.add_generation_prompt = add_ass;
 
-        if (handler != nullptr) {
-            auto choice = handler->tool_choice();
-            if (std::holds_alternative<std::string>(choice)) {
-                inputs.tool_choice = std::get<std::string>(choice);
-
-            } else {
-                auto choice_ptr = std::get<toolcall::json_ptr>(choice);
-                if (choice_ptr != nullptr) {
-                    inputs.tool_choice = *choice_ptr;
-                }
-            }
-
-            inputs.tools = handler->tool_list();
-        }
-
         auto chat_params = common_chat_params_init(tmpl_selected, inputs);
-        if (update_sparams) {
-            copy_chat_params(chat_params, update_sparams);
-        }
-
-        auto prompt = chat_params.prompt;
-        if (handler != nullptr) {
-            json response;
-            handler->call(prompt, response);
-            return response; // Caller will determine what to do based upon last_action
-
-        } else {
-            return prompt;
+        if (out_params != nullptr) {
+            *out_params = chat_params;
         }
+        return chat_params.prompt;
     }
 
     int alloc_size = 0;
@@ -1918,12 +1896,12 @@ std::string common_chat_format_single(
         const common_chat_msg & new_msg,
         bool add_ass,
         bool use_jinja,
-        toolcall::handler::ptr handler,
-        toolcall::sampling_updater * update_sparams)
+        const common_chat_inputs * inputs,
+        common_chat_params * out_params)
 {
     std::ostringstream ss;
     auto fmt_past_msg = past_msg.empty() ? ""
-        : common_chat_apply_template(tmpl, past_msg, false, use_jinja, handler, update_sparams);
+        : common_chat_apply_template(tmpl, past_msg, false, use_jinja, inputs);
 
     std::vector<common_chat_msg> chat_new(past_msg);
     // if the past_msg ends with a newline, we must preserve it in the formatted version
@@ -1932,7 +1910,7 @@ std::string common_chat_format_single(
     };
     // format chat with new_msg
     chat_new.push_back(new_msg);
-    auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja, handler, update_sparams);
+    auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja, inputs, out_params);
     // get the diff part
     ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
     return ss.str();
diff --git a/common/common.h b/common/common.h
@@ -618,6 +618,13 @@ std::string common_detokenize(
         const std::vector<llama_token> & tokens,
                                   bool   special = true);
 
+struct common_chat_params;
+struct common_chat_inputs;
+void common_chat_grammar_to_sampler(const common_chat_params * src,
+                                    const llama_vocab * vocab,
+                                    common_params_sampling * sparams);
+
+
 //
 // Chat template utils
 //
@@ -651,13 +658,6 @@ struct common_chat_templates {
     std::unique_ptr<common_chat_template> template_tool_use;
 };
 
-namespace toolcall {
-    struct sampling_updater {
-        common_params_sampling * sparams;
-        const llama_vocab      * vocab;
-    };
-}
-
 // CPP wrapper for llama_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
@@ -666,8 +666,8 @@ std::string common_chat_apply_template(
         const std::vector<common_chat_msg> & chat,
         bool add_ass,
         bool use_jinja,
-        toolcall::handler::ptr handler = nullptr,
-        toolcall::sampling_updater * update_sparams = nullptr);
+        const common_chat_inputs * inputs = nullptr,
+        common_chat_params * out_params = nullptr);
 
 // Format single message, while taking into account the position of that message in chat history
 std::string common_chat_format_single(
@@ -676,8 +676,8 @@ std::string common_chat_format_single(
         const common_chat_msg & new_msg,
         bool add_ass,
         bool use_jinja,
-        toolcall::handler::ptr handler = nullptr,
-        toolcall::sampling_updater * update_sparams = nullptr);
+        const common_chat_inputs * inputs = nullptr,
+        common_chat_params * out_params = nullptr);
 
 // Returns an example of formatted chat
 std::string common_chat_format_example(
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -1,4 +1,5 @@
 #include "arg.h"
+#include "chat.hpp"
 #include "common.h"
 #include "console.h"
 #include "log.h"
@@ -273,13 +274,37 @@ int main(int argc, char ** argv) {
 
         common_chat_msg new_msg{role, content, {}};
 
-        toolcall::sampling_updater updater{&sparams, vocab};
+        common_chat_inputs cinputs;
+        if (handler != nullptr) {
+            auto choice = handler->tool_choice();
+            if (std::holds_alternative<std::string>(choice)) {
+                cinputs.tool_choice = std::get<std::string>(choice);
+
+            } else {
+                auto choice_ptr = std::get<toolcall::json_ptr>(choice);
+                if (choice_ptr != nullptr) {
+                    cinputs.tool_choice = *choice_ptr;
+                }
+            }
+            cinputs.tools = handler->tool_list();
+        }
+
+        common_chat_params cparams;
         auto formatted =
             common_chat_format_single(chat_templates, chat_msgs, new_msg, add_ass, g_params->use_jinja,
-                                      handler, &updater);
+                                      &cinputs, &cparams);
 
         chat_msgs.push_back({role, content, {}});
         LOG_DBG("formatted: '%s'\n", formatted.c_str());
+
+        if (g_params->use_jinja) {
+            common_chat_grammar_to_sampler(&cparams, vocab, &sparams);
+            if (handler != nullptr) {
+                json response;
+                handler->call(formatted, response);
+                return std::string(response);
+            }
+        }
         return formatted;
     };