align Command R7B w/ --think / reasoning_content behaviour

Olivier Chafik · Olivier Chafik · commit e6d9b52480ab · 2025-02-05T15:47:37.000Z
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1978,7 +1978,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--think"},
         "*experimental* thinking mode (default: disabled)\n"
-        "returns reasoning_content in messages, forcing model to think unless it supports native <think> tags (DeepSeek R1)\n"
+        "returns reasoning_content in messages, forcing model to think unless it supports native <think> tags (DeepSeek R1, Command R7B)\n"
         "only supported for non-streamed responses",
         [](common_params & params) {
             params.think = true;
diff --git a/common/chat-template.hpp b/common/chat-template.hpp
@@ -316,7 +316,7 @@ class chat_template {
 
         auto polyfill_system_role = opts.polyfill_system_role && !caps_.supports_system_role;
         auto polyfill_tools = opts.polyfill_tools && has_tools && !caps_.supports_tools;
-        auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples;
+        auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples && caps_.supports_tool_calls;
         auto polyfill_tool_calls = opts.polyfill_tool_calls && has_tool_calls && !caps_.supports_tool_calls;
         auto polyfill_tool_responses = opts.polyfill_tool_responses && has_tool_responses && !caps_.supports_tool_responses;
         auto polyfill_object_arguments = opts.polyfill_object_arguments && has_tool_calls && caps_.requires_object_arguments;
diff --git a/common/chat.cpp b/common/chat.cpp
@@ -12,12 +12,13 @@ std::string common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
         case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
         case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
-        case COMMON_CHAT_FORMAT_DEEPSEEK_R1_THINK: return "DeepSeek R1 (extract <think>)";
+        case COMMON_CHAT_FORMAT_DEEPSEEK_R1_THINK: return "DeepSeek R1 (extract reasoning_content)";
         case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
         case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
         case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
+        case COMMON_CHAT_FORMAT_COMMAND_R7B_THINK: return "Command R7B (extract reasoning_content)";
         default:
             throw std::runtime_error("Unknown chat format");
     }
@@ -469,22 +470,49 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
         "<|END_THINKING|>",
         "<|END_ACTION|>",
     };
-    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
-    data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
+    auto adjusted_messages = json::array();
+    for (const auto & msg : inputs.messages) {
+        auto has_reasoning_content = msg.contains("reasoning_content") && msg["reasoning_content"].is_string();
+        auto has_tool_calls = msg.contains("tool_calls") && msg["tool_calls"].is_array();
+        if (has_reasoning_content && has_tool_calls) {
+            auto adjusted_message = msg;
+            adjusted_message["tool_plan"] = msg["reasoning_content"];
+            adjusted_message.erase("reasoning_content");
+            adjusted_messages.push_back(adjusted_message);
+        } else {
+            adjusted_messages.push_back(msg);
+        }
+    }
+    // } else {
+    //     adjusted_messages = inputs.messages;
+    // }
+    data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
+    data.format = inputs.think ? COMMON_CHAT_FORMAT_COMMAND_R7B_THINK : COMMON_CHAT_FORMAT_COMMAND_R7B;
     return data;
 }
-static common_chat_msg common_chat_parse_command_r7b(const std::string & input) {
-    static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
-    static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
+static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool think) {
+    static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|>)([\\s\\S\\n\\r]*)");
+    static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
+    static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
+
     std::smatch match;
 
     common_chat_msg result;
     result.role = "assistant";
-    if (std::regex_match(input, match, response_regex)) {
-        result.content = match[1].str();
-    } else if (std::regex_match(input, match, thought_action_regex)) {
-        result.tool_plan = match[1].str();
-        auto actions_str = match[2].str();
+
+    std::string rest = input;
+
+    if (std::regex_match(rest, match, thought_regex)) {
+        if (think) {
+            result.reasoning_content = match[2].str();
+        } else if (!match[2].str().empty()) {
+            // Let the unparsed thinking tags through in content only if their insides aren't empty.
+            result.content = match[1].str();
+        }
+        rest = match[3].str();
+    }
+    if (std::regex_match(rest, match, action_regex)) {
+        auto actions_str = match[1].str();
         auto actions = json::parse(actions_str);
         for (const auto & action : actions) {
             result.tool_calls.push_back({
@@ -493,9 +521,12 @@ static common_chat_msg common_chat_parse_command_r7b(const std::string & input)
                 /* .id = */        action["tool_call_id"],
             });
         }
+    } else if (std::regex_match(rest, match, response_regex)) {
+        auto response = match[1].str();
+        result.content += response;
     } else {
         LOG_ERR("Failed to parse command_r output");
-        result.content = input;
+        result.content += rest;
     }
     return result;
 }
@@ -1038,6 +1069,11 @@ common_chat_params common_chat_params_init(const common_chat_template & tmpl, co
         return common_chat_params_init_deepseek_r1(tmpl, inputs);
     }
 
+    // Command R7B: : use handler in all cases except json schema (thinking / tools).
+    if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos && inputs.json_schema.is_null()) {
+        return common_chat_params_init_command_r7b(tmpl, inputs);
+    }
+
     // Use generic handler when forcing thoughts or JSON schema for final output
     // TODO: support thinking mode and/or JSON schema in handlers below this.
     if (inputs.think || (!inputs.tools.is_null() && inputs.json_schema.is_object())) {
@@ -1081,11 +1117,6 @@ common_chat_params common_chat_params_init(const common_chat_template & tmpl, co
         return common_chat_params_init_mistral_nemo(tmpl, inputs);
     }
 
-    // Command R7B (w/ tools)
-    if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos) {
-        return common_chat_params_init_command_r7b(tmpl, inputs);
-    }
-
     // Generic fallback
     return common_chat_params_init_generic(tmpl, inputs);
 }
@@ -1123,7 +1154,9 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format
         case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
             return common_chat_parse_firefunction_v2(input);
         case COMMON_CHAT_FORMAT_COMMAND_R7B:
-            return common_chat_parse_command_r7b(input);
+            return common_chat_parse_command_r7b(input, /* think= */ false);
+        case COMMON_CHAT_FORMAT_COMMAND_R7B_THINK:
+            return common_chat_parse_command_r7b(input, /* think= */ true);
         default:
             throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
     }
diff --git a/common/chat.hpp b/common/chat.hpp
@@ -35,6 +35,7 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
     COMMON_CHAT_FORMAT_HERMES_2_PRO,
     COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_COMMAND_R7B_THINK,
 
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
diff --git a/common/common.h b/common/common.h
@@ -625,7 +625,6 @@ struct common_chat_msg {
     std::string content;
     std::vector<common_tool_call> tool_calls;
     std::string reasoning_content = "";
-    std::string tool_plan = "";
 };
 
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -127,6 +127,8 @@ The project is under active development, and we are [looking for feedback and co
 | `--grammar-file FNAME` | file to read grammar from |
 | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
 | `--jinja` | Enable experimental Jinja templating engine (required for tool use) |
+| `--think` | Enable experimental thinking mode (extracts DeepSeek R1 & Command R7B's native thinking tags and forces any other model to think before responding, resulting thoughts are in the `reasoning_content` output field) (requires `--jinja`) |
+--think 
 
 **Example-specific params**
 
@@ -1223,10 +1225,10 @@ curl http://localhost:8080/v1/chat/completions \
 
   # Native support for DeepSeek R1 works best w/ our own template (official template buggy)
 
-  llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
+  llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L --think \
     --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
 
-  llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
+  llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M --think \
     --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
 
   # Native support requires the right template for these GGUFs:
@@ -1240,7 +1242,7 @@ curl http://localhost:8080/v1/chat/completions \
   llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
     --chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
 
-  llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
+  llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L --think \
     --chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
 
   # Generic format support
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -748,9 +748,6 @@ struct server_task_result_cmpl_final : server_task_result {
         if (!msg.reasoning_content.empty()) {
             message["reasoning_content"] = msg.reasoning_content;
         }
-        if (!msg.tool_plan.empty()) {
-            message["tool_plan"] = msg.tool_plan;
-        }
 
         json choice {
             {"finish_reason", finish_reason},
diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py
@@ -274,43 +274,44 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
 
 
 @pytest.mark.slow
-@pytest.mark.parametrize("hf_repo,template_override", [
-    ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M",   ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
+@pytest.mark.parametrize("think,hf_repo,template_override", [
+    (True,  "bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M",   ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
 
-    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+    (False, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    (False, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
 
-    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
-    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+    (False, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (False, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
-    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+    (False, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    (False, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
 
-    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+    (False, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (False, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
 
-    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
-    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+    (False, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    (False, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
 
-    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+    (False, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    (False, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
 
-    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
-    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+    (False, "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
+    (False, "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
 
-    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
-    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+    (False, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (False, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
 
-    ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (True,  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 
     # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
-    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    (False, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
 
     # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
 ])
-def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None):
+def test_weather(think: bool, hf_repo: str, template_override: Tuple[str, str | None] | None):
     global server
     n_predict = 512
+    server.think = think
     server.n_slots = 1
     server.jinja = True
     server.n_ctx = 8192
@@ -488,44 +489,45 @@ def test_thoughts(n_predict: int, think: bool, expect_content: str | None, expec
 
 
 @pytest.mark.slow
-@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
-    (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"),
+@pytest.mark.parametrize("think,expected_arguments_override,hf_repo,template_override", [
+    (True,  None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (True,  None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"),
 
-    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
-    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+    (False, None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (False, None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
-    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
-    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+    (False, None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
+    (False, None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
 
-    (None,                 "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+    (False, None,                 "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    (False, '{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
 
-    ('{"code":"print("}',  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
-    (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
+    (False, '{"code":"print("}',  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    (False, None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
 
-    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
-    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+    (False, '{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    (False, '{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
 
-    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+    (False, None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    (False, None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
 
-    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+    (False, None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (False, None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
 
-    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
-    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+    (False, None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
+    (False, None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
 
-    (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+    (False, None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    (False, None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
 
     # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
-    (None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    (False, None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
 ])
-def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+def test_hello_world(think: bool, expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
     global server
     server.n_slots = 1
     server.jinja = True
+    server.think = think
     server.n_ctx = 8192
     server.n_predict = 512 # High because of DeepSeek R1
     server.model_hf_repo = hf_repo
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp

Original file line number	Diff line number	Diff line change
`@@ -748,9 +748,6 @@ struct server_task_result_cmpl_final : server_task_result {`
`748`	`748`	`if (!msg.reasoning_content.empty()) {`
`749`	`749`	`message["reasoning_content"] = msg.reasoning_content;`
`750`	`750`	`}`
`751`		`- if (!msg.tool_plan.empty()) {`
`752`		`- message["tool_plan"] = msg.tool_plan;`
`753`		`- }`
`754`	`751`
`755`	`752`	`json choice {`
`756`	`753`	`{"finish_reason", finish_reason},`