ggml-org · trvon · Aug 9, 2025 · Aug 9, 2025 · Aug 9, 2025 · Aug 9, 2025
diff --git a/common/chat.cpp b/common/chat.cpp
@@ -1313,17 +1313,33 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
 
     data.prompt = prompt;
     data.format = COMMON_CHAT_FORMAT_GPT_OSS;
+    data.reasoning_format = COMMON_REASONING_FORMAT_AUTO; // GPT-OSS Harmony format always has reasoning channels
 
     // TODO: support tool calls in GPT-OSS?
 
     return data;
 }
 static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
     // TODO @ngxson : this won't work with --special enabled, we should fix that
+    // TODO: Full Harmony format support with multiple channels (analysis, commentary, final) 
+    //       is being implemented in PR #15181. This is a minimal fix to prevent crashes.
     builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
-    if (!builder.syntax().parse_tool_calls) {
+
+    // Check if we have an <|end|> token at the current position or later
+    auto end_pos = builder.input().find("<|end|>", builder.pos());
+
+    if (end_pos != std::string::npos) {
+        // Content is everything from current position to <|end|>
+        auto content = builder.input().substr(builder.pos(), end_pos - builder.pos());
+        builder.add_content(content);
+        builder.move_to(end_pos);
+        // Consume the <|end|> token to prevent "Unexpected content at end of input" errors
+        // Note: <|end|> marks end of message, not end of generation. Full multi-message
+        // support requires proper Harmony channel parsing.
+        builder.consume_literal("<|end|>");
+    } else {
+        // No <|end|> token, consume everything remaining
         builder.add_content(builder.consume_rest());
-        return;
     }
 }
 

diff --git a/common/chat.h b/common/chat.h
@@ -135,6 +135,7 @@ struct common_chat_templates_inputs {
 
 struct common_chat_params {
     common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    common_reasoning_format              reasoning_format = COMMON_REASONING_FORMAT_NONE; // Template-specific reasoning format
     std::string                         prompt;
     std::string                         grammar;
     bool                                grammar_lazy = false;

@@ -10,6 +10,7 @@
 #include <string>
 
 #include "chat-parser.h"
+#include "chat.h"
 #include "common.h"
 #include "log.h"
 #include "regex-partial.h"
@@ -341,12 +342,123 @@ static void test_positions() {
   }
 }
 
+static void test_gpt_oss_harmony_format() {
+    // Test GPT-OSS Harmony format with <|end|> token and AUTO reasoning (parse_tool_calls=false)
+    {
+        auto msg = common_chat_parse(
+            "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>",
+            /* is_partial= */ false,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+                /* .parse_tool_calls = */ false,
+            }
+        );
+        // Should not throw with <|end|> token
+        assert_equals("The answer is 42", msg.content);
+        assert_equals("Let me think about this...", msg.reasoning_content);
+    }
+
+    // Test with parse_tool_calls=true (the default case that was failing)
+    {
+        auto msg = common_chat_parse(
+            "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>",
+            /* is_partial= */ false,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+                /* .parse_tool_calls = */ true,  // This was the failing case!
+            }
+        );
+        // Should not throw with <|end|> token even when parse_tool_calls is true
+        assert_equals("The answer is 42", msg.content);  // Content is always added regardless of parse_tool_calls setting
+        assert_equals("Let me think about this...", msg.reasoning_content);
+    }
+
+    // Test with parse_tool_calls=true and NO <|end|> token (real server scenario)
+    {
+        auto msg = common_chat_parse(
+            "<|channel|>analysis<|message|>Need location info<|start|>assistant<|channel|>final<|message|>Could you specify the location?",
+            /* is_partial= */ false,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+                /* .parse_tool_calls = */ true,  // Server default with tools
+            }
+        );
+        // Should not throw even without <|end|> token when parse_tool_calls is true
+        assert_equals("Could you specify the location?", msg.content);  // Content is always added regardless of parse_tool_calls setting
+        assert_equals("Need location info", msg.reasoning_content);
+    }
+
+    // Test with NONE reasoning format (the actual failing case that was crashing)
+    {
+        auto msg = common_chat_parse(
+            "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>",
+            /* is_partial= */ false,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE, // Test with NONE - should still work
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+                /* .parse_tool_calls = */ false,
+            }
+        );
+        // With NONE, reasoning won't be parsed but <|end|> should still be handled
+        assert_equals("<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42", msg.content);
+        assert_equals("", msg.reasoning_content);
+    }
+
+    // Test without <|end|> token (backward compatibility)
+    {
+        auto msg = common_chat_parse(
+            "<|channel|>analysis<|message|>Thinking...<|start|>assistant<|channel|>final<|message|>Hello world",
+            /* is_partial= */ false,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+                /* .parse_tool_calls = */ false,
+            }
+        );
+        // Should not throw without <|end|> token
+        assert_equals("Hello world", msg.content);
+        assert_equals("Thinking...", msg.reasoning_content);
+    }
+
+    // Test partial message with <|end|> token
+    {
+        auto msg = common_chat_parse(
+            "<|channel|>analysis<|message|>Processing...<|start|>assistant<|channel|>final<|message|>Partial result<|e",
+            /* is_partial= */ true,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+                /* .parse_tool_calls = */ false,
+            }
+        );
+        // Should not throw for partial message
+        assert_equals("Partial result<|e", msg.content);
+        assert_equals("Processing...", msg.reasoning_content);
+    }
+}
+
 int main() {
     test_positions();
     test_json_with_dumped_args_no_args();
     test_json_with_dumped_args();
     test_reasoning();
     test_regex();
+    test_gpt_oss_harmony_format();
     std::cout << "All tests passed!\n";
     return 0;
 }
@@ -383,8 +383,17 @@ struct server_task {
             } else {
                 params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
             }
-            params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
-            params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params_base.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
+
+            // Use template's reasoning format if provided, otherwise use CLI default
+            auto reasoning_it = data.find("reasoning_format");
+            if (reasoning_it != data.end()) {
+                params.oaicompat_chat_syntax.reasoning_format = static_cast<common_reasoning_format>(reasoning_it->get<int>());
+                SRV_INF("Reasoning format (from template): %s\n", common_reasoning_format_name(params.oaicompat_chat_syntax.reasoning_format));
+            } else {
+                params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
+            }
+
+            params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
             params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
             params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
         }

@@ -804,6 +804,7 @@ static json oaicompat_chat_params_parse(
     }
 
     llama_params["chat_format"]      = static_cast<int>(chat_params.format);
+    llama_params["reasoning_format"] = static_cast<int>(chat_params.reasoning_format); // Pass template's reasoning format
     llama_params["prompt"]           = chat_params.prompt;
     if (!chat_params.grammar.empty()) {
         llama_params["grammar"] = chat_params.grammar;