diff --git a/common/chat.cpp b/common/chat.cpp index 316bd24170c9e..73826c2c12013 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1313,6 +1313,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp data.prompt = prompt; data.format = COMMON_CHAT_FORMAT_GPT_OSS; + data.reasoning_format = COMMON_REASONING_FORMAT_AUTO; // GPT-OSS Harmony format always has reasoning channels // TODO: support tool calls in GPT-OSS? @@ -1320,10 +1321,25 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp } static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) { // TODO @ngxson : this won't work with --special enabled, we should fix that + // TODO: Full Harmony format support with multiple channels (analysis, commentary, final) + // is being implemented in PR #15181. This is a minimal fix to prevent crashes. builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>"); - if (!builder.syntax().parse_tool_calls) { + + // Check if we have an <|end|> token at the current position or later + auto end_pos = builder.input().find("<|end|>", builder.pos()); + + if (end_pos != std::string::npos) { + // Content is everything from current position to <|end|> + auto content = builder.input().substr(builder.pos(), end_pos - builder.pos()); + builder.add_content(content); + builder.move_to(end_pos); + // Consume the <|end|> token to prevent "Unexpected content at end of input" errors + // Note: <|end|> marks end of message, not end of generation. Full multi-message + // support requires proper Harmony channel parsing. + builder.consume_literal("<|end|>"); + } else { + // No <|end|> token, consume everything remaining builder.add_content(builder.consume_rest()); - return; } } diff --git a/common/chat.h b/common/chat.h index eb628d8bc275d..ba28f5ba323f9 100644 --- a/common/chat.h +++ b/common/chat.h @@ -135,6 +135,7 @@ struct common_chat_templates_inputs { struct common_chat_params { common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // Template-specific reasoning format std::string prompt; std::string grammar; bool grammar_lazy = false; diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp index 59e44e07d25ed..56b953459038f 100644 --- a/tests/test-chat-parser.cpp +++ b/tests/test-chat-parser.cpp @@ -10,6 +10,7 @@ #include #include "chat-parser.h" +#include "chat.h" #include "common.h" #include "log.h" #include "regex-partial.h" @@ -341,12 +342,123 @@ static void test_positions() { } } +static void test_gpt_oss_harmony_format() { + // Test GPT-OSS Harmony format with <|end|> token and AUTO reasoning (parse_tool_calls=false) + { + auto msg = common_chat_parse( + "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>", + /* is_partial= */ false, + { + /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ false, + } + ); + // Should not throw with <|end|> token + assert_equals("The answer is 42", msg.content); + assert_equals("Let me think about this...", msg.reasoning_content); + } + + // Test with parse_tool_calls=true (the default case that was failing) + { + auto msg = common_chat_parse( + "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>", + /* is_partial= */ false, + { + /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ true, // This was the failing case! + } + ); + // Should not throw with <|end|> token even when parse_tool_calls is true + assert_equals("The answer is 42", msg.content); // Content is always added regardless of parse_tool_calls setting + assert_equals("Let me think about this...", msg.reasoning_content); + } + + // Test with parse_tool_calls=true and NO <|end|> token (real server scenario) + { + auto msg = common_chat_parse( + "<|channel|>analysis<|message|>Need location info<|start|>assistant<|channel|>final<|message|>Could you specify the location?", + /* is_partial= */ false, + { + /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ true, // Server default with tools + } + ); + // Should not throw even without <|end|> token when parse_tool_calls is true + assert_equals("Could you specify the location?", msg.content); // Content is always added regardless of parse_tool_calls setting + assert_equals("Need location info", msg.reasoning_content); + } + + // Test with NONE reasoning format (the actual failing case that was crashing) + { + auto msg = common_chat_parse( + "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>", + /* is_partial= */ false, + { + /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE, // Test with NONE - should still work + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ false, + } + ); + // With NONE, reasoning won't be parsed but <|end|> should still be handled + assert_equals("<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42", msg.content); + assert_equals("", msg.reasoning_content); + } + + // Test without <|end|> token (backward compatibility) + { + auto msg = common_chat_parse( + "<|channel|>analysis<|message|>Thinking...<|start|>assistant<|channel|>final<|message|>Hello world", + /* is_partial= */ false, + { + /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ false, + } + ); + // Should not throw without <|end|> token + assert_equals("Hello world", msg.content); + assert_equals("Thinking...", msg.reasoning_content); + } + + // Test partial message with <|end|> token + { + auto msg = common_chat_parse( + "<|channel|>analysis<|message|>Processing...<|start|>assistant<|channel|>final<|message|>Partial result<|e", + /* is_partial= */ true, + { + /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ false, + } + ); + // Should not throw for partial message + assert_equals("Partial result<|e", msg.content); + assert_equals("Processing...", msg.reasoning_content); + } +} + int main() { test_positions(); test_json_with_dumped_args_no_args(); test_json_with_dumped_args(); test_reasoning(); test_regex(); + test_gpt_oss_harmony_format(); std::cout << "All tests passed!\n"; return 0; } diff --git a/tools/server/server.cpp b/tools/server/server.cpp index a255d481a4d1c..e0438f75ef750 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -383,8 +383,17 @@ struct server_task { } else { params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format; } - params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format; - params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params_base.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); + + // Use template's reasoning format if provided, otherwise use CLI default + auto reasoning_it = data.find("reasoning_format"); + if (reasoning_it != data.end()) { + params.oaicompat_chat_syntax.reasoning_format = static_cast(reasoning_it->get()); + SRV_INF("Reasoning format (from template): %s\n", common_reasoning_format_name(params.oaicompat_chat_syntax.reasoning_format)); + } else { + params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format; + } + + params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false); params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false); } diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index f3dfc8225da4d..ad039b2a85358 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -804,6 +804,7 @@ static json oaicompat_chat_params_parse( } llama_params["chat_format"] = static_cast(chat_params.format); + llama_params["reasoning_format"] = static_cast(chat_params.reasoning_format); // Pass template's reasoning format llama_params["prompt"] = chat_params.prompt; if (!chat_params.grammar.empty()) { llama_params["grammar"] = chat_params.grammar;