Skip to content

Fix GPT-OSS Harmony format end token handling crash #15195

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1313,17 +1313,33 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp

data.prompt = prompt;
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
data.reasoning_format = COMMON_REASONING_FORMAT_AUTO; // GPT-OSS Harmony format always has reasoning channels

// TODO: support tool calls in GPT-OSS?

return data;
}
static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
// TODO @ngxson : this won't work with --special enabled, we should fix that
// TODO: Full Harmony format support with multiple channels (analysis, commentary, final)
// is being implemented in PR #15181. This is a minimal fix to prevent crashes.
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
if (!builder.syntax().parse_tool_calls) {

// Check if we have an <|end|> token at the current position or later
auto end_pos = builder.input().find("<|end|>", builder.pos());

if (end_pos != std::string::npos) {
// Content is everything from current position to <|end|>
auto content = builder.input().substr(builder.pos(), end_pos - builder.pos());
builder.add_content(content);
builder.move_to(end_pos);
// Consume the <|end|> token to prevent "Unexpected content at end of input" errors
// Note: <|end|> marks end of message, not end of generation. Full multi-message
// support requires proper Harmony channel parsing.
builder.consume_literal("<|end|>");
} else {
// No <|end|> token, consume everything remaining
builder.add_content(builder.consume_rest());
return;
}
}

Expand Down
1 change: 1 addition & 0 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ struct common_chat_templates_inputs {

struct common_chat_params {
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // Template-specific reasoning format
std::string prompt;
std::string grammar;
bool grammar_lazy = false;
Expand Down
112 changes: 112 additions & 0 deletions tests/test-chat-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <string>

#include "chat-parser.h"
#include "chat.h"
#include "common.h"
#include "log.h"
#include "regex-partial.h"
Expand Down Expand Up @@ -341,12 +342,123 @@ static void test_positions() {
}
}

static void test_gpt_oss_harmony_format() {
// Test GPT-OSS Harmony format with <|end|> token and AUTO reasoning (parse_tool_calls=false)
{
auto msg = common_chat_parse(
"<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>",
/* is_partial= */ false,
{
/* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ false,
}
);
// Should not throw with <|end|> token
assert_equals("The answer is 42", msg.content);
assert_equals("Let me think about this...", msg.reasoning_content);
}

// Test with parse_tool_calls=true (the default case that was failing)
{
auto msg = common_chat_parse(
"<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>",
/* is_partial= */ false,
{
/* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ true, // This was the failing case!
}
);
// Should not throw with <|end|> token even when parse_tool_calls is true
assert_equals("The answer is 42", msg.content); // Content is always added regardless of parse_tool_calls setting
assert_equals("Let me think about this...", msg.reasoning_content);
}

// Test with parse_tool_calls=true and NO <|end|> token (real server scenario)
{
auto msg = common_chat_parse(
"<|channel|>analysis<|message|>Need location info<|start|>assistant<|channel|>final<|message|>Could you specify the location?",
/* is_partial= */ false,
{
/* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ true, // Server default with tools
}
);
// Should not throw even without <|end|> token when parse_tool_calls is true
assert_equals("Could you specify the location?", msg.content); // Content is always added regardless of parse_tool_calls setting
assert_equals("Need location info", msg.reasoning_content);
}

// Test with NONE reasoning format (the actual failing case that was crashing)
{
auto msg = common_chat_parse(
"<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>",
/* is_partial= */ false,
{
/* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE, // Test with NONE - should still work
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ false,
}
);
// With NONE, reasoning won't be parsed but <|end|> should still be handled
assert_equals("<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42", msg.content);
assert_equals("", msg.reasoning_content);
}

// Test without <|end|> token (backward compatibility)
{
auto msg = common_chat_parse(
"<|channel|>analysis<|message|>Thinking...<|start|>assistant<|channel|>final<|message|>Hello world",
/* is_partial= */ false,
{
/* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ false,
}
);
// Should not throw without <|end|> token
assert_equals("Hello world", msg.content);
assert_equals("Thinking...", msg.reasoning_content);
}

// Test partial message with <|end|> token
{
auto msg = common_chat_parse(
"<|channel|>analysis<|message|>Processing...<|start|>assistant<|channel|>final<|message|>Partial result<|e",
/* is_partial= */ true,
{
/* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ false,
}
);
// Should not throw for partial message
assert_equals("Partial result<|e", msg.content);
assert_equals("Processing...", msg.reasoning_content);
}
}

int main() {
test_positions();
test_json_with_dumped_args_no_args();
test_json_with_dumped_args();
test_reasoning();
test_regex();
test_gpt_oss_harmony_format();
std::cout << "All tests passed!\n";
return 0;
}
13 changes: 11 additions & 2 deletions tools/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -383,8 +383,17 @@ struct server_task {
} else {
params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
}
params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params_base.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);

// Use template's reasoning format if provided, otherwise use CLI default
auto reasoning_it = data.find("reasoning_format");
if (reasoning_it != data.end()) {
params.oaicompat_chat_syntax.reasoning_format = static_cast<common_reasoning_format>(reasoning_it->get<int>());
SRV_INF("Reasoning format (from template): %s\n", common_reasoning_format_name(params.oaicompat_chat_syntax.reasoning_format));
} else {
params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
}

params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
}
Expand Down
1 change: 1 addition & 0 deletions tools/server/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,7 @@ static json oaicompat_chat_params_parse(
}

llama_params["chat_format"] = static_cast<int>(chat_params.format);
llama_params["reasoning_format"] = static_cast<int>(chat_params.reasoning_format); // Pass template's reasoning format
llama_params["prompt"] = chat_params.prompt;
if (!chat_params.grammar.empty()) {
llama_params["grammar"] = chat_params.grammar;
Expand Down