Skip to content

Commit 9cdeebe

Browse files
author
ochafik
committed
switch to --reasoning-budget flag
1 parent 8547fcc commit 9cdeebe

File tree

8 files changed

+32
-19
lines changed

8 files changed

+32
-19
lines changed

common/arg.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2851,15 +2851,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28512851
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
28522852
"- none: leaves thoughts unparsed in `message.content`\n"
28532853
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2854-
"- nothink: prevents generation of thoughts (forcibly closing thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3)\n"
28552854
"(default: deepseek)",
28562855
[](common_params & params, const std::string & value) {
28572856
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
28582857
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2859-
else if (value == "nothink") { params.reasoning_format = COMMON_REASONING_FORMAT_NOTHINK; }
2860-
else { std::invalid_argument("invalid value"); }
2858+
else { throw std::invalid_argument("invalid value"); }
28612859
}
28622860
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
2861+
add_opt(common_arg(
2862+
{"--reasoning-budget"}, "N",
2863+
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
2864+
[](common_params & params, int value) {
2865+
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
2866+
params.reasoning_budget = value;
2867+
}
2868+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
28632869
add_opt(common_arg(
28642870
{"--chat-template"}, "JINJA_TEMPLATE",
28652871
string_format(
@@ -2958,7 +2964,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29582964
[](common_params & params, const std::string & value) {
29592965
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
29602966
else if (value == "md") { params.batched_bench_output_jsonl = false; }
2961-
else { std::invalid_argument("invalid value"); }
2967+
else { throw std::invalid_argument("invalid value"); }
29622968
}
29632969
).set_examples({LLAMA_EXAMPLE_BENCH}));
29642970
add_opt(common_arg(

common/chat.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -596,7 +596,6 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
596596
switch (format) {
597597
case COMMON_REASONING_FORMAT_NONE: return "none";
598598
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
599-
case COMMON_REASONING_FORMAT_NOTHINK: return "nothink";
600599
default:
601600
throw std::runtime_error("Unknown reasoning format");
602601
}
@@ -1700,7 +1699,7 @@ static common_chat_params common_chat_templates_apply_jinja(
17001699
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
17011700
params.add_generation_prompt = inputs.add_generation_prompt;
17021701
params.tool_choice = inputs.tool_choice;
1703-
params.enable_thinking = inputs.reasoning_format != COMMON_REASONING_FORMAT_NOTHINK;
1702+
params.enable_thinking = inputs.enable_thinking;
17041703
params.grammar = inputs.grammar;
17051704
params.now = inputs.now;
17061705
if (!inputs.json_schema.empty()) {

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,6 @@ struct common_params_vocoder {
216216
enum common_reasoning_format {
217217
COMMON_REASONING_FORMAT_NONE,
218218
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
219-
COMMON_REASONING_FORMAT_NOTHINK, // Forcibly disables thinking (causes any thinking tag to be closed, empty thinking tags to be inserted, or template specific variables to be set, depending on the chat format)
220219
};
221220

222221
struct common_params {
@@ -369,6 +368,7 @@ struct common_params {
369368
bool use_jinja = false; // NOLINT
370369
bool enable_chat_template = true;
371370
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
371+
int reasoning_budget = -1;
372372
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
373373

374374
std::vector<std::string> api_keys;

tools/server/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,8 @@ The project is under active development, and we are [looking for feedback and co
173173
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
174174
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
175175
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
176-
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>- nothink: prevents generation of thoughts (forcibly closing thoughts tag or setting template-specific variables such as `enable_thinking: false` for Qwen3)<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
176+
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
177+
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
177178
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
178179
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
179180
| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |

tools/server/server.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2089,6 +2089,7 @@ struct server_context {
20892089
/* common_chat_templates */ chat_templates.get(),
20902090
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
20912091
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
2092+
/* enable_thinking */ params_base.reasoning_budget != 0,
20922093
};
20932094
}
20942095

tools/server/tests/unit/test_template.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,24 @@ def create_server():
2626

2727

2828
@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
29-
@pytest.mark.parametrize("template_name,nothink,expected_end", [
30-
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", False, "<think>\n"),
31-
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", True, "<think>\n</think>"),
29+
@pytest.mark.parametrize("template_name,reasoning_budget,expected_end", [
30+
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", None, "<think>\n"),
31+
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", -1, "<think>\n"),
32+
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", 0, "<think>\n</think>"),
3233
33-
("Qwen-Qwen3-0.6B", False, "<|im_start|>assistant\n"),
34-
("Qwen-Qwen3-0.6B", True, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
34+
("Qwen-Qwen3-0.6B", -1, "<|im_start|>assistant\n"),
35+
("Qwen-Qwen3-0.6B", 0, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
3536
36-
("Qwen-QwQ-32B", False, "<|im_start|>assistant\n<think>\n"),
37-
("Qwen-QwQ-32B", True, "<|im_start|>assistant\n<think>\n</think>"),
37+
("Qwen-QwQ-32B", -1, "<|im_start|>assistant\n<think>\n"),
38+
("Qwen-QwQ-32B", 0, "<|im_start|>assistant\n<think>\n</think>"),
3839
39-
("CohereForAI-c4ai-command-r7b-12-2024-tool_use", False, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
40-
("CohereForAI-c4ai-command-r7b-12-2024-tool_use", True, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
40+
("CohereForAI-c4ai-command-r7b-12-2024-tool_use", -1, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
41+
("CohereForAI-c4ai-command-r7b-12-2024-tool_use", 0, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
4142
])
42-
def test_nothink(template_name: str, nothink: bool, expected_end: str, tools: list[dict]):
43+
def test_reasoning_budget(template_name: str, reasoning_budget: int | None, expected_end: str, tools: list[dict]):
4344
global server
4445
server.jinja = True
45-
server.reasoning_format = 'nothink' if nothink else None
46+
server.reasoning_budget = reasoning_budget
4647
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
4748
server.start(timeout_seconds=TIMEOUT_SERVER_START)
4849

tools/server/tests/utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ class ServerProcess:
8585
no_webui: bool | None = None
8686
jinja: bool | None = None
8787
reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None
88+
reasoning_budget: int | None = None
8889
chat_template: str | None = None
8990
chat_template_file: str | None = None
9091
server_path: str | None = None
@@ -191,6 +192,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
191192
server_args.append("--jinja")
192193
if self.reasoning_format is not None:
193194
server_args.extend(("--reasoning-format", self.reasoning_format))
195+
if self.reasoning_budget is not None:
196+
server_args.extend(("--reasoning-budget", self.reasoning_budget))
194197
if self.chat_template:
195198
server_args.extend(["--chat-template", self.chat_template])
196199
if self.chat_template_file:

tools/server/utils.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,7 @@ struct oaicompat_parser_options {
568568
common_chat_templates * tmpls;
569569
bool allow_image;
570570
bool allow_audio;
571+
bool enable_thinking = true;
571572
};
572573

573574
// used by /chat/completions endpoint
@@ -732,6 +733,7 @@ static json oaicompat_chat_params_parse(
732733
inputs.use_jinja = opt.use_jinja;
733734
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
734735
inputs.reasoning_format = opt.reasoning_format;
736+
inputs.enable_thinking = opt.enable_thinking;
735737
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
736738
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
737739
}

0 commit comments

Comments
 (0)