Skip to content

Commit c0f972b

Browse files
author
ochafik
committed
Use --reasoning-format, remove forced thinking for now
1 parent cc2c712 commit c0f972b

File tree

10 files changed

+180
-242
lines changed

10 files changed

+180
-242
lines changed

common/arg.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1976,12 +1976,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
19761976
}
19771977
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
19781978
add_opt(common_arg(
1979-
{"--think"},
1980-
"*experimental* thinking mode (default: disabled)\n"
1981-
"returns reasoning_content in messages, forcing model to think unless it supports native <think> tags (DeepSeek R1, Command R7B)\n"
1979+
{"--reasoning-format"}, "FORMAT",
1980+
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
1981+
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
19821982
"only supported for non-streamed responses",
1983-
[](common_params & params) {
1984-
params.think = true;
1983+
[](common_params & params, const std::string & value) {
1984+
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
1985+
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
1986+
else { std::invalid_argument("invalid value"); }
19851987
}
19861988
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
19871989
add_opt(common_arg(

common/chat.cpp

Lines changed: 78 additions & 146 deletions
Large diffs are not rendered by default.

common/chat.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ struct common_chat_inputs {
1919
bool stream;
2020
std::string grammar;
2121
bool add_generation_prompt = true;
22-
bool think = false;
22+
bool extract_reasoning = true;
2323
};
2424

2525
enum common_chat_format {
@@ -29,13 +29,13 @@ enum common_chat_format {
2929
COMMON_CHAT_FORMAT_LLAMA_3_X,
3030
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
3131
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
32-
COMMON_CHAT_FORMAT_DEEPSEEK_R1_THINK,
32+
COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
3333
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
3434
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
3535
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
3636
COMMON_CHAT_FORMAT_HERMES_2_PRO,
3737
COMMON_CHAT_FORMAT_COMMAND_R7B,
38-
COMMON_CHAT_FORMAT_COMMAND_R7B_THINK,
38+
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
3939

4040
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
4141
};

common/common.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,11 @@ struct common_params_vocoder {
202202
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
203203
};
204204

205+
enum common_reasoning_format {
206+
COMMON_REASONING_FORMAT_NONE,
207+
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
208+
};
209+
205210
struct common_params {
206211
int32_t n_predict = -1; // new tokens to predict
207212
int32_t n_ctx = 4096; // context size
@@ -346,7 +351,7 @@ struct common_params {
346351
std::string chat_template = ""; // NOLINT
347352
bool use_jinja = false; // NOLINT
348353
bool enable_chat_template = true;
349-
bool think = false; // return reasoning_content, force model to think unless it supports native <think> tags.
354+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
350355

351356
std::vector<std::string> api_keys;
352357

examples/server/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ The project is under active development, and we are [looking for feedback and co
127127
| `--grammar-file FNAME` | file to read grammar from |
128128
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
129129
| `--jinja` | Enable experimental Jinja templating engine (required for tool use) |
130-
| `--think` | Enable experimental thinking mode (extracts DeepSeek R1 & Command R7B's native thinking tags and forces any other model to think before responding, resulting thoughts are in the `reasoning_content` output field) (requires `--jinja`) |
130+
| `--reasoning-format FORMAT` | Controls extraction of model thinking traces and the format / field in which they are returned (default: `deepseek`; allowed values: `deepseek`, `none`; requires `--jinja`). `none` will leave thinking traces inline in `message.content` in a model-specific format, while `deepseek` will return them separately under `message.reasoning_content` |
131131

132132
**Example-specific params**
133133

@@ -1224,10 +1224,10 @@ curl http://localhost:8080/v1/chat/completions \
12241224

12251225
# Native support for DeepSeek R1 works best w/ our own template (official template buggy)
12261226

1227-
llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L --think \
1227+
llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
12281228
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
12291229

1230-
llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M --think \
1230+
llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
12311231
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
12321232

12331233
# Native support requires the right template for these GGUFs:
@@ -1241,7 +1241,7 @@ curl http://localhost:8080/v1/chat/completions \
12411241
llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
12421242
--chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
12431243

1244-
llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L --think \
1244+
llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
12451245
--chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
12461246

12471247
# Generic format support

examples/server/server.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4055,7 +4055,7 @@ int main(int argc, char ** argv) {
40554055
}
40564056

40574057
auto body = json::parse(req.body);
4058-
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.think, ctx_server.chat_templates);
4058+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
40594059

40604060
return handle_completions_impl(
40614061
SERVER_TASK_TYPE_COMPLETION,
@@ -4068,7 +4068,7 @@ int main(int argc, char ** argv) {
40684068
// same with handle_chat_completions, but without inference part
40694069
const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
40704070
auto body = json::parse(req.body);
4071-
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.think, ctx_server.chat_templates);
4071+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
40724072
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
40734073
};
40744074

examples/server/tests/unit/test_tool_call.py

Lines changed: 53 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -274,44 +274,44 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
274274

275275

276276
@pytest.mark.slow
277-
@pytest.mark.parametrize("think,hf_repo,template_override", [
278-
(True, "bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M", ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
277+
@pytest.mark.parametrize("reasoning_format,hf_repo,template_override", [
278+
('deepseek', "bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M", ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
279279
280-
(False, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
281-
(False, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
280+
(None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
281+
(None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
282282
283-
(False, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
284-
(False, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
283+
(None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
284+
(None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
285285
286-
(False, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
287-
(False, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
286+
(None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
287+
(None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
288288
289-
(False, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
290-
(False, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
289+
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
290+
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
291291
292-
(False, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
293-
(False, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
292+
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
293+
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
294294
295-
(False, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
296-
(False, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
295+
(None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
296+
(None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
297297
298-
(False, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
299-
(False, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
298+
(None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
299+
(None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
300300
301-
(False, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
302-
(False, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
301+
(None, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
302+
(None, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
303303
304-
(True, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
304+
('deepseek', "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
305305
306306
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
307-
(False, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
307+
(None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
308308
309309
# ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
310310
])
311-
def test_weather(think: bool, hf_repo: str, template_override: Tuple[str, str | None] | None):
311+
def test_weather(reasoning_format: Literal['deepseek', 'none'] | None, hf_repo: str, template_override: Tuple[str, str | None] | None):
312312
global server
313313
n_predict = 512
314-
server.think = think
314+
server.reasoning_format = reasoning_format
315315
server.n_slots = 1
316316
server.jinja = True
317317
server.n_ctx = 8192
@@ -440,19 +440,19 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
440440

441441

442442
@pytest.mark.slow
443-
@pytest.mark.parametrize("n_predict,think,expect_content,expect_reasoning_content,hf_repo,template_override", [
444-
(1024, True, "^The sum of 102 and 7 is 109.*", "^The user's request is straightforward.*", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
445-
(128, False, "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
443+
@pytest.mark.parametrize("n_predict,reasoning_format,expect_content,expect_reasoning_content,hf_repo,template_override", [
444+
# (1024, 'deepseek', "^The sum of 102 and 7 is 109.*", "^The user's request is straightforward.*", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
445+
# (128, None, "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
446446
447-
(1024, True, "To find the sum of.*", "I need to calculate the sum of 102 and 7.*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
448-
(1024, False, "<think>\nI need[\\s\\S\\r\\n]*?</think>\nTo find.*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
447+
(1024, 'deepseek', "To find the sum of.*", "I need to calculate the sum of 102 and 7.*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
448+
(1024, 'none', "<think>\nI need[\\s\\S\\r\\n]*?</think>\nTo find.*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
449449
450-
(1024, True, "To find the sum of.*", "First, I need to add the tens place.*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
450+
(1024, 'deepseek', "To find the sum of.*", "First, I need to add the tens place.*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
451451
])
452-
def test_thoughts(n_predict: int, think: bool, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
452+
def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
453453
global server
454454
server.n_slots = 1
455-
server.think = think
455+
server.reasoning_format = reasoning_format
456456
server.jinja = True
457457
server.n_ctx = 8192 * 2
458458
server.n_predict = n_predict
@@ -489,45 +489,44 @@ def test_thoughts(n_predict: int, think: bool, expect_content: str | None, expec
489489

490490

491491
@pytest.mark.slow
492-
@pytest.mark.parametrize("think,expected_arguments_override,hf_repo,template_override", [
493-
(True, None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
494-
(True, None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"),
492+
@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
493+
(None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
494+
(None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"),
495495
496-
(False, None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
497-
(False, None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
496+
(None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
497+
(None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
498498
499-
(False, None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)),
500-
(False, None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
499+
(None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)),
500+
(None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
501501
502-
(False, None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
503-
(False, '{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
502+
(None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
503+
('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
504504
505-
(False, '{"code":"print("}', "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
506-
(False, None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
505+
('{"code":"print("}', "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
506+
(None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
507507
508-
(False, '{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
509-
(False, '{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
508+
('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
509+
('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
510510
511-
(False, None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
512-
(False, None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
511+
(None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
512+
(None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
513513
514-
(False, None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
515-
(False, None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
514+
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
515+
(None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
516516
517-
(False, None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
518-
(False, None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
517+
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
518+
(None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
519519
520-
(False, None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
521-
(False, None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
520+
(None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
521+
(None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
522522
523523
# Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
524-
(False, None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
524+
(None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
525525
])
526-
def test_hello_world(think: bool, expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
526+
def test_hello_world(reasoning_format: Literal['deepseek', 'none'] | None, expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
527527
global server
528528
server.n_slots = 1
529529
server.jinja = True
530-
server.think = think
531530
server.n_ctx = 8192
532531
server.n_predict = 512 # High because of DeepSeek R1
533532
server.model_hf_repo = hf_repo

examples/server/tests/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ class ServerProcess:
7878
draft_max: int | None = None
7979
no_webui: bool | None = None
8080
jinja: bool | None = None
81-
think: bool | None = None
81+
reasoning_format: Literal['deepseek', 'none'] | None = None
8282
chat_template: str | None = None
8383
chat_template_file: str | None = None
8484

@@ -173,8 +173,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
173173
server_args.append("--no-webui")
174174
if self.jinja:
175175
server_args.append("--jinja")
176-
if self.think:
177-
server_args.append("--think")
176+
if self.reasoning_format:
177+
server_args.append("--reasoning-format")
178178
if self.chat_template:
179179
server_args.extend(["--chat-template", self.chat_template])
180180
if self.chat_template_file:

0 commit comments

Comments
 (0)