Skip to content

Commit 594facb

Browse files
added no-prefill-assistant flag
1 parent 3e0be1c commit 594facb

File tree

4 files changed

+15
-1
lines changed

4 files changed

+15
-1
lines changed

common/arg.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2880,6 +2880,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28802880
params.chat_template = read_file(value);
28812881
}
28822882
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2883+
add_opt(common_arg(
2884+
{"--no-prefill-assistant"},
2885+
string_format(
2886+
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
2887+
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
2888+
),
2889+
[](common_params & params) {
2890+
params.prefill_assistant = false;
2891+
}
2892+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
28832893
add_opt(common_arg(
28842894
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
28852895
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,7 @@ struct common_params {
368368
bool use_jinja = false; // NOLINT
369369
bool enable_chat_template = true;
370370
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
371+
bool prefill_assistant = true; // if true, the last server message will be prefilled into the response
371372

372373
std::vector<std::string> api_keys;
373374

tools/server/server.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4348,6 +4348,7 @@ int main(int argc, char ** argv) {
43484348
json data = oaicompat_completion_params_parse(
43494349
body,
43504350
params.use_jinja,
4351+
params.prefill_assistant,
43514352
params.reasoning_format,
43524353
ctx_server.chat_templates.get(),
43534354
ctx_server.mctx,
@@ -4369,6 +4370,7 @@ int main(int argc, char ** argv) {
43694370
json data = oaicompat_completion_params_parse(
43704371
body,
43714372
params.use_jinja,
4373+
params.prefill_assistant,
43724374
params.reasoning_format,
43734375
ctx_server.chat_templates.get(),
43744376
ctx_server.mctx,

tools/server/utils.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,7 @@ static json oaicompat_completion_params_parse(const json & body) {
583583
static json oaicompat_completion_params_parse(
584584
const json & body, /* openai api json semantics */
585585
bool use_jinja,
586+
bool prefill_assistant,
586587
common_reasoning_format reasoning_format,
587588
const struct common_chat_templates * tmpls,
588589
bool allow_non_text,
@@ -732,7 +733,7 @@ static json oaicompat_completion_params_parse(
732733

733734
// if the assistant message appears at the end of list, we do not add end-of-turn token
734735
// for ex. this can be useful to modify the reasoning process in reasoning models
735-
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
736+
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
736737
common_chat_msg last_message;
737738
if (prefill_assistant_message) {
738739
last_message = inputs.messages.back();

0 commit comments

Comments
 (0)