added no-prefill-assistant flag

isaac-mcfadyen · isaac-mcfadyen · commit 594facbe1846 · 2025-05-17T11:42:14.000-04:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2880,6 +2880,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.chat_template = read_file(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+    add_opt(common_arg(
+        {"--no-prefill-assistant"},
+        string_format(
+            "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
+            "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
+        ),
+        [](common_params & params) {
+            params.prefill_assistant = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
     add_opt(common_arg(
         {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
         string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
diff --git a/common/common.h b/common/common.h
@@ -368,6 +368,7 @@ struct common_params {
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    bool prefill_assistant = true;                                                                          // if true, the last server message will be prefilled into the response
 
     std::vector<std::string> api_keys;
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -4348,6 +4348,7 @@ int main(int argc, char ** argv) {
         json data = oaicompat_completion_params_parse(
             body,
             params.use_jinja,
+            params.prefill_assistant,
             params.reasoning_format,
             ctx_server.chat_templates.get(),
             ctx_server.mctx,
@@ -4369,6 +4370,7 @@ int main(int argc, char ** argv) {
         json data = oaicompat_completion_params_parse(
             body,
             params.use_jinja,
+            params.prefill_assistant,
             params.reasoning_format,
             ctx_server.chat_templates.get(),
             ctx_server.mctx,
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
@@ -583,6 +583,7 @@ static json oaicompat_completion_params_parse(const json & body) {
 static json oaicompat_completion_params_parse(
     const json & body, /* openai api json semantics */
     bool use_jinja,
+    bool prefill_assistant,
     common_reasoning_format reasoning_format,
     const struct common_chat_templates * tmpls,
     bool allow_non_text,
@@ -732,7 +733,7 @@ static json oaicompat_completion_params_parse(
 
     // if the assistant message appears at the end of list, we do not add end-of-turn token
     // for ex. this can be useful to modify the reasoning process in reasoning models
-    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
     common_chat_msg last_message;
     if (prefill_assistant_message) {
         last_message = inputs.messages.back();