@@ -173,6 +173,7 @@ struct server_task_result {
173173 std::vector<llama_token> tokens;
174174
175175 bool stream;
176+ bool include_usage;
176177 std::string prompt;
177178 // slot_params generation_params;
178179
@@ -500,22 +501,22 @@ struct server_task_result {
500501 {" model" , oaicompat_model},
501502 {" object" , " chat.completion.chunk" },
502503 });
503-
504- // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
505- // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
506- deltas.push_back ({
507- {" choices" , json::array ()},
508- {" created" , t},
509- {" id" , oaicompat_cmpl_id},
510- {" model" , oaicompat_model},
511- {" object" , " chat.completion.chunk" },
512- {" usage" , json {
513- {" completion_tokens" , n_decoded},
514- {" prompt_tokens" , n_prompt_tokens},
515- {" total_tokens" , n_decoded + n_prompt_tokens},
516- }},
517- });
518-
504+ if (include_usage) {
505+ // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
506+ // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
507+ deltas.push_back ({
508+ {" choices" , json::array ()},
509+ {" created" , t},
510+ {" id" , oaicompat_cmpl_id},
511+ {" model" , oaicompat_model},
512+ {" object" , " chat.completion.chunk" },
513+ {" usage" , json {
514+ {" completion_tokens" , n_decoded},
515+ {" prompt_tokens" , n_prompt_tokens},
516+ {" total_tokens" , n_decoded + n_prompt_tokens},
517+ }},
518+ });
519+ }
519520 if (timings.prompt_n >= 0 ) {
520521 deltas.back ().push_back ({ " timings" , timings.to_json () });
521522 }
@@ -547,6 +548,7 @@ struct server_task_multi {
547548
548549struct slot_params {
549550 bool stream = true ;
551+ bool include_usage = false ;
550552 bool cache_prompt = true ; // remember the prompt to avoid reprocessing all prompt
551553
552554 int32_t n_keep = 0 ; // number of tokens to keep from initial prompt
@@ -1359,7 +1361,7 @@ struct server_context {
13591361 // thinking is enabled if:
13601362 // 1. It's not explicitly disabled (reasoning_budget == 0)
13611363 // 2. The chat template supports it
1362- const bool enable_thinking = params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking (chat_templates.get ());
1364+ const bool enable_thinking = params.use_jinja && params. reasoning_budget != 0 && common_chat_templates_support_enable_thinking (chat_templates.get ());
13631365 // LLAMA_LOG_INFO("Enable thinking? %d\n", enable_thinking);
13641366
13651367 oai_parser_opt = {
@@ -1514,6 +1516,8 @@ struct server_context {
15141516 }
15151517 slot.params .timings_per_token = json_value (data, " timings_per_token" , false );
15161518 slot.params .stream = json_value (data, " stream" , false );
1519+ auto stream_opt = json_value (data, " stream_options" , json::object ());
1520+ slot.params .include_usage = json_value (stream_opt, " include_usage" , false );
15171521 slot.params .cache_prompt = json_value (data, " cache_prompt" , true );
15181522 slot.params .n_predict = json_value (data, " n_predict" , json_value (data, " max_tokens" , default_params.n_predict ));
15191523 slot.sparams .top_k = json_value (data, " top_k" , default_sparams.top_k );
@@ -2206,6 +2210,7 @@ struct server_context {
22062210 res.error = false ;
22072211 res.stop = true ; // to do: set value
22082212 res.stream = slot.params .stream ;
2213+ res.include_usage = slot.params .include_usage ;
22092214 res.content = slot.generated_text ;
22102215 res.oaicompat = slot.params .oaicompat ;
22112216 res.oaicompat_model = slot.params .oaicompat_model ;
0 commit comments