@@ -1034,6 +1034,112 @@ struct server_task_result_cmpl_partial : server_task_result {
10341034 }
10351035};
10361036
1037+ struct server_task_result_cmpl_progress : server_task_result {
1038+ int index = 0 ;
1039+
1040+ int32_t n_past;
1041+ int32_t n_prompt_tokens;
1042+ int32_t n_prompt_tokens_processed;
1043+ float progress;
1044+
1045+ // OAI-compat fields
1046+ bool verbose = false ;
1047+ oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
1048+ std::string oaicompat_model;
1049+ std::string oaicompat_cmpl_id;
1050+
1051+ virtual int get_index () override {
1052+ return index;
1053+ }
1054+
1055+ virtual bool is_stop () override {
1056+ return false ; // progress responses are not considered stop
1057+ }
1058+
1059+ virtual json to_json () override {
1060+ switch (oaicompat) {
1061+ case OAICOMPAT_TYPE_NONE:
1062+ return to_json_non_oaicompat ();
1063+ case OAICOMPAT_TYPE_COMPLETION:
1064+ return to_json_oaicompat ();
1065+ case OAICOMPAT_TYPE_CHAT:
1066+ return to_json_oaicompat_chat ();
1067+ default :
1068+ GGML_ASSERT (false && " Invalid oaicompat_type" );
1069+ }
1070+ }
1071+
1072+ json to_json_non_oaicompat () {
1073+ return json {
1074+ {" index" , index},
1075+ {" stop" , false },
1076+ {" id_slot" , id_slot},
1077+ {" prompt_processing" , json {
1078+ {" n_past" , n_past},
1079+ {" n_prompt_tokens" , n_prompt_tokens},
1080+ {" n_prompt_tokens_processed" , n_prompt_tokens_processed},
1081+ {" progress" , progress},
1082+ }},
1083+ };
1084+ }
1085+
1086+ json to_json_oaicompat () {
1087+ std::time_t t = std::time (0 );
1088+ json res = json {
1089+ {" choices" , json::array ({
1090+ json{
1091+ {" text" , " " },
1092+ {" index" , index},
1093+ {" logprobs" , nullptr },
1094+ {" finish_reason" , nullptr },
1095+ }
1096+ })},
1097+ {" created" , t},
1098+ {" model" , oaicompat_model},
1099+ {" system_fingerprint" , build_info},
1100+ {" object" , " text_completion" },
1101+ {" id" , oaicompat_cmpl_id},
1102+ {" prompt_processing" , json {
1103+ {" n_past" , n_past},
1104+ {" n_prompt_tokens" , n_prompt_tokens},
1105+ {" n_prompt_tokens_processed" , n_prompt_tokens_processed},
1106+ {" progress" , progress},
1107+ }},
1108+ };
1109+
1110+ // extra fields for debugging purposes
1111+ if (verbose) {
1112+ res[" __verbose" ] = to_json_non_oaicompat ();
1113+ }
1114+
1115+ return res;
1116+ }
1117+
1118+ json to_json_oaicompat_chat () {
1119+ std::time_t t = std::time (0 );
1120+ return json {
1121+ {" choices" , json::array ({
1122+ json {
1123+ {" finish_reason" , nullptr },
1124+ {" index" , 0 },
1125+ {" delta" , json::object ()},
1126+ },
1127+ })},
1128+ {" created" , t},
1129+ {" id" , oaicompat_cmpl_id},
1130+ {" model" , oaicompat_model},
1131+ {" system_fingerprint" , build_info},
1132+ {" object" , " chat.completion.chunk" },
1133+ {" prompt_processing" , json {
1134+ {" n_past" , n_past},
1135+ {" n_prompt_tokens" , n_prompt_tokens},
1136+ {" n_prompt_tokens_processed" , n_prompt_tokens_processed},
1137+ {" progress" , progress},
1138+ }},
1139+ };
1140+ }
1141+ };
1142+
10371143struct server_task_result_embd : server_task_result {
10381144 int index = 0 ;
10391145 std::vector<std::vector<float >> embedding;
@@ -2515,6 +2621,31 @@ struct server_context {
25152621 queue_results.send (std::move (res));
25162622 }
25172623
2624+ void send_progress_response (server_slot & slot) {
2625+ // Only send progress updates for streaming requests
2626+ if (!slot.params .stream ) {
2627+ return ;
2628+ }
2629+
2630+ auto res = std::make_unique<server_task_result_cmpl_progress>();
2631+
2632+ res->id = slot.id_task ;
2633+ res->id_slot = slot.id ;
2634+ res->index = slot.index ;
2635+
2636+ res->n_past = slot.n_past ;
2637+ res->n_prompt_tokens = slot.n_prompt_tokens ;
2638+ res->n_prompt_tokens_processed = slot.n_prompt_tokens_processed ;
2639+ res->progress = (float ) slot.n_prompt_tokens_processed / slot.n_prompt_tokens ;
2640+
2641+ res->verbose = slot.params .verbose ;
2642+ res->oaicompat = slot.params .oaicompat ;
2643+ res->oaicompat_model = slot.params .oaicompat_model ;
2644+ res->oaicompat_cmpl_id = slot.params .oaicompat_cmpl_id ;
2645+
2646+ queue_results.send (std::move (res));
2647+ }
2648+
25182649 void send_final_response (server_slot & slot) {
25192650 auto res = std::make_unique<server_task_result_cmpl_final>();
25202651 res->id = slot.id_task ;
@@ -2725,6 +2856,7 @@ struct server_context {
27252856 GGML_ASSERT (
27262857 dynamic_cast <server_task_result_cmpl_partial*>(result.get ()) != nullptr
27272858 || dynamic_cast <server_task_result_cmpl_final*>(result.get ()) != nullptr
2859+ || dynamic_cast <server_task_result_cmpl_progress*>(result.get ()) != nullptr
27282860 );
27292861 if (!result_handler (result)) {
27302862 cancel_tasks (id_tasks);
@@ -3340,6 +3472,9 @@ struct server_context {
33403472
33413473 SLT_INF (slot, " prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n " , slot.n_past , batch.n_tokens , (float ) slot.n_prompt_tokens_processed / slot.n_prompt_tokens );
33423474
3475+ // send progress update to client
3476+ send_progress_response (slot);
3477+
33433478 // entire prompt has been processed
33443479 if (slot.n_past == slot.n_prompt_tokens ) {
33453480 slot.state = SLOT_STATE_DONE_PROMPT;
0 commit comments