Skip to content

Commit 63cab13

Browse files
server: add prompt processing progress streaming for /completion endpoint
- Add server_task_result_cmpl_progress struct for streaming progress updates - Implement send_progress_response() function for real-time progress reporting - Send progress info during prompt processing phase before token generation - Support all compatibility modes (non-OAI, OAI completion, OAI chat) - Include detailed progress data: n_past, n_prompt_tokens, n_prompt_tokens_processed, progress percentage - Only send progress updates in streaming mode (stream: true) - Maintains backward compatibility with existing clients Closes #14685
1 parent 21c0217 commit 63cab13

File tree

1 file changed

+135
-0
lines changed

1 file changed

+135
-0
lines changed

tools/server/server.cpp

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,6 +1034,112 @@ struct server_task_result_cmpl_partial : server_task_result {
10341034
}
10351035
};
10361036

1037+
struct server_task_result_cmpl_progress : server_task_result {
1038+
int index = 0;
1039+
1040+
int32_t n_past;
1041+
int32_t n_prompt_tokens;
1042+
int32_t n_prompt_tokens_processed;
1043+
float progress;
1044+
1045+
// OAI-compat fields
1046+
bool verbose = false;
1047+
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
1048+
std::string oaicompat_model;
1049+
std::string oaicompat_cmpl_id;
1050+
1051+
virtual int get_index() override {
1052+
return index;
1053+
}
1054+
1055+
virtual bool is_stop() override {
1056+
return false; // progress responses are not considered stop
1057+
}
1058+
1059+
virtual json to_json() override {
1060+
switch (oaicompat) {
1061+
case OAICOMPAT_TYPE_NONE:
1062+
return to_json_non_oaicompat();
1063+
case OAICOMPAT_TYPE_COMPLETION:
1064+
return to_json_oaicompat();
1065+
case OAICOMPAT_TYPE_CHAT:
1066+
return to_json_oaicompat_chat();
1067+
default:
1068+
GGML_ASSERT(false && "Invalid oaicompat_type");
1069+
}
1070+
}
1071+
1072+
json to_json_non_oaicompat() {
1073+
return json {
1074+
{"index", index},
1075+
{"stop", false},
1076+
{"id_slot", id_slot},
1077+
{"prompt_processing", json {
1078+
{"n_past", n_past},
1079+
{"n_prompt_tokens", n_prompt_tokens},
1080+
{"n_prompt_tokens_processed", n_prompt_tokens_processed},
1081+
{"progress", progress},
1082+
}},
1083+
};
1084+
}
1085+
1086+
json to_json_oaicompat() {
1087+
std::time_t t = std::time(0);
1088+
json res = json {
1089+
{"choices", json::array({
1090+
json{
1091+
{"text", ""},
1092+
{"index", index},
1093+
{"logprobs", nullptr},
1094+
{"finish_reason", nullptr},
1095+
}
1096+
})},
1097+
{"created", t},
1098+
{"model", oaicompat_model},
1099+
{"system_fingerprint", build_info},
1100+
{"object", "text_completion"},
1101+
{"id", oaicompat_cmpl_id},
1102+
{"prompt_processing", json {
1103+
{"n_past", n_past},
1104+
{"n_prompt_tokens", n_prompt_tokens},
1105+
{"n_prompt_tokens_processed", n_prompt_tokens_processed},
1106+
{"progress", progress},
1107+
}},
1108+
};
1109+
1110+
// extra fields for debugging purposes
1111+
if (verbose) {
1112+
res["__verbose"] = to_json_non_oaicompat();
1113+
}
1114+
1115+
return res;
1116+
}
1117+
1118+
json to_json_oaicompat_chat() {
1119+
std::time_t t = std::time(0);
1120+
return json {
1121+
{"choices", json::array({
1122+
json {
1123+
{"finish_reason", nullptr},
1124+
{"index", 0},
1125+
{"delta", json::object()},
1126+
},
1127+
})},
1128+
{"created", t},
1129+
{"id", oaicompat_cmpl_id},
1130+
{"model", oaicompat_model},
1131+
{"system_fingerprint", build_info},
1132+
{"object", "chat.completion.chunk"},
1133+
{"prompt_processing", json {
1134+
{"n_past", n_past},
1135+
{"n_prompt_tokens", n_prompt_tokens},
1136+
{"n_prompt_tokens_processed", n_prompt_tokens_processed},
1137+
{"progress", progress},
1138+
}},
1139+
};
1140+
}
1141+
};
1142+
10371143
struct server_task_result_embd : server_task_result {
10381144
int index = 0;
10391145
std::vector<std::vector<float>> embedding;
@@ -2515,6 +2621,31 @@ struct server_context {
25152621
queue_results.send(std::move(res));
25162622
}
25172623

2624+
void send_progress_response(server_slot & slot) {
2625+
// Only send progress updates for streaming requests
2626+
if (!slot.params.stream) {
2627+
return;
2628+
}
2629+
2630+
auto res = std::make_unique<server_task_result_cmpl_progress>();
2631+
2632+
res->id = slot.id_task;
2633+
res->id_slot = slot.id;
2634+
res->index = slot.index;
2635+
2636+
res->n_past = slot.n_past;
2637+
res->n_prompt_tokens = slot.n_prompt_tokens;
2638+
res->n_prompt_tokens_processed = slot.n_prompt_tokens_processed;
2639+
res->progress = (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens;
2640+
2641+
res->verbose = slot.params.verbose;
2642+
res->oaicompat = slot.params.oaicompat;
2643+
res->oaicompat_model = slot.params.oaicompat_model;
2644+
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
2645+
2646+
queue_results.send(std::move(res));
2647+
}
2648+
25182649
void send_final_response(server_slot & slot) {
25192650
auto res = std::make_unique<server_task_result_cmpl_final>();
25202651
res->id = slot.id_task;
@@ -2725,6 +2856,7 @@ struct server_context {
27252856
GGML_ASSERT(
27262857
dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr
27272858
|| dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
2859+
|| dynamic_cast<server_task_result_cmpl_progress*>(result.get()) != nullptr
27282860
);
27292861
if (!result_handler(result)) {
27302862
cancel_tasks(id_tasks);
@@ -3340,6 +3472,9 @@ struct server_context {
33403472

33413473
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
33423474

3475+
// send progress update to client
3476+
send_progress_response(slot);
3477+
33433478
// entire prompt has been processed
33443479
if (slot.n_past == slot.n_prompt_tokens) {
33453480
slot.state = SLOT_STATE_DONE_PROMPT;

0 commit comments

Comments
 (0)