Skip to content

Commit 44f5474

Browse files
committed
add backend
1 parent 2c96bd2 commit 44f5474

File tree

2 files changed

+24
-6
lines changed

2 files changed

+24
-6
lines changed

examples/server/server.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1303,16 +1303,19 @@ struct server_context {
13031303
}
13041304

13051305
void send_partial_response(server_slot & slot, completion_token_output tkn) {
1306+
const double t_token_generation = (ggml_time_us() - slot.t_start_generation) / 1e3;
1307+
const double n_gen_second = 1e3 / t_token_generation * slot.n_decoded;
13061308
server_task_result res;
13071309
res.id = slot.id_task;
13081310
res.error = false;
13091311
res.stop = false;
13101312
res.data = json {
1311-
{"content", tkn.text_to_send},
1312-
{"stop", false},
1313-
{"id_slot", slot.id},
1314-
{"multimodal", false},
1315-
{"index", slot.index},
1313+
{"content", tkn.text_to_send},
1314+
{"stop", false},
1315+
{"id_slot", slot.id},
1316+
{"multimodal", false},
1317+
{"index", slot.index},
1318+
{"n_gen_second", n_gen_second},
13161319
};
13171320

13181321
if (slot.params.sampling.n_probs > 0) {
@@ -1340,6 +1343,8 @@ struct server_context {
13401343
}
13411344

13421345
void send_final_response(const server_slot & slot) {
1346+
const double n_prompt_second = 1e3 / slot.t_prompt_processing * slot.n_prompt_tokens_processed;
1347+
const double n_gen_second = 1e3 / slot.t_token_generation * slot.n_decoded;
13431348
server_task_result res;
13441349
res.id = slot.id_task;
13451350
res.error = false;
@@ -1351,6 +1356,8 @@ struct server_context {
13511356
{"model", params_base.model_alias},
13521357
{"tokens_predicted", slot.n_decoded},
13531358
{"tokens_evaluated", slot.n_prompt_tokens},
1359+
{"n_prompt_second", n_prompt_second},
1360+
{"n_gen_second", n_gen_second},
13541361
{"generation_settings", get_formated_generation(slot)},
13551362
{"prompt", common_detokenize(ctx, slot.prompt_tokens)},
13561363
{"has_new_line", slot.has_new_line},

examples/server/utils.hpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -740,13 +740,24 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
740740
{"model", modelname},
741741
{"object", "chat.completion.chunk"}
742742
};
743+
744+
double n_gen_second = json_value(result, "n_gen_second", 0.0);
745+
743746
if (!finish_reason.empty()) {
744747
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
745748
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
749+
double n_prompt_second = json_value(result, "n_prompt_second", 0.0);
750+
746751
ret.push_back({"usage", json {
747752
{"completion_tokens", num_tokens_predicted},
748753
{"prompt_tokens", num_prompt_tokens},
749-
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
754+
{"total_tokens", num_tokens_predicted + num_prompt_tokens},
755+
{"gen_second", n_gen_second},
756+
{"prompt_second", n_prompt_second}
757+
}});
758+
} else {
759+
ret.push_back({"usage", json {
760+
{"gen_second", n_gen_second}
750761
}});
751762
}
752763

0 commit comments

Comments
 (0)