add backend

lhpqaq · lhpqaq · commit 44f5474062f0 · 2024-11-28T10:38:02.000+08:00
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1303,16 +1303,19 @@ struct server_context {
     }
 
     void send_partial_response(server_slot & slot, completion_token_output tkn) {
+        const double t_token_generation = (ggml_time_us() - slot.t_start_generation) / 1e3;
+        const double n_gen_second = 1e3 / t_token_generation * slot.n_decoded;
         server_task_result res;
         res.id       = slot.id_task;
         res.error    = false;
         res.stop     = false;
         res.data     = json {
-            {"content",    tkn.text_to_send},
-            {"stop",       false},
-            {"id_slot",    slot.id},
-            {"multimodal", false},
-            {"index",      slot.index},
+            {"content",      tkn.text_to_send},
+            {"stop",         false},
+            {"id_slot",      slot.id},
+            {"multimodal",   false},
+            {"index",        slot.index},
+            {"n_gen_second", n_gen_second},
         };
 
         if (slot.params.sampling.n_probs > 0) {
@@ -1340,6 +1343,8 @@ struct server_context {
     }
 
     void send_final_response(const server_slot & slot) {
+        const double n_prompt_second = 1e3 / slot.t_prompt_processing * slot.n_prompt_tokens_processed;
+        const double n_gen_second = 1e3 / slot.t_token_generation * slot.n_decoded;
         server_task_result res;
         res.id       = slot.id_task;
         res.error    = false;
@@ -1351,6 +1356,8 @@ struct server_context {
             {"model",               params_base.model_alias},
             {"tokens_predicted",    slot.n_decoded},
             {"tokens_evaluated",    slot.n_prompt_tokens},
+            {"n_prompt_second",     n_prompt_second},
+            {"n_gen_second",        n_gen_second},
             {"generation_settings", get_formated_generation(slot)},
             {"prompt",              common_detokenize(ctx, slot.prompt_tokens)},
             {"has_new_line",        slot.has_new_line},
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
@@ -740,13 +740,24 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
         {"model",   modelname},
         {"object",  "chat.completion.chunk"}
     };
+    
+    double n_gen_second      = json_value(result, "n_gen_second", 0.0);
+
     if (!finish_reason.empty()) {
         int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
         int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
+        double n_prompt_second   = json_value(result, "n_prompt_second", 0.0);
+
         ret.push_back({"usage", json {
             {"completion_tokens", num_tokens_predicted},
             {"prompt_tokens",     num_prompt_tokens},
-            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
+            {"total_tokens",      num_tokens_predicted + num_prompt_tokens},
+            {"gen_second",        n_gen_second},
+            {"prompt_second",     n_prompt_second}
+        }});
+    } else {
+        ret.push_back({"usage", json {
+            {"gen_second",        n_gen_second}
         }});
     }