server: implement "skip-queue" feature

stduhpf · stduhpf · commit efa35acedfd2 · 2024-07-18T11:59:25.000+02:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -1062,6 +1062,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     if (arg == "--spm-infill") {
         params.spm_infill = true;
         return true;
+    }    
+    if (arg == "--skip-queue") {
+        params.skip_queue = true;
+        return true;
     }
     if (arg == "--grammar") {
         CHECK_ARG
@@ -1452,6 +1456,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "main infill", "       --in-suffix STRING",     "string to suffix after user inputs with (default: empty)" });
     options.push_back({ "server infill",
                                        "       --spm-infill",           "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
+    options.push_back({ "server infill",
+                                       "       --skip-queue",           "Always discard queue and eval only the last task. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
 
     options.push_back({ "sampling" });
     options.push_back({ "*",           "       --samplers SAMPLERS",    "samplers that will be used for generation in the order, separated by \';\'\n"
diff --git a/common/common.h b/common/common.h
@@ -255,6 +255,9 @@ struct gpt_params {
     std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
 
     bool spm_infill = false; // suffix/prefix/middle pattern for infill
+
+    bool skip_queue = false; // always skip server queue (for autocomplete) 
+
 };
 
 void gpt_params_handle_hf_token(gpt_params & params);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -379,6 +379,7 @@ struct server_metrics {
 struct server_queue {
     int id = 0;
     bool running;
+    bool skip_queue = false;
 
     // queues
     std::vector<server_task> queue_tasks;
@@ -471,6 +472,14 @@ struct server_queue {
                     lock.unlock();
                     break;
                 }
+
+                // Surely there's a better way to do this
+                if (skip_queue && queue_tasks.size() > 1 ) {
+                    LOG_INFO("Skipping queued tasks", {{"n_skipped_tasks", queue_tasks_deferred.size() + queue_tasks.size() - 1}});
+                    queue_tasks.erase(queue_tasks.begin(), queue_tasks.end() - 1);
+                    queue_tasks_deferred.clear();
+                }
+
                 server_task task = queue_tasks.front();
                 queue_tasks.erase(queue_tasks.begin());
                 lock.unlock();
@@ -2503,6 +2512,9 @@ int main(int argc, char ** argv) {
     // struct that contains llama context and inference
     server_context ctx_server;
 
+    ctx_server.queue_tasks.skip_queue = params.skip_queue;
+
+
     if (!params.system_prompt.empty()) {
         ctx_server.system_prompt_set(params.system_prompt);
     }