Skip to content

Commit efa35ac

Browse files
committed
server: implement "skip-queue" feature
1 parent d52e768 commit efa35ac

File tree

3 files changed

+21
-0
lines changed

3 files changed

+21
-0
lines changed

common/common.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,6 +1062,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
10621062
if (arg == "--spm-infill") {
10631063
params.spm_infill = true;
10641064
return true;
1065+
}
1066+
if (arg == "--skip-queue") {
1067+
params.skip_queue = true;
1068+
return true;
10651069
}
10661070
if (arg == "--grammar") {
10671071
CHECK_ARG
@@ -1452,6 +1456,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
14521456
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
14531457
options.push_back({ "server infill",
14541458
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
1459+
options.push_back({ "server infill",
1460+
" --skip-queue", "Always discard queue and eval only the last task. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
14551461

14561462
options.push_back({ "sampling" });
14571463
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,9 @@ struct gpt_params {
255255
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
256256

257257
bool spm_infill = false; // suffix/prefix/middle pattern for infill
258+
259+
bool skip_queue = false; // always skip server queue (for autocomplete)
260+
258261
};
259262

260263
void gpt_params_handle_hf_token(gpt_params & params);

examples/server/server.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,7 @@ struct server_metrics {
379379
struct server_queue {
380380
int id = 0;
381381
bool running;
382+
bool skip_queue = false;
382383

383384
// queues
384385
std::vector<server_task> queue_tasks;
@@ -471,6 +472,14 @@ struct server_queue {
471472
lock.unlock();
472473
break;
473474
}
475+
476+
// Surely there's a better way to do this
477+
if (skip_queue && queue_tasks.size() > 1 ) {
478+
LOG_INFO("Skipping queued tasks", {{"n_skipped_tasks", queue_tasks_deferred.size() + queue_tasks.size() - 1}});
479+
queue_tasks.erase(queue_tasks.begin(), queue_tasks.end() - 1);
480+
queue_tasks_deferred.clear();
481+
}
482+
474483
server_task task = queue_tasks.front();
475484
queue_tasks.erase(queue_tasks.begin());
476485
lock.unlock();
@@ -2503,6 +2512,9 @@ int main(int argc, char ** argv) {
25032512
// struct that contains llama context and inference
25042513
server_context ctx_server;
25052514

2515+
ctx_server.queue_tasks.skip_queue = params.skip_queue;
2516+
2517+
25062518
if (!params.system_prompt.empty()) {
25072519
ctx_server.system_prompt_set(params.system_prompt);
25082520
}

0 commit comments

Comments
 (0)