diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 852352383bdbe..dbec1eceea35f 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3317,7 +3317,13 @@ struct server_context { } // add prompt tokens for processing in the current batch - while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { + // limit tokens per slot to ensure fairness across multiple slots + // this prevents large prompts from blocking other slots (issue #6607) + const int max_tokens_per_slot = std::max(1, n_batch / 4); + int tokens_added_this_slot = 0; + while (slot.n_past < slot.n_prompt_tokens && + batch.n_tokens < n_batch && + tokens_added_this_slot < max_tokens_per_slot) { // get next token to process llama_token cur_tok = slot.prompt_tokens[slot.n_past]; if (cur_tok == LLAMA_TOKEN_NULL) { @@ -3332,6 +3338,7 @@ struct server_context { slot.n_prompt_tokens_processed++; slot.n_past++; + tokens_added_this_slot++; } // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());