@@ -2558,12 +2558,22 @@ struct server_context {
25582558 // start populating the batch for this iteration
25592559 common_batch_clear (batch);
25602560
2561+ // track if given slot can be batched with slots already in the batch
2562+ server_slot * slot_batched = nullptr ;
2563+
25612564 // frist, add sampled tokens from any ongoing sequences
25622565 for (auto & slot : slots) {
25632566 if (slot.state != SLOT_STATE_GENERATING) {
25642567 continue ;
25652568 }
25662569
2570+ // check if we can batch this slot with the previous one
2571+ if (!slot_batched) {
2572+ slot_batched = &slot;
2573+ } else if (slot_batched && !slot_batched->can_batch_with (slot)) {
2574+ continue ;
2575+ }
2576+
25672577 slot.i_batch = batch.n_tokens ;
25682578
25692579 common_batch_add (batch, slot.sampled , slot.n_past , { slot.id }, true );
@@ -2582,17 +2592,16 @@ struct server_context {
25822592 int32_t n_batch = llama_n_batch (ctx);
25832593 int32_t n_ubatch = llama_n_ubatch (ctx);
25842594
2585- // track if given slot can be batched with slots already in the batch
2586- server_slot * slot_batched = nullptr ;
2587-
25882595 // next, batch any pending prompts without exceeding n_batch
25892596 if (params_base.cont_batching || batch.n_tokens == 0 ) {
25902597 for (auto & slot : slots) {
25912598 // check if we can batch this slot with the previous one
2592- if (!slot_batched) {
2593- slot_batched = &slot;
2594- } else if (slot_batched && !slot_batched->can_batch_with (slot)) {
2595- continue ;
2599+ if (slot.is_processing ()) {
2600+ if (!slot_batched) {
2601+ slot_batched = &slot;
2602+ } else if (slot_batched && !slot_batched->can_batch_with (slot)) {
2603+ continue ;
2604+ }
25962605 }
25972606
25982607 // this slot still has a prompt to be processed
0 commit comments