move can_batch_with check

ngxson · ngxson · commit b9b2b6371aae · 2024-12-27T20:22:49.000+01:00
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2588,6 +2588,13 @@ struct server_context {
         // next, batch any pending prompts without exceeding n_batch
         if (params_base.cont_batching || batch.n_tokens == 0) {
             for (auto & slot : slots) {
+                // check if we can batch this slot with the previous one
+                if (!slot_batched) {
+                    slot_batched = &slot;
+                } else if (slot_batched && !slot_batched->can_batch_with(slot)) {
+                    continue;
+                }
+
                 // this slot still has a prompt to be processed
                 if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
                     auto & prompt_tokens = slot.prompt_tokens;
@@ -2748,13 +2755,6 @@ struct server_context {
                         }
                     }
 
-                    // check if we can batch this slot with the previous one
-                    if (!slot_batched) {
-                        slot_batched = &slot;
-                    } else if (slot_batched && !slot_batched->can_batch_with(slot)) {
-                        continue;
-                    }
-
                     // keep only the common part
                     if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
diff --git a/examples/server/tests/unit/test_lora.py b/examples/server/tests/unit/test_lora.py
@@ -68,10 +68,9 @@ def test_lora_per_request():
             "temperature": 0.0,
             "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
         })
-    ) for lora, re_test in lora_config]
+    ) for lora, _ in lora_config]
     results = parallel_function_calls(tasks)
 
-    print(results)
     assert all([res.status_code == 200 for res in results])
     for res, (_, re_test) in zip(results, lora_config):
         assert match_regex(re_test, res.body["content"])