refactor logic adding tokens to batch

ngxson · ngxson · commit f8bc46629fa4 · 2025-04-21T23:18:44.000+02:00
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2859,7 +2859,7 @@ struct server_context {
                     res->id = task.id;
                     queue_results.send(std::move(res));
                 } break;
-                 
+
         }
     }
 
@@ -3159,49 +3159,51 @@ struct server_context {
                     // remove the non-common part from the cache
                     slot.cache_tokens.keep_until(slot.n_past);
 
-                    // add prompt tokens for processing in the current batch
-                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens() < n_batch) {
-                        // without pooling, we want to output the embeddings for all the tokens in the batch
-                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
+                    auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
 
-                        auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
-                        if (curr_chunk.tok_image) {
-                            // if there are already TEXT tokens in the batch, we need to process them first
-                            if (batch.batch.n_tokens > 0) {
-                                break;
-                            }
-                            // encode the image
-                            server_encode_image(slot.mctx, batch, curr_chunk, slot.n_past, slot.id);
-                            GGML_ASSERT(batch.has_embd());
-                            SLT_INF(slot, "image encoded, n_past = %d, n_embd_tokens = %d\n", slot.n_past, batch.n_tokens());
+                    // check if we should process the image
+                    if (curr_chunk.tok_image) {
+                        if (batch.has_text()) {
+                            continue; // we cannot have both text batch and image batch
+                        }
 
-                            if (slot.params.cache_prompt) {
-                                slot.cache_tokens.add_image_tokens(curr_chunk.tok_image);
-                            }
+                        // encode the image
+                        server_encode_image(slot.mctx, batch, curr_chunk, slot.n_past, slot.id);
+                        GGML_ASSERT(batch.has_embd());
+                        SLT_INF(slot, "image encoded, n_past = %d, n_embd_tokens = %d\n", slot.n_past, batch.n_tokens());
 
-                            slot.n_past                    += batch.n_tokens();
-                            slot.n_prompt_tokens_processed += batch.n_tokens();
-                            break; // we cannot have both text batch and image batch
+                        if (slot.params.cache_prompt) {
+                            slot.cache_tokens.add_image_tokens(curr_chunk.tok_image);
+                        }
 
-                        } else {
-                            GGML_ASSERT(!batch.has_embd());
-                            common_batch_add(batch.batch, curr_chunk.tok_text, slot.n_past, { slot.id }, need_embd);
-                            if (slot.params.cache_prompt) {
-                                slot.cache_tokens.add_text_token(curr_chunk.tok_text);
-                            }
+                        slot.n_past                    += batch.n_tokens();
+                        slot.n_prompt_tokens_processed += batch.n_tokens();
 
-                            slot.n_prompt_tokens_processed++;
-                            slot.n_past++;
-                        }
+                        break; // currently, we can only process one image at a time, so we skip ALL other slots
                     }
 
-                    SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
+                    // add prompt tokens for processing in the current batch
+                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens() < n_batch) {
+                        GGML_ASSERT(!batch.has_embd());
+                        auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
+                        if (curr_chunk.tok_text == LLAMA_TOKEN_NULL) {
+                            break; // end of text chunk
+                        }
 
-                    if (batch.has_embd()) {
-                        // currently, we can only process one image at a time, so we skip other slots
-                        break;
+                        // without pooling, we want to output the embeddings for all the tokens in the batch
+                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
+
+                        common_batch_add(batch.batch, curr_chunk.tok_text, slot.n_past, { slot.id }, need_embd);
+                        if (slot.params.cache_prompt) {
+                            slot.cache_tokens.add_text_token(curr_chunk.tok_text);
+                        }
+
+                        slot.n_prompt_tokens_processed++;
+                        slot.n_past++;
                     }
 
+                    SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
+
                     SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens(), (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
 
                     // entire prompt has been processed
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
@@ -668,7 +668,7 @@ static json oaicompat_completion_params_parse(
                 p["text"] = "<__image__>";
                 p.erase("image_url");
             }
-        }        
+        }
     }
 
     common_chat_templates_inputs inputs;
@@ -979,9 +979,9 @@ struct server_inp_chunk {
 
 /**
  * server_inputs is a helper to manage the input tokens and image for the server.
- * 
+ *
  * the difference between server_inputs and mtmd_input_chunks is that each chunk of server_inputs only contains a single text token, but text chunk of mtmd_input_chunks can contain multiple tokens.
- * 
+ *
  * it is made this way to simplify the logic of KV cache management.
  */
 struct server_inputs {
@@ -1184,7 +1184,6 @@ struct server_batch {
 
     void reserve_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
         GGML_ASSERT(n_tokens <= (int32_t)pos.size());
-        seq_ids[n_tokens] = nullptr;
         batch.n_tokens = n_tokens;
         batch.embd     = embd;
         batch.token    = nullptr;
@@ -1207,7 +1206,11 @@ struct server_batch {
     }
 
     bool has_embd() const {
-        return batch.embd != nullptr;
+        return batch.embd != nullptr && batch.n_tokens > 0;
+    }
+
+    bool has_text() const {
+        return batch.token != nullptr && batch.n_tokens > 0;
     }
 };