allow decoding image embedding to be split into batches

ngxson · ngxson · commit 8afa9528371f · 2025-04-23T22:20:52.000+02:00
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1860,7 +1860,8 @@ struct server_context {
 
     llama_context_params cparams_dft;
 
-    server_batch batch;
+    llama_batch batch;
+    server_batch_embd batch_embd;
 
     bool clean_kv_cache = true;
     bool add_bos_token  = true;
@@ -1898,6 +1899,8 @@ struct server_context {
 
             llama_batch_free(slot.batch_spec);
         }
+
+        llama_batch_free(batch);
     }
 
     bool load_model(const common_params & params) {
@@ -2034,7 +2037,8 @@ struct server_context {
         // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
         {
             const int32_t n_batch = llama_n_batch(ctx);
-            batch = server_batch(std::max(n_batch, params_base.n_parallel));
+            batch      = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
+            batch_embd = server_batch_embd(std::max(n_batch, params_base.n_parallel));
         }
 
         metrics.init();
@@ -2931,7 +2935,7 @@ struct server_context {
         }*/
 
         // start populating the batch for this iteration
-        batch.clear();
+        common_batch_clear(batch);
 
         // track if given slot can be batched with slots already in the batch
         server_slot * slot_batched = nullptr;
@@ -2953,9 +2957,9 @@ struct server_context {
                 continue;
             }
 
-            slot.i_batch = batch.n_tokens();
+            slot.i_batch = batch.n_tokens;
 
-            common_batch_add(batch.batch, slot.sampled, slot.n_past, { slot.id }, true);
+            common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
 
             slot.n_past += 1;
 
@@ -2972,7 +2976,7 @@ struct server_context {
         int32_t n_ubatch = llama_n_ubatch(ctx);
 
         // next, batch any pending prompts without exceeding n_batch
-        if (params_base.cont_batching || batch.n_tokens() == 0) {
+        if (params_base.cont_batching || batch.n_tokens == 0) {
             for (auto & slot : slots) {
                 // check if we can batch this slot with the previous one
                 if (slot.is_processing()) {
@@ -3140,7 +3144,7 @@ struct server_context {
                     // non-causal tasks require to fit the entire prompt in the physical batch
                     if (slot.is_non_causal()) {
                         // cannot fit the prompt in the current batch - will try next iter
-                        if (batch.n_tokens() + slot.n_prompt_tokens > n_batch) {
+                        if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
                             continue;
                         }
                     }
@@ -3163,28 +3167,26 @@ struct server_context {
 
                     // check if we should process the image
                     if (curr_chunk.tok_image) {
-                        if (batch.has_text()) {
-                            continue; // we cannot have both text batch and image batch
+                        // process the image
+                        int32_t res = server_img_process(ctx, mctx, curr_chunk, batch_embd, slot.n_past, slot.id);
+                        if (res != 0) {
+                            SLT_ERR(slot, "failed to process image, res = %d\n", res);
+                            slot.release();
+                            send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
+                            continue;
                         }
 
-                        // encode the image
-                        server_encode_image(slot.mctx, batch, curr_chunk, slot.n_past, slot.id);
-                        GGML_ASSERT(batch.has_embd());
-                        SLT_INF(slot, "image encoded, n_past = %d, n_embd_tokens = %d\n", slot.n_past, batch.n_tokens());
-
                         if (slot.params.cache_prompt) {
                             slot.cache_tokens.add_image_tokens(curr_chunk.tok_image);
                         }
 
-                        slot.n_past                    += batch.n_tokens();
-                        slot.n_prompt_tokens_processed += batch.n_tokens();
-
-                        break; // currently, we can only process one image at a time, so we skip ALL other slots
+                        slot.n_past                    += curr_chunk.n_tokens;
+                        slot.n_prompt_tokens_processed += curr_chunk.n_tokens;
                     }
 
                     // add prompt tokens for processing in the current batch
-                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens() < n_batch) {
-                        GGML_ASSERT(!batch.has_embd());
+                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
+                        // get next token to process
                         auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
                         if (curr_chunk.tok_text == LLAMA_TOKEN_NULL) {
                             break; // end of text chunk
@@ -3193,7 +3195,7 @@ struct server_context {
                         // without pooling, we want to output the embeddings for all the tokens in the batch
                         const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
 
-                        common_batch_add(batch.batch, curr_chunk.tok_text, slot.n_past, { slot.id }, need_embd);
+                        common_batch_add(batch, curr_chunk.tok_text, slot.n_past, { slot.id }, need_embd);
                         if (slot.params.cache_prompt) {
                             slot.cache_tokens.add_text_token(curr_chunk.tok_text);
                         }
@@ -3204,47 +3206,47 @@ struct server_context {
 
                     // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
 
-                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens(), (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
+                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
 
                     // entire prompt has been processed
                     if (slot.n_past == slot.n_prompt_tokens) {
                         slot.state = SLOT_STATE_DONE_PROMPT;
 
-                        GGML_ASSERT(batch.n_tokens() > 0);
+                        GGML_ASSERT(batch.n_tokens > 0);
 
                         common_sampler_reset(slot.smpl);
 
                         // Process all prompt tokens through sampler system
                         for (size_t i = 0; i < slot.cache_tokens.n_tokens(); ++i) {
-                            auto & curr_chunk = slot.cache_tokens.get_chunk(i);
+                            auto & curr_chunk = slot.prompt_tokens.get_chunk(i);
                             if (curr_chunk.tok_text != LLAMA_TOKEN_NULL) {
                                 common_sampler_accept(slot.smpl, curr_chunk.tok_text, false);
                             }
                         }
 
                         // extract the logits only for the last token
-                        batch.logits[batch.n_tokens() - 1] = true;
+                        batch.logits[batch.n_tokens - 1] = true;
 
                         slot.n_decoded = 0;
-                        slot.i_batch   = batch.n_tokens() - 1;
+                        slot.i_batch   = batch.n_tokens - 1;
 
-                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens());
+                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
                     }
                 }
 
-                if (batch.n_tokens() >= n_batch) {
+                if (batch.n_tokens >= n_batch) {
                     break;
                 }
             }
         }
 
-        if (batch.n_tokens() == 0) {
+        if (batch.n_tokens == 0) {
             SRV_WRN("%s", "no tokens to decode\n");
             return;
         }
 
         // debug
-        SRV_DBG("decoding %s batch, n_tokens = %d\n", batch.has_embd() ? "embd" : "text", batch.n_tokens());
+        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
 
         if (slot_batched) {
             // make sure we're in the right embedding mode
@@ -3254,32 +3256,22 @@ struct server_context {
         }
 
         // process the created batch of tokens
-        for (int32_t i = 0; i < batch.n_tokens(); i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, batch.n_tokens() - i);
+        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
 
-            // TODO @ngxson : hacky here, we don't want to split the embd batch
-            llama_batch batch_view = batch.has_embd() ? batch.batch : llama_batch{
+            llama_batch batch_view = llama_batch{
                 n_tokens,
-                batch.batch.token    + i,
+                batch.token    + i,
                 nullptr,
-                batch.batch.pos      + i,
-                batch.batch.n_seq_id + i,
-                batch.batch.seq_id   + i,
-                batch.batch.logits   + i,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
             };
 
-            // TODO @ngxson : maybe move this to llama_batch_ext
-            if (batch.has_embd() && mtmd_decode_use_non_causal(mctx)) {
-                llama_set_causal_attn(ctx, false);
-            }
-
             const int ret = llama_decode(ctx, batch_view);
             metrics.on_decoded(slots);
 
-            if (batch.has_embd() && mtmd_decode_use_non_causal(mctx)) {
-                llama_set_causal_attn(ctx, true);
-            }
-
             if (ret != 0) {
                 if (n_batch == 1 || ret < 0) {
                     // if you get here, it means the KV cache is full - try increasing it via the context size
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
@@ -963,6 +963,8 @@ static std::vector<common_adapter_lora_info> parse_lora_request(
 // (may need to refactor in near future)
 //
 
+// each chunk can contain either one SINGLE text token or an image (multiple token embeddings)
+// this is to simplify the logic of KV cache management
 struct server_inp_chunk {
     size_t n_tokens = 1; // always 1 in case of text
     llama_token tok_text;
@@ -981,6 +983,15 @@ struct server_inp_chunk {
  * server_inputs is a helper to manage the input tokens and image for the server.
  *
  * the difference between server_inputs and mtmd_input_chunks is that each chunk of server_inputs only contains a single text token, but text chunk of mtmd_input_chunks can contain multiple tokens.
+ * 
+ * for example, server_inputs may contain 5 text tokens followed by 1 image chunk:
+ *   1 41 2635 325 463 <image of 15 tokens>
+ * 
+ * in this example:
+ *   - n_tokens() returns 5+15 = 20 total tokens
+ *   - get_chunk(1) returns chunk containing token ID 41
+ *   - get_chunk(5) returns image chunk (15 tokens)
+ *   - get_chunk(7) returns same image chunk
  *
  * it is made this way to simplify the logic of KV cache management.
  */
@@ -1079,6 +1090,7 @@ struct server_inputs {
         return ret;
     }
 
+    // make sure all text tokens are within the vocab range
     bool validate(llama_token max_vocab_id) const {
         for (const auto & chunk : chunks) {
             if (!chunk.tok_image) {
@@ -1090,24 +1102,26 @@ struct server_inputs {
         return true;
     }
 
+    // pos is also referred as logical index
     server_inp_chunk & get_chunk(size_t pos) {
-        return chunks[get_chunk_idx(pos)];
+        size_t physical_idx = get_chunk_physical_idx(pos);
+        return chunks[physical_idx];
     }
 
-    size_t get_chunk_idx(size_t pos) const {
+    // returns physical_index
+    size_t get_chunk_physical_idx(size_t logical_idx) const {
         size_t current_pos = 0;
         for (size_t i = 0; i < chunks.size(); ++i) {
             const auto & chunk = chunks[i];
             size_t chunk_end_pos = current_pos + chunk.n_tokens;
-            if (pos < chunk_end_pos) {
+            if (logical_idx < chunk_end_pos) {
                 // The target position 'pos' falls within this chunk
                 return i;
             }
-
             current_pos = chunk_end_pos;
         }
         // If the loop finishes, 'pos' is >= the total number of logical positions
-        return chunks.size();
+        throw std::out_of_range("Position out of range");
     }
 
     // same idea with std::vector<llama_token> resize()
@@ -1164,7 +1178,7 @@ struct server_inputs {
 
 // helper struct to make working with embd batch easier
 // note: this will be removed after llama_batch_ext refactoring
-struct server_batch {
+struct server_batch_embd {
     std::vector<llama_pos>      pos;
     std::vector<llama_token>    token;
     std::vector<int32_t>        n_seq_id;
@@ -1174,8 +1188,8 @@ struct server_batch {
 
     llama_batch batch;
 
-    server_batch() : server_batch(1) {}
-    server_batch(int32_t n_tokens) {
+    server_batch_embd() : server_batch_embd(1) {}
+    server_batch_embd(int32_t n_tokens) {
         token   .resize(n_tokens);
         pos     .resize(n_tokens);
         n_seq_id.resize(n_tokens);
@@ -1233,23 +1247,69 @@ struct server_batch {
 };
 
 // TODO @ngxson : quite hacky for now, but just to see if it works
-static int32_t server_encode_image(mtmd_context * mctx, server_batch & batch_out, server_inp_chunk & chunk, llama_pos n_past, llama_seq_id seq_id) {
+static int32_t server_img_process(
+        llama_context * ctx,
+        mtmd_context * mctx,
+        server_inp_chunk & chunk,
+        server_batch_embd & batch,
+        llama_pos n_past,
+        int slot_id) {
     GGML_ASSERT(chunk.tok_image);
-    batch_out.clear();
-
-    int64_t t0 = ggml_time_ms();
-    LOG_INF("encoding image...\n");
-    int32_t ret = mtmd_encode(mctx, chunk.tok_image.get());
-    if (ret != 0) {
-        LOG_ERR("failed to encode image\n");
-        return ret;
+    int32_t ret;
+
+    // encode the image
+    {
+        int64_t t0 = ggml_time_ms();
+        SRV_INF("encoding image (%d tokens)...\n", (int)chunk.n_tokens);
+        ret = mtmd_encode(mctx, chunk.tok_image.get());
+        if (ret != 0) {
+            SRV_ERR("failed to encode image, status = %d\n", ret);
+            return ret;
+        }
+        SRV_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
     }
-    LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
 
-    int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tok_image.get());
     float * embd = mtmd_get_output_embd(mctx);
-    batch_out.reserve_embd_batch(embd, n_tokens, n_past, seq_id);
-    return ret;
+    // decode the embeddings
+    int64_t t1            = ggml_time_ms();
+    int32_t n_embd        = llama_model_n_embd(llama_get_model(ctx));
+    int32_t n_tokens      = chunk.n_tokens;
+    int32_t n_batch       = batch.pos.size();
+    int32_t i_batch       = 0;
+    int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
+    // split into batches
+    while (i_batch < n_img_batches) {
+        int32_t pos_offset = i_batch*n_batch;
+        int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
+        float * embd_batch = embd + pos_offset*n_embd;
+        batch.clear();
+        batch.reserve_embd_batch(embd_batch, n_tokens_batch, n_past, slot_id);
+
+        SRV_INF("decoding embd batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
+
+        // TODO @ngxson : maybe move this to llama_batch_ext
+        if (mtmd_decode_use_non_causal(mctx)) {
+            llama_set_causal_attn(ctx, false);
+        }
+
+        ret = llama_decode(ctx, batch.batch);
+        if (ret != 0) {
+            LOG_ERR("failed to decode image\n");
+            llama_set_causal_attn(ctx, true); // restore causal attn
+            return ret;
+        }
+
+        if (mtmd_decode_use_non_causal(mctx)) {
+            llama_set_causal_attn(ctx, true);
+        }
+
+        i_batch++;
+        n_past += n_tokens_batch;
+    }
+    SRV_INF("image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
+
+    batch.clear();
+    return 0;
 }
 
 // hacky, support text-only for now