server : remove pos state from server_tokens

ggerganov · ggerganov · commit 80e3672bfa63 · 2025-10-30T15:42:04.000+02:00
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3623,7 +3623,7 @@ struct server_context {
 
             slot.i_batch = batch.n_tokens;
 
-            common_batch_add(batch, slot.sampled, slot.prompt.n_tokens(), { slot.id }, true);
+            common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
 
             slot.prompt.tokens.push_back(slot.sampled);
 
@@ -3927,7 +3927,7 @@ struct server_context {
                     if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
                         // process the image
                         size_t n_tokens_out = 0;
-                        int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.id, n_tokens_out);
+                        int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
                         if (res != 0) {
                             SLT_ERR(slot, "failed to process image, res = %d\n", res);
                             send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
@@ -3994,7 +3994,7 @@ struct server_context {
                         // embedding requires all tokens in the batch to be output
                         common_batch_add(batch,
                             cur_tok,
-                            input_tokens.get_pos(slot.prompt.n_tokens()),
+                            slot.prompt.tokens.pos_next(),
                             { slot.id },
                             slot.need_embd());
                         slot.prompt.tokens.push_back(cur_tok);
@@ -4291,10 +4291,10 @@ struct server_context {
 
                 // construct the speculation batch
                 common_batch_clear(slot.batch_spec);
-                common_batch_add  (slot.batch_spec, id, slot.prompt.n_tokens(), { slot.id }, true);
+                common_batch_add  (slot.batch_spec, id, slot.prompt.tokens.pos_next(), { slot.id }, true);
 
                 for (size_t i = 0; i < draft.size(); ++i) {
-                    common_batch_add(slot.batch_spec, draft[i], slot.prompt.n_tokens() + 1 + i, { slot.id }, true);
+                    common_batch_add(slot.batch_spec, draft[i], slot.prompt.tokens.pos_next() + 1 + i, { slot.id }, true);
                 }
 
                 SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens);
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
@@ -1081,7 +1081,7 @@ struct server_tokens {
 private: // disallow accessing these members directly, risking out-of-sync
 
     // map a **start** index in tokens to the image chunk
-    // note: the order need to be in-sync with tokens and pos
+    // note: the order need to be in-sync with tokens
     std::map<size_t, mtmd::input_chunk_ptr> map_idx_to_media;
 
     // list of tokens
@@ -1090,10 +1090,6 @@ struct server_tokens {
     // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
     llama_tokens tokens;
 
-    // the position per-token (llama_pos) in the overall input
-    // useful for M-RoPE, where the position is different from the index in tokens
-    std::vector<llama_pos> pos;
-
     // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
     //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
     // idx  0   1   2   3   4   5      6      7      8      9      10
@@ -1124,28 +1120,21 @@ struct server_tokens {
     }
 
     server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {
-        for (llama_pos i = 0; i < (llama_pos)tokens.size(); ++i) {
-            pos.push_back(i);
-        }
     }
 
-    llama_pos next_pos() const {
-        if (tokens.empty()) {
-            return 0;
-        } else if (tokens.back() != LLAMA_TOKEN_NULL) {
-            return pos.back() + 1;
-        } else {
-            // find the last media chunk
-            GGML_ASSERT(has_mtmd);
-            GGML_ASSERT(!map_idx_to_media.empty());
-            const auto & chunk = map_idx_to_media.rbegin()->second;
-            return pos.back() + mtmd_input_chunk_get_n_pos(chunk.get());
+    llama_pos pos_next() const {
+        if (!has_mtmd) {
+            return tokens.size();
         }
-    }
 
-    llama_pos get_pos(size_t idx) const {
-        GGML_ASSERT(idx < pos.size());
-        return pos[idx];
+        llama_pos res = tokens.size();
+
+        for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) {
+            const auto & chunk = it->second;
+            res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get());
+        }
+
+        return res;
     }
 
     // for debugging
@@ -1154,12 +1143,11 @@ struct server_tokens {
         oss << "tokens: ";
         for (size_t idx = 0; idx < tokens.size(); ++idx) {
             llama_token t = tokens[idx];
-            llama_pos   p = pos[idx];
             oss << "idx:" << idx << " ";
             if (t == LLAMA_TOKEN_NULL) {
-                oss << "<embd>(" << p << ")\n";
+                oss << "<embd> ";
             } else {
-                oss << t << "(" << p << ")\n";
+                oss << t << " ";
             }
         }
         oss << "\n";
@@ -1182,7 +1170,6 @@ struct server_tokens {
         if (tok == LLAMA_TOKEN_NULL) {
             throw std::runtime_error("Invalid token");
         }
-        pos.emplace_back(next_pos());
         tokens.emplace_back(tok);
     }
 
@@ -1192,10 +1179,8 @@ struct server_tokens {
         if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
             GGML_ASSERT(has_mtmd);
             const size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
-            const llama_pos cur_pos = next_pos();
             size_t start_idx = tokens.size();
             for (size_t i = 0; i < n_tokens; ++i) {
-                pos.emplace_back(cur_pos);
                 tokens.emplace_back(LLAMA_TOKEN_NULL);
             }
             mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
@@ -1233,11 +1218,6 @@ struct server_tokens {
     void insert(const llama_tokens & inp_tokens) {
         GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
         tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
-        // rebuild the pos vector
-        pos.clear();
-        for (llama_pos i = 0; i < (llama_pos)tokens.size(); ++i) {
-            pos.emplace_back(i);
-        }
     }
 
     // for compatibility with speculative decoding, ctx shift, slot save/load
@@ -1386,6 +1366,7 @@ struct server_tokens {
                 llama_context * ctx,
                 mtmd_context * mctx,
                 size_t idx,
+                llama_pos n_past,
                 int32_t seq_id,
                 size_t & n_tokens_out) const {
         const auto & chunk = find_chunk(idx);
@@ -1397,7 +1378,7 @@ struct server_tokens {
         llama_pos new_n_past; // unused for now
         int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
             chunk.get(),
-            pos[idx], // position
+            n_past,
             seq_id,
             n_batch,
             true, // logits last