improve it

ngxson · ngxson · commit edc53a9940b9 · 2025-10-28T17:06:23.000+01:00
diff --git a/include/llama.h b/include/llama.h
@@ -218,6 +218,7 @@ extern "C" {
     // - token  : the token ids of the input (used when embd is NULL)
     // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
     // - pos    : the positions of the respective token in the sequence
+    //            (for M-RoPE, first `n_tokens` are linearly increasing, followed by `n_pos_per_embd * n_tokens` positions for RoPE)
     //            (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
     // - seq_id : the sequence to which the respective token belongs
     //            (if set to NULL, the sequence ID will be assumed to be 0)
@@ -232,7 +233,7 @@ extern "C" {
 
         llama_token  *  token;
         float        *  embd;
-        llama_pos    *  pos;      // first `n_tokens` elements are always linearly increasing position for traditional llm
+        llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
         int8_t       *  logits;   // TODO: rename this to "output"
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -639,7 +639,10 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
 
     auto udata = std::make_shared<llama_ubatch::data_t>();
 
-    const int32_t n_pos_cur = batch.embd ? (n_pos_per_embd + 1) : 1;
+    const int32_t n_pos_per_embd_inp = n_pos_per_embd > 1
+        ? (n_pos_per_embd + 1) // include the extra linearly increasing positions for M-RoPE
+        : 1;                   // standard RoPE
+    const int32_t n_pos_cur = batch.embd ? n_pos_per_embd_inp : 1;
 
     const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
     const int64_t n_pos_all  =              (int64_t) n_tokens*n_pos_cur;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -54,13 +54,10 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
             }
             ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
         } else {
-            llama_pos * pos_ptr = ubatch->pos;
-            // Normally, ubatch->pos stores linearly increasing position
-            // However, some multi-modal models requires special position embedding (e.g. M-Rope in qwen2vl and qwen2.5vl)
-            // But linearly increasing position is still needed for proper causal attention masking
-            // So we store both of them: the first n_tokens elements are not changed, while model-specific positions are appended after that.
-            if (ubatch->embd && n_pos_per_embd > 1) pos_ptr += n_tokens; // use mrope positions
-            ggml_backend_tensor_set(pos, pos_ptr, 0, n_tokens * n_pos_per_embd * ggml_element_size(pos));
+            const bool has_mrope = ubatch->embd && n_pos_per_embd > 1;
+            ggml_backend_tensor_set(pos,
+                ubatch->pos + (has_mrope ? n_tokens : 0), // skip the first n_tokens positions for M-RoPE
+                0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
         }
     }
 }
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
@@ -191,7 +191,8 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) {
 
         // eval the token
         common_batch_clear(ctx.batch);
-        common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
+        int max_pos = llama_memory_seq_pos_max(llama_get_memory(ctx.lctx), 0);
+        common_batch_add(ctx.batch, token_id, max_pos+1, {0}, true);
         if (llama_decode(ctx.lctx, ctx.batch)) {
             LOG_ERR("failed to decode token\n");
             return 1;
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
@@ -55,15 +55,11 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
 
 // helper struct to make working with embd batch easier
 // note: this will be removed after llama_batch_ext refactoring
-// notes2: Normally, batch's `pos` stores linearly increasing position
-// However, some multi-modal models requires special position embedding (e.g. M-Rope in qwen2vl and qwen2.5vl)
-// But linearly increasing position is still needed for proper causal attention masking
-// So we store both of them: the first n_tokens elements are not changed, while model-specific positions are appended after that.
-// So `pos` has `n_tokens * (n_pos_per_embd + 1)` elements
 struct decode_embd_batch {
     int n_pos_per_embd;
     int n_mmproj_embd;
-    std::vector<llama_pos>      pos;
+    std::vector<llama_pos>      pos;      // for M-RoPE, this will have (1+n_pos_per_embd)*n_tokens elements
+                                          // the extra n_tokens are for linearly increasing positions
     std::vector<llama_pos>      pos_view; // used by mrope
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id>   seq_id_0;
@@ -171,6 +167,59 @@ struct decode_embd_batch {
     }
 };
 
+// helper struct to make working with embd batch easier
+struct decode_text_batch {
+    std::vector<llama_token>    tokens;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id>   seq_id_0;
+    std::vector<llama_seq_id *> seq_ids;
+    std::vector<int8_t>         logits;
+    llama_seq_id                seq_id;
+    llama_batch batch;
+    decode_text_batch(int32_t n_tokens, llama_seq_id seq_id) : seq_id(seq_id) {
+        tokens  .resize(n_tokens);
+        n_seq_id.resize(n_tokens);
+        seq_ids .resize(n_tokens + 1);
+        logits  .resize(n_tokens);
+        seq_ids[n_tokens] = nullptr;
+        for (int32_t i = 0; i < n_tokens; i++) {
+            n_seq_id[i] = 1;
+            seq_ids [i] = &this->seq_id;
+        }
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ tokens.data(),
+            /*embd           =*/ nullptr,
+            /*pos            =*/ nullptr, // position is tracked automatically
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+    }
+
+    void clear() {
+        batch.n_tokens = 0;
+    }
+
+    bool is_full() const {
+        return batch.n_tokens >= (int32_t) tokens.size();
+    }
+
+    void add_token(llama_token tok, bool output) {
+        GGML_ASSERT(!is_full());
+        int32_t j = batch.n_tokens;
+        batch.token [j] = tok;
+        batch.logits[j] = output;
+        batch.n_tokens++;
+    }
+
+    void set_logits_last() {
+        if (batch.n_tokens > 0) {
+            batch.logits[batch.n_tokens - 1] = true;
+        }
+    }
+};
+
 // Helper function for decoding an image whose embeddings have already been calculated
 int32_t mtmd_helper_decode_image_chunk(
         mtmd_context * ctx,
@@ -259,7 +308,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
         bool logits_last,
         llama_pos * new_n_past) {
     int32_t ret;
-    llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
+    decode_text_batch text_batch(n_batch, seq_id);
     auto chunk_type = mtmd_input_chunk_get_type(chunk);
 
     if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
@@ -268,28 +317,20 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
         // LOG_INF("decoding text chunk, n_tokens = %zu\n", n_tokens);
         size_t i = 0;
         while (i < n_tokens) { // split into batches
-            text_batch.n_tokens = 0; // clear the batch
-            for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
-                int32_t j = text_batch.n_tokens;
-                text_batch.token   [j]    = tokens[i];
-                text_batch.pos     [j]    = n_past++;
-                text_batch.n_seq_id[j]    = 1;
-                text_batch.seq_id  [j][0] = seq_id;
-                text_batch.logits  [j]    = false;
-
-                text_batch.n_tokens++;
+            text_batch.clear();
+            for (; i < n_tokens && !text_batch.is_full(); i++) {
+                text_batch.add_token(tokens[i], false);
             }
             bool is_last_token = (i == n_tokens);
             if (logits_last && is_last_token) {
-                text_batch.logits[text_batch.n_tokens - 1] = true;
+                text_batch.set_logits_last();
             }
-            ret = llama_decode(lctx, text_batch);
+            ret = llama_decode(lctx, text_batch.batch);
             if (ret != 0) {
                 LOG_ERR("failed to decode text\n");
-                llama_batch_free(text_batch);
                 return ret;
             }
-            *new_n_past += text_batch.n_tokens;
+            *new_n_past += text_batch.batch.n_tokens;
         }
 
     } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
@@ -301,7 +342,6 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
         ret = mtmd_encode_chunk(ctx, chunk);
         if (ret != 0) {
             LOG_ERR("failed to encode %s slice\n", name);
-            llama_batch_free(text_batch);
             return ret;
         }
 
@@ -311,14 +351,12 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
         ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
         if (ret != 0) {
             LOG_ERR("failed to decode %s\n", name);
-            llama_batch_free(text_batch);
             return ret;
         }
     } else {
         GGML_ABORT("chunk type not supported");
     }
 
-    llama_batch_free(text_batch);
     return 0;
 }
 
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -5,6 +5,15 @@
 
 #include "llama.h"
 
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
 #include <algorithm>
 #include <cerrno>
 #include <cstdio>
@@ -1030,6 +1039,11 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
 }
 
 llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
+    if (image_tokens->use_mrope_pos) {
+        // for M-RoPE, temporal dimension = max(t,h,w)
+        // t is omitted as we don't support video input
+        return std::max(image_tokens->nx, image_tokens->ny);
+    }
     return image_tokens->n_tokens();
 }
 
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
@@ -171,7 +171,7 @@ MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * i
 MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
 MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
 MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens); // TODO: deprecate
-// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
 MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens); // TODO: deprecate
 
 // tokenize an input text prompt and a list of bitmaps (images/audio)

Original file line number	Diff line number	Diff line change
`@@ -54,13 +54,10 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {`
`54`	`54`	`}`
`55`	`55`	`ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));`
`56`	`56`	`} else {`
`57`		`- llama_pos * pos_ptr = ubatch->pos;`
`58`		`- // Normally, ubatch->pos stores linearly increasing position`
`59`		`- // However, some multi-modal models requires special position embedding (e.g. M-Rope in qwen2vl and qwen2.5vl)`
`60`		`- // But linearly increasing position is still needed for proper causal attention masking`
`61`		`- // So we store both of them: the first n_tokens elements are not changed, while model-specific positions are appended after that.`
`62`		`- if (ubatch->embd && n_pos_per_embd > 1) pos_ptr += n_tokens; // use mrope positions`
`63`		`- ggml_backend_tensor_set(pos, pos_ptr, 0, n_tokens * n_pos_per_embd * ggml_element_size(pos));`
	`57`	`+ const bool has_mrope = ubatch->embd && n_pos_per_embd > 1;`
	`58`	`+ ggml_backend_tensor_set(pos,`
	`59`	`+ ubatch->pos + (has_mrope ? n_tokens : 0), // skip the first n_tokens positions for M-RoPE`
	`60`	`+ 0, n_tokensn_pos_per_embdggml_element_size(pos));`
`64`	`61`	`}`
`65`	`62`	`}`
`66`	`63`	`}`