ggml-org · ngxson · Oct 29, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 29, 2025
@@ -251,46 +251,39 @@ bool llama_batch_allocr::init(
     // consistency checks
     //
 
-    for (uint32_t s = 0; s < n_seq_max; ++s) {
-        if (seq_pos[s].empty()) {
-            continue;
-        }
+    // TODO @ngxson : we currently can't check M-RoPE positions, as the position is increased based on image size
+    if (n_pos_per_embd == 1) {
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
+            if (seq_pos[s].empty()) {
+                continue;
+            }
 
-        const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
+            const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
 
-        if (p0 >= 0) {
-            bool ok = true;
+            if (p0 >= 0) {
+                bool ok = true;
 
-            if (batch.token) {
                 if (seq_pos_min(s) != p0 + 1) {
                     ok = false;
                 }
-            } else {
-                assert(batch.embd);
 
-                // for embeddings (typically used as vision input), we allow them to have repeating positions
-                // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
-                if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
-                    ok = false;
+                if (!ok) {
+                    LLAMA_LOG_ERROR(
+                            "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
+                            " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
+                            " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
+                            " it is required that the sequence positions remain consecutive: Y = X + 1\n",
+                            __func__, s, s, p0, s, seq_pos_min(s));
+
+                    return false;
                 }
             }
 
-            if (!ok) {
-                LLAMA_LOG_ERROR(
-                        "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
-                        " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
-                        " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
-                        " it is required that the sequence positions remain consecutive: Y = X + 1\n",
-                        __func__, s, s, p0, s, seq_pos_min(s));
-
+            if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
+                LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
                 return false;
             }
         }
-
-        if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
-            LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
-            return false;
-        }
     }
 
     if (memory) {

@@ -17,6 +17,10 @@ struct llama_ubatch {
         return b_equal_seqs != 0;
     }
 
+    bool has_mrope() const {
+        return data->pos.size() == data->token.size()*4;
+    }
+
     uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
                            //       otherwise address sanitizer complains
     // TODO: whole_seqs for embeddings?

@@ -900,6 +900,13 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch &
 
             cells.pos_set(idx, ubatch.pos[i]);
 
+            if (ubatch.has_mrope()) {
+                cells.pos_mrope_set(idx, {
+                    ubatch.pos[i + ubatch.n_tokens],   // y
+                    ubatch.pos[i + ubatch.n_tokens*2], // x
+                });
+            }
+
             for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
                 cells.seq_add(idx, ubatch.seq_id[i][s]);
             }
@@ -1243,6 +1250,13 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
 
                 const llama_pos p1 = ubatch->pos[i];
 
+                // for M-RoPE
+                llama_kv_pos_mrope p1_mrope;
+                if (ubatch->has_mrope()) {
+                    p1_mrope.y = ubatch->pos[i + ubatch->n_tokens];
+                    p1_mrope.x = ubatch->pos[i + ubatch->n_tokens*2];
+                }
+
                 const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
 
                 for (uint32_t j = 0; j < n_kv; ++j) {
@@ -1262,6 +1276,14 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
                         continue;
                     }
 
+                    // M-RoPE causal mask
+                    if (causal_attn && ubatch->has_mrope() && p0 == p1) {
+                        const auto & p0_mrope = cells.pos_mrope_get(j);
+                        if (p0_mrope.is_gt(p1_mrope)) {
+                            continue;
+                        }
+                    }
+
                     // apply SWA if any
                     if (is_masked_swa(p0, p1)) {
                         continue;

@@ -9,6 +9,15 @@
 #include <set>
 #include <map>
 
+struct llama_kv_pos_mrope {
+    llama_pos y = 0;
+    llama_pos x = 0;
+    // return true if this position is greater than the other position
+    bool is_gt(const llama_kv_pos_mrope & other) const {
+        return (y > other.y) || (y == other.y && x > other.x);
+    }
+};
+
 // meta information about KV cells that can be part of multiple sequences at the same time
 // TODO: add unit tests
 class llama_kv_cells {
@@ -43,6 +52,7 @@ class llama_kv_cells {
 
     void resize(uint32_t n) {
         pos.resize(n);
+        pos_mrope.resize(n);
         shift.resize(n);
         seq.resize(n);
 
@@ -107,8 +117,9 @@ class llama_kv_cells {
         for (uint32_t j = 0; j < n; ++j) {
             const auto idx = i + j;
 
-            res.pos[j] = pos[idx];
-            res.seq[j] = seq[idx];
+            res.pos      [j] = pos[idx];
+            res.pos_mrope[j] = pos_mrope[idx];
+            res.seq      [j] = seq[idx];
 
             assert(shift[idx] == 0);
         }
@@ -125,8 +136,9 @@ class llama_kv_cells {
         for (uint32_t j = 0; j < idxs.size(); ++j) {
             const auto idx = idxs[j];
 
-            res.pos[j] = pos[idx];
-            res.seq[j] = seq[idx];
+            res.pos      [j] = pos[idx];
+            res.pos_mrope[j] = pos_mrope[idx];
+            res.seq      [j] = seq[idx];
 
             assert(shift[idx] == 0);
         }
@@ -340,6 +352,13 @@ class llama_kv_cells {
         return pos[i];
     }
 
+    const llama_kv_pos_mrope & pos_mrope_get(uint32_t i) const {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        return pos_mrope[i];
+    }
+
     // note: call only if the cell is not empty
     llama_pos get_shift(uint32_t i) const {
         assert(i < pos.size());
@@ -368,6 +387,11 @@ class llama_kv_cells {
         used.insert(i);
     }
 
+    void pos_mrope_set(uint32_t i, llama_kv_pos_mrope p) {
+        assert(i < pos_mrope.size());
+        pos_mrope[i] = p;
+    }
+
     // pos[i] = pos[i] + d
     // sets "has_shift" to true
     // note: call only if the cell is not empty
@@ -424,6 +448,9 @@ class llama_kv_cells {
 
     std::vector<llama_pos> pos;
 
+    // stores addition info for M-RoPE positions
+    std::vector<llama_kv_pos_mrope> pos_mrope;
+
-    // stores addition info for M-RoPE positions
-    std::vector<llama_kv_pos_mrope> pos_mrope;
+    // stores extra optional cell info
+    std::vector<llama_kv_cell_ext> ext;
+
-    // stores addition info for M-RoPE positions
-    std::vector<llama_kv_pos_mrope> pos_mrope;
+    // stores extra optional cell info
+    std::vector<llama_kv_cell_ext> ext;
+
     // this array accumulates any applied shifts to the pos array since the last reset_shift() call
     // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
     //

@@ -5,6 +5,15 @@
 
 #include "llama.h"
 
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
 #include <algorithm>
 #include <cerrno>
 #include <cstdio>
@@ -1031,7 +1040,9 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
 
 llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
     if (image_tokens->use_mrope_pos) {
-        return 1; // for M-RoPE, the whole image is 1 in temporal dimension
+        // for M-RoPE, temporal dimension = max(t,h,w)
+        // t is omitted as we don't support video input
+        return std::max(image_tokens->nx, image_tokens->ny);
     }
     return image_tokens->n_tokens();
 }

@@ -153,7 +153,7 @@ MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd
 MTMD_API size_t                     mtmd_input_chunk_get_n_tokens    (const mtmd_input_chunk * chunk);
 // returns nullptr for ID on text chunk
 MTMD_API const char *               mtmd_input_chunk_get_id          (const mtmd_input_chunk * chunk);
-// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
 MTMD_API llama_pos                  mtmd_input_chunk_get_n_pos       (const mtmd_input_chunk * chunk);
 
 // in case you want to use custom logic to handle the chunk (i.e. KV cache management)
@@ -171,7 +171,7 @@ MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * i
 MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
 MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
 MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens); // TODO: deprecate
-// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
 MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens); // TODO: deprecate
 
 // tokenize an input text prompt and a list of bitmaps (images/audio)