correct x,y ordering

ngxson · ngxson · commit 90353eae92e4 · 2025-10-28T21:42:38.000+01:00
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -251,46 +251,39 @@ bool llama_batch_allocr::init(
     // consistency checks
     //
 
-    for (uint32_t s = 0; s < n_seq_max; ++s) {
-        if (seq_pos[s].empty()) {
-            continue;
-        }
+    // TODO @ngxson : we currently can't check M-RoPE positions, as the position is increased based on image size
+    if (n_pos_per_embd == 1) {
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
+            if (seq_pos[s].empty()) {
+                continue;
+            }
 
-        const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
+            const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
 
-        if (p0 >= 0) {
-            bool ok = true;
+            if (p0 >= 0) {
+                bool ok = true;
 
-            if (batch.token) {
                 if (seq_pos_min(s) != p0 + 1) {
                     ok = false;
                 }
-            } else {
-                assert(batch.embd);
 
-                // for embeddings (typically used as vision input), we allow them to have repeating positions
-                // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
-                if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
-                    ok = false;
+                if (!ok) {
+                    LLAMA_LOG_ERROR(
+                            "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
+                            " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
+                            " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
+                            " it is required that the sequence positions remain consecutive: Y = X + 1\n",
+                            __func__, s, s, p0, s, seq_pos_min(s));
+
+                    return false;
                 }
             }
 
-            if (!ok) {
-                LLAMA_LOG_ERROR(
-                        "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
-                        " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
-                        " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
-                        " it is required that the sequence positions remain consecutive: Y = X + 1\n",
-                        __func__, s, s, p0, s, seq_pos_min(s));
-
+            if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
+                LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
                 return false;
             }
         }
-
-        if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
-            LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
-            return false;
-        }
     }
 
     if (memory) {
@@ -660,9 +653,6 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
     const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
     const int64_t n_pos_all  =              (int64_t) n_tokens*n_pos_cur;
 
-    // printf("ubatch_add: n_tokens=%d, n_seqs=%d, n_pos_cur=%d, n_embd_all=%lld, n_pos_all=%lld\n",
-    //         n_tokens, n_seqs, n_pos_cur, n_embd_all, n_pos_all);
-
     udata->token     .resize(n_tokens);
     udata->embd      .resize(n_embd_all);
     udata->pos       .resize(n_pos_all);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -902,9 +902,8 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch &
 
             if (ubatch.has_mrope()) {
                 cells.pos_mrope_set(idx, {
-                    ubatch.pos[i + ubatch.n_tokens],   // x
-                    ubatch.pos[i + ubatch.n_tokens*2], // y
-                    ubatch.pos[i + ubatch.n_tokens*3], // t
+                    ubatch.pos[i + ubatch.n_tokens],   // y
+                    ubatch.pos[i + ubatch.n_tokens*2], // x
                 });
             }
 
@@ -1254,9 +1253,8 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
                 // for M-RoPE
                 llama_kv_pos_mrope p1_mrope;
                 if (ubatch->has_mrope()) {
-                    p1_mrope.x = ubatch->pos[i + ubatch->n_tokens];
-                    p1_mrope.y = ubatch->pos[i + ubatch->n_tokens*2];
-                    p1_mrope.t = ubatch->pos[i + ubatch->n_tokens*3];
+                    p1_mrope.y = ubatch->pos[i + ubatch->n_tokens];
+                    p1_mrope.x = ubatch->pos[i + ubatch->n_tokens*2];
                 }
 
                 const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h
@@ -10,14 +10,11 @@
 #include <map>
 
 struct llama_kv_pos_mrope {
-    llama_pos x;
-    llama_pos y;
-    llama_pos t;
+    llama_pos y = 0;
+    llama_pos x = 0;
     // return true if this position is greater than the other position
     bool is_gt(const llama_kv_pos_mrope & other) const {
-        return (t >  other.t)
-            || (t == other.t && y >  other.y)
-            || (t == other.t && y == other.y && x > other.x);
+        return (y > other.y) || (y == other.y && x > other.x);
     }
 };
 
@@ -391,13 +388,8 @@ class llama_kv_cells {
     }
 
     void pos_mrope_set(uint32_t i, llama_kv_pos_mrope p) {
-        assert(i < pos.size());
-        assert(pos[i] == -1);
-        assert(seq[i].none());
-
+        assert(i < pos_mrope.size());
         pos_mrope[i] = p;
-
-        used.insert(i);
     }
 
     // pos[i] = pos[i] + d
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -5,6 +5,15 @@
 
 #include "llama.h"
 
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
 #include <algorithm>
 #include <cerrno>
 #include <cstdio>
@@ -1031,7 +1040,9 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
 
 llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
     if (image_tokens->use_mrope_pos) {
-        return 1; // for M-RoPE, the whole image is 1 in temporal dimension
+        // for M-RoPE, temporal dimension = max(t,h,w)
+        // t is omitted as we don't support video input
+        return std::max(image_tokens->nx, image_tokens->ny);
     }
     return image_tokens->n_tokens();
 }
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
@@ -153,7 +153,7 @@ MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd
 MTMD_API size_t                     mtmd_input_chunk_get_n_tokens    (const mtmd_input_chunk * chunk);
 // returns nullptr for ID on text chunk
 MTMD_API const char *               mtmd_input_chunk_get_id          (const mtmd_input_chunk * chunk);
-// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
 MTMD_API llama_pos                  mtmd_input_chunk_get_n_pos       (const mtmd_input_chunk * chunk);
 
 // in case you want to use custom logic to handle the chunk (i.e. KV cache management)
@@ -171,7 +171,7 @@ MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * i
 MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
 MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
 MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens); // TODO: deprecate
-// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
 MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens); // TODO: deprecate
 
 // tokenize an input text prompt and a list of bitmaps (images/audio)