Append mrope positions to the traditional llm pos, otherwise causal atten masking will break. See #13694. This also allows us to remove some previous workarounds

rujialiu · rujialiu · commit 54aa805ba329 · 2025-08-21T21:11:36.000+08:00
diff --git a/include/llama.h b/include/llama.h
@@ -226,7 +226,7 @@ extern "C" {
 
         llama_token  *  token;
         float        *  embd;
-        llama_pos    *  pos;
+        llama_pos    *  pos;      // first `n_tokens` elements are always linearly increasing position for traditional llm
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
         int8_t       *  logits;   // TODO: rename this to "output"
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -259,23 +259,7 @@ bool llama_batch_allocr::init(
         const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
 
         if (p0 >= 0) {
-            bool ok = true;
-
-            if (batch.token) {
-                if (seq_pos_min(s) != p0 + 1) {
-                    ok = false;
-                }
-            } else {
-                assert(batch.embd);
-
-                // for embeddings (typically used as vision input), we allow them to have repeating positions
-                // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
-                if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
-                    ok = false;
-                }
-            }
-
-            if (!ok) {
+            if (seq_pos_min(s) != p0 + 1) {
                 LLAMA_LOG_ERROR(
                         "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
                         " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
@@ -655,7 +639,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
 
     auto udata = std::make_shared<llama_ubatch::data_t>();
 
-    const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
+    const int32_t n_pos_cur = batch.embd ? (n_pos_per_embd + 1) : 1;
 
     const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
     const int64_t n_pos_all  =              (int64_t) n_tokens*n_pos_cur;
@@ -681,7 +665,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
         }
 
         for (int j = 0; j < n_pos_cur; ++j) {
-            udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
+            udata->pos[j * n_tokens + i] = batch.pos[j * batch.n_tokens + idxs[i]];
         }
 
         udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -54,7 +54,9 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
             }
             ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
         } else {
-            ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
+            llama_pos * pos_ptr = ubatch->pos;
+            if (ubatch->embd && n_pos_per_embd > 1) pos_ptr += n_tokens; // use mrope positions
+            ggml_backend_tensor_set(pos, pos_ptr, 0, n_tokens * n_pos_per_embd * ggml_element_size(pos));
         }
     }
 }
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
@@ -66,7 +66,7 @@ struct decode_embd_batch {
     std::vector<int8_t>         logits;
     llama_batch batch;
     decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-        pos     .resize(n_tokens * n_pos_per_embd);
+        pos     .resize(n_tokens * (n_pos_per_embd + 1));
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
         logits  .resize(n_tokens);
@@ -100,13 +100,14 @@ struct decode_embd_batch {
         for (int y = 0; y < ny; y++) {
             for (int x = 0; x < nx; x++) {
                 int i = y * nx + x;
-                pos[i                     ] = pos_0;
-                pos[i + batch.n_tokens    ] = pos_0 + y;
-                pos[i + batch.n_tokens * 2] = pos_0 + x;
-                pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+                pos[i + batch.n_tokens    ] = pos_0;
+                pos[i + batch.n_tokens * 2] = pos_0 + y;
+                pos[i + batch.n_tokens * 3] = pos_0 + x;
+                pos[i + batch.n_tokens * 4] = 0; // last pos dim is unused
             }
         }
         for (int i = 0; i < batch.n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
             batch.n_seq_id[i] = 1;
             batch.seq_id  [i] = seq_id_0.data();
             batch.logits  [i] = false;
@@ -118,12 +119,13 @@ struct decode_embd_batch {
         GGML_ASSERT(n_pos_per_embd == 4);
         seq_id_0[0] = seq_id;
         for (int i = 0; i < batch.n_tokens; i++) {
-            pos[i                     ] = pos_0 + i;
             pos[i + batch.n_tokens    ] = pos_0 + i;
             pos[i + batch.n_tokens * 2] = pos_0 + i;
-            pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+            pos[i + batch.n_tokens * 3] = pos_0 + i;
+            pos[i + batch.n_tokens * 4] = 0; // last pos dim is unused
         }
         for (int i = 0; i < batch.n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
             batch.n_seq_id[i] = 1;
             batch.seq_id  [i] = seq_id_0.data();
             batch.logits  [i] = false;
@@ -133,12 +135,12 @@ struct decode_embd_batch {
     llama_batch get_view(int offset, int n_tokens) {
         llama_pos * pos_ptr;
         pos_view.clear();
-        pos_view.reserve(n_tokens * n_pos_per_embd);
+        pos_view.reserve(n_tokens * (n_pos_per_embd + 1));
         if (n_pos_per_embd > 1) {
             // mrope
             // for example, with layout of src: 1234...1234...1234...1234...
             //       offset 2 will give us dst: 34...34...34...34...
-            for (int i = 0; i < n_pos_per_embd; i++) {
+            for (int i = 0; i <= n_pos_per_embd; i++) {
                 // assume n_tokens is less than or equal to batch.n_tokens
                 // batch.n_tokens is number of **total** tokens
                 // n_tokens is number of viewed token
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -1024,9 +1024,6 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
 }
 
 llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
-    if (image_tokens->use_mrope_pos) {
-        return 1; // for M-RoPE, the whole image is 1 in temporal dimension
-    }
     return image_tokens->n_tokens();
 }
 

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,9 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {`
`54`	`54`	`}`
`55`	`55`	`ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));`
`56`	`56`	`} else {`
`57`		`- ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokensn_pos_per_embdggml_element_size(pos));`
	`57`	`+ llama_pos * pos_ptr = ubatch->pos;`
	`58`	`+ if (ubatch->embd && n_pos_per_embd > 1) pos_ptr += n_tokens; // use mrope positions`
	`59`	`+ ggml_backend_tensor_set(pos, pos_ptr, 0, n_tokens * n_pos_per_embd * ggml_element_size(pos));`
`58`	`60`	`}`
`59`	`61`	`}`
`60`	`62`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1024,9 +1024,6 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {`
`1024`	`1024`	`}`
`1025`	`1025`
`1026`	`1026`	`llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {`
`1027`		`- if (image_tokens->use_mrope_pos) {`
`1028`		`- return 1; // for M-RoPE, the whole image is 1 in temporal dimension`
`1029`		`- }`
`1030`	`1027`	`return image_tokens->n_tokens();`
`1031`	`1028`	`}`
`1032`	`1029`