llama.cpp: Fix Qwen2.5 VL cache causal masking (PR #16745)

iosub · iosub · commit e1a3d8557072 · 2025-10-25T18:21:38.000-05:00
Applied changes from llama.cpp PR #16745 to fix cache causal masking issues for Qwen2.5 VL models. Key changes: - Disabled consecutive position validation in llama-batch.cpp (allows position jumps for vision embeddings) - Added kv_position_of_token field to track KV cache positions for proper causal masking - Modified causal masking logic to use batch positions instead of temporal positions - Updated M-RoPE position calculation to use max(nx, ny) for images This fix allows Qwen VL models to handle non-consecutive positions in embeddings, which is required for proper vision processing. Ref: ggml-org/llama.cpp#16745
diff --git a/llama/llama.cpp/src/llama-batch.cpp b/llama/llama.cpp/src/llama-batch.cpp
@@ -221,12 +221,11 @@ bool llama_batch_allocr::init(
             /*.n_seq_id     =*/ batch.n_seq_id,
             /*.seq_id       =*/ batch.seq_id,
             /*.seq_id_unq   =*/ this->seq_id_unq.data(),
-            /*.seq_idx      =*/ this->seq_idx.data(),
-            /*.output       =*/ batch.logits,
-            /*.data         =*/ {},
-        };
-
-        ubatch_print(ubatch, debug);
+        /*.seq_idx      =*/ this->seq_idx.data(),
+        /*.output       =*/ batch.logits,
+        /*.kv_position_of_token=*/ {},
+        /*.data         =*/ {},
+    };        ubatch_print(ubatch, debug);
 
         LLAMA_LOG_DEBUG("%s:   seq       = [\n", __func__);
         for (int s0 = 0; s0 < (int) seq_pos.size(); ++s0) {
@@ -256,36 +255,38 @@ bool llama_batch_allocr::init(
             continue;
         }
 
-        const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
-
-        if (p0 >= 0) {
-            bool ok = true;
-
-            if (batch.token) {
-                if (seq_pos_min(s) != p0 + 1) {
-                    ok = false;
-                }
-            } else {
-                assert(batch.embd);
-
-                // for embeddings (typically used as vision input), we allow them to have repeating positions
-                // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
-                if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
-                    ok = false;
-                }
-            }
-
-            if (!ok) {
-                LLAMA_LOG_ERROR(
-                        "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
-                        " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
-                        " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
-                        " it is required that the sequence positions remain consecutive: Y = X + 1\n",
-                        __func__, s, s, p0, s, seq_pos_min(s));
-
-                return false;
-            }
-        }
+        //@fmayran: these checks don't make sense with models using position encoding such as Qwen VL, because the position stored in the KV cache can jump around (it is not even always increasing).
+        //it is not enough to let them be repeating. Within an image embedding, arbitrary jumps are expected.
+        //const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
+        //
+        //if (p0 >= 0) {
+        //    bool ok = true;
+        //
+        //    if (batch.token) {
+        //        if (seq_pos_min(s) != p0 + 1) {
+        //            ok = false;
+        //        }
+        //    } else {
+        //        assert(batch.embd);
+        //
+        //        // for embeddings (typically used as vision input), we allow them to have repeating positions
+        //        // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
+        //        if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
+        //            ok = false;
+        //        }
+        //    }
+        //
+        //    if (!ok) {
+        //        LLAMA_LOG_ERROR(
+        //                "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
+        //                " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
+        //                " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
+        //                " it is required that the sequence positions remain consecutive: Y = X + 1\n",
+        //                __func__, s, s, p0, s, seq_pos_min(s));
+        //
+        //        return false;
+        //    }
+        //}
 
         if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
             LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
@@ -369,36 +370,38 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
 
     auto udata = std::make_shared<llama_ubatch::data_t>();
 
-    udata->token     .resize(n_tokens);
-    udata->embd      .clear();
-    udata->pos       .resize(n_tokens);
-    udata->n_seq_id  .resize(n_tokens);
-    udata->seq_id    .resize(n_tokens);
-    udata->seq_id_unq.resize(0);
-    udata->seq_idx   .resize(LLAMA_MAX_SEQ, -1);
-    udata->output    .resize(n_tokens);
+    udata->token               .resize(n_tokens);
+    udata->embd                .clear();
+    udata->pos                 .resize(n_tokens);
+    udata->n_seq_id            .resize(n_tokens);
+    udata->seq_id              .resize(n_tokens);
+    udata->seq_id_unq          .resize(0);
+    udata->seq_idx             .resize(LLAMA_MAX_SEQ, -1);
+    udata->output              .resize(n_tokens);
+    udata->kv_position_of_token.resize(n_tokens, -1);
 
     for (uint32_t s = 0; s < n_seqs; ++s) {
         udata->seq_idx[s] = s;
         udata->seq_id_unq.push_back(s);
     }
 
     llama_ubatch res {
-        /*.b_equal_seqs =*/ true,
-        /*.n_tokens     =*/ n_tokens,
-        /*.n_seq_tokens =*/ n_seq_tokens,
-        /*.n_seqs       =*/ n_seqs,
-        /*.n_seqs_unq   =*/ n_seqs,
-
-        /*.token        =*/ udata->token.data(),
-        /*.embd         =*/ nullptr,
-        /*.pos          =*/ udata->pos.data(),
-        /*.n_seq_id     =*/ udata->n_seq_id.data(),
-        /*.seq_id       =*/ udata->seq_id.data(),
-        /*.seq_id_unq   =*/ udata->seq_id_unq.data(),
-        /*.seq_idx      =*/ udata->seq_idx.data(),
-        /*.output       =*/ udata->output.data(),
-        /*.data         =*/ std::move(udata),
+        /*.b_equal_seqs =*/        true,
+        /*.n_tokens     =*/        n_tokens,
+        /*.n_seq_tokens =*/        n_seq_tokens,
+        /*.n_seqs       =*/        n_seqs,
+        /*.n_seqs_unq   =*/        n_seqs,
+
+        /*.token        =*/        udata->token.data(),
+        /*.embd         =*/        nullptr,
+        /*.pos          =*/        udata->pos.data(),
+        /*.n_seq_id     =*/        udata->n_seq_id.data(),
+        /*.seq_id       =*/        udata->seq_id.data(),
+        /*.seq_id_unq   =*/        udata->seq_id_unq.data(),
+        /*.seq_idx      =*/        udata->seq_idx.data(),
+        /*.output       =*/        udata->output.data(),
+        /*.kv_position_of_token=*/ udata->kv_position_of_token.data(),
+        /*.data         =*/        std::move(udata),
     };
 
     return res;
@@ -660,14 +663,15 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
     const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
     const int64_t n_pos_all  =              (int64_t) n_tokens*n_pos_cur;
 
-    udata->token     .resize(n_tokens);
-    udata->embd      .resize(n_embd_all);
-    udata->pos       .resize(n_pos_all);
-    udata->n_seq_id  .resize(n_tokens);
-    udata->seq_id    .resize(n_tokens);
-    udata->seq_id_unq.resize(0);
-    udata->seq_idx   .resize(LLAMA_MAX_SEQ, -1);
-    udata->output    .resize(n_tokens);
+    udata->token               .resize(n_tokens);
+    udata->embd                .resize(n_embd_all);
+    udata->pos                 .resize(n_pos_all);
+    udata->n_seq_id            .resize(n_tokens);
+    udata->seq_id              .resize(n_tokens);
+    udata->seq_id_unq          .resize(0);
+    udata->seq_idx             .resize(LLAMA_MAX_SEQ, -1);
+    udata->output              .resize(n_tokens);
+    udata->kv_position_of_token.resize(n_tokens, -1);
 
     seq_set_t seq_set_unq;
 
@@ -705,21 +709,22 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
     }
 
     llama_ubatch res {
-        /*.b_equal_seqs =*/ equal_seqs,
-        /*.n_tokens     =*/ n_tokens,
-        /*.n_seq_tokens =*/ n_tokens/n_seqs,
-        /*.n_seqs       =*/ n_seqs,
-        /*.n_seqs_unq   =*/ (uint32_t) udata->seq_id_unq.size(),
-
-        /*.token        =*/ batch.token ? udata->token.data() : nullptr,
-        /*.embd         =*/ batch.embd ? udata->embd.data() : nullptr,
-        /*.pos          =*/ udata->pos.data(),
-        /*.n_seq_id     =*/ udata->n_seq_id.data(),
-        /*.seq_id       =*/ udata->seq_id.data(),
-        /*.seq_id_unq   =*/ udata->seq_id_unq.data(),
-        /*.seq_idx      =*/ udata->seq_idx.data(),
-        /*.output       =*/ udata->output.data(),
-        /*.data         =*/ std::move(udata),
+        /*.b_equal_seqs =*/        equal_seqs,
+        /*.n_tokens     =*/        n_tokens,
+        /*.n_seq_tokens =*/        n_tokens/n_seqs,
+        /*.n_seqs       =*/        n_seqs,
+        /*.n_seqs_unq   =*/        (uint32_t) udata->seq_id_unq.size(),
+
+        /*.token        =*/        batch.token ? udata->token.data() : nullptr,
+        /*.embd         =*/        batch.embd ? udata->embd.data() : nullptr,
+        /*.pos          =*/        udata->pos.data(),
+        /*.n_seq_id     =*/        udata->n_seq_id.data(),
+        /*.seq_id       =*/        udata->seq_id.data(),
+        /*.seq_id_unq   =*/        udata->seq_id_unq.data(),
+        /*.seq_idx      =*/        udata->seq_idx.data(),
+        /*.output       =*/        udata->output.data(),
+        /*.kv_position_of_token=*/ udata->kv_position_of_token.data(),
+        /*.data         =*/        std::move(udata),
     };
 
     if (debug > 0) {
diff --git a/llama/llama.cpp/src/llama-batch.h b/llama/llama.cpp/src/llama-batch.h
@@ -30,15 +30,16 @@ struct llama_ubatch {
     // seq_idx:    indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
     //             used for extracting sequence pooled embeddings
 
-    //                          // size               | idx | val
-    llama_token  *  token;      // [n_tokens]         | i   | id, token
-    float        *  embd;       // [n_embd, n_tokens] | i   | embd
-    llama_pos    *  pos;        // [n_tokens]         | i   | pos
-    int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
-    llama_seq_id ** seq_id;     // [n_tokens]         | s   | s0, s1, seq_id
-    llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
-    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
-    int8_t       *  output;     // [n_tokens]         | i   | -
+    //                                      // size               | idx | val
+    llama_token  *  token;                  // [n_tokens]         | i   | id, token
+    float        *  embd;                   // [n_embd, n_tokens] | i   | embd
+    llama_pos    *  pos;                    // [n_tokens]         | i   | pos
+    int32_t      *  n_seq_id;               // [n_tokens]         | i   | -
+    llama_seq_id ** seq_id;                 // [n_tokens]         | s   | s0, s1, seq_id
+    llama_seq_id *  seq_id_unq;             // [n_seqs_unq]       | s   | seq_id
+    int32_t      *  seq_idx;                // [LLAMA_MAX_SEQ]    | -   | seq_idx
+    int8_t       *  output;                 // [n_tokens]         | i   | -
+    int32_t      *  kv_position_of_token;   // [n_tokens]         | i   | kv position where the token was inserted
 
     struct data_t {
         std::vector<llama_token>    token;
@@ -49,6 +50,7 @@ struct llama_ubatch {
         std::vector<llama_seq_id>   seq_id_unq;
         std::vector<int32_t>        seq_idx;
         std::vector<int8_t>         output;
+        std::vector<int32_t>        kv_position_of_token;//when pushed to the kv cache, where is the token pushed (used for causal masking)
     };
 
     // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
diff --git a/llama/llama.cpp/src/llama-kv-cache.cpp b/llama/llama.cpp/src/llama-kv-cache.cpp
@@ -895,6 +895,7 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch &
             }
 
             cells.pos_set(idx, ubatch.pos[i]);
+            ubatch.kv_position_of_token[i] = (int32_t)idx;//set the position in the kv cache as a property for this token (needed for proper causal masking)
 
             for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
                 cells.seq_add(idx, ubatch.seq_id[i][s]);
@@ -1215,6 +1216,12 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
 
     std::fill(data, data + ggml_nelements(dst), -INFINITY);
 
+    std::vector<int32_t> map_kv_to_batch(n_kv, -1);//for each token in the cache, either (-1) or the position in the current ubatch
+    for (uint32_t i = 0; i < n_tokens; ++i)//invert the batch -> kv position map into a kv -> batch position map
+    {
+        if (ubatch->kv_position_of_token[i] != -1)
+            map_kv_to_batch[ubatch->kv_position_of_token[i]] = i;
+    }
     // Use only the previous KV cells of the correct sequence for each token of the ubatch.
     // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
     // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
@@ -1254,8 +1261,10 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
                     const llama_pos p0 = cells.pos_get(j);
 
                     // mask future tokens
-                    if (causal_attn && p0 > p1) {
-                        continue;
+                    if (causal_attn)
+                    {
+                        if (map_kv_to_batch[j] != -1 && map_kv_to_batch[j] > (int32_t)i)//if the kv cache token is in the current batch AND its position in the batch is higher than i
+                            continue;
                     }
 
                     // apply SWA if any
diff --git a/llama/llama.cpp/tools/mtmd/mtmd.cpp b/llama/llama.cpp/tools/mtmd/mtmd.cpp
@@ -1036,7 +1036,7 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
 
 llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
     if (image_tokens->use_mrope_pos) {
-        return 1; // for M-RoPE, the whole image is 1 in temporal dimension
+        return (::std::max)(image_tokens->nx, image_tokens->ny);//assuming image, not video // for M-RoPE, the whole image is 1 in temporal dimension
     }
     return image_tokens->n_tokens();
 }
diff --git a/llama/llama.cpp/tools/mtmd/mtmd.h b/llama/llama.cpp/tools/mtmd/mtmd.h
@@ -156,7 +156,7 @@ MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd
 MTMD_API size_t                     mtmd_input_chunk_get_n_tokens    (const mtmd_input_chunk * chunk);
 // returns nullptr for ID on text chunk
 MTMD_API const char *               mtmd_input_chunk_get_id          (const mtmd_input_chunk * chunk);
-// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
+// number of temporal positions (always max(ntok_x, ntok_y, ntok_t) for M-RoPE, n_tokens otherwise)
 MTMD_API llama_pos                  mtmd_input_chunk_get_n_pos       (const mtmd_input_chunk * chunk);
 
 // in case you want to use custom logic to handle the chunk (i.e. KV cache management)

Original file line number	Diff line number	Diff line change
`@@ -1036,7 +1036,7 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {`
`1036`	`1036`
`1037`	`1037`	`llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {`
`1038`	`1038`	`if (image_tokens->use_mrope_pos) {`
`1039`		`- return 1; // for M-RoPE, the whole image is 1 in temporal dimension`
	`1039`	`+ return (::std::max)(image_tokens->nx, image_tokens->ny);//assuming image, not video // for M-RoPE, the whole image is 1 in temporal dimension`
`1040`	`1040`	`}`
`1041`	`1041`	`return image_tokens->n_tokens();`
`1042`	`1042`	`}`