From 54aa805ba329523410ca9e178b889c14cf97ae87 Mon Sep 17 00:00:00 2001 From: Rujia Liu Date: Thu, 21 Aug 2025 21:11:36 +0800 Subject: [PATCH 1/2] Append mrope positions to the traditional llm pos, otherwise causal atten masking will break. See #13694. This also allows us to remove some previous workarounds --- include/llama.h | 2 +- src/llama-batch.cpp | 22 +++------------------- src/llama-graph.cpp | 4 +++- tools/mtmd/mtmd-helper.cpp | 20 +++++++++++--------- tools/mtmd/mtmd.cpp | 3 --- 5 files changed, 18 insertions(+), 33 deletions(-) diff --git a/include/llama.h b/include/llama.h index 135eaf1b65569..709ca11e7ef23 100644 --- a/include/llama.h +++ b/include/llama.h @@ -226,7 +226,7 @@ extern "C" { llama_token * token; float * embd; - llama_pos * pos; + llama_pos * pos; // first `n_tokens` elements are always linearly increasing position for traditional llm int32_t * n_seq_id; llama_seq_id ** seq_id; int8_t * logits; // TODO: rename this to "output" diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 55d89eca0ad94..707c7ff6529e3 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -259,23 +259,7 @@ bool llama_batch_allocr::init( const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1; if (p0 >= 0) { - bool ok = true; - - if (batch.token) { - if (seq_pos_min(s) != p0 + 1) { - ok = false; - } - } else { - assert(batch.embd); - - // for embeddings (typically used as vision input), we allow them to have repeating positions - // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762 - if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) { - ok = false; - } - } - - if (!ok) { + if (seq_pos_min(s) != p0 + 1) { LLAMA_LOG_ERROR( "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n" " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n" @@ -655,7 +639,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u auto udata = std::make_shared(); - const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1; + const int32_t n_pos_cur = batch.embd ? (n_pos_per_embd + 1) : 1; const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0; const int64_t n_pos_all = (int64_t) n_tokens*n_pos_cur; @@ -681,7 +665,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u } for (int j = 0; j < n_pos_cur; ++j) { - udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]]; + udata->pos[j * n_tokens + i] = batch.pos[j * batch.n_tokens + idxs[i]]; } udata->n_seq_id[i] = batch.n_seq_id[idxs[i]]; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 053c72d6dc8d1..b45be79abce08 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -54,7 +54,9 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { } ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos)); } else { - ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos)); + llama_pos * pos_ptr = ubatch->pos; + if (ubatch->embd && n_pos_per_embd > 1) pos_ptr += n_tokens; // use mrope positions + ggml_backend_tensor_set(pos, pos_ptr, 0, n_tokens * n_pos_per_embd * ggml_element_size(pos)); } } } diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 686f42f3960fe..835035994ae81 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -66,7 +66,7 @@ struct decode_embd_batch { std::vector logits; llama_batch batch; decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) { - pos .resize(n_tokens * n_pos_per_embd); + pos .resize(n_tokens * (n_pos_per_embd + 1)); n_seq_id.resize(n_tokens); seq_ids .resize(n_tokens + 1); logits .resize(n_tokens); @@ -100,13 +100,14 @@ struct decode_embd_batch { for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { int i = y * nx + x; - pos[i ] = pos_0; - pos[i + batch.n_tokens ] = pos_0 + y; - pos[i + batch.n_tokens * 2] = pos_0 + x; - pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused + pos[i + batch.n_tokens ] = pos_0; + pos[i + batch.n_tokens * 2] = pos_0 + y; + pos[i + batch.n_tokens * 3] = pos_0 + x; + pos[i + batch.n_tokens * 4] = 0; // last pos dim is unused } } for (int i = 0; i < batch.n_tokens; i++) { + batch.pos [i] = pos_0 + i; batch.n_seq_id[i] = 1; batch.seq_id [i] = seq_id_0.data(); batch.logits [i] = false; @@ -118,12 +119,13 @@ struct decode_embd_batch { GGML_ASSERT(n_pos_per_embd == 4); seq_id_0[0] = seq_id; for (int i = 0; i < batch.n_tokens; i++) { - pos[i ] = pos_0 + i; pos[i + batch.n_tokens ] = pos_0 + i; pos[i + batch.n_tokens * 2] = pos_0 + i; - pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused + pos[i + batch.n_tokens * 3] = pos_0 + i; + pos[i + batch.n_tokens * 4] = 0; // last pos dim is unused } for (int i = 0; i < batch.n_tokens; i++) { + batch.pos [i] = pos_0 + i; batch.n_seq_id[i] = 1; batch.seq_id [i] = seq_id_0.data(); batch.logits [i] = false; @@ -133,12 +135,12 @@ struct decode_embd_batch { llama_batch get_view(int offset, int n_tokens) { llama_pos * pos_ptr; pos_view.clear(); - pos_view.reserve(n_tokens * n_pos_per_embd); + pos_view.reserve(n_tokens * (n_pos_per_embd + 1)); if (n_pos_per_embd > 1) { // mrope // for example, with layout of src: 1234...1234...1234...1234... // offset 2 will give us dst: 34...34...34...34... - for (int i = 0; i < n_pos_per_embd; i++) { + for (int i = 0; i <= n_pos_per_embd; i++) { // assume n_tokens is less than or equal to batch.n_tokens // batch.n_tokens is number of **total** tokens // n_tokens is number of viewed token diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index a05373d5b3ca5..fc2e7bb5ac87a 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -1024,9 +1024,6 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) { } llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { - if (image_tokens->use_mrope_pos) { - return 1; // for M-RoPE, the whole image is 1 in temporal dimension - } return image_tokens->n_tokens(); } From 7db161a8b6673b2269f815f06b176c4c67b0ed54 Mon Sep 17 00:00:00 2001 From: Rujia Liu Date: Sun, 24 Aug 2025 15:53:47 +0800 Subject: [PATCH 2/2] Added notes about what is actually stored in batch's `pos` --- src/llama-graph.cpp | 4 ++++ tools/mtmd/mtmd-helper.cpp | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index b45be79abce08..5c77eed6d3c39 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -55,6 +55,10 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos)); } else { llama_pos * pos_ptr = ubatch->pos; + // Normally, ubatch->pos stores linearly increasing position + // However, some multi-modal models requires special position embedding (e.g. M-Rope in qwen2vl and qwen2.5vl) + // But linearly increasing position is still needed for proper causal attention masking + // So we store both of them: the first n_tokens elements are not changed, while model-specific positions are appended after that. if (ubatch->embd && n_pos_per_embd > 1) pos_ptr += n_tokens; // use mrope positions ggml_backend_tensor_set(pos, pos_ptr, 0, n_tokens * n_pos_per_embd * ggml_element_size(pos)); } diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 835035994ae81..2e60e78754994 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -55,6 +55,11 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) { // helper struct to make working with embd batch easier // note: this will be removed after llama_batch_ext refactoring +// notes2: Normally, batch's `pos` stores linearly increasing position +// However, some multi-modal models requires special position embedding (e.g. M-Rope in qwen2vl and qwen2.5vl) +// But linearly increasing position is still needed for proper causal attention masking +// So we store both of them: the first n_tokens elements are not changed, while model-specific positions are appended after that. +// So `pos` has `n_tokens * (n_pos_per_embd + 1)` elements struct decode_embd_batch { int n_pos_per_embd; int n_mmproj_embd;