llama: store mrope data in KV cell

ngxson · ngxson · commit bf7f92428378 · 2025-10-28T19:25:02.000+01:00
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -660,6 +660,9 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
     const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
     const int64_t n_pos_all  =              (int64_t) n_tokens*n_pos_cur;
 
+    // printf("ubatch_add: n_tokens=%d, n_seqs=%d, n_pos_cur=%d, n_embd_all=%lld, n_pos_all=%lld\n",
+    //         n_tokens, n_seqs, n_pos_cur, n_embd_all, n_pos_all);
+
     udata->token     .resize(n_tokens);
     udata->embd      .resize(n_embd_all);
     udata->pos       .resize(n_pos_all);
diff --git a/src/llama-batch.h b/src/llama-batch.h
@@ -17,6 +17,10 @@ struct llama_ubatch {
         return b_equal_seqs != 0;
     }
 
+    bool has_mrope() const {
+        return data->pos.size() == data->token.size()*4;
+    }
+
     uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
                            //       otherwise address sanitizer complains
     // TODO: whole_seqs for embeddings?
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -900,6 +900,14 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch &
 
             cells.pos_set(idx, ubatch.pos[i]);
 
+            if (ubatch.has_mrope()) {
+                cells.pos_mrope_set(idx, {
+                    ubatch.pos[i + ubatch.n_tokens],   // x
+                    ubatch.pos[i + ubatch.n_tokens*2], // y
+                    ubatch.pos[i + ubatch.n_tokens*3], // t
+                });
+            }
+
             for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
                 cells.seq_add(idx, ubatch.seq_id[i][s]);
             }
@@ -1243,6 +1251,14 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
 
                 const llama_pos p1 = ubatch->pos[i];
 
+                // for M-RoPE
+                llama_kv_pos_mrope p1_mrope;
+                if (ubatch->has_mrope()) {
+                    p1_mrope.x = ubatch->pos[i + ubatch->n_tokens];
+                    p1_mrope.y = ubatch->pos[i + ubatch->n_tokens*2];
+                    p1_mrope.t = ubatch->pos[i + ubatch->n_tokens*3];
+                }
+
                 const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
 
                 for (uint32_t j = 0; j < n_kv; ++j) {
@@ -1262,6 +1278,14 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
                         continue;
                     }
 
+                    // M-RoPE causal mask
+                    if (causal_attn && ubatch->has_mrope() && p0 == p1) {
+                        const auto & p0_mrope = cells.pos_mrope_get(j);
+                        if (p0_mrope.is_gt(p1_mrope)) {
+                            continue;
+                        }
+                    }
+
                     // apply SWA if any
                     if (is_masked_swa(p0, p1)) {
                         continue;
diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h
@@ -9,6 +9,18 @@
 #include <set>
 #include <map>
 
+struct llama_kv_pos_mrope {
+    llama_pos x;
+    llama_pos y;
+    llama_pos t;
+    // return true if this position is greater than the other position
+    bool is_gt(const llama_kv_pos_mrope & other) const {
+        return (t >  other.t)
+            || (t == other.t && y >  other.y)
+            || (t == other.t && y == other.y && x > other.x);
+    }
+};
+
 // meta information about KV cells that can be part of multiple sequences at the same time
 // TODO: add unit tests
 class llama_kv_cells {
@@ -43,6 +55,7 @@ class llama_kv_cells {
 
     void resize(uint32_t n) {
         pos.resize(n);
+        pos_mrope.resize(n);
         shift.resize(n);
         seq.resize(n);
 
@@ -107,8 +120,9 @@ class llama_kv_cells {
         for (uint32_t j = 0; j < n; ++j) {
             const auto idx = i + j;
 
-            res.pos[j] = pos[idx];
-            res.seq[j] = seq[idx];
+            res.pos      [j] = pos[idx];
+            res.pos_mrope[j] = pos_mrope[idx];
+            res.seq      [j] = seq[idx];
 
             assert(shift[idx] == 0);
         }
@@ -125,8 +139,9 @@ class llama_kv_cells {
         for (uint32_t j = 0; j < idxs.size(); ++j) {
             const auto idx = idxs[j];
 
-            res.pos[j] = pos[idx];
-            res.seq[j] = seq[idx];
+            res.pos      [j] = pos[idx];
+            res.pos_mrope[j] = pos_mrope[idx];
+            res.seq      [j] = seq[idx];
 
             assert(shift[idx] == 0);
         }
@@ -340,6 +355,13 @@ class llama_kv_cells {
         return pos[i];
     }
 
+    const llama_kv_pos_mrope & pos_mrope_get(uint32_t i) const {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        return pos_mrope[i];
+    }
+
     // note: call only if the cell is not empty
     llama_pos get_shift(uint32_t i) const {
         assert(i < pos.size());
@@ -368,6 +390,16 @@ class llama_kv_cells {
         used.insert(i);
     }
 
+    void pos_mrope_set(uint32_t i, llama_kv_pos_mrope p) {
+        assert(i < pos.size());
+        assert(pos[i] == -1);
+        assert(seq[i].none());
+
+        pos_mrope[i] = p;
+
+        used.insert(i);
+    }
+
     // pos[i] = pos[i] + d
     // sets "has_shift" to true
     // note: call only if the cell is not empty
@@ -424,6 +456,9 @@ class llama_kv_cells {
 
     std::vector<llama_pos> pos;
 
+    // stores addition info for M-RoPE positions
+    std::vector<llama_kv_pos_mrope> pos_mrope;
+
     // this array accumulates any applied shifts to the pos array since the last reset_shift() call
     // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
     //