address review comments

ngxson · ngxson · commit c3e1393f63d9 · 2025-10-29T11:16:23.000+01:00
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -215,6 +215,7 @@ bool llama_batch_allocr::init(
             /*.n_seq_tokens =*/ (uint32_t) 1,
             /*.n_seqs       =*/ (uint32_t) batch.n_tokens,
             /*.n_seqs_unq   =*/ (uint32_t) this->seq_id_unq.size(),
+            /*.n_pos        =*/ n_pos_per_embd,
             /*.token        =*/ batch.token,
             /*.embd         =*/ batch.embd,
             /*.pos          =*/ batch.pos,
@@ -382,6 +383,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
         /*.n_seq_tokens =*/ n_seq_tokens,
         /*.n_seqs       =*/ n_seqs,
         /*.n_seqs_unq   =*/ n_seqs,
+        /*.n_pos        =*/ n_pos_per_embd,
 
         /*.token        =*/ udata->token.data(),
         /*.embd         =*/ nullptr,
@@ -703,6 +705,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
         /*.n_seq_tokens =*/ n_tokens/n_seqs,
         /*.n_seqs       =*/ n_seqs,
         /*.n_seqs_unq   =*/ (uint32_t) udata->seq_id_unq.size(),
+        /*.n_pos        =*/ n_pos_per_embd,
 
         /*.token        =*/ batch.token ? udata->token.data() : nullptr,
         /*.embd         =*/ batch.embd ? udata->embd.data() : nullptr,
diff --git a/src/llama-batch.h b/src/llama-batch.h
@@ -17,8 +17,14 @@ struct llama_ubatch {
         return b_equal_seqs != 0;
     }
 
-    bool has_mrope() const {
-        return data->pos.size() == data->token.size()*4;
+    // typical for M-RoPE cases:
+    //   0 - sequantial position of the tokens/embeddings in the sequence
+    //   1 - y position in the image
+    //   2 - x position in the image
+    //   3 - other
+    bool is_pos_2d() const {
+        // TODO @ngxson : we may need to check for model arch when more models use >1 positions
+        return n_pos >= 3;
     }
 
     uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
@@ -29,6 +35,7 @@ struct llama_ubatch {
     uint32_t n_seq_tokens; // tokens per sequence set
     uint32_t n_seqs;       // sequence sets in the ubatch
     uint32_t n_seqs_unq;   // unique sequence ids in the ubatch
+    uint32_t n_pos;        // number of position inputs for each token/embedding
 
     // seq_id_unq: unique sequence ids in the ubatch
     // seq_idx:    indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
@@ -37,7 +44,7 @@ struct llama_ubatch {
     //                          // size               | idx | val
     llama_token  *  token;      // [n_tokens]         | i   | id, token
     float        *  embd;       // [n_embd, n_tokens] | i   | embd
-    llama_pos    *  pos;        // [n_tokens]         | i   | pos
+    llama_pos    *  pos;        // [n_tokens*n_pos]   | i   | pos
     int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
     llama_seq_id ** seq_id;     // [n_tokens]         | s   | s0, s1, seq_id
     llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -900,11 +900,11 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch &
 
             cells.pos_set(idx, ubatch.pos[i]);
 
-            if (ubatch.has_mrope()) {
-                cells.pos_mrope_set(idx, {
-                    ubatch.pos[i + ubatch.n_tokens],   // y
-                    ubatch.pos[i + ubatch.n_tokens*2], // x
-                });
+            if (ubatch.is_pos_2d()) {
+                llama_kv_cell_ext ext;
+                ext.x = ubatch.pos[i + ubatch.n_tokens*2];
+                ext.y = ubatch.pos[i + ubatch.n_tokens];
+                cells.ext_set(idx, std::move(ext));
             }
 
             for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
@@ -1251,11 +1251,8 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
                 const llama_pos p1 = ubatch->pos[i];
 
                 // for M-RoPE
-                llama_kv_pos_mrope p1_mrope;
-                if (ubatch->has_mrope()) {
-                    p1_mrope.y = ubatch->pos[i + ubatch->n_tokens];
-                    p1_mrope.x = ubatch->pos[i + ubatch->n_tokens*2];
-                }
+                llama_pos p1_x = ubatch->pos[i + ubatch->n_tokens*2];
+                llama_pos p1_y = ubatch->pos[i + ubatch->n_tokens];
 
                 const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
 
@@ -1277,9 +1274,9 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
                     }
 
                     // M-RoPE causal mask
-                    if (causal_attn && ubatch->has_mrope() && p0 == p1) {
-                        const auto & p0_mrope = cells.pos_mrope_get(j);
-                        if (p0_mrope.is_gt(p1_mrope)) {
+                    if (causal_attn && ubatch->is_pos_2d() && p0 == p1) {
+                        const auto & p0_ext = cells.ext_get(j);
+                        if (p0_ext.is_2d_gt(p1_x, p1_y)) {
                             continue;
                         }
                     }
diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h
@@ -9,12 +9,14 @@
 #include <set>
 #include <map>
 
-struct llama_kv_pos_mrope {
-    llama_pos y = 0;
+struct llama_kv_cell_ext {
+    // 2D spatial positions, typically used for M-RoPE
     llama_pos x = 0;
-    // return true if this position is greater than the other position
-    bool is_gt(const llama_kv_pos_mrope & other) const {
-        return (y > other.y) || (y == other.y && x > other.x);
+    llama_pos y = 0;
+
+    // return true if the current 2D spatial position is greater than other
+    bool is_2d_gt(llama_pos ox, llama_pos oy) const {
+        return (y > oy) || (y == oy && x > ox);
     }
 };
 
@@ -52,7 +54,7 @@ class llama_kv_cells {
 
     void resize(uint32_t n) {
         pos.resize(n);
-        pos_mrope.resize(n);
+        ext.resize(n);
         shift.resize(n);
         seq.resize(n);
 
@@ -117,9 +119,9 @@ class llama_kv_cells {
         for (uint32_t j = 0; j < n; ++j) {
             const auto idx = i + j;
 
-            res.pos      [j] = pos[idx];
-            res.pos_mrope[j] = pos_mrope[idx];
-            res.seq      [j] = seq[idx];
+            res.pos[j] = pos[idx];
+            res.ext[j] = ext[idx];
+            res.seq[j] = seq[idx];
 
             assert(shift[idx] == 0);
         }
@@ -136,9 +138,9 @@ class llama_kv_cells {
         for (uint32_t j = 0; j < idxs.size(); ++j) {
             const auto idx = idxs[j];
 
-            res.pos      [j] = pos[idx];
-            res.pos_mrope[j] = pos_mrope[idx];
-            res.seq      [j] = seq[idx];
+            res.pos[j] = pos[idx];
+            res.ext[j] = ext[idx];
+            res.seq[j] = seq[idx];
 
             assert(shift[idx] == 0);
         }
@@ -352,11 +354,11 @@ class llama_kv_cells {
         return pos[i];
     }
 
-    const llama_kv_pos_mrope & pos_mrope_get(uint32_t i) const {
+    const llama_kv_cell_ext & ext_get(uint32_t i) const {
         assert(i < pos.size());
         assert(pos[i] != -1);
 
-        return pos_mrope[i];
+        return ext[i];
     }
 
     // note: call only if the cell is not empty
@@ -387,9 +389,9 @@ class llama_kv_cells {
         used.insert(i);
     }
 
-    void pos_mrope_set(uint32_t i, llama_kv_pos_mrope p) {
-        assert(i < pos_mrope.size());
-        pos_mrope[i] = p;
+    void ext_set(uint32_t i, llama_kv_cell_ext && p) {
+        assert(i < ext.size());
+        ext[i] = std::move(p);
     }
 
     // pos[i] = pos[i] + d
@@ -448,8 +450,8 @@ class llama_kv_cells {
 
     std::vector<llama_pos> pos;
 
-    // stores addition info for M-RoPE positions
-    std::vector<llama_kv_pos_mrope> pos_mrope;
+    // stores extra info per cell
+    std::vector<llama_kv_cell_ext> ext;
 
     // this array accumulates any applied shifts to the pos array since the last reset_shift() call
     // this is used to queue multiple updates to the pos array, which in the end can be applied in one go: