kv-cells : improve ext handling

ggerganov · ggerganov · commit bed0f57fa23d · 2025-10-29T15:52:49.000+02:00
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -338,6 +338,8 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll
             llama_pos pos   = v_cells[s0].pos_get(i);
             llama_pos shift = v_cells[s0].get_shift(i);
 
+            llama_kv_cell_ext ext = v_cells[s0].ext_get(i);
+
             if (shift != 0) {
                 pos -= shift;
                 assert(pos >= 0);
@@ -349,6 +351,8 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll
             if (shift != 0) {
                 v_cells[s1].pos_add(i, shift);
             }
+
+            v_cells[s1].ext_set(i, ext);
         }
     }
 
@@ -383,6 +387,7 @@ void llama_kv_cache::seq_keep(llama_seq_id seq_id) {
 
 void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
     GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+    GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_add() is only supported for n_pos_per_embd() == 1");
 
     auto & cells = v_cells[seq_to_stream[seq_id]];
     auto & head  = v_heads[seq_to_stream[seq_id]];
@@ -427,6 +432,7 @@ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, ll
 
 void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
     GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
+    GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_div() is only supported for n_pos_per_embd() == 1");
 
     auto & cells = v_cells[seq_to_stream[seq_id]];
 
@@ -905,7 +911,7 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch &
                     /*.x =*/ ubatch.pos[i + ubatch.n_tokens*2],
                     /*.y =*/ ubatch.pos[i + ubatch.n_tokens],
                 };
-                cells.ext_set(idx, std::move(ext));
+                cells.ext_set(idx, ext);
             }
 
             for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h
@@ -18,6 +18,12 @@ struct llama_kv_cell_ext {
     bool is_2d_gt(llama_pos ox, llama_pos oy) const {
         return (y > oy) || (y == oy && x > ox);
     }
+
+    void reset() {
+        static_assert(std::is_trivially_copyable_v<llama_kv_cell_ext>);
+
+        memset(this, 0, sizeof(*this));
+    }
 };
 
 // meta information about KV cells that can be part of multiple sequences at the same time
@@ -27,6 +33,7 @@ class llama_kv_cells {
     void reset() {
         for (uint32_t i = 0; i < pos.size(); ++i) {
             pos[i]   = -1;
+            ext[i].reset();
             shift[i] =  0;
             seq[i].reset();
         }
@@ -168,6 +175,7 @@ class llama_kv_cells {
             }
 
             pos[idx] = other.pos[j];
+            ext[idx] = other.ext[j];
             seq[idx] = other.seq[j];
 
             if (pos[idx] != -1) {
@@ -198,6 +206,7 @@ class llama_kv_cells {
             }
 
             pos[idx] = other.pos[j];
+            ext[idx] = other.ext[j];
             seq[idx] = other.seq[j];
 
             if (pos[idx] != -1) {
@@ -217,6 +226,7 @@ class llama_kv_cells {
         seq[i].reset();
 
         pos[i] = -1;
+        ext[i].reset();
         shift[i] = 0;
 
         used.erase(i);
@@ -235,6 +245,7 @@ class llama_kv_cells {
 
         if (seq[i].none()) {
             pos[i] = -1;
+            ext[i].reset();
             shift[i] = 0;
 
             used.erase(i);
@@ -264,6 +275,7 @@ class llama_kv_cells {
             seq[i].reset();
 
             pos[i] = -1;
+            ext[i].reset();
             shift[i] = 0;
 
             used.erase(i);
@@ -389,9 +401,9 @@ class llama_kv_cells {
         used.insert(i);
     }
 
-    void ext_set(uint32_t i, llama_kv_cell_ext && p) {
+    void ext_set(uint32_t i, llama_kv_cell_ext p) {
         assert(i < ext.size());
-        ext[i] = std::move(p);
+        ext[i] = p;
     }
 
     // pos[i] = pos[i] + d