kv-cache : make cpy_k and cpy_v implementation more readable

ggerganov · ggerganov · commit 85a5ea3671b9 · 2025-09-08T09:39:00.000+03:00
ggml-ci
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -1018,21 +1018,33 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm
 
     const int32_t ikv = map_layer_ids.at(il);
 
-    auto * k = layers[ikv].k;
+    ggml_tensor * k = layers[ikv].k;
+
+    const int64_t n_embd_head = k_cur->ne[0];
+    const int64_t n_head      = k_cur->ne[1];
+    const int64_t n_token     = k_cur->ne[2];
 
-    const int64_t n_embd_k_gqa = k_cur->ne[0]*k_cur->ne[1];
-    const int64_t n_tokens = k_cur->ne[2];
+    const int64_t n_embd_gqa = n_embd_head*n_head;
 
     // we can merge dims 0 and 1
-    assert(k_cur->nb[0]*k_cur->ne[0] == k_cur->nb[1]);
+    // TODO: add ggml helper function for this?
+    assert(ggml_row_size(k_cur->type, n_embd_head) == k_cur->nb[1]);
+
+    k_cur = ggml_view_2d(ctx, k_cur, n_embd_gqa, n_token , k_cur->nb[2], 0);
+
+    const int64_t n_stream = k->ne[2];
+
+    if (n_stream > 1) {
+        const uint64_t kv_size = get_size();
 
-    //k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
-    k_cur = ggml_view_2d(ctx, k_cur, n_embd_k_gqa, n_tokens, k_cur->nb[2], 0);
+        assert(n_embd_gqa == k->ne[0]);
+        assert(kv_size    == k->ne[1]);
 
-    if (k->ne[2] > 1) {
-        k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
+        // merge the buffer across all streams because the idxs are global
+        k = ggml_reshape_2d(ctx, k, n_embd_gqa, kv_size*n_stream);
     }
 
+    // store the current K values into the cache
     return ggml_set_rows(ctx, k, k_cur, k_idxs);
 }
 
@@ -1043,39 +1055,51 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
 
     auto * v = layers[ikv].v;
 
-    const int64_t n_embd_v_gqa = v_cur->ne[0]*v_cur->ne[1];
-    const int64_t n_tokens     = v_cur->ne[2];
+    const int64_t n_embd_head = v_cur->ne[0];
+    const int64_t n_head      = v_cur->ne[1];
+    const int64_t n_token     = v_cur->ne[2];
+
+    const int64_t n_embd_gqa = n_embd_head*n_head;
 
     // we can merge dims 0 and 1
-    assert(v_cur->nb[0]*v_cur->ne[0] == v_cur->nb[1]);
+    assert(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]);
 
-    //v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
+    const int64_t n_stream = v->ne[2];
 
+    // take this branch when FA is enabled (the V cache is not transposed)
     if (!v_trans) {
-        v_cur = ggml_view_2d(ctx, v_cur, n_embd_v_gqa, n_tokens, v_cur->nb[2], 0);
+        v_cur = ggml_view_2d(ctx, v_cur, n_embd_gqa, n_token, v_cur->nb[2], 0);
+
+        if (n_stream > 1) {
+            const uint64_t kv_size = get_size();
+
+            assert(n_embd_gqa == v->ne[0]);
+            assert(kv_size    == v->ne[1]);
 
-        if (v->ne[2] > 1) {
-            v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
+            // merge the buffer across all streams because the idxs are global
+            v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream);
         }
 
         return ggml_set_rows(ctx, v, v_cur, v_idxs);
     }
 
-    if (v_cur->nb[1]*v_cur->ne[1] != v_cur->nb[2]) {
-        v_cur = ggml_cont_2d   (ctx, v_cur, n_embd_v_gqa, n_tokens);
+    if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) {
+        // we can merge dims 0, 1 and 2
+        v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_token);
     } else {
-        v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
+        // otherwise -> make a copy to get contiguous data
+        v_cur = ggml_cont_2d   (ctx, v_cur, n_embd_gqa, n_token);
     }
 
     // [TAG_V_CACHE_VARIABLE]
-    if (n_embd_v_gqa < v->ne[0]) {
-        v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
+    if (n_embd_gqa < v->ne[0]) {
+        v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_gqa, 0, 0, 0);
     }
 
-    // the row becomes a single element
-    ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
+    // in this branch the v_idxs are constructed in such a way that each row becomes a single element
+    ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, ggml_nelements(v));
 
-    v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
+    v_cur = ggml_reshape_2d(ctx, v_cur, 1, ggml_nelements(v_cur));
 
     return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
 }