cont : apply to all iSWA models

ggerganov · ggerganov · commit fe19219da330 · 2025-05-11T13:42:48.000+03:00
ggml-ci
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -362,22 +362,17 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 
 void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
     if (self_kq_mask) {
-        kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
-    }
-
-    // TODO: remove
-    if (self_kq_mask_swa) {
-        kv_self->set_input_kq_mask_swa(self_kq_mask_swa, ubatch, cparams.causal_attn);
+        kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn, false);
     }
 }
 
 void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
     if (self_kq_mask) {
-        kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+        kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn, false);
     }
 
     if (self_kq_mask_swa) {
-        kv_self->get_kv_swa()->set_input_kq_mask_swa(self_kq_mask_swa, ubatch, cparams.causal_attn);
+        kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn, true);
     }
 }
 
@@ -427,7 +422,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     n_layer          (hparams.n_layer),
     n_rot            (hparams.n_rot),
     n_ctx            (cparams.n_ctx),
-    n_ctx_per_seq    (cparams.n_ctx / cparams.n_seq_max),
     n_head           (hparams.n_head()),
     n_head_kv        (hparams.n_head_kv()),
     n_embd_head_k    (hparams.n_embd_head_k),
@@ -1241,6 +1235,9 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
     auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
 
     {
+        GGML_ASSERT(hparams.n_swa_pattern == 1 && "Use llama_kv_cache_unified_iswa for SWA");
+        GGML_ASSERT(hparams.n_swa == 0         && "Use llama_kv_cache_unified_iswa for SWA");
+
         const auto n_kv = kv_self->get_n();
 
         inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
@@ -1250,19 +1247,6 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
         inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
     }
 
-    // TODO: remove
-    if (hparams.n_swa_pattern > 1) {
-        GGML_ASSERT(hparams.n_swa > 0);
-
-        const auto n_kv = kv_self->get_n();
-
-        inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
-        ggml_set_input(inp->self_kq_mask_swa);
-
-        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
-    }
-
     return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
 }
 
@@ -1292,9 +1276,7 @@ ggml_tensor * llm_graph_context::build_attn(
         ggml_build_forward_expand(gf, kv_self->cpy_v(ctx0, v_cur, il));
     }
 
-    const bool is_swa = hparams.is_swa(il);
-
-    const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
+    const auto & kq_mask = inp->get_kq_mask();
 
     ggml_tensor * q = q_cur;
     ggml_tensor * k = kv_self->get_k(ctx0, il);
@@ -1334,8 +1316,8 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
     }
 
     {
-        GGML_ASSERT(hparams.n_swa_pattern > 1);
-        GGML_ASSERT(hparams.n_swa > 0);
+        GGML_ASSERT(hparams.n_swa_pattern > 1 && "Use llama_kv_cache_unified for non-SWA");
+        GGML_ASSERT(hparams.n_swa > 0         && "Use llama_kv_cache_unified for non-SWA");
 
         const auto n_kv = kv_self->get_kv_swa()->get_n();
 
@@ -1367,21 +1349,23 @@ ggml_tensor * llm_graph_context::build_attn(
     ggml_build_forward_expand(gf, k_cur);
     ggml_build_forward_expand(gf, v_cur);
 
+    const bool is_swa = hparams.is_swa(il);
+
     const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
 
+    const auto * kv = is_swa ? kv_self->get_kv_swa() : kv_self->get_kv_base();
+
     // store to KV cache
     {
-        ggml_build_forward_expand(gf, kv_self->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv_self->cpy_v(ctx0, v_cur, il));
+        ggml_build_forward_expand(gf, kv->cpy_k(ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, kv->cpy_v(ctx0, v_cur, il));
     }
 
-    const bool is_swa = hparams.is_swa(il);
-
     const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
 
     ggml_tensor * q = q_cur;
-    ggml_tensor * k = kv_self->get_k(ctx0, il);
-    ggml_tensor * v = kv_self->get_v(ctx0, il);
+    ggml_tensor * k = kv->get_k(ctx0, il);
+    ggml_tensor * v = kv->get_v(ctx0, il);
 
     ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
     cb(cur, "kqv_out", il);
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -257,12 +257,9 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
-    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; } // TODO: remove
 
     ggml_tensor * self_kq_mask         = nullptr; // F32 [n_kv, n_batch]
     ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_kv, n_batch]
-    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch] // TODO: remove
-    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch] // TODO: remove
 
     const llama_hparams & hparams;
     const llama_cparams & cparams;
@@ -404,7 +401,6 @@ struct llm_graph_context {
     const int64_t n_layer;
     const int64_t n_rot;
     const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
-    const int64_t n_ctx_per_seq;
     const int64_t n_head;
     const int64_t n_head_kv;
     const int64_t n_embd_head_k;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -630,7 +630,7 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
     return ggml_cpy(ctx, v_cur, v_view);
 }
 
-void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
+void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn, bool swa) const {
     const int64_t n_tokens     = ubatch->n_tokens;
     const int64_t n_seq_tokens = ubatch->n_seq_tokens;
     const int64_t n_seqs       = ubatch->n_seqs;
@@ -674,68 +674,21 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
                         }
                     }
 
-                    if (data) {
-                        data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
-                    }
-                }
-            }
-        }
-
-        // mask padded tokens
-        if (data) {
-            for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                for (int j = 0; j < n_kv; ++j) {
-                    data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                }
-            }
-        }
-    }
-}
-
-void llama_kv_cache_unified::set_input_kq_mask_swa(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
-    const int64_t n_tokens     = ubatch->n_tokens;
-    const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-    const int64_t n_seqs       = ubatch->n_seqs;
-
-    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-    float * data = (float *) dst->data;
-
-    const int64_t n_kv = n;
-
-    for (int h = 0; h < 1; ++h) {
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-            for (int j = 0; j < n_seq_tokens; ++j) {
-                const llama_pos pos = ubatch->pos[s*n_seq_tokens + j];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    float f;
-                    // mask the token if:
-                    if (!cells[i].has_seq_id(seq_id) // not the correct sequence
-                            || (causal_attn && cells[i].pos > pos) // for causal, mask future tokens
-                       ) {
-                        f = -INFINITY;
-                    } else {
-                        if (hparams.use_alibi) {
-                            f = -std::abs(cells[i].pos - pos);
-                        } else {
-                            f = 0.0f;
+                    if (swa) {
+                        // may need to cut off old tokens for sliding window
+                        // TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
+                        if (hparams.n_attn_chunk) {
+                            llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
+                            if (cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
+                                f = -INFINITY;
+                            }
+                        } else if (hparams.n_swa) {
+                            if (pos - cells[i].pos >= (int32_t) hparams.n_swa) {
+                                f = -INFINITY;
+                            }
                         }
                     }
 
-                    // may need to cut off old tokens for sliding window
-                    // TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
-                    if (hparams.n_attn_chunk) {
-                        llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
-                        if (cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
-                            f = -INFINITY;
-                        }
-                    } else {
-                        if (pos - cells[i].pos >= (int32_t)hparams.n_swa) {
-                            f = -INFINITY;
-                        }
-                    }
                     data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
                 }
             }
@@ -891,8 +844,6 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
     const auto & n_embd_head_k = hparams.n_embd_head_k;
   //const auto & n_embd_head_v = hparams.n_embd_head_v;
 
-    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
-
     //GGML_ASSERT(kv_self->size == n_ctx);
 
     auto inp = std::make_unique<llm_graph_input_k_shift>(this);
@@ -914,7 +865,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
         const float freq_base_l  = is_swa ? hparams.rope_freq_base_train_swa  : cparams.rope_freq_base;
         const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
 
-        ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
 
         ggml_tensor * k =
             ggml_view_3d(ctx, layer.k,
@@ -1736,38 +1687,6 @@ void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id
     kv_swa ->state_read(io, seq_id);
 }
 
-ggml_tensor * llama_kv_cache_unified_iswa::get_k(ggml_context * ctx, int32_t il) const {
-    if (hparams.is_swa(il)) {
-        return kv_swa->get_k(ctx, il);
-    }
-
-    return kv_base->get_k(ctx, il);
-}
-
-ggml_tensor * llama_kv_cache_unified_iswa::get_v(ggml_context * ctx, int32_t il) const {
-    if (hparams.is_swa(il)) {
-        return kv_swa->get_v(ctx, il);
-    }
-
-    return kv_base->get_v(ctx, il);
-}
-
-ggml_tensor * llama_kv_cache_unified_iswa::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
-    if (hparams.is_swa(il)) {
-        return kv_swa->cpy_k(ctx, k_cur, il);
-    }
-
-    return kv_base->cpy_k(ctx, k_cur, il);
-}
-
-ggml_tensor * llama_kv_cache_unified_iswa::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
-    if (hparams.is_swa(il)) {
-        return kv_swa->cpy_v(ctx, v_cur, il);
-    }
-
-    return kv_base->cpy_v(ctx, v_cur, il);
-}
-
 llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_base() const {
     return kv_base.get();
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -168,9 +168,7 @@ class llama_kv_cache_unified : public llama_kv_cache {
     ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
     ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
 
-    void set_input_kq_mask    (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
-    void set_input_kq_mask_swa(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const; // TODO: remove
-
+    void set_input_kq_mask    (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn, bool swa) const;
     void set_input_k_shift    (ggml_tensor * dst) const;
     void set_input_pos_bucket (ggml_tensor * dst, const llama_ubatch * ubatch) const;
 
@@ -360,12 +358,6 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
     // llama_kv_cache_unified_iswa specific API
     //
 
-    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
-    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
-
-    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
-    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
-
     llama_kv_cache_unified * get_kv_base() const;
     llama_kv_cache_unified * get_kv_swa () const;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/src/llama-model.h b/src/llama-model.h