From e549515cb3f238552e5c13b1df91c2ab64eb216b Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sun, 3 Aug 2025 00:45:47 -0400 Subject: [PATCH] memory : handle kv_unified for hybrid models --- src/llama-memory-hybrid.cpp | 3 ++- src/llama-memory-hybrid.h | 1 + src/llama-model.cpp | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index d8e2086c87514..e98b4e3546959 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid( /* common */ uint32_t n_seq_max, bool offload, + bool unified, /* layer filters */ layer_filter_cb && filter_attn, layer_filter_cb && filter_recr) : @@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid( type_v, v_trans, offload, - 1, + unified, kv_size, n_seq_max, n_pad, diff --git a/src/llama-memory-hybrid.h b/src/llama-memory-hybrid.h index 4ac318175785e..c2d56cd541594 100644 --- a/src/llama-memory-hybrid.h +++ b/src/llama-memory-hybrid.h @@ -39,6 +39,7 @@ class llama_memory_hybrid : public llama_memory_i { /* common */ uint32_t n_seq_max, bool offload, + bool unified, /* layer filters */ layer_filter_cb && filter_attn = nullptr, layer_filter_cb && filter_recr = nullptr); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6b58fb8a059f4..60a615c159a51 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -17598,6 +17598,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max), /* n_seq_max */ cparams.n_seq_max, /* offload */ cparams.offload_kqv, + /* unified */ cparams.kv_unified, /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr, /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr); } else {