@@ -18316,6 +18316,23 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1831618316 std::max((uint32_t) 1, cparams.n_seq_max),
1831718317 cparams.n_seq_max);
1831818318 } else if (llm_arch_is_hybrid(arch)) {
18319+
18320+ // The main difference between hybrid architectures is the
18321+ // layer filters, so pick the right one here
18322+ llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
18323+ llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
18324+ if (arch == LLM_ARCH_FALCON_H1) {
18325+ filter_attn = [&](int32_t) { return true; };
18326+ filter_recr = [&](int32_t) { return true; };
18327+ } else if (arch == LLM_ARCH_NEMOTRONH) {
18328+ filter_attn = [&](int32_t il) {
18329+ return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
18330+ };
18331+ filter_recr = [&](int32_t il) {
18332+ return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
18333+ };
18334+ }
18335+
1831918336 const auto padding = llama_kv_cache::get_padding(cparams);
1832018337
1832118338 cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
@@ -18335,8 +18352,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1833518352 /* n_seq_max */ cparams.n_seq_max,
1833618353 /* offload */ cparams.offload_kqv,
1833718354 /* unified */ cparams.kv_unified,
18338- /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr ,
18339- /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr );
18355+ /* filter_attn */ std::move(filter_attn) ,
18356+ /* filter_recr */ std::move(filter_recr) );
1834018357 } else {
1834118358 const auto padding = llama_kv_cache::get_padding(cparams);
1834218359
0 commit comments