@@ -19641,7 +19641,7 @@ struct llm_build_apertus : public llm_graph_context {
1964119641    }
1964219642};
1964319643
19644- llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
19644+ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const  llama_cparams & cparams) const {
1964519645    llama_memory_i * res;
1964619646
1964719647    switch (arch) {
@@ -19692,17 +19692,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1969219692                        };
1969319693                    }
1969419694
19695-                     const auto padding = llama_kv_cache::get_padding(cparams);
19696- 
19697-                     cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
19698- 
1969919695                    res = new llama_memory_hybrid(
1970019696                        /* model             */ *this,
1970119697                        /* attn_type_k       */ params.type_k,
1970219698                        /* attn_type_v       */ params.type_v,
1970319699                        /* attn_v_trans      */ !cparams.flash_attn,
1970419700                        /* attn_kv_size      */ cparams.n_ctx,
19705-                         /* attn_n_pad        */ padding ,
19701+                         /* attn_n_pad        */ 1 ,
1970619702                        /* attn_n_swa        */ hparams.n_swa,
1970719703                        /* attn_swa_type     */ hparams.swa_type,
1970819704                        /* recurrent_type_k  */ GGML_TYPE_F32,
@@ -19714,23 +19710,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1971419710                        /* filter_attn       */ std::move(filter_attn),
1971519711                        /* filter_recr       */ std::move(filter_recr));
1971619712                } else {
19717-                     const auto padding = llama_kv_cache::get_padding(cparams);
19718- 
1971919713                    uint32_t n_ctx_per_stream = cparams.n_ctx;
1972019714
1972119715                    if (!cparams.kv_unified) {
1972219716                        n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
19723-                         n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
19724- 
19725-                         cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
19726-                     } else {
19727-                         n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
19728- 
19729-                         cparams.n_ctx = n_ctx_per_stream;
1973019717                    }
1973119718
19732-                     LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
19733- 
1973419719                    llama_memory_i::layer_reuse_cb reuse = nullptr;
1973519720
1973619721                    if (arch == LLM_ARCH_GEMMA3N) {
@@ -19757,7 +19742,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1975719742                                n_ctx_per_stream,
1975819743                                cparams.n_seq_max,
1975919744                                cparams.n_ubatch,
19760-                                 padding ,
19745+                                 1 ,
1976119746                                nullptr,
1976219747                                reuse);
1976319748                    } else {
@@ -19772,7 +19757,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1977219757                                cparams.kv_unified,
1977319758                                n_ctx_per_stream,
1977419759                                cparams.n_seq_max,
19775-                                 padding ,
19760+                                 1 ,
1977619761                                hparams.n_swa,
1977719762                                hparams.swa_type,
1977819763                                nullptr,
0 commit comments