Skip to content

Commit e28cec3

Browse files
committed
memory : remove KV cache size padding
1 parent 3479efd commit e28cec3

File tree

4 files changed

+5
-28
lines changed

4 files changed

+5
-28
lines changed

src/llama-kv-cache.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2010,8 +2010,3 @@ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ub
20102010
void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
20112011
kv->set_input_pos_bucket(dst, ubatch);
20122012
}
2013-
2014-
uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) {
2015-
// the FA kernels require padding to avoid extra runtime boundary checks
2016-
return cparams.flash_attn ? 256u : 32u;
2017-
}

src/llama-kv-cache.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ struct llama_context;
1919

2020
class llama_kv_cache : public llama_memory_i {
2121
public:
22-
static uint32_t get_padding(const llama_cparams & cparams);
23-
2422
struct stream_copy_info {
2523
bool empty() const {
2624
assert(ssrc.size() == sdst.size());

src/llama-model.cpp

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19641,7 +19641,7 @@ struct llm_build_apertus : public llm_graph_context {
1964119641
}
1964219642
};
1964319643

19644-
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
19644+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
1964519645
llama_memory_i * res;
1964619646

1964719647
switch (arch) {
@@ -19692,17 +19692,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1969219692
};
1969319693
}
1969419694

19695-
const auto padding = llama_kv_cache::get_padding(cparams);
19696-
19697-
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
19698-
1969919695
res = new llama_memory_hybrid(
1970019696
/* model */ *this,
1970119697
/* attn_type_k */ params.type_k,
1970219698
/* attn_type_v */ params.type_v,
1970319699
/* attn_v_trans */ !cparams.flash_attn,
1970419700
/* attn_kv_size */ cparams.n_ctx,
19705-
/* attn_n_pad */ padding,
19701+
/* attn_n_pad */ 1,
1970619702
/* attn_n_swa */ hparams.n_swa,
1970719703
/* attn_swa_type */ hparams.swa_type,
1970819704
/* recurrent_type_k */ GGML_TYPE_F32,
@@ -19714,23 +19710,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1971419710
/* filter_attn */ std::move(filter_attn),
1971519711
/* filter_recr */ std::move(filter_recr));
1971619712
} else {
19717-
const auto padding = llama_kv_cache::get_padding(cparams);
19718-
1971919713
uint32_t n_ctx_per_stream = cparams.n_ctx;
1972019714

1972119715
if (!cparams.kv_unified) {
1972219716
n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
19723-
n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
19724-
19725-
cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
19726-
} else {
19727-
n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
19728-
19729-
cparams.n_ctx = n_ctx_per_stream;
1973019717
}
1973119718

19732-
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
19733-
1973419719
llama_memory_i::layer_reuse_cb reuse = nullptr;
1973519720

1973619721
if (arch == LLM_ARCH_GEMMA3N) {
@@ -19757,7 +19742,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1975719742
n_ctx_per_stream,
1975819743
cparams.n_seq_max,
1975919744
cparams.n_ubatch,
19760-
padding,
19745+
1,
1976119746
nullptr,
1976219747
reuse);
1976319748
} else {
@@ -19772,7 +19757,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1977219757
cparams.kv_unified,
1977319758
n_ctx_per_stream,
1977419759
cparams.n_seq_max,
19775-
padding,
19760+
1,
1977619761
hparams.n_swa,
1977719762
hparams.swa_type,
1977819763
nullptr,

src/llama-model.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -500,9 +500,8 @@ struct llama_model {
500500

501501
ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
502502

503-
// note: can mutate `cparams`
504503
// TODO: move this to new llm_arch_model_i interface
505-
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
504+
llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
506505

507506
// TODO: move this to new llm_arch_model_i interface
508507
ggml_cgraph * build_graph(const llm_graph_params & params) const;

0 commit comments

Comments
 (0)