@@ -280,7 +280,7 @@ llama_context::llama_context(
280
280
}
281
281
282
282
// reserve worst-case graph
283
- if (!hparams.vocab_only && memory ) {
283
+ if (!hparams.vocab_only ) {
284
284
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max ;
285
285
const uint32_t n_tokens = std::min (cparams.n_ctx , cparams.n_ubatch );
286
286
@@ -292,11 +292,13 @@ llama_context::llama_context(
292
292
int n_splits_tg = -1 ;
293
293
int n_nodes_tg = -1 ;
294
294
295
- // simulate full KV cache
296
-
297
- const auto mctx = memory->init_full ();
298
- if (!mctx) {
299
- throw std::runtime_error (" failed to initialize KV cache" );
295
+ llama_memory_context_ptr mctx;
296
+ if (memory) {
297
+ LLAMA_LOG_DEBUG (" %s: reserving full memory module\n " , __func__);
298
+ mctx = memory->init_full ();
299
+ if (!mctx) {
300
+ throw std::runtime_error (" failed to initialize memory module" );
301
+ }
300
302
}
301
303
302
304
cross.v_embd .clear ();
@@ -1056,7 +1058,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
1056
1058
const auto * res = process_ubatch (ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get (), status);
1057
1059
1058
1060
if (!res) {
1059
- // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
1061
+ // the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
1060
1062
llama_pos pos_min[LLAMA_MAX_SEQ];
1061
1063
for (int s = 0 ; s < LLAMA_MAX_SEQ; ++s) {
1062
1064
pos_min[s] = std::numeric_limits<llama_pos>::max ();
@@ -1073,7 +1075,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
1073
1075
continue ;
1074
1076
}
1075
1077
1076
- LLAMA_LOG_WARN (" %s: removing KV cache entries for seq_id = %d, pos = [%d, +inf)\n " , __func__, s, pos_min[s]);
1078
+ LLAMA_LOG_WARN (" %s: removing memory module entries for seq_id = %d, pos = [%d, +inf)\n " , __func__, s, pos_min[s]);
1077
1079
1078
1080
memory->seq_rm (s, pos_min[s], -1 );
1079
1081
}
@@ -1857,7 +1859,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
1857
1859
}
1858
1860
1859
1861
if (memory != nullptr ) {
1860
- LLAMA_LOG_DEBUG (" %s: - writing KV self \n " , __func__);
1862
+ LLAMA_LOG_DEBUG (" %s: - writing memory module \n " , __func__);
1861
1863
memory->state_write (io);
1862
1864
}
1863
1865
@@ -1943,7 +1945,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
1943
1945
}
1944
1946
1945
1947
if (memory) {
1946
- LLAMA_LOG_DEBUG (" %s: - reading KV self \n " , __func__);
1948
+ LLAMA_LOG_DEBUG (" %s: - reading memory module \n " , __func__);
1947
1949
1948
1950
memory->state_read (io);
1949
1951
}
0 commit comments