Dynamically modify the context size without resetting the memory.

J4e6eR · J4e6eR · commit 73b85e4573b4 · 2025-07-30T10:27:28.000Z
diff --git a/include/llama.h b/include/llama.h
@@ -490,7 +490,7 @@ extern "C" {
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
-    LLAMA_API void     llama_mod_n_ctx  (struct llama_context * ctx, uint32_t new_ctx, struct llama_context_params params, const char* dump_file_path);
+    LLAMA_API void     llama_mod_n_ctx  (struct llama_context * ctx, uint32_t new_ctx, struct llama_context_params params);
 
     DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
     DEPRECATED(LLAMA_API int32_t llama_n_embd     (const struct llama_model * model), "use llama_model_n_embd instead");
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -420,7 +420,7 @@ void load_state(llama_context* ctx, const char* dump_file_path){
     }
 }
 
-void llama_context::mod_n_ctx(uint32_t new_n_ctx, llama_context_params params, const char* dump_file_path = "dump_state.bin"){
+void llama_context::mod_n_ctx(uint32_t new_n_ctx, llama_context_params params){
     // Allow only to increase the context size.
     if (cparams.n_ctx < new_n_ctx) {
         cparams.n_ctx = new_n_ctx;
@@ -429,10 +429,14 @@ void llama_context::mod_n_ctx(uint32_t new_n_ctx, llama_context_params params, c
             /*.type_v =*/ params.type_v,
         };
     
+        /*
         // Resets the memory and sets it to new memory params with modified cparams 
         dump_state(this, dump_file_path); // Dump the state here.
         memory.reset(model.create_memory(params_mem, cparams));
         load_state(this, dump_file_path); // Load the state.
+        */
+
+        memory.get()->resize(new_n_ctx);
     }
     else{
         LLAMA_LOG_ERROR("%s: Cannot decrease the context size.", __func__);
@@ -2293,8 +2297,8 @@ uint32_t llama_n_ctx(const llama_context * ctx) {
     return ctx->n_ctx();
 }
 
-void llama_mod_n_ctx(struct llama_context * ctx, uint32_t new_n_ctx, llama_context_params params, const char* dump_file_path){
-    ctx->mod_n_ctx(new_n_ctx, params, dump_file_path);
+void llama_mod_n_ctx(struct llama_context * ctx, uint32_t new_n_ctx, llama_context_params params){
+    ctx->mod_n_ctx(new_n_ctx, params);
 }
 
 uint32_t llama_n_batch(const llama_context * ctx) {
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -37,7 +37,7 @@ struct llama_context {
 
     ggml_context * get_ctx_compute() const;
 
-    void mod_n_ctx(uint32_t new_ctx, llama_context_params params, const char* dump_file_path);
+    void mod_n_ctx(uint32_t new_ctx, llama_context_params params);
     
     uint32_t n_ctx()         const;
     uint32_t n_ctx_per_seq() const;
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -704,6 +704,19 @@ uint32_t llama_kv_cache_unified::get_size() const {
     return cells.size();
 }
 
+// Resizing the cells vector so we can have dynamic ctx.
+bool llama_kv_cache_unified::resize(uint32_t new_n_ctx){
+    try{
+        assert(new_n_ctx > cells.size());
+        new_n_ctx = GGML_PAD(new_n_ctx, n_pad);
+        cells.resize(new_n_ctx);
+        return true;
+    }
+    catch (...){
+        return false;
+    }
+}
+
 bool llama_kv_cache_unified::get_has_shift() const {
     return cells.get_has_shift();
 }
diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h
@@ -88,7 +88,8 @@ class llama_kv_cache_unified : public llama_memory_i {
     //
 
     uint32_t get_size() const;
-
+    // Resizing the cells size to get dynamic context size at runtime.
+    bool resize(uint32_t);
     bool get_has_shift() const;
 
     //
diff --git a/src/llama-memory.h b/src/llama-memory.h
@@ -80,29 +80,35 @@ struct llama_memory_i {
 
     // getters
     virtual bool get_can_shift() const = 0;
-
+    
     //
     // ops
     //
-
+    
     // if data == true, the data buffers will also be cleared together with the metadata
     virtual void clear(bool data) = 0;
-
+    
     virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
     virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
     virtual void seq_keep(llama_seq_id seq_id) = 0;
     virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) = 0;
     virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;
-
+    
     virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
     virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
-
+    
     //
     // state write/read
     //
-
+    
     virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
     virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
+    
+    // Dynamically modify the context files.
+    virtual bool resize(uint32_t) {
+        // Not implemented yet
+        return false;
+    };
 };
 
 using llama_memory_ptr = std::unique_ptr<llama_memory_i>;

Original file line number	Diff line number	Diff line change
`@@ -420,7 +420,7 @@ void load_state(llama_context* ctx, const char* dump_file_path){`
`420`	`420`	`}`
`421`	`421`	`}`
`422`	`422`
`423`		`-void llama_context::mod_n_ctx(uint32_t new_n_ctx, llama_context_params params, const char* dump_file_path = "dump_state.bin"){`
	`423`	`+void llama_context::mod_n_ctx(uint32_t new_n_ctx, llama_context_params params){`
`424`	`424`	`// Allow only to increase the context size.`
`425`	`425`	`if (cparams.n_ctx < new_n_ctx) {`
`426`	`426`	`cparams.n_ctx = new_n_ctx;`
`@@ -429,10 +429,14 @@ void llama_context::mod_n_ctx(uint32_t new_n_ctx, llama_context_params params, c`
`429`	`429`	`/.type_v =/ params.type_v,`
`430`	`430`	`};`
`431`	`431`
	`432`	`+ /*`
`432`	`433`	`// Resets the memory and sets it to new memory params with modified cparams`
`433`	`434`	`dump_state(this, dump_file_path); // Dump the state here.`
`434`	`435`	`memory.reset(model.create_memory(params_mem, cparams));`
`435`	`436`	`load_state(this, dump_file_path); // Load the state.`
	`437`	`+ */`
	`438`	`+`
	`439`	`+ memory.get()->resize(new_n_ctx);`
`436`	`440`	`}`
`437`	`441`	`else{`
`438`	`442`	`LLAMA_LOG_ERROR("%s: Cannot decrease the context size.", __func__);`
`@@ -2293,8 +2297,8 @@ uint32_t llama_n_ctx(const llama_context * ctx) {`
`2293`	`2297`	`return ctx->n_ctx();`
`2294`	`2298`	`}`
`2295`	`2299`
`2296`		`-void llama_mod_n_ctx(struct llama_context * ctx, uint32_t new_n_ctx, llama_context_params params, const char* dump_file_path){`
`2297`		`- ctx->mod_n_ctx(new_n_ctx, params, dump_file_path);`
	`2300`	`+void llama_mod_n_ctx(struct llama_context * ctx, uint32_t new_n_ctx, llama_context_params params){`
	`2301`	`+ ctx->mod_n_ctx(new_n_ctx, params);`
`2298`	`2302`	`}`
`2299`	`2303`
`2300`	`2304`	`uint32_t llama_n_batch(const llama_context * ctx) {`