Dynamically modify the context wthout resetting the memory.

J4e6eR · J4e6eR · commit 27456b5c7101 · 2025-07-31T12:58:08.000Z
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -439,11 +439,13 @@ void llama_context::mod_n_ctx(uint32_t new_n_ctx, llama_context_params params, c
             /*.type_k =*/ params.type_k,
             /*.type_v =*/ params.type_v,
         };
-    
+        /*
         // Resets the memory and sets it to new memory params with modified cparams 
         dump_state(this, dump_file_path); // Dump the state here.
         memory.reset(model.create_memory(params_mem, cparams));
         load_state(this, dump_file_path); // Load the state.
+        */
+        memory.get()->resize(new_n_ctx);
     }
     else{
         LLAMA_LOG_ERROR("%s: Cannot decrease the context size.", __func__);
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -1007,6 +1007,23 @@ uint32_t llama_kv_cache_unified::get_n_stream() const {
     return n_stream;
 }
 
+// Resizing the cells vector so we can have dynamic ctx.
+// Not modifying n_stream at the moment
+bool llama_kv_cache_unified::resize(uint32_t new_n_ctx){
+    try{
+        new_n_ctx = GGML_PAD(new_n_ctx, n_pad);
+        // v_cells.resize(n_stream);
+        for (uint32_t s = 0; s < n_stream; ++s) {
+            assert(new_n_ctx > v_cells[s].size());
+            v_cells[s].resize(new_n_ctx);
+        }
+        return true;
+    }
+    catch (...){
+        return false;
+    }
+}
+
 bool llama_kv_cache_unified::get_has_shift() const {
     bool result = false;
 
diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h
@@ -146,6 +146,9 @@ class llama_kv_cache_unified : public llama_memory_i {
     uint32_t get_size()     const;
     uint32_t get_n_stream() const;
 
+    // Resizing the cells size to get dynamic context size at runtime.
+    bool resize(uint32_t);
+
     bool get_has_shift() const;
 
     //
diff --git a/src/llama-memory.h b/src/llama-memory.h
@@ -106,6 +106,12 @@ struct llama_memory_i {
 
     virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
     virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
+
+    // Dynamically modify the context files.
+    virtual bool resize(uint32_t) {
+        // Implemented only for unified memory at the moment.
+        return false;
+    }
 };
 
 using llama_memory_ptr = std::unique_ptr<llama_memory_i>;