graph : avoid set_max_nodes in llm_graph_result

ggerganov · ggerganov · commit b7b6caf35c51 · 2025-07-16T17:15:33.000+03:00
ggml-ci
diff --git a/include/llama.h b/include/llama.h
@@ -1389,7 +1389,7 @@ extern "C" {
 
         int32_t n_p_eval;
         int32_t n_eval;
-        int32_t n_reused;
+        int32_t n_reused; // number of times a ggml compute graph had been reused
     };
 
     struct llama_perf_sampler_data {
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -478,8 +478,8 @@ class llm_graph_result : public llm_graph_result_i {
     ggml_cgraph  * get_gf()  override { return gf; }
     ggml_context * get_ctx() override { return ctx_compute.get(); }
 
-    void set_max_nodes(int64_t max_nodes) {
-        this->max_nodes = max_nodes;
+    int64_t get_max_nodes() const {
+        return max_nodes;
     }
 
     void reset() override {
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -68,8 +68,6 @@ llama_kv_cache_unified::llama_kv_cache_unified(
 
     cells.resize(kv_size);
 
-    gf_res.reset(new llm_graph_result(32768)); // note: the max nodes will be updated later
-
     for (uint32_t il = 0; il < n_layer_cache; il++) {
         if (filter && !filter(il)) {
             LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
@@ -471,6 +469,10 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
 
     auto * sched = lctx->get_sched();
 
+    if (!gf_res || gf_res->get_max_nodes() != lctx->graph_max_nodes()) {
+        gf_res.reset(new llm_graph_result(lctx->graph_max_nodes()));
+    }
+
     if (do_shift) {
         if (!get_can_shift()) {
             GGML_ABORT("The current KV cache / model configuration does not support K-shift");
@@ -484,7 +486,6 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
 
             auto * res = gf_res.get();
 
-            res->set_max_nodes(lctx->graph_max_nodes());
             res->reset();
 
             auto * gf = build_graph_shift(res, lctx);
@@ -531,7 +532,6 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
 
         auto * res = gf_res.get();
 
-        res->set_max_nodes(lctx->graph_max_nodes());
         res->reset();
 
         auto * gf = build_graph_defrag(res, lctx, dinfo);

Original file line number	Diff line number	Diff line change
`@@ -478,8 +478,8 @@ class llm_graph_result : public llm_graph_result_i {`
`478`	`478`	`ggml_cgraph * get_gf() override { return gf; }`
`479`	`479`	`ggml_context * get_ctx() override { return ctx_compute.get(); }`
`480`	`480`
`481`		`- void set_max_nodes(int64_t max_nodes) {`
`482`		`- this->max_nodes = max_nodes;`
	`481`	`+ int64_t get_max_nodes() const {`
	`482`	`+ return max_nodes;`
`483`	`483`	`}`
`484`	`484`
`485`	`485`	`void reset() override {`