graph : rename update() to can_reuse()

ggerganov · ggerganov · commit ee7c644f82d4 · 2025-07-08T20:30:32.000+03:00
ggml-ci
diff --git a/include/llama.h b/include/llama.h
@@ -374,8 +374,6 @@ extern "C" {
         bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
                           // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
                           //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
-
-        bool graph_reuse; // reuse previous compute graphs when possible
     };
 
     // model quantization parameters
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -101,8 +101,7 @@ llama_context::llama_context(
 
     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
 
-    cparams.op_offload  = params.op_offload;
-    cparams.graph_reuse = params.graph_reuse;
+    cparams.op_offload = params.op_offload;
 
     const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
 
@@ -689,9 +688,9 @@ llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch,
     // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
     const auto gparams = graph_params(res, ubatch, mctx, gtype);
 
-    const bool can_reuse = cparams.graph_reuse && res->update(gparams);
-    if (can_reuse) {
-        LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
+    if (res->can_reuse(gparams)) {
+        //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
+
         n_reused++;
     } else {
         res->reset();
@@ -2186,7 +2185,6 @@ llama_context_params llama_context_default_params() {
         /*.no_perf                     =*/ true,
         /*.op_offload                  =*/ true,
         /*.swa_full                    =*/ true,
-        /*.graph_reuse                 =*/ false,
     };
 
     return result;
diff --git a/src/llama-cparams.cpp b/src/llama-cparams.cpp
@@ -27,7 +27,6 @@ bool llama_cparams::is_same(const llama_cparams & other) const {
         no_perf             == other.no_perf             &&
         warmup              == other.warmup              &&
         op_offload          == other.op_offload          &&
-        graph_reuse         == other.graph_reuse         &&
         pooling_type        == other.pooling_type        &&
         cb_eval             == other.cb_eval             &&
         cb_eval_user_data   == other.cb_eval_user_data;
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
@@ -33,7 +33,6 @@ struct llama_cparams {
     bool no_perf;
     bool warmup;
     bool op_offload;
-    bool graph_reuse;
 
     enum llama_pooling_type pooling_type;
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -28,7 +28,7 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-bool llm_graph_input_embd::update(const llm_graph_params & params) {
+bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
     bool res = true;
 
     res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
@@ -59,7 +59,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-bool llm_graph_input_pos::update(const llm_graph_params & params) {
+bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
     bool res = true;
 
     res &= pos->ne[0] == params.ubatch.n_tokens;
@@ -135,7 +135,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-bool llm_graph_input_out_ids::update(const llm_graph_params & params) {
+bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) {
     bool res = true;
 
     res &= n_outputs == params.n_outputs;
@@ -312,7 +312,7 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
     mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
 }
 
-bool llm_graph_input_attn_kv_unified::update(const llm_graph_params & params) {
+bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) {
     const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
 
     this->mctx = mctx;
@@ -342,7 +342,7 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
     mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
 }
 
-bool llm_graph_input_attn_kv_unified_iswa::update(const llm_graph_params & params) {
+bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) {
     const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
 
     this->mctx = mctx;
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -83,7 +83,7 @@ class llm_graph_input_i {
 
     // return true if the resulting input tensors using the provided graph parameters would be
     //   the same as the previous input tensors that we have currently stored in the object
-    virtual bool update(const llm_graph_params & params) {
+    virtual bool can_reuse(const llm_graph_params & params) {
         // returning false here by default will prevent from reusing the graph if the check
         //   for the input type has not been implemented yet
         GGML_UNUSED(params);
@@ -100,7 +100,7 @@ class llm_graph_input_embd : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    bool update(const llm_graph_params & params) override;
+    bool can_reuse(const llm_graph_params & params) override;
 
     ggml_tensor * tokens = nullptr; // I32 [n_batch]
     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
@@ -113,7 +113,7 @@ class llm_graph_input_pos : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    bool update(const llm_graph_params & params) override;
+    bool can_reuse(const llm_graph_params & params) override;
 
     ggml_tensor * pos = nullptr; // I32 [n_batch]
 
@@ -173,7 +173,7 @@ class llm_graph_input_out_ids : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    bool update(const llm_graph_params & params) override;
+    bool can_reuse(const llm_graph_params & params) override;
 
     ggml_tensor * out_ids; // I32 [n_outputs]
 
@@ -265,7 +265,7 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    bool update(const llm_graph_params & params) override;
+    bool can_reuse(const llm_graph_params & params) override;
 
     ggml_tensor * get_k_idxs() const { return self_k_idxs; }
     ggml_tensor * get_v_idxs() const { return self_v_idxs; }
@@ -298,7 +298,7 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    bool update(const llm_graph_params & params) override;
+    bool can_reuse(const llm_graph_params & params) override;
 
     ggml_tensor * get_k_idxs()     const { return self_k_idxs; }
     ggml_tensor * get_v_idxs()     const { return self_v_idxs; }
@@ -410,7 +410,7 @@ class llm_graph_result_i {
 
     virtual void set_inputs(const llama_ubatch * ubatch) = 0;
 
-    virtual bool update(const llm_graph_params & params) = 0;
+    virtual bool can_reuse(const llm_graph_params & params) = 0;
 };
 
 using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
@@ -504,20 +504,20 @@ class llm_graph_result : public llm_graph_result_i {
         }
     }
 
-    // try to update the existing graph result using the new graph parameters
+    // try to update the existing graph result using the new graph parameters in order to reuse it
     // this can only be done if we determine that the resulting graph using the new graph parameters
     //   would be identical to the existing graph. in that case, we simply have to update the memory
     //   contexts of the input tensors of the graph and we can reuse it for another computation
     // return true if the graph was updated and can be reused
-    bool update(const llm_graph_params & params) override {
+    bool can_reuse(const llm_graph_params & params) override {
         if (!this->params.is_same(params)) {
             return false;
         }
 
         bool res = true;
 
         for (auto & input : inputs) {
-            res &= input->update(params);
+            res &= input->can_reuse(params);
         }
 
         return res;

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {`
`28`	`28`	`}`
`29`	`29`	`}`
`30`	`30`
`31`		`-bool llm_graph_input_embd::update(const llm_graph_params & params) {`
	`31`	`+bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {`
`32`	`32`	`bool res = true;`
`33`	`33`
`34`	`34`	`res &= (!tokens && !params.ubatch.token) \|\| (tokens && tokens->ne[0] == params.ubatch.n_tokens);`
`@@ -59,7 +59,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {`
`59`	`59`	`}`
`60`	`60`	`}`
`61`	`61`
`62`		`-bool llm_graph_input_pos::update(const llm_graph_params & params) {`
	`62`	`+bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {`
`63`	`63`	`bool res = true;`
`64`	`64`
`65`	`65`	`res &= pos->ne[0] == params.ubatch.n_tokens;`
`@@ -135,7 +135,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {`
`135`	`135`	`}`
`136`	`136`	`}`
`137`	`137`
`138`		`-bool llm_graph_input_out_ids::update(const llm_graph_params & params) {`
	`138`	`+bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) {`
`139`	`139`	`bool res = true;`
`140`	`140`
`141`	`141`	`res &= n_outputs == params.n_outputs;`
`@@ -312,7 +312,7 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {`
`312`	`312`	`mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);`
`313`	`313`	`}`
`314`	`314`
`315`		`-bool llm_graph_input_attn_kv_unified::update(const llm_graph_params & params) {`
	`315`	`+bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) {`
`316`	`316`	`const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);`
`317`	`317`
`318`	`318`	`this->mctx = mctx;`
`@@ -342,7 +342,7 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch`
`342`	`342`	`mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);`
`343`	`343`	`}`
`344`	`344`
`345`		`-bool llm_graph_input_attn_kv_unified_iswa::update(const llm_graph_params & params) {`
	`345`	`+bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) {`
`346`	`346`	`const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);`
`347`	`347`
`348`	`348`	`this->mctx = mctx;`