@@ -144,7 +144,7 @@ class llm_graph_input_pos_bucket : public llm_graph_input_i {
144144
145145 ggml_tensor * pos_bucket = nullptr ; // I32 [n_batch, n_batch]
146146
147- const llama_hparams & hparams;
147+ const llama_hparams hparams;
148148};
149149
150150class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
@@ -158,7 +158,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
158158
159159 ggml_tensor * pos_bucket = nullptr ; // I32 [n_kv, n_batch]
160160
161- const llama_hparams & hparams;
161+ const llama_hparams hparams;
162162
163163 const llama_kv_cache_unified_context * mctx;
164164};
@@ -177,8 +177,8 @@ class llm_graph_input_out_ids : public llm_graph_input_i {
177177
178178 ggml_tensor * out_ids; // I32 [n_outputs]
179179
180- const llama_hparams & hparams;
181- const llama_cparams & cparams;
180+ const llama_hparams hparams;
181+ const llama_cparams cparams;
182182
183183 const uint32_t n_outputs;
184184};
@@ -192,7 +192,7 @@ class llm_graph_input_mean : public llm_graph_input_i {
192192
193193 ggml_tensor * mean; // F32 [n_batch, n_batch]
194194
195- const llama_cparams & cparams;
195+ const llama_cparams cparams;
196196};
197197
198198class llm_graph_input_cls : public llm_graph_input_i {
@@ -204,7 +204,7 @@ class llm_graph_input_cls : public llm_graph_input_i {
204204
205205 ggml_tensor * cls; // I32 [n_batch]
206206
207- const llama_cparams & cparams;
207+ const llama_cparams cparams;
208208};
209209
210210class llm_graph_input_rs : public llm_graph_input_i {
@@ -247,8 +247,8 @@ class llm_graph_input_attn_no_cache : public llm_graph_input_i {
247247 ggml_tensor * kq_mask = nullptr ; // F32 [n_tokens, n_batch, 1, 1]
248248 ggml_tensor * kq_mask_cnv = nullptr ; // [n_tokens, n_batch, 1, 1]
249249
250- const llama_hparams & hparams;
251- const llama_cparams & cparams;
250+ const llama_hparams hparams;
251+ const llama_cparams cparams;
252252};
253253
254254class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
@@ -278,8 +278,11 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
278278 ggml_tensor * self_kq_mask = nullptr ; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
279279 ggml_tensor * self_kq_mask_cnv = nullptr ; // [n_kv, n_batch/n_stream, 1, n_stream]
280280
281- const llama_hparams & hparams;
282- const llama_cparams & cparams;
281+ // note: these have to be copies because in order to be able to reuse a graph, its inputs
282+ // need to carry these parameters with them. otherwise, they can point to freed
283+ // llm_graph_params from a previous batch, causing stack-use-after-return
284+ const llama_hparams hparams;
285+ const llama_cparams cparams;
283286
284287 const llama_kv_cache_unified_context * mctx;
285288};
@@ -318,8 +321,8 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
318321 ggml_tensor * self_kq_mask_swa = nullptr ; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
319322 ggml_tensor * self_kq_mask_swa_cnv = nullptr ; // [n_kv, n_batch/n_stream, 1, n_stream]
320323
321- const llama_hparams & hparams;
322- const llama_cparams & cparams;
324+ const llama_hparams hparams;
325+ const llama_cparams cparams;
323326
324327 const llama_kv_cache_unified_iswa_context * mctx;
325328};
0 commit comments