@@ -246,31 +246,48 @@ void llama_context::init() {
246
246
uint32_t n_tokens = std::min (cparams.n_ctx , cparams.n_ubatch );
247
247
llama_token token = model.vocab .token_bos (); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
248
248
249
+ int n_splits_pp = -1 ;
250
+ int n_nodes_pp = -1 ;
251
+
252
+ int n_splits_tg = -1 ;
253
+ int n_nodes_tg = -1 ;
254
+
249
255
// reserve pp graph first so that buffers are only allocated once
250
- llama_ubatch ubatch_pp = { true , n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
251
- ggml_cgraph * gf_pp = build_graph (ubatch_pp, true );
252
- if (!ggml_backend_sched_reserve (sched.get (), gf_pp)) {
253
- LLAMA_LOG_ERROR (" %s: failed to allocate compute pp buffers\n " , __func__);
254
- throw std::runtime_error (" failed to allocate compute buffers" );
256
+ {
257
+ llama_ubatch ubatch_pp = { true , n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
258
+ auto res_pp = graph_build (ubatch_pp, true );
259
+ auto & gf_pp = res_pp.gf ;
260
+ if (!ggml_backend_sched_reserve (sched.get (), gf_pp)) {
261
+ LLAMA_LOG_ERROR (" %s: failed to allocate compute pp buffers\n " , __func__);
262
+ throw std::runtime_error (" failed to allocate compute buffers" );
263
+ }
264
+
265
+ n_splits_pp = ggml_backend_sched_get_n_splits (sched.get ());
266
+ n_nodes_pp = ggml_graph_n_nodes (gf_pp);
255
267
}
256
- int n_splits_pp = ggml_backend_sched_get_n_splits (sched.get ());
257
- int n_nodes_pp = ggml_graph_n_nodes (gf_pp);
258
268
259
269
// reserve with tg graph to get the number of splits and nodes
260
- llama_ubatch ubatch_tg = { true , 1 , 1 , n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
261
- ggml_cgraph * gf_tg = build_graph (ubatch_tg, true );
262
- if (!ggml_backend_sched_reserve (sched.get (), gf_tg)) {
263
- LLAMA_LOG_ERROR (" %s: failed to allocate compute tg buffers\n " , __func__);
264
- throw std::runtime_error (" failed to allocate compute buffers" );
270
+ {
271
+ llama_ubatch ubatch_tg = { true , 1 , 1 , n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
272
+ auto res_tg = graph_build (ubatch_tg, true );
273
+ auto & gf_tg = res_tg.gf ;
274
+ if (!ggml_backend_sched_reserve (sched.get (), gf_tg)) {
275
+ LLAMA_LOG_ERROR (" %s: failed to allocate compute tg buffers\n " , __func__);
276
+ throw std::runtime_error (" failed to allocate compute buffers" );
277
+ }
278
+ n_splits_tg = ggml_backend_sched_get_n_splits (sched.get ());
279
+ n_nodes_tg = ggml_graph_n_nodes (gf_tg);
265
280
}
266
- int n_splits_tg = ggml_backend_sched_get_n_splits (sched.get ());
267
- int n_nodes_tg = ggml_graph_n_nodes (gf_tg);
268
281
269
282
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
270
- gf_pp = build_graph (ubatch_pp, true );
271
- if (!ggml_backend_sched_reserve (sched.get (), gf_pp)) {
272
- LLAMA_LOG_ERROR (" %s: failed to allocate compute pp buffers\n " , __func__);
273
- throw std::runtime_error (" failed to allocate compute buffers" );
283
+ {
284
+ llama_ubatch ubatch_pp = { true , n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
285
+ auto res_pp = graph_build (ubatch_pp, true );
286
+ auto & gf_pp = res_pp.gf ;
287
+ if (!ggml_backend_sched_reserve (sched.get (), gf_pp)) {
288
+ LLAMA_LOG_ERROR (" %s: failed to allocate compute pp buffers\n " , __func__);
289
+ throw std::runtime_error (" failed to allocate compute buffers" );
290
+ }
274
291
}
275
292
276
293
for (size_t i = 0 ; i < backend_ptrs.size (); ++i) {
@@ -890,7 +907,7 @@ void llama_context::build_cb(
890
907
}
891
908
}
892
909
893
- ggml_cgraph * llama_context::build_graph (const llama_ubatch & ubatch, bool worst_case) {
910
+ llama_graph_result llama_context::graph_build (const llama_ubatch & ubatch, bool worst_case) {
894
911
return model.build_graph (*this , cparams, ubatch, graph_init (), worst_case);
895
912
}
896
913
@@ -1814,11 +1831,11 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
1814
1831
llama_token token = model.vocab .token_bos (); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
1815
1832
llama_ubatch ubatch = { true , n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
1816
1833
1817
- ggml_cgraph * gf = build_graph (ubatch, true );
1834
+ auto res = graph_build (ubatch, true );
1818
1835
1819
1836
// initialize scheduler with the worst-case graph
1820
1837
ggml_backend_sched_reset (sched.get ());
1821
- if (!ggml_backend_sched_reserve (sched.get (), gf)) {
1838
+ if (!ggml_backend_sched_reserve (sched.get (), res. gf )) {
1822
1839
LLAMA_LOG_ERROR (" %s: failed to allocate compute buffers\n " , __func__);
1823
1840
}
1824
1841
@@ -1828,7 +1845,9 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
1828
1845
ggml_backend_sched_reset (sched.get ());
1829
1846
ggml_backend_sched_set_eval_callback (sched.get (), cparams.cb_eval , cparams.cb_eval_user_data );
1830
1847
1831
- ggml_cgraph * gf = build_graph (ubatch, false );
1848
+ auto res = graph_build (ubatch, false );
1849
+
1850
+ auto & gf = res.gf ;
1832
1851
1833
1852
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
1834
1853
@@ -2073,7 +2092,9 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
2073
2092
ggml_backend_sched_reset (sched.get ());
2074
2093
ggml_backend_sched_set_eval_callback (sched.get (), cparams.cb_eval , cparams.cb_eval_user_data );
2075
2094
2076
- ggml_cgraph * gf = build_graph (ubatch, false );
2095
+ auto res = graph_build (ubatch, false );
2096
+
2097
+ auto & gf = res.gf ;
2077
2098
2078
2099
ggml_backend_sched_alloc_graph (sched.get (), gf);
2079
2100
0 commit comments