@@ -256,7 +256,7 @@ void llama_context::init() {
256
256
{
257
257
llama_ubatch ubatch_pp = { true , n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
258
258
auto ctx = graph_init ();
259
- auto res_pp = graph_build (ctx, ubatch_pp, true );
259
+ auto res_pp = graph_build (ctx. get () , ubatch_pp, true );
260
260
auto & gf_pp = res_pp.gf ;
261
261
if (!ggml_backend_sched_reserve (sched.get (), gf_pp)) {
262
262
LLAMA_LOG_ERROR (" %s: failed to allocate compute pp buffers\n " , __func__);
@@ -271,7 +271,7 @@ void llama_context::init() {
271
271
{
272
272
llama_ubatch ubatch_tg = { true , 1 , 1 , n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
273
273
auto ctx = graph_init ();
274
- auto res_tg = graph_build (ctx, ubatch_tg, true );
274
+ auto res_tg = graph_build (ctx. get () , ubatch_tg, true );
275
275
auto & gf_tg = res_tg.gf ;
276
276
if (!ggml_backend_sched_reserve (sched.get (), gf_tg)) {
277
277
LLAMA_LOG_ERROR (" %s: failed to allocate compute tg buffers\n " , __func__);
@@ -285,7 +285,7 @@ void llama_context::init() {
285
285
{
286
286
llama_ubatch ubatch_pp = { true , n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
287
287
auto ctx = graph_init ();
288
- auto res_pp = graph_build (ctx, ubatch_pp, true );
288
+ auto res_pp = graph_build (ctx. get () , ubatch_pp, true );
289
289
auto & gf_pp = res_pp.gf ;
290
290
if (!ggml_backend_sched_reserve (sched.get (), gf_pp)) {
291
291
LLAMA_LOG_ERROR (" %s: failed to allocate compute pp buffers\n " , __func__);
@@ -573,7 +573,7 @@ ggml_context_ptr llama_context::graph_init() {
573
573
}
574
574
575
575
llama_graph_result llama_context::graph_build (
576
- ggml_context_ptr & ctx,
576
+ ggml_context * ctx,
577
577
const llama_ubatch & ubatch,
578
578
bool worst_case) {
579
579
return model.build_graph (ctx, *this , cparams, ubatch, worst_case);
@@ -1720,7 +1720,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
1720
1720
ggml_backend_sched_set_eval_callback (sched.get (), cparams.cb_eval , cparams.cb_eval_user_data );
1721
1721
1722
1722
auto ctx = graph_init ();
1723
- auto res = graph_build (ctx, ubatch, false );
1723
+ auto res = graph_build (ctx. get () , ubatch, false );
1724
1724
1725
1725
auto * gf = res.gf ;
1726
1726
@@ -2000,7 +2000,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
2000
2000
llama_ubatch ubatch = { true , n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
2001
2001
2002
2002
auto ctx = graph_init ();
2003
- auto res = graph_build (ctx, ubatch, true );
2003
+ auto res = graph_build (ctx. get () , ubatch, true );
2004
2004
2005
2005
// initialize scheduler with the worst-case graph
2006
2006
ggml_backend_sched_reset (sched.get ());
@@ -2015,7 +2015,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
2015
2015
ggml_backend_sched_set_eval_callback (sched.get (), cparams.cb_eval , cparams.cb_eval_user_data );
2016
2016
2017
2017
auto ctx = graph_init ();
2018
- auto res = graph_build (ctx, ubatch, false );
2018
+ auto res = graph_build (ctx. get () , ubatch, false );
2019
2019
2020
2020
auto * gf = res.gf ;
2021
2021
@@ -2483,11 +2483,10 @@ void llama_context_kv_self::kv_self_update() {
2483
2483
ggml_backend_sched_reset (sched.get ());
2484
2484
2485
2485
auto ctx = graph_init ();
2486
- auto * ctx0 = ctx.get ();
2487
2486
2488
- ggml_cgraph * gf = ggml_new_graph_custom (ctx0 , model.max_nodes (), false );
2487
+ ggml_cgraph * gf = ggml_new_graph_custom (ctx. get () , model.max_nodes (), false );
2489
2488
2490
- build_kv_self_shift (ctx0 , gf);
2489
+ build_kv_self_shift (ctx. get () , gf);
2491
2490
2492
2491
ggml_backend_sched_alloc_graph (sched.get (), gf);
2493
2492
@@ -2512,11 +2511,10 @@ void llama_context_kv_self::kv_self_update() {
2512
2511
ggml_backend_sched_reset (sched.get ());
2513
2512
2514
2513
auto ctx = graph_init ();
2515
- auto * ctx0 = ctx.get ();
2516
2514
2517
- ggml_cgraph * gf = ggml_new_graph_custom (ctx0 , model.max_nodes (), false );
2515
+ ggml_cgraph * gf = ggml_new_graph_custom (ctx. get () , model.max_nodes (), false );
2518
2516
2519
- build_kv_self_defrag (ctx0 , gf);
2517
+ build_kv_self_defrag (ctx. get () , gf);
2520
2518
2521
2519
ggml_backend_sched_alloc_graph (sched.get (), gf);
2522
2520
0 commit comments