@@ -7629,30 +7629,6 @@ static int llama_decode_impl(
7629
7629
return -3 ;
7630
7630
}
7631
7631
7632
- // reserve a worst case graph if needed
7633
- // TODO: extract to a function
7634
- if (lctx.need_reserve ) {
7635
- const auto & cparams = lctx.cparams ;
7636
- const auto & model = lctx.model ;
7637
-
7638
- // build worst-case graph
7639
- uint32_t n_seqs = 1 ; // TODO: worst-case number of sequences
7640
- uint32_t n_tokens = std::min (cparams.n_ctx , cparams.n_ubatch );
7641
-
7642
- llama_token token = model.vocab .token_bos (); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
7643
- llama_ubatch ubatch = { true , n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
7644
-
7645
- ggml_cgraph * gf = llama_build_graph (lctx, ubatch, true );
7646
-
7647
- // initialize scheduler with the worst-case graph
7648
- ggml_backend_sched_reset (lctx.sched .get ());
7649
- if (!ggml_backend_sched_reserve (lctx.sched .get (), gf)) {
7650
- LLAMA_LOG_ERROR (" %s: failed to allocate compute buffers\n " , __func__);
7651
- }
7652
-
7653
- lctx.need_reserve = false ;
7654
- }
7655
-
7656
7632
ggml_backend_sched_reset (lctx.sched .get ());
7657
7633
ggml_backend_sched_set_eval_callback (lctx.sched .get (), lctx.cparams .cb_eval , lctx.cparams .cb_eval_user_data );
7658
7634
@@ -7889,30 +7865,8 @@ static int llama_encode_impl(
7889
7865
7890
7866
// batch_manager->prepare(ubatch);
7891
7867
7892
- // reserve a worst case graph if needed
7893
- // TODO: extract to a function
7894
- if (lctx.need_reserve ) {
7895
- // TODO: extract to a function
7896
- const auto & cparams = lctx.cparams ;
7897
- const auto & model = lctx.model ;
7898
-
7899
- // build worst-case graph
7900
- uint32_t n_seqs = 1 ; // TODO: worst-case number of sequences
7901
- uint32_t n_tokens = std::min (cparams.n_ctx , cparams.n_ubatch );
7902
-
7903
- llama_token token = model.vocab .token_bos (); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
7904
- llama_ubatch ubatch = { true , n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
7905
-
7906
- ggml_cgraph * gf = llama_build_graph (lctx, ubatch, true );
7907
-
7908
- // initialize scheduler with the worst-case graph
7909
- ggml_backend_sched_reset (lctx.sched .get ());
7910
- if (!ggml_backend_sched_reserve (lctx.sched .get (), gf)) {
7911
- LLAMA_LOG_ERROR (" %s: failed to allocate compute buffers\n " , __func__);
7912
- }
7913
-
7914
- lctx.need_reserve = false ;
7915
- }
7868
+ // TODO: do reserve
7869
+ GGML_ASSERT (lctx.need_reserve == false );
7916
7870
7917
7871
ggml_backend_sched_reset (lctx.sched .get ());
7918
7872
ggml_backend_sched_set_eval_callback (lctx.sched .get (), lctx.cparams .cb_eval , lctx.cparams .cb_eval_user_data );
0 commit comments