@@ -3784,7 +3784,7 @@ static size_t llama_model_max_nodes(const llama_model & /*model*/) {
37843784 // return 32768;
37853785 //}
37863786
3787- return 8192 ;
3787+ return 65536 ;
37883788}
37893789
37903790struct llama_model_loader {
@@ -8879,7 +8879,8 @@ struct llm_build_context {
88798879 llama_context & lctx,
88808880 const llama_batch & batch,
88818881 const llm_build_cb & cb,
8882- bool worst_case) :
8882+ bool worst_case,
8883+ bool warmup) :
88838884 model (lctx.model),
88848885 lctx (lctx),
88858886 hparams (model.hparams),
@@ -8897,7 +8898,7 @@ struct llm_build_context {
88978898 n_embd_head_v (hparams.n_embd_head_v),
88988899 n_embd_v_gqa (hparams.n_embd_v_gqa()),
88998900 n_expert (hparams.n_expert),
8900- n_expert_used (hparams.n_expert_used),
8901+ n_expert_used (warmup ? hparams.n_expert : hparams.n_expert_used),
89018902 freq_base (cparams.rope_freq_base),
89028903 freq_scale (cparams.rope_freq_scale),
89038904 ext_factor (cparams.yarn_ext_factor),
@@ -14433,7 +14434,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
1443314434
1443414435 llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
1443514436
14436- struct llm_build_context llm(lctx, dummy, cb, false);
14437+ struct llm_build_context llm(lctx, dummy, cb, false, false );
1443714438
1443814439 llm.init();
1443914440
@@ -14450,7 +14451,7 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
1445014451
1445114452 llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
1445214453
14453- struct llm_build_context llm(lctx, dummy, cb, false);
14454+ struct llm_build_context llm(lctx, dummy, cb, false, false );
1445414455
1445514456 llm.init();
1445614457
@@ -14467,7 +14468,7 @@ static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) {
1446714468
1446814469 llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
1446914470
14470- struct llm_build_context llm(lctx, dummy, cb, false);
14471+ struct llm_build_context llm(lctx, dummy, cb, false, false );
1447114472
1447214473 llm.init();
1447314474
@@ -14517,7 +14518,11 @@ static struct ggml_cgraph * llama_build_graph(
1451714518
1451814519 struct ggml_cgraph * result = NULL;
1451914520
14520- struct llm_build_context llm(lctx, batch, cb, worst_case);
14521+ const llama_vocab * vocab = llama_get_vocab(&lctx);
14522+ llama_token bos = llama_token_bos_impl(*vocab);
14523+ llama_token eos = llama_token_eos_impl(*vocab);
14524+ bool is_warming_up = (batch.n_tokens == 1 && batch.token[0] == bos);
14525+ struct llm_build_context llm(lctx, batch, cb, worst_case, is_warming_up);
1452114526
1452214527 llm.init();
1452314528
0 commit comments