Skip to content

Commit 93e66a6

Browse files
committed
Revert "Load all MoE experts during warmup and make warmup 1 token (ikawrakow#198)"
1 parent a1f07f1 commit 93e66a6

File tree

3 files changed

+9
-18
lines changed

3 files changed

+9
-18
lines changed

common/common.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2630,10 +2630,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
26302630
if (bos != -1) {
26312631
tmp.push_back(bos);
26322632
}
2633-
else
2634-
{
2635-
tmp.push_back(eos);
2636-
}
2633+
tmp.push_back(eos);
2634+
26372635
if (llama_model_has_encoder(model)) {
26382636
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
26392637
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);

examples/llama-bench/llama-bench.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1920,8 +1920,7 @@ int main(int argc, char ** argv) {
19201920
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
19211921
}
19221922
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
1923-
test_prompt(ctx, 1, 0, t.n_batch, t.n_threads);
1924-
// test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1923+
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
19251924
}
19261925
if (t.n_gen > 0) {
19271926
if (params.progress) {

src/llama.cpp

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4471,7 +4471,6 @@ using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
44714471

44724472
static size_t llama_model_max_nodes(const llama_model & model) {
44734473
return std::max<size_t>(8192, model.tensors_by_name.size()*5);
4474-
44754474
}
44764475

44774476
struct llama_model_loader {
@@ -10231,8 +10230,7 @@ struct llm_build_context {
1023110230
llama_context & lctx,
1023210231
const llama_ubatch & batch,
1023310232
const llm_build_cb & cb,
10234-
bool worst_case,
10235-
bool warmup) :
10233+
bool worst_case) :
1023610234
model (lctx.model),
1023710235
lctx (lctx),
1023810236
hparams (model.hparams),
@@ -10250,7 +10248,7 @@ struct llm_build_context {
1025010248
n_embd_head_v (hparams.n_embd_head_v),
1025110249
n_embd_v_gqa (hparams.n_embd_v_gqa()),
1025210250
n_expert (hparams.n_expert),
10253-
n_expert_used (warmup ? hparams.n_expert : hparams.n_expert_used),
10251+
n_expert_used (hparams.n_expert_used),
1025410252
freq_base (cparams.rope_freq_base),
1025510253
freq_scale (cparams.rope_freq_scale),
1025610254
ext_factor (cparams.yarn_ext_factor),
@@ -16054,7 +16052,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
1605416052

1605516053
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
1605616054

16057-
struct llm_build_context llm(lctx, dummy, cb, false, false);
16055+
struct llm_build_context llm(lctx, dummy, cb, false);
1605816056

1605916057
llm.init();
1606016058

@@ -16071,7 +16069,7 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
1607116069

1607216070
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
1607316071

16074-
struct llm_build_context llm(lctx, dummy, cb, false, false);
16072+
struct llm_build_context llm(lctx, dummy, cb, false);
1607516073

1607616074
llm.init();
1607716075

@@ -16088,7 +16086,7 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
1608816086

1608916087
// llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
1609016088

16091-
// struct llm_build_context llm(lctx, dummy, cb, false, false);
16089+
// struct llm_build_context llm(lctx, dummy, cb, false);
1609216090

1609316091
// llm.init();
1609416092

@@ -16142,11 +16140,7 @@ static struct ggml_cgraph * llama_build_graph(
1614216140

1614316141
struct ggml_cgraph * result = NULL;
1614416142

16145-
const llama_vocab * vocab = llama_get_vocab(&lctx);
16146-
llama_token bos = llama_token_bos_impl(*vocab);
16147-
llama_token eos = llama_token_eos_impl(*vocab);
16148-
bool is_warming_up = (batch.n_tokens == 1 && batch.token[0] == bos);
16149-
struct llm_build_context llm(lctx, batch, cb, worst_case, is_warming_up);
16143+
struct llm_build_context llm(lctx, batch, cb, worst_case);
1615016144

1615116145
llm.init();
1615216146

0 commit comments

Comments
 (0)