Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2169,8 +2169,10 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
if (bos != -1) {
tmp.push_back(bos);
}
tmp.push_back(eos);

else
{
tmp.push_back(eos);
}
if (llama_model_has_encoder(model)) {
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
Expand Down
2 changes: 1 addition & 1 deletion examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1586,7 +1586,7 @@ int main(int argc, char ** argv) {
if (params.warmup) {
if (t.n_prompt > 0) {
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
test_prompt(ctx, 1, 0, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
test_gen(ctx, 1, 0, t.n_threads);
Expand Down
19 changes: 12 additions & 7 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3784,7 +3784,7 @@ static size_t llama_model_max_nodes(const llama_model & /*model*/) {
// return 32768;
//}

return 8192;
return 65536;
}

struct llama_model_loader {
Expand Down Expand Up @@ -8879,7 +8879,8 @@ struct llm_build_context {
llama_context & lctx,
const llama_batch & batch,
const llm_build_cb & cb,
bool worst_case) :
bool worst_case,
bool warmup) :
model (lctx.model),
lctx (lctx),
hparams (model.hparams),
Expand All @@ -8897,7 +8898,7 @@ struct llm_build_context {
n_embd_head_v (hparams.n_embd_head_v),
n_embd_v_gqa (hparams.n_embd_v_gqa()),
n_expert (hparams.n_expert),
n_expert_used (hparams.n_expert_used),
n_expert_used (warmup ? hparams.n_expert : hparams.n_expert_used),
freq_base (cparams.rope_freq_base),
freq_scale (cparams.rope_freq_scale),
ext_factor (cparams.yarn_ext_factor),
Expand Down Expand Up @@ -14433,7 +14434,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const

llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };

struct llm_build_context llm(lctx, dummy, cb, false);
struct llm_build_context llm(lctx, dummy, cb, false, false);

llm.init();

Expand All @@ -14450,7 +14451,7 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {

llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };

struct llm_build_context llm(lctx, dummy, cb, false);
struct llm_build_context llm(lctx, dummy, cb, false, false);

llm.init();

Expand All @@ -14467,7 +14468,7 @@ static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) {

llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };

struct llm_build_context llm(lctx, dummy, cb, false);
struct llm_build_context llm(lctx, dummy, cb, false, false);

llm.init();

Expand Down Expand Up @@ -14517,7 +14518,11 @@ static struct ggml_cgraph * llama_build_graph(

struct ggml_cgraph * result = NULL;

struct llm_build_context llm(lctx, batch, cb, worst_case);
const llama_vocab * vocab = llama_get_vocab(&lctx);
llama_token bos = llama_token_bos_impl(*vocab);
llama_token eos = llama_token_eos_impl(*vocab);
bool is_warming_up = (batch.n_tokens == 1 && batch.token[0] == bos);
struct llm_build_context llm(lctx, batch, cb, worst_case, is_warming_up);

llm.init();

Expand Down