Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
//

uint32_t llama_context::graph_max_nodes() const {
return std::max<uint32_t>(65536u, 5u*model.n_tensors());
return std::max<uint32_t>(1024u, 6u*model.n_tensors());
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably bump this up to 8u*model.n_tensors() just to be safe.

}

llm_graph_result * llama_context::get_gf_res_reserve() const {
Expand Down
7 changes: 5 additions & 2 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -906,8 +906,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
}

// aggregate experts
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
// to avoid potentially a large number of add nodes during warmup
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
ggml_tensor * moe_out = nullptr;
for (int i = 0; i < n_expert_used; ++i) {
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
experts->nb[2], i*experts->nb[1]);

Expand All @@ -918,7 +921,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
}
}

if (n_expert_used == 1) {
if (hparams.n_expert_used == 1) {
// avoid returning a non-contiguous tensor
moe_out = ggml_cont(ctx0, moe_out);
}
Expand Down
Loading