-
Notifications
You must be signed in to change notification settings - Fork 13.4k
Add support for SmallThinker model series #14898
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
efe27eb
a6d6eaf
a5274b7
8e2cb21
e28d2c5
ebd78cc
8c6af02
92b518b
4186bab
f1d4698
f10cd46
4af8b59
e2c900c
594af99
29e1fe0
5d09d11
bb3dd58
e338c30
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -938,6 +938,100 @@ ggml_tensor * llm_graph_context::build_moe_ffn( | |
return moe_out; | ||
} | ||
|
||
ggml_tensor * llm_graph_context::build_moe_ffn_from_probs( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code duplication is unfortunate, is it possible to merge this into Can be a follow-up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a great point. I've been thinking about the best way to merge these and have a couple of ideas on how we could approach it.
Both approaches seem feasible. Given the complexity and your suggestion that this can be a follow-up, would you prefer I handle this in a separate PR, or should I proceed with one of these solutions here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A separate PR is probably best. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, I'll open a follow-up issue later to track this so we don't forget. Thanks! Edit: I've created issue #14920 to track this. |
||
ggml_tensor * cur, | ||
ggml_tensor * probs, | ||
ggml_tensor * up_exps, | ||
ggml_tensor * gate_exps, | ||
ggml_tensor * down_exps, | ||
ggml_tensor * exp_probs_b, | ||
int64_t n_expert, | ||
int64_t n_expert_used, | ||
llama_expert_gating_func_type gating_op, | ||
int il) const { | ||
const int64_t n_embd = cur->ne[0]; | ||
const int64_t n_tokens = cur->ne[1]; | ||
|
||
// add experts selection bias - introduced in DeepSeek V3 | ||
// leave probs unbiased as it's later used to get expert weights | ||
ggml_tensor * selection_probs = probs; | ||
if (exp_probs_b != nullptr) { | ||
selection_probs = ggml_add(ctx0, probs, exp_probs_b); | ||
cb(selection_probs, "ffn_moe_probs_biased", il); | ||
} | ||
|
||
// select experts | ||
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens] | ||
cb(selected_experts->src[0], "ffn_moe_argsort", il); | ||
cb(selected_experts, "ffn_moe_topk", il); | ||
|
||
ggml_tensor * weights = ggml_get_rows(ctx0, | ||
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] | ||
cb(weights, "ffn_moe_weights", il); | ||
|
||
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); | ||
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) { | ||
weights = ggml_soft_max(ctx0, weights); | ||
} else { | ||
weights = ggml_sigmoid(ctx0, weights); | ||
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] | ||
cb(weights_sum, "ffn_moe_weights_sum", il); | ||
|
||
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] | ||
cb(weights, "ffn_moe_weights_norm", il); | ||
} | ||
|
||
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); | ||
|
||
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); | ||
|
||
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] | ||
cb(up, "ffn_moe_up", il); | ||
|
||
ggml_tensor * experts = nullptr; | ||
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] | ||
cb(cur, "ffn_moe_gate", il); | ||
|
||
cur = ggml_reglu_split(ctx0, cur, up); | ||
cb(cur, "ffn_moe_reglu", il); | ||
|
||
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] | ||
cb(experts, "ffn_moe_down", il); | ||
|
||
experts = ggml_mul(ctx0, experts, weights); | ||
cb(cur, "ffn_moe_weighted", il); | ||
|
||
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr }; | ||
|
||
assert(n_expert_used > 0); | ||
|
||
// order the views before the adds | ||
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) { | ||
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]); | ||
|
||
ggml_build_forward_expand(gf, cur_experts[i]); | ||
} | ||
|
||
// aggregate experts | ||
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used | ||
// to avoid potentially a large number of add nodes during warmup | ||
// ref: https://github.com/ggml-org/llama.cpp/pull/14753 | ||
ggml_tensor * moe_out = cur_experts[0]; | ||
|
||
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) { | ||
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]); | ||
} | ||
|
||
if (n_expert_used == 1) { | ||
// avoid returning a non-contiguous tensor | ||
moe_out = ggml_cont(ctx0, moe_out); | ||
} | ||
|
||
cb(moe_out, "ffn_moe_out", il); | ||
|
||
return moe_out; | ||
} | ||
|
||
// input embeddings with optional lora | ||
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { | ||
const int64_t n_embd = hparams.n_embd; | ||
|
Uh oh!
There was an error while loading. Please reload this page.