Skip to content

Commit 6201b43

Browse files
committed
Update the graph.
1 parent 02ff085 commit 6201b43

File tree

1 file changed

+113
-33
lines changed

1 file changed

+113
-33
lines changed

src/llama-model.cpp

Lines changed: 113 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -13736,6 +13736,11 @@ struct llm_build_arcee : public llm_graph_context {
1373613736

1373713737
struct llm_build_smollm3 : public llm_graph_context {
1373813738
llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13739+
const int64_t n_embd_head = hparams.n_embd_head_v;
13740+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13741+
GGML_ASSERT(n_embd_head == hparams.n_rot);
13742+
13743+
// collect layers for which RoPE is disabled (metadata key: "smollm3.no_rope_layers")
1373913744
std::vector<int32_t> no_rope_layers;
1374013745
if (arch == LLM_ARCH_SMOLLM3) {
1374113746
const int kid = gguf_find_key(model.meta, "smollm3.no_rope_layers");
@@ -13747,59 +13752,134 @@ struct llm_build_smollm3 : public llm_graph_context {
1374713752
}
1374813753
}
1374913754

13750-
const int64_t n_tokens = params.n_tokens;
13751-
const int64_t n_layer = hparams.n_layer;
13755+
// token embeddings
13756+
ggml_tensor * inpL = build_inp_embd(model.tok_embd);
1375213757

13753-
gf->n_threads = params.n_threads;
13758+
// positional ids
13759+
ggml_tensor * inp_pos = build_inp_pos();
1375413760

13755-
// build the graph
13756-
inp_tokens->set_input(ubatch);
13757-
inp_pos->set_input(ubatch);
13758-
inp_attn_temp->set_input(ubatch);
13761+
// attention helper (unified KV cache)
13762+
auto * inp_attn = build_attn_inp_kv_unified();
1375913763

13760-
struct ggml_tensor * cur = build_inp_embd();
13761-
struct ggml_tensor * lay_out = nullptr;
13764+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
1376213765

13766+
ggml_tensor * cur = nullptr;
1376313767
for (int il = 0; il < n_layer; ++il) {
13764-
struct ggml_tensor * inp_norm = build_norm(cur, hparams.f_norm_eps, il, tn(LLM_TENSOR_ATTN_NORM, il));
13765-
struct ggml_tensor * qkv = build_attn(inp_norm, il);
13766-
struct ggml_tensor * q = ggml_view_4d(ctx, qkv, hparams.n_embd_head_v, hparams.n_head(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_v, 0, 0, 0);
13767-
struct ggml_tensor * k = ggml_view_4d(ctx, qkv, hparams.n_embd_head_k, hparams.n_head_kv(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_k, ggml_element_size(qkv)*hparams.n_embd_k_gqa(il), 0, 0);
13768-
struct ggml_tensor * v = ggml_view_4d(ctx, qkv, hparams.n_embd_head_v, hparams.n_head_kv(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_v, ggml_element_size(qkv)*hparams.n_embd_k_gqa(il) + ggml_element_size(qkv)*hparams.n_embd_k_gqa(il), 0, 0);
13768+
ggml_tensor * inpSA = inpL;
13769+
13770+
// attention norm
13771+
cur = build_norm(inpL,
13772+
model.layers[il].attn_norm, NULL,
13773+
LLM_NORM_RMS, il);
13774+
cb(cur, "attn_norm", il);
13775+
13776+
// ---- self-attention ----
13777+
{
13778+
// fused QKV projection
13779+
ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
13780+
cb(qkv, "wqkv", il);
13781+
if (model.layers[il].bqkv) {
13782+
qkv = ggml_add(ctx0, qkv, model.layers[il].bqkv);
13783+
cb(qkv, "bqkv", il);
13784+
}
13785+
13786+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
1376913787

13770-
ggml_set_name(q, "q");
13771-
ggml_set_name(k, "k");
13772-
ggml_set_name(v, "v");
13788+
ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens, qkv->nb[1], 0));
13789+
ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], sizeof(float)*(n_embd)));
13790+
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], sizeof(float)*(n_embd + n_embd_gqa)));
1377313791

13774-
struct ggml_tensor * qcur = q;
13775-
struct ggml_tensor * kcur = k;
13792+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13793+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13794+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1377613795

13777-
bool apply_rope = true;
13778-
if (arch == LLM_ARCH_SMOLLM3) {
13779-
if (std::find(no_rope_layers.begin(), no_rope_layers.end(), il) != no_rope_layers.end()) {
13780-
apply_rope = false;
13796+
if (std::find(no_rope_layers.begin(), no_rope_layers.end(), il) == no_rope_layers.end()) {
13797+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
13798+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
13799+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13800+
ext_factor, attn_factor, beta_fast, beta_slow);
13801+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors,
13802+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13803+
ext_factor, attn_factor, beta_fast, beta_slow);
1378113804
}
13782-
}
1378313805

13784-
if (apply_rope && get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il))) {
13785-
qcur = ggml_rope_ext(ctx, q, inp_pos->pos, get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il)), hparams.rope_type, 0, hparams.n_rot, hparams.n_gqa(il), hparams.rope_freq_base_train, hparams.rope_freq_scale_train, hparams.n_ctx_orig_yarn, hparams.rope_yarn_log_mul);
13786-
kcur = ggml_rope_ext(ctx, k, inp_pos->pos, get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il)), hparams.rope_type, 0, hparams.n_rot, hparams.n_gqa(il), hparams.rope_freq_base_train, hparams.rope_freq_scale_train, hparams.n_ctx_orig_yarn, hparams.rope_yarn_log_mul);
13806+
cb(Qcur, "Qcur", il);
13807+
cb(Kcur, "Kcur", il);
13808+
cb(Vcur, "Vcur", il);
13809+
13810+
cur = build_attn(inp_attn, gf,
13811+
model.layers[il].wo, model.layers[il].bo,
13812+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
13813+
cb(cur, "attn_out", il);
1378713814
}
1378813815

13789-
struct ggml_tensor * attn_out = build_attn_out(inp_norm, qcur, kcur, v, il);
13816+
// skip padded tokens for final layer
13817+
if (il == n_layer - 1) {
13818+
ggml_tensor * inp_out_ids = build_inp_out_ids();
13819+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13820+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13821+
}
1379013822

13823+
// ---- feed-forward ----
1379113824
if (hparams.use_par_res) {
1379213825
// parallel residual
13793-
lay_out = ggml_add(ctx, attn_out, build_ff_par(inp_norm, il));
13826+
ggml_tensor * ffn_cur = build_norm(inpL,
13827+
model.layers[il].ffn_norm, NULL,
13828+
LLM_NORM_RMS, il);
13829+
cb(ffn_cur, "ffn_norm", il);
13830+
13831+
ffn_cur = build_ffn(ffn_cur,
13832+
model.layers[il].ffn_up, NULL, NULL,
13833+
model.layers[il].ffn_gate, NULL, NULL,
13834+
model.layers[il].ffn_down, NULL, NULL,
13835+
NULL,
13836+
LLM_FFN_SILU, LLM_FFN_PAR, il);
13837+
cb(ffn_cur, "ffn_out", il);
13838+
13839+
cur = ggml_add(ctx0, cur, ffn_cur);
13840+
cb(cur, "par_res", il);
1379413841
} else {
1379513842
// sequential residual
13796-
lay_out = ggml_add(ctx, cur, attn_out);
13797-
lay_out = build_ff_seq(lay_out, il);
13843+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13844+
cb(ffn_inp, "ffn_inp", il);
13845+
13846+
cur = build_norm(ffn_inp,
13847+
model.layers[il].ffn_norm, NULL,
13848+
LLM_NORM_RMS, il);
13849+
cb(cur, "ffn_norm", il);
13850+
13851+
cur = build_ffn(cur,
13852+
model.layers[il].ffn_up, NULL, NULL,
13853+
model.layers[il].ffn_gate, NULL, NULL,
13854+
model.layers[il].ffn_down, NULL, NULL,
13855+
NULL,
13856+
LLM_FFN_SILU, LLM_FFN_PAR, il);
13857+
cb(cur, "ffn_out", il);
13858+
13859+
cur = ggml_add(ctx0, cur, ffn_inp);
13860+
cb(cur, "ffn_out", il);
1379813861
}
13799-
cur = lay_out;
13862+
13863+
// post-processing
13864+
cur = build_cvec(cur, il);
13865+
cb(cur, "l_out", il);
13866+
13867+
inpL = cur;
1380013868
}
1380113869

13802-
build_output(cur, lay_out);
13870+
// final RMSNorm
13871+
cur = build_norm(cur,
13872+
model.output_norm, NULL,
13873+
LLM_NORM_RMS, -1);
13874+
cb(cur, "result_norm", -1);
13875+
res->t_embd = cur;
13876+
13877+
// lm_head
13878+
cur = build_lora_mm(model.output, cur);
13879+
cb(cur, "result_output", -1);
13880+
res->t_logits = cur;
13881+
13882+
ggml_build_forward_expand(gf, cur);
1380313883
}
1380413884
};
1380513885

0 commit comments

Comments
 (0)