@@ -13736,6 +13736,11 @@ struct llm_build_arcee : public llm_graph_context {
1373613736
1373713737struct llm_build_smollm3 : public llm_graph_context {
1373813738 llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13739+ const int64_t n_embd_head = hparams.n_embd_head_v;
13740+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13741+ GGML_ASSERT(n_embd_head == hparams.n_rot);
13742+
13743+ // collect layers for which RoPE is disabled (metadata key: "smollm3.no_rope_layers")
1373913744 std::vector<int32_t> no_rope_layers;
1374013745 if (arch == LLM_ARCH_SMOLLM3) {
1374113746 const int kid = gguf_find_key(model.meta, "smollm3.no_rope_layers");
@@ -13747,59 +13752,134 @@ struct llm_build_smollm3 : public llm_graph_context {
1374713752 }
1374813753 }
1374913754
13750- const int64_t n_tokens = params.n_tokens;
13751- const int64_t n_layer = hparams.n_layer ;
13755+ // token embeddings
13756+ ggml_tensor * inpL = build_inp_embd(model.tok_embd) ;
1375213757
13753- gf->n_threads = params.n_threads;
13758+ // positional ids
13759+ ggml_tensor * inp_pos = build_inp_pos();
1375413760
13755- // build the graph
13756- inp_tokens->set_input(ubatch);
13757- inp_pos->set_input(ubatch);
13758- inp_attn_temp->set_input(ubatch);
13761+ // attention helper (unified KV cache)
13762+ auto * inp_attn = build_attn_inp_kv_unified();
1375913763
13760- struct ggml_tensor * cur = build_inp_embd();
13761- struct ggml_tensor * lay_out = nullptr;
13764+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
1376213765
13766+ ggml_tensor * cur = nullptr;
1376313767 for (int il = 0; il < n_layer; ++il) {
13764- struct ggml_tensor * inp_norm = build_norm(cur, hparams.f_norm_eps, il, tn(LLM_TENSOR_ATTN_NORM, il));
13765- struct ggml_tensor * qkv = build_attn(inp_norm, il);
13766- struct ggml_tensor * q = ggml_view_4d(ctx, qkv, hparams.n_embd_head_v, hparams.n_head(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_v, 0, 0, 0);
13767- struct ggml_tensor * k = ggml_view_4d(ctx, qkv, hparams.n_embd_head_k, hparams.n_head_kv(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_k, ggml_element_size(qkv)*hparams.n_embd_k_gqa(il), 0, 0);
13768- struct ggml_tensor * v = ggml_view_4d(ctx, qkv, hparams.n_embd_head_v, hparams.n_head_kv(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_v, ggml_element_size(qkv)*hparams.n_embd_k_gqa(il) + ggml_element_size(qkv)*hparams.n_embd_k_gqa(il), 0, 0);
13768+ ggml_tensor * inpSA = inpL;
13769+
13770+ // attention norm
13771+ cur = build_norm(inpL,
13772+ model.layers[il].attn_norm, NULL,
13773+ LLM_NORM_RMS, il);
13774+ cb(cur, "attn_norm", il);
13775+
13776+ // ---- self-attention ----
13777+ {
13778+ // fused QKV projection
13779+ ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
13780+ cb(qkv, "wqkv", il);
13781+ if (model.layers[il].bqkv) {
13782+ qkv = ggml_add(ctx0, qkv, model.layers[il].bqkv);
13783+ cb(qkv, "bqkv", il);
13784+ }
13785+
13786+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
1376913787
13770- ggml_set_name(q, "q" );
13771- ggml_set_name(k, "k" );
13772- ggml_set_name(v, "v" );
13788+ ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens, qkv->nb[1], 0) );
13789+ ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], sizeof(float)*(n_embd)) );
13790+ ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], sizeof(float)*(n_embd + n_embd_gqa)) );
1377313791
13774- struct ggml_tensor * qcur = q;
13775- struct ggml_tensor * kcur = k;
13792+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13793+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13794+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1377613795
13777- bool apply_rope = true;
13778- if (arch == LLM_ARCH_SMOLLM3) {
13779- if (std::find(no_rope_layers.begin(), no_rope_layers.end(), il) != no_rope_layers.end()) {
13780- apply_rope = false;
13796+ if (std::find(no_rope_layers.begin(), no_rope_layers.end(), il) == no_rope_layers.end()) {
13797+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
13798+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
13799+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13800+ ext_factor, attn_factor, beta_fast, beta_slow);
13801+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors,
13802+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13803+ ext_factor, attn_factor, beta_fast, beta_slow);
1378113804 }
13782- }
1378313805
13784- if (apply_rope && get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il))) {
13785- qcur = ggml_rope_ext(ctx, q, inp_pos->pos, get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il)), hparams.rope_type, 0, hparams.n_rot, hparams.n_gqa(il), hparams.rope_freq_base_train, hparams.rope_freq_scale_train, hparams.n_ctx_orig_yarn, hparams.rope_yarn_log_mul);
13786- kcur = ggml_rope_ext(ctx, k, inp_pos->pos, get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il)), hparams.rope_type, 0, hparams.n_rot, hparams.n_gqa(il), hparams.rope_freq_base_train, hparams.rope_freq_scale_train, hparams.n_ctx_orig_yarn, hparams.rope_yarn_log_mul);
13806+ cb(Qcur, "Qcur", il);
13807+ cb(Kcur, "Kcur", il);
13808+ cb(Vcur, "Vcur", il);
13809+
13810+ cur = build_attn(inp_attn, gf,
13811+ model.layers[il].wo, model.layers[il].bo,
13812+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
13813+ cb(cur, "attn_out", il);
1378713814 }
1378813815
13789- struct ggml_tensor * attn_out = build_attn_out(inp_norm, qcur, kcur, v, il);
13816+ // skip padded tokens for final layer
13817+ if (il == n_layer - 1) {
13818+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13819+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13820+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13821+ }
1379013822
13823+ // ---- feed-forward ----
1379113824 if (hparams.use_par_res) {
1379213825 // parallel residual
13793- lay_out = ggml_add(ctx, attn_out, build_ff_par(inp_norm, il));
13826+ ggml_tensor * ffn_cur = build_norm(inpL,
13827+ model.layers[il].ffn_norm, NULL,
13828+ LLM_NORM_RMS, il);
13829+ cb(ffn_cur, "ffn_norm", il);
13830+
13831+ ffn_cur = build_ffn(ffn_cur,
13832+ model.layers[il].ffn_up, NULL, NULL,
13833+ model.layers[il].ffn_gate, NULL, NULL,
13834+ model.layers[il].ffn_down, NULL, NULL,
13835+ NULL,
13836+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13837+ cb(ffn_cur, "ffn_out", il);
13838+
13839+ cur = ggml_add(ctx0, cur, ffn_cur);
13840+ cb(cur, "par_res", il);
1379413841 } else {
1379513842 // sequential residual
13796- lay_out = ggml_add(ctx, cur, attn_out);
13797- lay_out = build_ff_seq(lay_out, il);
13843+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13844+ cb(ffn_inp, "ffn_inp", il);
13845+
13846+ cur = build_norm(ffn_inp,
13847+ model.layers[il].ffn_norm, NULL,
13848+ LLM_NORM_RMS, il);
13849+ cb(cur, "ffn_norm", il);
13850+
13851+ cur = build_ffn(cur,
13852+ model.layers[il].ffn_up, NULL, NULL,
13853+ model.layers[il].ffn_gate, NULL, NULL,
13854+ model.layers[il].ffn_down, NULL, NULL,
13855+ NULL,
13856+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13857+ cb(cur, "ffn_out", il);
13858+
13859+ cur = ggml_add(ctx0, cur, ffn_inp);
13860+ cb(cur, "ffn_out", il);
1379813861 }
13799- cur = lay_out;
13862+
13863+ // post-processing
13864+ cur = build_cvec(cur, il);
13865+ cb(cur, "l_out", il);
13866+
13867+ inpL = cur;
1380013868 }
1380113869
13802- build_output(cur, lay_out);
13870+ // final RMSNorm
13871+ cur = build_norm(cur,
13872+ model.output_norm, NULL,
13873+ LLM_NORM_RMS, -1);
13874+ cb(cur, "result_norm", -1);
13875+ res->t_embd = cur;
13876+
13877+ // lm_head
13878+ cur = build_lora_mm(model.output, cur);
13879+ cb(cur, "result_output", -1);
13880+ res->t_logits = cur;
13881+
13882+ ggml_build_forward_expand(gf, cur);
1380313883 }
1380413884};
1380513885
0 commit comments