@@ -13734,6 +13734,75 @@ struct llm_build_arcee : public llm_graph_context {
1373413734 }
1373513735};
1373613736
13737+ struct llm_build_smollm3 : public llm_graph_context {
13738+ llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13739+ std::vector<int32_t> no_rope_layers;
13740+ if (arch == LLM_ARCH_SMOLLM3) {
13741+ const int kid = gguf_find_key(model.meta, "smollm3.no_rope_layers");
13742+ if (kid != -1) {
13743+ const uint32_t n = gguf_get_arr_n(model.meta, kid);
13744+ no_rope_layers.resize(n);
13745+ const int nb = gguf_get_arr_data(model.meta, kid, no_rope_layers.data(), n * sizeof(int32_t));
13746+ GGML_ASSERT(nb == int(n * sizeof(int32_t)));
13747+ }
13748+ }
13749+
13750+ const int64_t n_tokens = params.n_tokens;
13751+ const int64_t n_layer = hparams.n_layer;
13752+
13753+ gf->n_threads = params.n_threads;
13754+
13755+ // build the graph
13756+ inp_tokens->set_input(ubatch);
13757+ inp_pos->set_input(ubatch);
13758+ inp_attn_temp->set_input(ubatch);
13759+
13760+ struct ggml_tensor * cur = build_inp_embd();
13761+ struct ggml_tensor * lay_out = nullptr;
13762+
13763+ for (int il = 0; il < n_layer; ++il) {
13764+ struct ggml_tensor * inp_norm = build_norm(cur, hparams.f_norm_eps, il, tn(LLM_TENSOR_ATTN_NORM, il));
13765+ struct ggml_tensor * qkv = build_attn(inp_norm, il);
13766+ struct ggml_tensor * q = ggml_view_4d(ctx, qkv, hparams.n_embd_head_v, hparams.n_head(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_v, 0, 0, 0);
13767+ struct ggml_tensor * k = ggml_view_4d(ctx, qkv, hparams.n_embd_head_k, hparams.n_head_kv(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_k, ggml_element_size(qkv)*hparams.n_embd_k_gqa(il), 0, 0);
13768+ struct ggml_tensor * v = ggml_view_4d(ctx, qkv, hparams.n_embd_head_v, hparams.n_head_kv(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_v, ggml_element_size(qkv)*hparams.n_embd_k_gqa(il) + ggml_element_size(qkv)*hparams.n_embd_k_gqa(il), 0, 0);
13769+
13770+ ggml_set_name(q, "q");
13771+ ggml_set_name(k, "k");
13772+ ggml_set_name(v, "v");
13773+
13774+ struct ggml_tensor * qcur = q;
13775+ struct ggml_tensor * kcur = k;
13776+
13777+ bool apply_rope = true;
13778+ if (arch == LLM_ARCH_SMOLLM3) {
13779+ if (std::find(no_rope_layers.begin(), no_rope_layers.end(), il) != no_rope_layers.end()) {
13780+ apply_rope = false;
13781+ }
13782+ }
13783+
13784+ if (apply_rope && get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il))) {
13785+ qcur = ggml_rope_ext(ctx, q, inp_pos->pos, get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il)), hparams.rope_type, 0, hparams.n_rot, hparams.n_gqa(il), hparams.rope_freq_base_train, hparams.rope_freq_scale_train, hparams.n_ctx_orig_yarn, hparams.rope_yarn_log_mul);
13786+ kcur = ggml_rope_ext(ctx, k, inp_pos->pos, get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il)), hparams.rope_type, 0, hparams.n_rot, hparams.n_gqa(il), hparams.rope_freq_base_train, hparams.rope_freq_scale_train, hparams.n_ctx_orig_yarn, hparams.rope_yarn_log_mul);
13787+ }
13788+
13789+ struct ggml_tensor * attn_out = build_attn_out(inp_norm, qcur, kcur, v, il);
13790+
13791+ if (hparams.use_par_res) {
13792+ // parallel residual
13793+ lay_out = ggml_add(ctx, attn_out, build_ff_par(inp_norm, il));
13794+ } else {
13795+ // sequential residual
13796+ lay_out = ggml_add(ctx, cur, attn_out);
13797+ lay_out = build_ff_seq(lay_out, il);
13798+ }
13799+ cur = lay_out;
13800+ }
13801+
13802+ build_output(cur, lay_out);
13803+ }
13804+ };
13805+
1373713806llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
1373813807 llama_memory_i * res;
1373913808
@@ -14085,6 +14154,10 @@ llm_graph_result_ptr llama_model::build_graph(
1408514154 {
1408614155 llm = std::make_unique<llm_build_arcee>(*this, params, gf);
1408714156 } break;
14157+ case LLM_ARCH_SMOLLM3:
14158+ {
14159+ llm = std::make_unique<llm_build_smollm3>(*this, params, gf);
14160+ } break;
1408814161 default:
1408914162 GGML_ABORT("fatal error");
1409014163 }
@@ -14235,9 +14308,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
1423514308 case LLM_ARCH_CHAMELEON:
1423614309 case LLM_ARCH_BAILINGMOE:
1423714310 case LLM_ARCH_NEO_BERT:
14311+ case LLM_ARCH_SMOLLM3:
1423814312 case LLM_ARCH_ARCEE:
1423914313 return LLAMA_ROPE_TYPE_NORM;
1424014314
14315+
1424114316 // the pairs of head values are offset by n_rot/2
1424214317 case LLM_ARCH_FALCON:
1424314318 case LLM_ARCH_GROK:
0 commit comments