@@ -452,10 +452,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
452452 return;
453453 }
454454
455- if (arch == LLM_ARCH_SMOLLM3) {
456- ml.get_key("no_rope_layer_interval", hparams.no_rope_layer_interval);
457- }
458-
459455 ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
460456 ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
461457 ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
@@ -1565,6 +1561,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
15651561 default: type = LLM_TYPE_UNKNOWN;
15661562 }
15671563 } break;
1564+ case LLM_ARCH_SMOLLM3:
1565+ {
1566+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1567+
1568+ switch (hparams.n_layer) {
1569+ case 36: type = LLM_TYPE_3B; break;
1570+ default: type = LLM_TYPE_UNKNOWN;
1571+ }
1572+ } break;
15681573 default: throw std::runtime_error("unsupported model architecture");
15691574 }
15701575
@@ -4528,6 +4533,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
45284533 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
45294534 }
45304535 } break;
4536+ case LLM_ARCH_SMOLLM3:
4537+ {
4538+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4539+
4540+ // output
4541+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4542+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4543+
4544+ // if output is NULL, init from the input tok embed
4545+ if (output == NULL) {
4546+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4547+ }
4548+
4549+ for (int i = 0; i < n_layer; ++i) {
4550+ auto & layer = layers[i];
4551+
4552+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4553+
4554+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4555+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4556+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4557+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4558+
4559+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4560+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4561+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4562+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4563+ }
4564+ } break;
45314565 default:
45324566 throw std::runtime_error("unknown architecture");
45334567 }
@@ -14962,6 +14996,7 @@ llm_graph_result_ptr llama_model::build_graph(
1496214996 llm = std::make_unique<llm_build_llama>(*this, params, gf);
1496314997 } break;
1496414998 case LLM_ARCH_LLAMA4:
14999+ case LLM_ARCH_SMOLLM3:
1496515000 {
1496615001 llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
1496715002 } break;
0 commit comments