Skip to content

Commit 024bd29

Browse files
committed
Init - first pass.
1 parent e434e69 commit 024bd29

File tree

6 files changed

+133
-9
lines changed

6 files changed

+133
-9
lines changed

convert_hf_to_gguf.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6298,6 +6298,16 @@ def set_gguf_parameters(self):
62986298
super().set_gguf_parameters()
62996299
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
63006300

6301+
@Model.register("SmolLM3ForCausalLM")
6302+
class SmolLM3Model(LlamaModel):
6303+
model_arch = gguf.MODEL_ARCH.SMOLLM3
6304+
6305+
def set_gguf_parameters(self):
6306+
super().set_gguf_parameters()
6307+
6308+
if self.model.config.no_rope_layers is not None:
6309+
self.gguf_writer.add_array("smollm3.no_rope_layers", self.model.config.no_rope_layers, gguf.GGUFValueType.INT32)
6310+
63016311
###### CONVERSION LOGIC ######
63026312

63036313

docs/development/HOWTO-add-model.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,20 +83,22 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv
8383

8484
### 2. Define the model architecture in `llama.cpp`
8585

86-
The model params and tensors layout must be defined in `llama.cpp`:
87-
1. Define a new `llm_arch`
88-
2. Define the tensors layout in `LLM_TENSOR_NAMES`
89-
3. Add any non-standard metadata in `llm_load_hparams`
90-
4. Create the tensors for inference in `llm_load_tensors`
91-
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
86+
The model params and tensors layout must be defined in `llama.cpp` source files:
87+
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
88+
2. In `src/llama-arch.cpp`:
89+
- Add the architecture name to the `LLM_ARCH_NAMES` map.
90+
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
91+
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
92+
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
9293

9394
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
9495

9596
### 3. Build the GGML graph implementation
9697

97-
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
98-
99-
Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
98+
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
99+
Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
100+
Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
101+
Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.
100102

101103
Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.
102104

gguf-py/gguf/constants.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,7 @@ class MODEL_ARCH(IntEnum):
346346
BAILINGMOE = auto()
347347
DOTS1 = auto()
348348
ARCEE = auto()
349+
SMOLLM3 = auto()
349350

350351

351352
class VISION_PROJECTOR_TYPE(IntEnum):
@@ -629,6 +630,7 @@ class MODEL_TENSOR(IntEnum):
629630
MODEL_ARCH.BAILINGMOE: "bailingmoe",
630631
MODEL_ARCH.DOTS1: "dots1",
631632
MODEL_ARCH.ARCEE: "arcee",
633+
MODEL_ARCH.SMOLLM3: "smollm3",
632634
}
633635

634636
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -2101,6 +2103,21 @@ class MODEL_TENSOR(IntEnum):
21012103
MODEL_TENSOR.FFN_DOWN,
21022104
MODEL_TENSOR.FFN_UP,
21032105
],
2106+
MODEL_ARCH.SMOLLM3: [
2107+
MODEL_TENSOR.TOKEN_EMBD,
2108+
MODEL_TENSOR.OUTPUT_NORM,
2109+
MODEL_TENSOR.OUTPUT,
2110+
MODEL_TENSOR.ROPE_FREQS,
2111+
MODEL_TENSOR.ATTN_NORM,
2112+
MODEL_TENSOR.ATTN_Q,
2113+
MODEL_TENSOR.ATTN_K,
2114+
MODEL_TENSOR.ATTN_V,
2115+
MODEL_TENSOR.ATTN_OUT,
2116+
MODEL_TENSOR.ATTN_ROT_EMBD,
2117+
MODEL_TENSOR.FFN_GATE,
2118+
MODEL_TENSOR.FFN_DOWN,
2119+
MODEL_TENSOR.FFN_UP,
2120+
],
21042121
# TODO
21052122
}
21062123

src/llama-arch.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
7575
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
7676
{ LLM_ARCH_DOTS1, "dots1" },
7777
{ LLM_ARCH_ARCEE, "arcee" },
78+
{ LLM_ARCH_SMOLLM3, "smollm3" },
7879
{ LLM_ARCH_UNKNOWN, "(unknown)" },
7980
};
8081

@@ -1625,6 +1626,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
16251626
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
16261627
},
16271628
},
1629+
{
1630+
LLM_ARCH_SMOLLM3,
1631+
{
1632+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd.weight" },
1633+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm.weight" },
1634+
{ LLM_TENSOR_OUTPUT, "output.weight" },
1635+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1636+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm.weight" },
1637+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q.weight" },
1638+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k.weight" },
1639+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v.weight" },
1640+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output.weight" },
1641+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1642+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate.weight" },
1643+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down.weight" },
1644+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up.weight" },
1645+
},
1646+
},
16281647
};
16291648

16301649
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ enum llm_arch {
7979
LLM_ARCH_BAILINGMOE,
8080
LLM_ARCH_DOTS1,
8181
LLM_ARCH_ARCEE,
82+
LLM_ARCH_SMOLLM3,
8283
LLM_ARCH_UNKNOWN,
8384
};
8485

src/llama-model.cpp

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13734,6 +13734,75 @@ struct llm_build_arcee : public llm_graph_context {
1373413734
}
1373513735
};
1373613736

13737+
struct llm_build_smollm3 : public llm_graph_context {
13738+
llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13739+
std::vector<int32_t> no_rope_layers;
13740+
if (arch == LLM_ARCH_SMOLLM3) {
13741+
const int kid = gguf_find_key(model.meta, "smollm3.no_rope_layers");
13742+
if (kid != -1) {
13743+
const uint32_t n = gguf_get_arr_n(model.meta, kid);
13744+
no_rope_layers.resize(n);
13745+
const int nb = gguf_get_arr_data(model.meta, kid, no_rope_layers.data(), n * sizeof(int32_t));
13746+
GGML_ASSERT(nb == int(n * sizeof(int32_t)));
13747+
}
13748+
}
13749+
13750+
const int64_t n_tokens = params.n_tokens;
13751+
const int64_t n_layer = hparams.n_layer;
13752+
13753+
gf->n_threads = params.n_threads;
13754+
13755+
// build the graph
13756+
inp_tokens->set_input(ubatch);
13757+
inp_pos->set_input(ubatch);
13758+
inp_attn_temp->set_input(ubatch);
13759+
13760+
struct ggml_tensor * cur = build_inp_embd();
13761+
struct ggml_tensor * lay_out = nullptr;
13762+
13763+
for (int il = 0; il < n_layer; ++il) {
13764+
struct ggml_tensor * inp_norm = build_norm(cur, hparams.f_norm_eps, il, tn(LLM_TENSOR_ATTN_NORM, il));
13765+
struct ggml_tensor * qkv = build_attn(inp_norm, il);
13766+
struct ggml_tensor * q = ggml_view_4d(ctx, qkv, hparams.n_embd_head_v, hparams.n_head(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_v, 0, 0, 0);
13767+
struct ggml_tensor * k = ggml_view_4d(ctx, qkv, hparams.n_embd_head_k, hparams.n_head_kv(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_k, ggml_element_size(qkv)*hparams.n_embd_k_gqa(il), 0, 0);
13768+
struct ggml_tensor * v = ggml_view_4d(ctx, qkv, hparams.n_embd_head_v, hparams.n_head_kv(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_v, ggml_element_size(qkv)*hparams.n_embd_k_gqa(il) + ggml_element_size(qkv)*hparams.n_embd_k_gqa(il), 0, 0);
13769+
13770+
ggml_set_name(q, "q");
13771+
ggml_set_name(k, "k");
13772+
ggml_set_name(v, "v");
13773+
13774+
struct ggml_tensor * qcur = q;
13775+
struct ggml_tensor * kcur = k;
13776+
13777+
bool apply_rope = true;
13778+
if (arch == LLM_ARCH_SMOLLM3) {
13779+
if (std::find(no_rope_layers.begin(), no_rope_layers.end(), il) != no_rope_layers.end()) {
13780+
apply_rope = false;
13781+
}
13782+
}
13783+
13784+
if (apply_rope && get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il))) {
13785+
qcur = ggml_rope_ext(ctx, q, inp_pos->pos, get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il)), hparams.rope_type, 0, hparams.n_rot, hparams.n_gqa(il), hparams.rope_freq_base_train, hparams.rope_freq_scale_train, hparams.n_ctx_orig_yarn, hparams.rope_yarn_log_mul);
13786+
kcur = ggml_rope_ext(ctx, k, inp_pos->pos, get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il)), hparams.rope_type, 0, hparams.n_rot, hparams.n_gqa(il), hparams.rope_freq_base_train, hparams.rope_freq_scale_train, hparams.n_ctx_orig_yarn, hparams.rope_yarn_log_mul);
13787+
}
13788+
13789+
struct ggml_tensor * attn_out = build_attn_out(inp_norm, qcur, kcur, v, il);
13790+
13791+
if (hparams.use_par_res) {
13792+
// parallel residual
13793+
lay_out = ggml_add(ctx, attn_out, build_ff_par(inp_norm, il));
13794+
} else {
13795+
// sequential residual
13796+
lay_out = ggml_add(ctx, cur, attn_out);
13797+
lay_out = build_ff_seq(lay_out, il);
13798+
}
13799+
cur = lay_out;
13800+
}
13801+
13802+
build_output(cur, lay_out);
13803+
}
13804+
};
13805+
1373713806
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
1373813807
llama_memory_i * res;
1373913808

@@ -14085,6 +14154,10 @@ llm_graph_result_ptr llama_model::build_graph(
1408514154
{
1408614155
llm = std::make_unique<llm_build_arcee>(*this, params, gf);
1408714156
} break;
14157+
case LLM_ARCH_SMOLLM3:
14158+
{
14159+
llm = std::make_unique<llm_build_smollm3>(*this, params, gf);
14160+
} break;
1408814161
default:
1408914162
GGML_ABORT("fatal error");
1409014163
}
@@ -14235,9 +14308,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
1423514308
case LLM_ARCH_CHAMELEON:
1423614309
case LLM_ARCH_BAILINGMOE:
1423714310
case LLM_ARCH_NEO_BERT:
14311+
case LLM_ARCH_SMOLLM3:
1423814312
case LLM_ARCH_ARCEE:
1423914313
return LLAMA_ROPE_TYPE_NORM;
1424014314

14315+
1424114316
// the pairs of head values are offset by n_rot/2
1424214317
case LLM_ARCH_FALCON:
1424314318
case LLM_ARCH_GROK:

0 commit comments

Comments
 (0)