Skip to content

Commit 7b9aa7b

Browse files
Merge branch 'master' into add-fh1-rebased
2 parents 710630a + 699f439 commit 7b9aa7b

File tree

14 files changed

+297
-72
lines changed

14 files changed

+297
-72
lines changed

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2734,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27342734
params.public_path = value;
27352735
}
27362736
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
2737+
add_opt(common_arg(
2738+
{"--api-prefix"}, "PREFIX",
2739+
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
2740+
[](common_params & params, const std::string & value) {
2741+
params.api_prefix = value;
2742+
}
2743+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
27372744
add_opt(common_arg(
27382745
{"--no-webui"},
27392746
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ struct common_params {
370370

371371
std::string hostname = "127.0.0.1";
372372
std::string public_path = ""; // NOLINT
373+
std::string api_prefix = ""; // NOLINT
373374
std::string chat_template = ""; // NOLINT
374375
bool use_jinja = false; // NOLINT
375376
bool enable_chat_template = true;

convert_hf_to_gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6813,6 +6813,9 @@ def prepare_tensors(self):
68136813
raise ValueError(f"Unprocessed experts: {experts}")
68146814

68156815

6816+
@ModelBase.register("SmolLM3ForCausalLM")
6817+
class SmolLM3Model(LlamaModel):
6818+
model_arch = gguf.MODEL_ARCH.SMOLLM3
68166819
###### CONVERSION LOGIC ######
68176820

68186821

docs/development/HOWTO-add-model.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,20 +83,22 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv
8383

8484
### 2. Define the model architecture in `llama.cpp`
8585

86-
The model params and tensors layout must be defined in `llama.cpp`:
87-
1. Define a new `llm_arch`
88-
2. Define the tensors layout in `LLM_TENSOR_NAMES`
89-
3. Add any non-standard metadata in `llm_load_hparams`
90-
4. Create the tensors for inference in `llm_load_tensors`
91-
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
86+
The model params and tensors layout must be defined in `llama.cpp` source files:
87+
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
88+
2. In `src/llama-arch.cpp`:
89+
- Add the architecture name to the `LLM_ARCH_NAMES` map.
90+
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
91+
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
92+
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
9293

9394
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
9495

9596
### 3. Build the GGML graph implementation
9697

97-
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
98-
99-
Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
98+
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
99+
Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
100+
Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
101+
Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.
100102

101103
Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.
102104

ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,19 @@ void main() {
1414

1515
const uint row_dst = gl_GlobalInvocationID.x;
1616

17-
if (i0 >= p.n_dims) {
18-
const uint i = row_dst*ne0 + i0;
19-
20-
data_d[i + 0] = data_a[i + 0];
21-
data_d[i + 1] = data_a[i + 1];
22-
23-
return;
24-
}
25-
2617
const uint row_x = row_dst % ne1;
2718
const uint channel_x = row_dst / ne1;
2819

2920
const uint idst = row_dst*ne0 + i0/2;
3021
const uint ix = channel_x*p.s2 + row_x*p.s1 + i0/2;
3122

23+
if (i0 >= p.n_dims) {
24+
data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0];
25+
data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1];
26+
27+
return;
28+
}
29+
3230
const int sect_dims = p.sections[0] + p.sections[1] + p.sections[2] + p.sections[3];
3331
const int sec_w = p.sections[1] + p.sections[0];
3432
const uint sector = (i0 / 2) % sect_dims;

ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,21 +13,19 @@ void main() {
1313

1414
const uint row_dst = gl_GlobalInvocationID.x;
1515

16-
if (i0 >= p.n_dims) {
17-
const uint i = row_dst*ne0 + i0;
18-
19-
data_d[i + 0] = data_a[i + 0];
20-
data_d[i + 1] = data_a[i + 1];
21-
22-
return;
23-
}
24-
2516
const uint row_x = row_dst % ne1;
2617
const uint channel_x = row_dst / ne1;
2718

2819
const uint idst = row_dst*ne0 + i0/2;
2920
const uint ix = channel_x*p.s2 + row_x*p.s1 + i0/2;
3021

22+
if (i0 >= p.n_dims) {
23+
data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0];
24+
data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1];
25+
26+
return;
27+
}
28+
3129
const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f);
3230

3331
const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;

ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,21 +13,19 @@ void main() {
1313

1414
const uint row_dst = gl_GlobalInvocationID.x;
1515

16-
if (i0 >= p.n_dims) {
17-
const uint i = row_dst*ne0 + i0;
18-
19-
data_d[i + 0] = data_a[i + 0];
20-
data_d[i + 1] = data_a[i + 1];
21-
22-
return;
23-
}
24-
2516
const uint row_x = row_dst % ne1;
2617
const uint channel_x = row_dst / ne1;
2718

2819
const uint idst = row_dst*ne0 + i0;
2920
const uint ix = channel_x*p.s2 + row_x*p.s1 + i0;
3021

22+
if (i0 >= p.n_dims) {
23+
data_d[idst + 0] = data_a[ix + 0];
24+
data_d[idst + 1] = data_a[ix + 1];
25+
26+
return;
27+
}
28+
3129
const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f);
3230

3331
const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;

gguf-py/gguf/constants.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ class MODEL_ARCH(IntEnum):
360360
ARCEE = auto()
361361
ERNIE4_5 = auto()
362362
HUNYUAN_MOE = auto()
363+
SMOLLM3 = auto()
363364

364365

365366
class VISION_PROJECTOR_TYPE(IntEnum):
@@ -665,6 +666,7 @@ class MODEL_TENSOR(IntEnum):
665666
MODEL_ARCH.ERNIE4_5: "ernie4_5",
666667
MODEL_ARCH.FALCON_H1: "falcon-h1",
667668
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
669+
MODEL_ARCH.SMOLLM3: "smollm3",
668670
}
669671

670672
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -2271,6 +2273,22 @@ class MODEL_TENSOR(IntEnum):
22712273
MODEL_TENSOR.FFN_DOWN_SHEXP,
22722274
MODEL_TENSOR.FFN_UP_SHEXP,
22732275
],
2276+
MODEL_ARCH.SMOLLM3: [
2277+
MODEL_TENSOR.TOKEN_EMBD,
2278+
MODEL_TENSOR.OUTPUT_NORM,
2279+
MODEL_TENSOR.OUTPUT,
2280+
MODEL_TENSOR.ROPE_FREQS,
2281+
MODEL_TENSOR.ATTN_NORM,
2282+
MODEL_TENSOR.ATTN_Q,
2283+
MODEL_TENSOR.ATTN_K,
2284+
MODEL_TENSOR.ATTN_V,
2285+
MODEL_TENSOR.ATTN_OUT,
2286+
MODEL_TENSOR.ATTN_ROT_EMBD,
2287+
MODEL_TENSOR.FFN_NORM,
2288+
MODEL_TENSOR.FFN_GATE,
2289+
MODEL_TENSOR.FFN_DOWN,
2290+
MODEL_TENSOR.FFN_UP,
2291+
],
22742292
# TODO
22752293
}
22762294

src/llama-arch.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
8080
{ LLM_ARCH_ARCEE, "arcee" },
8181
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
8282
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
83+
{ LLM_ARCH_SMOLLM3, "smollm3" },
8384
{ LLM_ARCH_UNKNOWN, "(unknown)" },
8485
};
8586

@@ -1749,6 +1750,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
17491750
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
17501751
},
17511752
},
1753+
{
1754+
LLM_ARCH_SMOLLM3,
1755+
{
1756+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1757+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1758+
{ LLM_TENSOR_OUTPUT, "output" },
1759+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1760+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1761+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1762+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1763+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1764+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1765+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1766+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1767+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1768+
},
1769+
},
17521770
};
17531771

17541772
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ enum llm_arch {
8484
LLM_ARCH_ARCEE,
8585
LLM_ARCH_ERNIE4_5,
8686
LLM_ARCH_HUNYUAN_MOE,
87+
LLM_ARCH_SMOLLM3,
8788
LLM_ARCH_UNKNOWN,
8889
};
8990

0 commit comments

Comments
 (0)