Skip to content

Commit 1f816de

Browse files
committed
talk-llama : sync llama.cpp
1 parent c4ea72b commit 1f816de

24 files changed

+1456
-430
lines changed

examples/talk-llama/llama-arch.cpp

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
4242
{ LLM_ARCH_GEMMA, "gemma" },
4343
{ LLM_ARCH_GEMMA2, "gemma2" },
4444
{ LLM_ARCH_GEMMA3, "gemma3" },
45+
{ LLM_ARCH_GEMMA3N, "gemma3n" },
4546
{ LLM_ARCH_STARCODER2, "starcoder2" },
4647
{ LLM_ARCH_MAMBA, "mamba" },
4748
{ LLM_ARCH_XVERSE, "xverse" },
@@ -75,6 +76,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
7576
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
7677
{ LLM_ARCH_DOTS1, "dots1" },
7778
{ LLM_ARCH_ARCEE, "arcee" },
79+
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
7880
{ LLM_ARCH_UNKNOWN, "(unknown)" },
7981
};
8082

@@ -932,6 +934,42 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
932934
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
933935
},
934936
},
937+
{
938+
LLM_ARCH_GEMMA3N,
939+
{
940+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
941+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
942+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
943+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
944+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
945+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
946+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
947+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
948+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
949+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
950+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
951+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
952+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
953+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
954+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
955+
{ LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" },
956+
{ LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" },
957+
{ LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" },
958+
{ LLM_TENSOR_ALTUP_UNEMBD_PROJ, "altup_unembd_proj" },
959+
{ LLM_TENSOR_ALTUP_PROJ, "altup_proj" },
960+
{ LLM_TENSOR_PER_LAYER_INP_GATE, "blk.%d.inp_gate" },
961+
{ LLM_TENSOR_PER_LAYER_PROJ, "blk.%d.proj" },
962+
{ LLM_TENSOR_PER_LAYER_POST_NORM, "blk.%d.post_norm" },
963+
{ LLM_TENSOR_ALTUP_CORRECT_COEF, "blk.%d.altup_correct_coef" },
964+
{ LLM_TENSOR_ALTUP_CORRECT_SCALE, "blk.%d.altup_correct_scale" },
965+
{ LLM_TENSOR_ALTUP_PREDICT_COEF, "blk.%d.altup_predict_coef" },
966+
{ LLM_TENSOR_ALTUP_ROUTER, "blk.%d.altup_router" },
967+
{ LLM_TENSOR_ALTUP_ROUTER_NORM, "blk.%d.altup_router_norm" },
968+
{ LLM_TENSOR_LAUREL_L, "blk.%d.laurel_l" },
969+
{ LLM_TENSOR_LAUREL_R, "blk.%d.laurel_r" },
970+
{ LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
971+
},
972+
},
935973
{
936974
LLM_ARCH_STARCODER2,
937975
{
@@ -1621,6 +1659,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
16211659
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
16221660
}
16231661
},
1662+
{
1663+
LLM_ARCH_ERNIE4_5,
1664+
{
1665+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1666+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1667+
{ LLM_TENSOR_OUTPUT, "output" },
1668+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1669+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1670+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1671+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1672+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1673+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1674+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1675+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1676+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1677+
},
1678+
},
16241679
{
16251680
LLM_ARCH_UNKNOWN,
16261681
{
@@ -1749,6 +1804,23 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
17491804
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
17501805
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
17511806
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1807+
// altup / laurel (gemma 3n)
1808+
{LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
1809+
{LLM_TENSOR_PER_LAYER_MODEL_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
1810+
{LLM_TENSOR_PER_LAYER_PROJ_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
1811+
{LLM_TENSOR_ALTUP_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
1812+
{LLM_TENSOR_ALTUP_UNEMBD_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
1813+
{LLM_TENSOR_PER_LAYER_INP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1814+
{LLM_TENSOR_PER_LAYER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1815+
{LLM_TENSOR_PER_LAYER_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1816+
{LLM_TENSOR_ALTUP_CORRECT_COEF, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1817+
{LLM_TENSOR_ALTUP_CORRECT_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1818+
{LLM_TENSOR_ALTUP_PREDICT_COEF, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1819+
{LLM_TENSOR_ALTUP_ROUTER, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1820+
{LLM_TENSOR_ALTUP_ROUTER_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1821+
{LLM_TENSOR_LAUREL_L, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1822+
{LLM_TENSOR_LAUREL_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1823+
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
17521824
// this tensor is loaded for T5, but never used
17531825
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
17541826
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},

examples/talk-llama/llama-arch.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ enum llm_arch {
4646
LLM_ARCH_GEMMA,
4747
LLM_ARCH_GEMMA2,
4848
LLM_ARCH_GEMMA3,
49+
LLM_ARCH_GEMMA3N,
4950
LLM_ARCH_STARCODER2,
5051
LLM_ARCH_MAMBA,
5152
LLM_ARCH_XVERSE,
@@ -79,6 +80,7 @@ enum llm_arch {
7980
LLM_ARCH_BAILINGMOE,
8081
LLM_ARCH_DOTS1,
8182
LLM_ARCH_ARCEE,
83+
LLM_ARCH_ERNIE4_5,
8284
LLM_ARCH_UNKNOWN,
8385
};
8486

@@ -269,6 +271,22 @@ enum llm_tensor {
269271
LLM_TENSOR_LAYER_OUT_NORM,
270272
LLM_TENSOR_POST_ATTN_NORM,
271273
LLM_TENSOR_POST_MLP_NORM,
274+
LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n
275+
LLM_TENSOR_PER_LAYER_MODEL_PROJ, // gemma3n
276+
LLM_TENSOR_PER_LAYER_INP_GATE, // gemma3n
277+
LLM_TENSOR_PER_LAYER_PROJ, // gemma3n
278+
LLM_TENSOR_PER_LAYER_PROJ_NORM, // gemma3n
279+
LLM_TENSOR_PER_LAYER_POST_NORM, // gemma3n
280+
LLM_TENSOR_ALTUP_PROJ, // gemma3n
281+
LLM_TENSOR_ALTUP_UNEMBD_PROJ, // gemma3n
282+
LLM_TENSOR_ALTUP_CORRECT_COEF, // gemma3n
283+
LLM_TENSOR_ALTUP_CORRECT_SCALE, // gemma3n
284+
LLM_TENSOR_ALTUP_PREDICT_COEF, // gemma3n
285+
LLM_TENSOR_ALTUP_ROUTER, // gemma3n
286+
LLM_TENSOR_ALTUP_ROUTER_NORM, // gemma3n
287+
LLM_TENSOR_LAUREL_L, // gemma3n
288+
LLM_TENSOR_LAUREL_R, // gemma3n
289+
LLM_TENSOR_LAUREL_POST_NORM, // gemma3n
272290
LLM_TENSOR_SSM_IN,
273291
LLM_TENSOR_SSM_CONV1D,
274292
LLM_TENSOR_SSM_X,

examples/talk-llama/llama-batch.cpp

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -244,22 +244,35 @@ bool llama_batch_allocr::init(
244244
continue;
245245
}
246246

247-
if (memory) {
247+
const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
248+
249+
if (p0 >= 0) {
250+
bool ok = true;
251+
248252
if (batch.token) {
249-
if (seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
250-
LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
251-
return false;
253+
if (seq_pos_min(s) != p0 + 1) {
254+
ok = false;
252255
}
253256
} else {
254257
assert(batch.embd);
255258

256259
// for embeddings (typically used as vision input), we allow them to have repeating positions
257260
// ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
258-
if (seq_pos_min(s) != memory->seq_pos_max(s) && seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
259-
LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
260-
return false;
261+
if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
262+
ok = false;
261263
}
262264
}
265+
266+
if (!ok) {
267+
LLAMA_LOG_ERROR(
268+
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
269+
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
270+
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
271+
" it is required that the sequence positions remain consecutive: Y = X + 1\n",
272+
__func__, s, s, p0, s, seq_pos_min(s));
273+
274+
return false;
275+
}
263276
}
264277

265278
if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {

examples/talk-llama/llama-chat.cpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -528,12 +528,17 @@ int32_t llm_chat_apply_template(
528528
}
529529
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
530530
// this template requires the model to have "\n\n" as EOT token
531-
for (auto message : chat) {
532-
std::string role(message->role);
533-
if (role == "user") {
534-
ss << "User: " << message->content << "\n\nAssistant:";
535-
} else {
536-
ss << message->content << "\n\n";
531+
for (size_t i = 0; i < chat.size(); i++) {
532+
std::string role(chat[i]->role);
533+
if (role == "system") {
534+
ss << "System: " << trim(chat[i]->content) << "\n\n";
535+
} else if (role == "user") {
536+
ss << "User: " << trim(chat[i]->content) << "\n\n";
537+
if (i == chat.size() - 1) {
538+
ss << "Assistant:";
539+
}
540+
} else if (role == "assistant") {
541+
ss << "Assistant: " << trim(chat[i]->content) << "\n\n";
537542
}
538543
}
539544
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {

0 commit comments

Comments
 (0)