Skip to content

Commit ff4c1a5

Browse files
committed
talk-llama : sync llama.cpp
1 parent ed6a306 commit ff4c1a5

19 files changed

+565
-104
lines changed

examples/talk-llama/llama-arch.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,14 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
9393
{ LLM_ARCH_SMOLLM3, "smollm3" },
9494
{ LLM_ARCH_OPENAI_MOE, "gpt-oss" },
9595
{ LLM_ARCH_LFM2, "lfm2" },
96+
{ LLM_ARCH_LFM2MOE, "lfm2moe" },
9697
{ LLM_ARCH_DREAM, "dream" },
9798
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
9899
{ LLM_ARCH_LLADA, "llada" },
99100
{ LLM_ARCH_LLADA_MOE, "llada-moe" },
100101
{ LLM_ARCH_SEED_OSS, "seed_oss" },
101102
{ LLM_ARCH_GROVEMOE, "grovemoe" },
103+
{ LLM_ARCH_APERTUS, "apertus" },
102104
{ LLM_ARCH_UNKNOWN, "(unknown)" },
103105
};
104106

@@ -217,6 +219,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
217219
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
218220

219221
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
222+
// sentence-transformers dense modules feature dims
223+
{ LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" },
224+
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
225+
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
226+
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
220227

221228
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
222229
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
@@ -256,6 +263,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
256263
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
257264
{ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
258265

266+
{ LLM_KV_XIELU_ALPHA_N, "xielu.alpha_n" },
267+
{ LLM_KV_XIELU_ALPHA_P, "xielu.alpha_p" },
268+
{ LLM_KV_XIELU_BETA, "xielu.beta" },
269+
{ LLM_KV_XIELU_EPS, "xielu.eps" },
270+
259271
// deprecated
260272
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
261273
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
@@ -1064,6 +1076,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
10641076
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
10651077
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
10661078
{ LLM_TENSOR_OUTPUT, "output" },
1079+
{ LLM_TENSOR_DENSE_2_OUT, "dense_2" },
1080+
{ LLM_TENSOR_DENSE_3_OUT, "dense_3" },
10671081
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
10681082
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
10691083
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@@ -2098,6 +2112,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
20982112
{ LLM_TENSOR_OUTPUT, "output" },
20992113
}
21002114
},
2115+
{
2116+
LLM_ARCH_LFM2MOE,
2117+
{
2118+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2119+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2120+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2121+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2122+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2123+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2124+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2125+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2126+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
2127+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2128+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2129+
{ LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" },
2130+
{ LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
2131+
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
2132+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2133+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
2134+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2135+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2136+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2137+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2138+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
2139+
}
2140+
},
21012141
{
21022142
LLM_ARCH_SMALLTHINKER,
21032143
{
@@ -2119,6 +2159,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
21192159
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }
21202160
},
21212161
},
2162+
{
2163+
LLM_ARCH_APERTUS,
2164+
{
2165+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2166+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2167+
{ LLM_TENSOR_OUTPUT, "output" },
2168+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
2169+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2170+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2171+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2172+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2173+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2174+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2175+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2176+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2177+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2178+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2179+
},
2180+
},
21222181
{
21232182
LLM_ARCH_DREAM,
21242183
{
@@ -2229,6 +2288,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
22292288
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
22302289
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
22312290
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
2291+
{LLM_TENSOR_DENSE_2_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
2292+
{LLM_TENSOR_DENSE_3_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
22322293
{LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
22332294
{LLM_TENSOR_DEC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
22342295
{LLM_TENSOR_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
@@ -2468,6 +2529,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
24682529
case LLM_ARCH_PLAMO2:
24692530
case LLM_ARCH_GRANITE_HYBRID:
24702531
case LLM_ARCH_LFM2:
2532+
case LLM_ARCH_LFM2MOE:
24712533
case LLM_ARCH_NEMOTRON_H:
24722534
return true;
24732535
default:

examples/talk-llama/llama-arch.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,14 @@ enum llm_arch {
9797
LLM_ARCH_SMOLLM3,
9898
LLM_ARCH_OPENAI_MOE,
9999
LLM_ARCH_LFM2,
100+
LLM_ARCH_LFM2MOE,
100101
LLM_ARCH_DREAM,
101102
LLM_ARCH_SMALLTHINKER,
102103
LLM_ARCH_LLADA,
103104
LLM_ARCH_LLADA_MOE,
104105
LLM_ARCH_SEED_OSS,
105106
LLM_ARCH_GROVEMOE,
107+
LLM_ARCH_APERTUS,
106108
LLM_ARCH_UNKNOWN,
107109
};
108110

@@ -260,17 +262,30 @@ enum llm_kv {
260262

261263
LLM_KV_SHORTCONV_L_CACHE,
262264

265+
LLM_KV_XIELU_ALPHA_N,
266+
LLM_KV_XIELU_ALPHA_P,
267+
LLM_KV_XIELU_BETA,
268+
LLM_KV_XIELU_EPS,
269+
263270
// deprecated:
264271
LLM_KV_TOKENIZER_PREFIX_ID,
265272
LLM_KV_TOKENIZER_SUFFIX_ID,
266273
LLM_KV_TOKENIZER_MIDDLE_ID,
274+
275+
// sentence-transformers dense layers in and out features
276+
LLM_KV_DENSE_2_FEAT_IN,
277+
LLM_KV_DENSE_2_FEAT_OUT,
278+
LLM_KV_DENSE_3_FEAT_IN,
279+
LLM_KV_DENSE_3_FEAT_OUT,
267280
};
268281

269282
enum llm_tensor {
270283
LLM_TENSOR_TOKEN_EMBD,
271284
LLM_TENSOR_TOKEN_EMBD_NORM,
272285
LLM_TENSOR_TOKEN_TYPES,
273286
LLM_TENSOR_POS_EMBD,
287+
LLM_TENSOR_DENSE_2_OUT,
288+
LLM_TENSOR_DENSE_3_OUT,
274289
LLM_TENSOR_OUTPUT,
275290
LLM_TENSOR_OUTPUT_NORM,
276291
LLM_TENSOR_ROPE_FREQS,

examples/talk-llama/llama-chat.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,7 @@ int32_t llm_chat_apply_template(
590590
ss << message->content << "<|end_of_text|>\n";
591591
}
592592
if (add_ass) {
593-
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
593+
ss << "<|start_of_role|>assistant<|end_of_role|>";
594594
}
595595
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
596596
// GigaChat template

examples/talk-llama/llama-context.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2346,6 +2346,12 @@ llama_context * llama_init_from_model(
23462346
return nullptr;
23472347
}
23482348

2349+
if (params.pooling_type != model->hparams.pooling_type) {
2350+
//user-specified pooling-type is different from the model default
2351+
LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
2352+
model->hparams.pooling_type, params.pooling_type);
2353+
}
2354+
23492355
try {
23502356
auto * ctx = new llama_context(*model, params);
23512357
return ctx;

examples/talk-llama/llama-graph.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1853,6 +1853,23 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
18531853
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
18541854
}
18551855

1856+
void llm_graph_context::build_dense_out(
1857+
ggml_tensor * dense_2,
1858+
ggml_tensor * dense_3) const {
1859+
if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
1860+
return;
1861+
}
1862+
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
1863+
GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
1864+
1865+
cur = ggml_mul_mat(ctx0, dense_2, cur);
1866+
cur = ggml_mul_mat(ctx0, dense_3, cur);
1867+
cb(cur, "result_embd_pooled", -1);
1868+
res->t_embd_pooled = cur;
1869+
ggml_build_forward_expand(gf, cur);
1870+
}
1871+
1872+
18561873
void llm_graph_context::build_pooling(
18571874
ggml_tensor * cls,
18581875
ggml_tensor * cls_b,

examples/talk-llama/llama-graph.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,14 @@ struct llm_graph_context {
814814
ggml_tensor * cls_b,
815815
ggml_tensor * cls_out,
816816
ggml_tensor * cls_out_b) const;
817+
818+
//
819+
// dense (out)
820+
//
821+
822+
void build_dense_out(
823+
ggml_tensor * dense_2,
824+
ggml_tensor * dense_3) const;
817825
};
818826

819827
// TODO: better name

examples/talk-llama/llama-hparams.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,11 @@ uint32_t llama_hparams::n_embd_s() const {
140140
}
141141

142142
bool llama_hparams::is_recurrent(uint32_t il) const {
143-
return recurrent_layer_arr[il];
143+
if (il < n_layer) {
144+
return recurrent_layer_arr[il];
145+
}
146+
147+
GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
144148
}
145149

146150
uint32_t llama_hparams::n_pos_per_embd() const {

examples/talk-llama/llama-hparams.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ struct llama_hparams {
4242
uint32_t n_embd;
4343
uint32_t n_embd_features = 0;
4444
uint32_t n_layer;
45-
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
45+
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
4646
uint32_t n_rot;
4747
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
4848
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
@@ -169,6 +169,18 @@ struct llama_hparams {
169169
uint32_t laurel_rank = 64;
170170
uint32_t n_embd_altup = 256;
171171

172+
// needed for sentence-transformers dense layers
173+
uint32_t dense_2_feat_in = 0; // in_features of the 2_Dense
174+
uint32_t dense_2_feat_out = 0; // out_features of the 2_Dense
175+
uint32_t dense_3_feat_in = 0; // in_features of the 3_Dense
176+
uint32_t dense_3_feat_out = 0; // out_features of the 3_Dense
177+
178+
// xIELU
179+
std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_n;
180+
std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_p;
181+
std::array<float, LLAMA_MAX_LAYERS> xielu_beta;
182+
std::array<float, LLAMA_MAX_LAYERS> xielu_eps;
183+
172184
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
173185
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
174186
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;

examples/talk-llama/llama-kv-cache-iswa.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,15 +220,15 @@ bool llama_kv_cache_iswa::get_can_shift() const {
220220
}
221221

222222
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
223-
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
223+
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
224224
kv_base->state_write(io, seq_id, flags);
225225
}
226226

227227
kv_swa->state_write(io, seq_id, flags);
228228
}
229229

230230
void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
231-
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
231+
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
232232
kv_base->state_read(io, seq_id, flags);
233233
}
234234

examples/talk-llama/llama-kv-cache.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,11 +123,8 @@ llama_kv_cache::llama_kv_cache(
123123
throw std::runtime_error("failed to create ggml context for kv cache");
124124
}
125125

126-
ggml_tensor * k;
127-
ggml_tensor * v;
128-
129-
k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
130-
v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
126+
ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
127+
ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
131128

132129
ggml_format_name(k, "cache_k_l%d", il);
133130
ggml_format_name(v, "cache_v_l%d", il);

0 commit comments

Comments
 (0)