Skip to content

Commit a750e53

Browse files
committed
Fix formatting of attn / ffn / ffn_moe calls
1 parent ec3278c commit a750e53

25 files changed

+345
-131
lines changed

src/llama-model.cpp

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,92 @@
1313

1414
#include "ggml-cpp.h"
1515

16-
#include "models/models.h"
16+
#include "models/llm_graph_context_mamba.h"
17+
#include "models/llm_build_arcee.h"
18+
#include "models/llm_build_arctic.h"
19+
#include "models/llm_build_baichuan.h"
20+
#include "models/llm_build_bailingmoe.h"
21+
#include "models/llm_build_bert.h"
22+
#include "models/llm_build_bitnet.h"
23+
#include "models/llm_build_bloom.h"
24+
#include "models/llm_build_chameleon.h"
25+
#include "models/llm_build_chatglm.h"
26+
#include "models/llm_build_codeshell.h"
27+
#include "models/llm_build_cohere2_iswa.h"
28+
#include "models/llm_build_command_r.h"
29+
#include "models/llm_build_dbrx.h"
30+
#include "models/llm_build_deci.h"
31+
#include "models/llm_build_deepseek.h"
32+
#include "models/llm_build_deepseek2.h"
33+
#include "models/llm_build_dots1.h"
34+
#include "models/llm_build_dream.h"
35+
#include "models/llm_build_ernie4_5.h"
36+
#include "models/llm_build_ernie4_5_moe.h"
37+
#include "models/llm_build_exaone.h"
38+
#include "models/llm_build_falcon.h"
39+
#include "models/llm_build_falcon_h1.h"
40+
#include "models/llm_build_gemma.h"
41+
#include "models/llm_build_gemma2_iswa.h"
42+
#include "models/llm_build_gemma3_iswa.h"
43+
#include "models/llm_build_gemma3n_iswa.h"
44+
#include "models/llm_build_gemma_embedding_iswa.h"
45+
#include "models/llm_build_glm4.h"
46+
#include "models/llm_build_glm4_moe.h"
47+
#include "models/llm_build_gpt2.h"
48+
#include "models/llm_build_gptneox.h"
49+
#include "models/llm_build_granite.h"
50+
#include "models/llm_build_granite_hybrid.h"
51+
#include "models/llm_build_grok.h"
52+
#include "models/llm_build_hunyuan_dense.h"
53+
#include "models/llm_build_hunyuan_moe.h"
54+
#include "models/llm_build_internlm2.h"
55+
#include "models/llm_build_jais.h"
56+
#include "models/llm_build_jamba.h"
57+
#include "models/llm_build_lfm2.h"
58+
#include "models/llm_build_llada.h"
59+
#include "models/llm_build_llada_moe.h"
60+
#include "models/llm_build_llama.h"
61+
#include "models/llm_build_llama_iswa.h"
62+
#include "models/llm_build_mamba.h"
63+
#include "models/llm_build_minicpm3.h"
64+
#include "models/llm_build_mpt.h"
65+
#include "models/llm_build_nemotron.h"
66+
#include "models/llm_build_nemotron_h.h"
67+
#include "models/llm_build_neo_bert.h"
68+
#include "models/llm_build_olmo.h"
69+
#include "models/llm_build_olmoe.h"
70+
#include "models/llm_build_openai_moe_iswa.h"
71+
#include "models/llm_build_openelm.h"
72+
#include "models/llm_build_orion.h"
73+
#include "models/llm_build_phi2.h"
74+
#include "models/llm_build_plamo.h"
75+
#include "models/llm_build_plamo2.h"
76+
#include "models/llm_build_plm.h"
77+
#include "models/llm_build_qwen.h"
78+
#include "models/llm_build_qwen2.h"
79+
#include "models/llm_build_qwen2moe.h"
80+
#include "models/llm_build_qwen2vl.h"
81+
#include "models/llm_build_qwen3.h"
82+
#include "models/llm_build_qwen3moe.h"
83+
#include "models/llm_build_refact.h"
84+
#include "models/llm_build_rwkv_base.h"
85+
#include "models/llm_build_rwkv6.h"
86+
#include "models/llm_build_rwkv6qwen2.h"
87+
#include "models/llm_build_rwkv7.h"
88+
#include "models/llm_build_arwkv7.h"
89+
#include "models/llm_build_seed_oss.h"
90+
#include "models/llm_build_smollm3.h"
91+
#include "models/llm_build_stablelm.h"
92+
#include "models/llm_build_starcoder.h"
93+
#include "models/llm_build_starcoder2.h"
94+
#include "models/llm_build_t5_dec.h"
95+
#include "models/llm_build_t5_enc.h"
96+
#include "models/llm_build_wavtokenizer_dec.h"
97+
#include "models/llm_build_xverse.h"
98+
#include "models/llm_build_exaone4.h"
99+
#include "models/llm_build_olmo2.h"
100+
#include "models/llm_build_smallthinker.h"
101+
#include "models/llm_build_phi3.h"
17102

18103
#include <algorithm>
19104
#include <cassert>

src/models/apertus.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,9 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
6161
cb(Kcur, "Kcur_pos", il);
6262
cb(Vcur, "Vcur_pos", il);
6363

64-
cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr,
65-
nullptr, kq_scale, il);
64+
cur = build_attn(inp_attn,
65+
model.layers[il].wo, model.layers[il].bo,
66+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
6667
cb(cur, "attn_out", il);
6768
}
6869

src/models/bailingmoe2.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,9 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
5757
cb(Kcur, "Kcur", il);
5858
cb(Vcur, "Vcur", il);
5959

60-
cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr,
61-
nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
60+
cur = build_attn(inp_attn,
61+
model.layers[il].wo, model.layers[il].bo,
62+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
6263
}
6364

6465
if (il == n_transformer_layers - 1 && inp_out_ids) {

src/models/bert.cpp

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,9 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
101101
cb(Kcur, "Kcur", il);
102102
cb(Vcur, "Vcur", il);
103103

104-
cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr,
105-
nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
104+
cur = build_attn(inp_attn,
105+
model.layers[il].wo, model.layers[il].bo,
106+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
106107
cb(cur, "kqv_out", il);
107108
}
108109

@@ -134,18 +135,25 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
134135
cb(cur, "ffn_moe_out", il);
135136
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
136137
model.arch == LLM_ARCH_JINA_BERT_V3) {
137-
cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL,
138-
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU,
139-
LLM_FFN_SEQ, il);
138+
cur = build_ffn(cur,
139+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
140+
NULL, NULL, NULL,
141+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
142+
LLM_FFN_GELU, LLM_FFN_SEQ, il);
140143
cb(cur, "ffn_out", il);
141144
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
142-
cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
143-
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
144-
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
145+
cur = build_ffn(cur,
146+
model.layers[il].ffn_up, NULL, NULL,
147+
model.layers[il].ffn_gate, NULL, NULL,
148+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
149+
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
145150
cb(cur, "ffn_out", il);
146151
} else {
147-
cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
148-
model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
152+
cur = build_ffn(cur,
153+
model.layers[il].ffn_up, NULL, NULL,
154+
model.layers[il].ffn_gate, NULL, NULL,
155+
model.layers[il].ffn_down, NULL, NULL,
156+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
149157
cb(cur, "ffn_out", il);
150158
}
151159

src/models/cohere2-iswa.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,11 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
9191

9292
// feed-forward network
9393
{
94-
cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
95-
NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
96-
il);
94+
cur = build_ffn(ffn_inp,
95+
model.layers[il].ffn_up, NULL, NULL,
96+
model.layers[il].ffn_gate, NULL, NULL,
97+
model.layers[il].ffn_down, NULL, NULL,
98+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
9799
cb(cur, "ffn_out", il);
98100
}
99101

src/models/command-r.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,9 @@ llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_gr
7272
cb(Kcur, "Kcur", il);
7373
cb(Vcur, "Vcur", il);
7474

75-
cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr,
76-
nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
75+
cur = build_attn(inp_attn,
76+
model.layers[il].wo, model.layers[il].bo,
77+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
7778
};
7879
if (il == n_layer - 1 && inp_out_ids) {
7980
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
@@ -84,8 +85,11 @@ llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_gr
8485

8586
// feed-forward network
8687
{
87-
cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
88-
model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
88+
cur = build_ffn(ffn_inp,
89+
model.layers[il].ffn_up, NULL, NULL,
90+
model.layers[il].ffn_gate, NULL, NULL,
91+
model.layers[il].ffn_down, NULL, NULL,
92+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
8993
cb(cur, "ffn_out", il);
9094
};
9195
// add together residual + FFN + self-attention

src/models/deci.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,9 @@ llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params
7979
cb(Kcur, "Kcur", il);
8080
cb(Vcur, "Vcur", il);
8181

82-
cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr,
83-
nullptr, kq_scale, il);
82+
cur = build_attn(inp_attn,
83+
model.layers[il].wo, model.layers[il].bo,
84+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
8485
};
8586
if (il == n_layer - 1 && inp_out_ids) {
8687
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
@@ -101,9 +102,11 @@ llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params
101102
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
102103
cb(cur, "ffn_norm", il);
103104

104-
cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate,
105-
model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b,
106-
NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
105+
cur = build_ffn(cur,
106+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
107+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
108+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
109+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
107110
cb(cur, "ffn_out", il);
108111
};
109112
cur = ggml_add(ctx0, cur, ffn_inp);

src/models/deepseek.cpp

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,9 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
6969
cb(Kcur, "Kcur", il);
7070
cb(Vcur, "Vcur", il);
7171

72-
cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr,
73-
nullptr, kq_scale, il);
72+
cur = build_attn(inp_attn,
73+
model.layers[il].wo, model.layers[il].bo,
74+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
7475
};
7576
if (il == n_layer - 1 && inp_out_ids) {
7677
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
@@ -83,22 +84,35 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
8384
cb(cur, "ffn_norm", il);
8485

8586
if ((uint32_t) il < hparams.n_layer_dense_lead) {
86-
cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
87-
model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
87+
cur = build_ffn(cur,
88+
model.layers[il].ffn_up, NULL, NULL,
89+
model.layers[il].ffn_gate, NULL, NULL,
90+
model.layers[il].ffn_down, NULL, NULL,
91+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
8892
cb(cur, "ffn_out", il);
8993
} else {
9094
// MoE branch
91-
ggml_tensor * moe_out = build_moe_ffn(
92-
cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps,
93-
model.layers[il].ffn_down_exps, nullptr, n_expert, n_expert_used, LLM_FFN_SILU, false, false,
94-
hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
95+
ggml_tensor * moe_out = build_moe_ffn(cur,
96+
model.layers[il].ffn_gate_inp,
97+
model.layers[il].ffn_up_exps,
98+
model.layers[il].ffn_gate_exps,
99+
model.layers[il].ffn_down_exps,
100+
nullptr,
101+
n_expert, n_expert_used,
102+
LLM_FFN_SILU, false,
103+
false, hparams.expert_weights_scale,
104+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
105+
il);
95106
cb(moe_out, "ffn_moe_out", il);
96107

97108
// FFN shared expert
98109
{
99110
ggml_tensor * ffn_shexp =
100-
build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL,
101-
NULL, model.layers[il].ffn_down_shexp, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
111+
build_ffn(cur,
112+
model.layers[il].ffn_up_shexp, NULL, NULL,
113+
model.layers[il].ffn_gate_shexp, NULL, NULL,
114+
model.layers[il].ffn_down_shexp, NULL, NULL,
115+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
102116
cb(ffn_shexp, "ffn_shexp", il);
103117

104118
cur = ggml_add(ctx0, moe_out, ffn_shexp);

src/models/deepseek2.cpp

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
128128
cb(Vcur, "Vcur", il);
129129

130130
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
131-
cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
132-
model.layers[il].wv_b, kq_scale, il);
131+
cur = build_attn(inp_attn,
132+
model.layers[il].wo, NULL,
133+
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
133134
} else {
134135
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
135136
cb(kv, "kv", il);
@@ -159,8 +160,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
159160
cb(Kcur, "Kcur", il);
160161

161162
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
162-
cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
163-
kq_scale, il);
163+
cur = build_attn(inp_attn,
164+
model.layers[il].wo, NULL,
165+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
164166
}
165167
};
166168
if (il == n_layer - 1 && inp_out_ids) {
@@ -174,23 +176,34 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
174176
cb(cur, "ffn_norm", il);
175177

176178
if ((uint32_t) il < hparams.n_layer_dense_lead) {
177-
cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
178-
model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
179+
cur = build_ffn(cur,
180+
model.layers[il].ffn_up, NULL, NULL,
181+
model.layers[il].ffn_gate, NULL, NULL,
182+
model.layers[il].ffn_down, NULL, NULL,
183+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
179184
cb(cur, "ffn_out", il);
180185
} else {
181186
// MoE branch
182-
ggml_tensor * moe_out = build_moe_ffn(
183-
cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps,
184-
model.layers[il].ffn_down_exps, model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU,
185-
hparams.expert_weights_norm, true, hparams.expert_weights_scale,
186-
(llama_expert_gating_func_type) hparams.expert_gating_func, il);
187+
ggml_tensor * moe_out = build_moe_ffn(cur,
188+
model.layers[il].ffn_gate_inp,
189+
model.layers[il].ffn_up_exps,
190+
model.layers[il].ffn_gate_exps,
191+
model.layers[il].ffn_down_exps,
192+
model.layers[il].ffn_exp_probs_b,
193+
n_expert, n_expert_used,
194+
LLM_FFN_SILU, hparams.expert_weights_norm,
195+
true, hparams.expert_weights_scale,
196+
(llama_expert_gating_func_type) hparams.expert_gating_func,
197+
il);
187198
cb(moe_out, "ffn_moe_out", il);
188199

189200
// FFN shared expert
190201
{
191202
ggml_tensor * ffn_shexp =
192-
build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL,
193-
NULL, model.layers[il].ffn_down_shexp, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
203+
build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL,
204+
model.layers[il].ffn_gate_shexp, NULL, NULL,
205+
model.layers[il].ffn_down_shexp, NULL, NULL,
206+
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
194207
cb(ffn_shexp, "ffn_shexp", il);
195208

196209
cur = ggml_add(ctx0, moe_out, ffn_shexp);

0 commit comments

Comments
 (0)