Skip to content

Commit dd41d9e

Browse files
committed
Merge with master, add new models, cleanup
1 parent 743c618 commit dd41d9e

20 files changed

+636
-226
lines changed

src/CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,13 @@ add_library(llama
3535
unicode-data.cpp
3636
unicode.cpp
3737
unicode.h
38+
models/llm_build_apertus.cpp
3839
models/llm_build_arcee.cpp
3940
models/llm_build_arctic.cpp
4041
models/llm_build_arwkv7.cpp
4142
models/llm_build_baichuan.cpp
4243
models/llm_build_bailingmoe.cpp
44+
models/llm_build_bailingmoe2.cpp
4345
models/llm_build_bert.cpp
4446
models/llm_build_bitnet.cpp
4547
models/llm_build_bloom.cpp
@@ -60,7 +62,7 @@ add_library(llama
6062
models/llm_build_exaone4.cpp
6163
models/llm_build_falcon_h1.cpp
6264
models/llm_build_falcon.cpp
63-
models/llm_build_gemma_embedding_iswa.cpp
65+
models/llm_build_gemma_embedding.cpp
6466
models/llm_build_gemma.cpp
6567
models/llm_build_gemma2_iswa.cpp
6668
models/llm_build_gemma3_iswa.cpp
@@ -72,6 +74,7 @@ add_library(llama
7274
models/llm_build_granite_hybrid.cpp
7375
models/llm_build_granite.cpp
7476
models/llm_build_grok.cpp
77+
models/llm_build_grovemoe.cpp
7578
models/llm_build_hunyuan_dense.cpp
7679
models/llm_build_hunyuan_moe.cpp
7780
models/llm_build_internlm2.cpp

src/llama-model.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
#include "ggml-cpp.h"
1515

1616
#include "models/llm_graph_context_mamba.h"
17+
#include "models/llm_build_apertus.h"
1718
#include "models/llm_build_arcee.h"
1819
#include "models/llm_build_arctic.h"
1920
#include "models/llm_build_baichuan.h"
2021
#include "models/llm_build_bailingmoe.h"
22+
#include "models/llm_build_bailingmoe2.h"
2123
#include "models/llm_build_bert.h"
2224
#include "models/llm_build_bitnet.h"
2325
#include "models/llm_build_bloom.h"
@@ -41,14 +43,15 @@
4143
#include "models/llm_build_gemma2_iswa.h"
4244
#include "models/llm_build_gemma3_iswa.h"
4345
#include "models/llm_build_gemma3n_iswa.h"
44-
#include "models/llm_build_gemma_embedding_iswa.h"
46+
#include "models/llm_build_gemma_embedding.h"
4547
#include "models/llm_build_glm4.h"
4648
#include "models/llm_build_glm4_moe.h"
4749
#include "models/llm_build_gpt2.h"
4850
#include "models/llm_build_gptneox.h"
4951
#include "models/llm_build_granite.h"
5052
#include "models/llm_build_granite_hybrid.h"
5153
#include "models/llm_build_grok.h"
54+
#include "models/llm_build_grovemoe.h"
5255
#include "models/llm_build_hunyuan_dense.h"
5356
#include "models/llm_build_hunyuan_moe.h"
5457
#include "models/llm_build_internlm2.h"

src/models/llm_build_apertus.cpp

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#include "llm_build_apertus.h"
2+
3+
#include "../llama-graph.h"
4+
#include "../llama-model.h"
5+
6+
#include <cmath>
7+
8+
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
9+
const int64_t n_embd_head = hparams.n_embd_head_v;
10+
11+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12+
GGML_ASSERT(n_embd_head == hparams.n_rot);
13+
14+
ggml_tensor * cur;
15+
ggml_tensor * inpL;
16+
17+
inpL = build_inp_embd(model.tok_embd);
18+
19+
ggml_tensor * inp_pos = build_inp_pos();
20+
auto * inp_attn = build_attn_inp_kv();
21+
22+
const float kq_scale =
23+
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
24+
25+
ggml_tensor * inp_out_ids = build_inp_out_ids();
26+
27+
for (int il = 0; il < n_layer; ++il) {
28+
ggml_tensor * inpSA = inpL;
29+
30+
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
31+
cb(cur, "attn_norm", il);
32+
33+
// self-attention
34+
{
35+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
36+
37+
// compute Q and K and RoPE them
38+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
39+
cb(Qcur, "Qcur", il);
40+
41+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
42+
cb(Kcur, "Kcur", il);
43+
44+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
45+
cb(Vcur, "Vcur", il);
46+
47+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
48+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
49+
cb(Qcur, "Qcur_normed", il);
50+
51+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
52+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
53+
cb(Kcur, "Kcur_normed", il);
54+
55+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
56+
57+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
58+
ext_factor, attn_factor, beta_fast, beta_slow);
59+
60+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
61+
ext_factor, attn_factor, beta_fast, beta_slow);
62+
63+
cb(Qcur, "Qcur_pos", il);
64+
cb(Kcur, "Kcur_pos", il);
65+
cb(Vcur, "Vcur_pos", il);
66+
67+
cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr,
68+
nullptr, kq_scale, il);
69+
cb(cur, "attn_out", il);
70+
}
71+
72+
if (il == n_layer - 1 && inp_out_ids) {
73+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
74+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
75+
}
76+
77+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
78+
cb(ffn_inp, "ffn_inp", il);
79+
80+
// feed-forward network with xIELU activation
81+
{
82+
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il);
83+
cb(cur, "ffn_norm", il);
84+
85+
// Up projection
86+
ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
87+
cb(up, "ffn_up", il);
88+
89+
float alpha_n_val = hparams.xielu_alpha_n[il];
90+
float alpha_p_val = hparams.xielu_alpha_p[il];
91+
float beta_val = hparams.xielu_beta[il];
92+
float eps_val = hparams.xielu_eps[il];
93+
94+
// Apply xIELU activation
95+
ggml_tensor * activated = ggml_xielu(ctx0, up, alpha_n_val, alpha_p_val, beta_val, eps_val);
96+
cb(activated, "ffn_xielu", il);
97+
98+
// Down projection
99+
cur = build_lora_mm(model.layers[il].ffn_down, activated);
100+
cb(cur, "ffn_down", il);
101+
}
102+
103+
cur = ggml_add(ctx0, cur, ffn_inp);
104+
cb(cur, "ffn_out", il);
105+
106+
cur = build_cvec(cur, il);
107+
cb(cur, "l_out", il);
108+
109+
// input for next layer
110+
inpL = cur;
111+
}
112+
113+
cur = inpL;
114+
115+
cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
116+
117+
cb(cur, "result_norm", -1);
118+
res->t_embd = cur;
119+
120+
// lm_head
121+
cur = build_lora_mm(model.output, cur);
122+
123+
cb(cur, "result_output", -1);
124+
res->t_logits = cur;
125+
126+
ggml_build_forward_expand(gf, cur);
127+
}

src/models/llm_build_apertus.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#pragma once
2+
3+
#include "../llama-model.h"
4+
#include "../llama-graph.h"
5+
6+
#include <cmath>
7+
8+
struct llm_build_apertus : public llm_graph_context {
9+
llm_build_apertus(const llama_model & model, const llm_graph_params & params);
10+
};
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#include "../llama-model.h"
2+
#include "../llama-graph.h"
3+
4+
#include "llm_build_bailingmoe2.h"
5+
6+
#include <cmath>
7+
8+
9+
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
10+
llm_graph_context(params) {
11+
const int64_t n_embd_head = hparams.n_embd_head_v;
12+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
13+
14+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
15+
16+
ggml_tensor * cur;
17+
ggml_tensor * inpL;
18+
19+
inpL = build_inp_embd(model.tok_embd);
20+
21+
// inp_pos - contains the positions
22+
ggml_tensor * inp_pos = build_inp_pos();
23+
24+
auto * inp_attn = build_attn_inp_kv();
25+
26+
ggml_tensor * inp_out_ids = build_inp_out_ids();
27+
28+
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
29+
for (int il = 0; il < n_transformer_layers; ++il) {
30+
ggml_tensor * inpSA = inpL;
31+
32+
// norm
33+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
34+
cb(cur, "attn_norm", il);
35+
36+
// self_attention
37+
{
38+
cur = build_lora_mm(model.layers[il].wqkv, cur);
39+
cb(cur, "wqkv", il);
40+
41+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
42+
cur->nb[1], 0 * sizeof(float) * (n_embd));
43+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
44+
cur->nb[1], 1 * sizeof(float) * (n_embd));
45+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
46+
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
47+
48+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
49+
cb(Qcur, "Qcur_normed", il);
50+
51+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
52+
ext_factor, attn_factor, beta_fast, beta_slow);
53+
54+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
55+
cb(Kcur, "Kcur_normed", il);
56+
57+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
58+
ext_factor, attn_factor, beta_fast, beta_slow);
59+
60+
cb(Qcur, "Qcur", il);
61+
cb(Kcur, "Kcur", il);
62+
cb(Vcur, "Vcur", il);
63+
64+
cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr,
65+
nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
66+
}
67+
68+
if (il == n_transformer_layers - 1 && inp_out_ids) {
69+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
70+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
71+
}
72+
73+
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA);
74+
cb(sa_out, "sa_out", il);
75+
76+
// MoE branch
77+
cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
78+
cb(cur, "ffn_norm", il);
79+
80+
if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
81+
cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
82+
model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
83+
cb(cur, "ffn_out", il);
84+
} else {
85+
ggml_tensor * moe_out = build_moe_ffn(
86+
cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps,
87+
model.layers[il].ffn_down_exps, model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU,
88+
hparams.expert_weights_norm, true, hparams.expert_weights_scale,
89+
(llama_expert_gating_func_type) hparams.expert_gating_func, il);
90+
cb(moe_out, "ffn_moe_out", il);
91+
92+
{
93+
ggml_tensor * ffn_shexp =
94+
build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL,
95+
NULL, model.layers[il].ffn_down_shexp, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
96+
cb(ffn_shexp, "ffn_shexp", il);
97+
98+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
99+
cb(cur, "ffn_out", il);
100+
}
101+
}
102+
103+
cur = ggml_add(ctx0, cur, sa_out);
104+
105+
cur = build_cvec(cur, il);
106+
cb(cur, "l_out", il);
107+
108+
// input for next layer
109+
inpL = cur;
110+
}
111+
112+
cur = inpL;
113+
114+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
115+
116+
cb(cur, "result_norm", -1);
117+
res->t_embd = cur;
118+
119+
// lm_head
120+
cur = build_lora_mm(model.output, cur);
121+
122+
cb(cur, "result_output", -1);
123+
res->t_logits = cur;
124+
125+
ggml_build_forward_expand(gf, cur);
126+
}

src/models/llm_build_bailingmoe2.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#pragma once
2+
3+
#include "../llama-model.h"
4+
#include "../llama-graph.h"
5+
6+
#include <cmath>
7+
8+
struct llm_build_bailingmoe2 : public llm_graph_context {
9+
llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params);
10+
};

0 commit comments

Comments
 (0)