Skip to content

Commit 261556f

Browse files
committed
Add missing qwen3vl(moe) models
1 parent c8c8fc7 commit 261556f

File tree

3 files changed

+303
-0
lines changed

3 files changed

+303
-0
lines changed

src/models/models.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,15 @@ struct llm_build_qwen3moe : public llm_graph_context {
402402
llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params);
403403
};
404404

405+
struct llm_build_qwen3vl : public llm_graph_context {
406+
llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params);
407+
};
408+
409+
struct llm_build_qwen3vlmoe : public llm_graph_context {
410+
llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
411+
};
412+
413+
405414
struct llm_build_qwen : public llm_graph_context {
406415
llm_build_qwen(const llama_model & model, const llm_graph_params & params);
407416
};

src/models/qwen3vl-moe.cpp

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#include "models.h"
2+
3+
llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4+
const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
5+
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
6+
const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
7+
const int64_t n_embd_head = hparams.n_embd_head_v;
8+
9+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10+
GGML_ASSERT(n_embd_head == hparams.n_rot);
11+
12+
ggml_tensor * cur;
13+
ggml_tensor * inpL;
14+
15+
inpL = build_inp_embd(model.tok_embd);
16+
17+
int sections[4];
18+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
19+
20+
std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
21+
22+
if (ubatch.embd) {
23+
// Image input: split main embd and deepstack embds
24+
ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
25+
for (size_t i = 0; i < n_deepstack_layers; i++) {
26+
deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
27+
}
28+
inpL = inpL_main;
29+
}
30+
31+
// inp_pos - contains the positions
32+
ggml_tensor * inp_pos = build_inp_pos();
33+
34+
auto * inp_attn = build_attn_inp_kv();
35+
36+
ggml_tensor * inp_out_ids = build_inp_out_ids();
37+
38+
for (int il = 0; il < n_layer; ++il) {
39+
ggml_tensor * inpSA = inpL;
40+
41+
// norm
42+
cur = build_norm(inpL,
43+
model.layers[il].attn_norm, NULL,
44+
LLM_NORM_RMS, il);
45+
cb(cur, "attn_norm", il);
46+
47+
// self_attention
48+
{
49+
// compute Q and K and RoPE them
50+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
51+
cb(Qcur, "Qcur", il);
52+
53+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
54+
cb(Kcur, "Kcur", il);
55+
56+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
57+
cb(Vcur, "Vcur", il);
58+
59+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
60+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
61+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
62+
63+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
64+
cb(Qcur, "Qcur_normed", il);
65+
66+
Qcur = ggml_rope_multi(
67+
ctx0, Qcur, inp_pos, nullptr,
68+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
69+
ext_factor, attn_factor, beta_fast, beta_slow
70+
);
71+
72+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
73+
cb(Kcur, "Kcur_normed", il);
74+
75+
Kcur = ggml_rope_multi(
76+
ctx0, Kcur, inp_pos, nullptr,
77+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
78+
ext_factor, attn_factor, beta_fast, beta_slow
79+
);
80+
81+
cb(Qcur, "Qcur", il);
82+
cb(Kcur, "Kcur", il);
83+
cb(Vcur, "Vcur", il);
84+
85+
cur = build_attn(inp_attn,
86+
model.layers[il].wo, model.layers[il].bo,
87+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
88+
}
89+
90+
if (il == n_layer - 1 && inp_out_ids) {
91+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
92+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
93+
}
94+
95+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
96+
cb(ffn_inp, "ffn_inp", il);
97+
98+
// MoE branch
99+
cur = build_norm(ffn_inp,
100+
model.layers[il].ffn_norm, NULL,
101+
LLM_NORM_RMS, il);
102+
cb(cur, "ffn_norm", il);
103+
104+
ggml_tensor * moe_out =
105+
build_moe_ffn(cur,
106+
model.layers[il].ffn_gate_inp,
107+
model.layers[il].ffn_up_exps,
108+
model.layers[il].ffn_gate_exps,
109+
model.layers[il].ffn_down_exps,
110+
nullptr,
111+
n_expert, n_expert_used,
112+
LLM_FFN_SILU, true,
113+
false, 0.0,
114+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
115+
il);
116+
cb(moe_out, "ffn_moe_out", il);
117+
cur = moe_out;
118+
119+
cur = ggml_add(ctx0, cur, ffn_inp);
120+
121+
cur = build_cvec(cur, il);
122+
cb(cur, "l_out", il);
123+
124+
if (ubatch.embd && (size_t)il < n_deepstack_layers) {
125+
cur = ggml_add(ctx0, cur, deepstack_features[il]);
126+
cb(cur, "deepstack_out", il);
127+
}
128+
129+
// input for next layer
130+
inpL = cur;
131+
}
132+
133+
cur = inpL;
134+
135+
cur = build_norm(cur,
136+
model.output_norm, NULL,
137+
LLM_NORM_RMS, -1);
138+
139+
cb(cur, "result_norm", -1);
140+
res->t_embd = cur;
141+
142+
// lm_head
143+
cur = build_lora_mm(model.output, cur);
144+
145+
cb(cur, "result_output", -1);
146+
res->t_logits = cur;
147+
148+
ggml_build_forward_expand(gf, cur);
149+
}
150+

src/models/qwen3vl.cpp

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#include "models.h"
2+
3+
llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4+
5+
const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
6+
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
7+
const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
8+
const int64_t n_embd_head = hparams.n_embd_head_v;
9+
10+
11+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12+
GGML_ASSERT(n_embd_head == hparams.n_rot);
13+
14+
ggml_tensor * cur;
15+
ggml_tensor * inpL;
16+
17+
inpL = build_inp_embd(model.tok_embd);
18+
19+
int sections[4];
20+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
21+
22+
std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
23+
24+
if (ubatch.embd) {
25+
// Image input: split main embd and deepstack embds
26+
ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
27+
for (size_t i = 0; i < n_deepstack_layers; i++) {
28+
deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
29+
}
30+
inpL = inpL_main;
31+
}
32+
33+
// inp_pos - contains the positions
34+
ggml_tensor * inp_pos = build_inp_pos();
35+
36+
auto * inp_attn = build_attn_inp_kv();
37+
38+
ggml_tensor * inp_out_ids = build_inp_out_ids();
39+
40+
for (int il = 0; il < n_layer; ++il) {
41+
ggml_tensor * inpSA = inpL;
42+
43+
// norm
44+
cur = build_norm(inpL,
45+
model.layers[il].attn_norm, NULL,
46+
LLM_NORM_RMS, il);
47+
cb(cur, "attn_norm", il);
48+
49+
// self-attention
50+
{
51+
// compute Q and K and RoPE them
52+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
53+
cb(Qcur, "Qcur", il);
54+
55+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
56+
cb(Kcur, "Kcur", il);
57+
58+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
59+
cb(Vcur, "Vcur", il);
60+
61+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
62+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
63+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
64+
65+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
66+
cb(Qcur, "Qcur_normed", il);
67+
68+
Qcur = ggml_rope_multi(
69+
ctx0, Qcur, inp_pos, nullptr,
70+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
71+
ext_factor, attn_factor, beta_fast, beta_slow
72+
);
73+
74+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
75+
cb(Kcur, "Kcur_normed", il);
76+
77+
Kcur = ggml_rope_multi(
78+
ctx0, Kcur, inp_pos, nullptr,
79+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
80+
ext_factor, attn_factor, beta_fast, beta_slow
81+
);
82+
83+
cb(Qcur, "Qcur", il);
84+
cb(Kcur, "Kcur", il);
85+
cb(Vcur, "Vcur", il);
86+
87+
cur = build_attn(inp_attn,
88+
model.layers[il].wo, model.layers[il].bo,
89+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
90+
}
91+
92+
if (il == n_layer - 1 && inp_out_ids) {
93+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
94+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
95+
}
96+
97+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
98+
cb(ffn_inp, "ffn_inp", il);
99+
100+
// feed-forward network
101+
cur = build_norm(ffn_inp,
102+
model.layers[il].ffn_norm, NULL,
103+
LLM_NORM_RMS, il);
104+
cb(cur, "ffn_norm", il);
105+
106+
cur = build_ffn(cur,
107+
model.layers[il].ffn_up, NULL, NULL,
108+
model.layers[il].ffn_gate, NULL, NULL,
109+
model.layers[il].ffn_down, NULL, NULL,
110+
NULL,
111+
LLM_FFN_SILU, LLM_FFN_PAR, il);
112+
cb(cur, "ffn_out", il);
113+
114+
cur = ggml_add(ctx0, cur, ffn_inp);
115+
116+
cur = build_cvec(cur, il);
117+
cb(cur, "l_out", il);
118+
119+
if (ubatch.embd && (size_t)il < n_deepstack_layers) {
120+
cur = ggml_add(ctx0, cur, deepstack_features[il]);
121+
cb(cur, "deepstack_out", il);
122+
}
123+
124+
// input for next layer
125+
inpL = cur;
126+
}
127+
128+
cur = inpL;
129+
130+
cur = build_norm(cur,
131+
model.output_norm, NULL,
132+
LLM_NORM_RMS, -1);
133+
134+
cb(cur, "result_norm", -1);
135+
res->t_embd = cur;
136+
137+
// lm_head
138+
cur = build_lora_mm(model.output, cur);
139+
140+
cb(cur, "result_output", -1);
141+
res->t_logits = cur;
142+
143+
ggml_build_forward_expand(gf, cur);
144+
}

0 commit comments

Comments
 (0)