Skip to content

Commit 32dcee4

Browse files
committed
Some attempts to get the convolution input right.
1 parent 397cd9f commit 32dcee4

File tree

3 files changed

+38
-20
lines changed

3 files changed

+38
-20
lines changed

src/llama-model.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7138,6 +7138,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
71387138
case LLM_ARCH_ARCEE:
71397139
case LLM_ARCH_ERNIE4_5:
71407140
case LLM_ARCH_ERNIE4_5_MOE:
7141+
case LLM_ARCH_QWEN3NEXT:
71417142
return LLAMA_ROPE_TYPE_NORM;
71427143

71437144
// the pairs of head values are offset by n_rot/2
@@ -7157,7 +7158,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
71577158
case LLM_ARCH_QWEN2MOE:
71587159
case LLM_ARCH_QWEN3:
71597160
case LLM_ARCH_QWEN3MOE:
7160-
case LLM_ARCH_QWEN3NEXT:
71617161
case LLM_ARCH_LLADA_MOE:
71627162
case LLM_ARCH_OLMO2:
71637163
case LLM_ARCH_OLMOE:

src/models/llm_build_qwen3next.cpp

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
2121

2222
for (int il = 0; il < n_layer; ++il) {
2323
struct ggml_tensor * inpSA = inpL;
24-
25-
// Pre-norm for attention/linear attention
26-
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
24+
cur = build_q3n_norm(inpL, model.layers[il].attn_norm, il);
2725
cb(cur, "attn_norm", il);
2826

2927
// Determine layer type and build appropriate attention mechanism
@@ -35,7 +33,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
3533
cur = build_qwen3next_attention_layer(cur, inp_pos, inp->get_attn(), model, n_embd_head, il);
3634
}
3735
// Post-attention norm
38-
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
36+
cur = build_q3n_norm(cur, model.layers[il].attn_post_norm, il);
3937
cb(cur, "attn_post_norm", il);
4038

4139
if (il == n_layer - 1 && inp_out_ids) {
@@ -48,14 +46,15 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
4846

4947
// FFN layer (MoE or dense)
5048
cur = build_layer_ffn(cur, model, il);
49+
cb(cur, "post_moe", il);
5150

5251
// Input for next layer
5352
inpL = cur;
5453
}
5554
cur = inpL;
5655

5756
// Final norm
58-
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
57+
cur = build_q3n_norm(cur, model.output_norm, -1);
5958

6059
cb(cur, "result_norm", -1);
6160
res->t_embd = cur;
@@ -70,6 +69,11 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
7069
ggml_build_forward_expand(gf, cur);
7170
}
7271

72+
struct ggml_tensor * llm_build_qwen3next::build_q3n_norm(struct ggml_tensor * input, struct ggml_tensor * weights, int layer) {
73+
ggml_tensor * input_norm = ggml_scale_bias(ctx0, weights, 1.0f, 1.0f);
74+
return build_norm(input, input_norm, nullptr, LLM_NORM_RMS, layer);
75+
}
76+
7377
// ggml_delta_net
7478
struct ggml_tensor * llm_build_qwen3next::ggml_delta_net(struct ggml_tensor * k,
7579
struct ggml_tensor * v,
@@ -386,27 +390,38 @@ ggml_tensor * llm_build_qwen3next::build_qwen3next_linear_attn_layer(llm_graph_i
386390
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
387391
cb(conv_states, "conv_states", il);
388392

389-
// Combine query, key, value for convolution input
390-
ggml_tensor * qkv_mixed = ggml_concat(ctx0, query, key, 1);
391-
qkv_mixed = ggml_concat(ctx0, qkv_mixed, value_reshaped, 1);
393+
// After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
394+
// query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
395+
ggml_tensor * query_flat = ggml_reshape_3d(ctx0, query, head_k_dim * num_k_heads, n_tokens, n_seqs);
396+
cb(query_flat, "query_flat", il);
397+
398+
// key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
399+
ggml_tensor * key_flat = ggml_reshape_3d(ctx0, key, head_k_dim * num_k_heads, n_tokens, n_seqs);
400+
cb(key_flat, "key_flat", il);
401+
402+
// value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
403+
ggml_tensor * value_flat = ggml_reshape_3d(ctx0, value_reshaped, head_v_dim * num_v_heads, n_tokens, n_seqs);
404+
cb(value_flat, "value_flat", il);
392405

406+
// Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
407+
ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
408+
qkv_mixed = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
409+
cb(qkv_mixed, "qkv_mixed_concatenated", il);
410+
411+
// Calculate the total conv dimension
393412
int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
394413

414+
// Reshape to [n_tokens, qkv_dim, n_seqs] for proper convolution input format
415+
qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, n_tokens, qkv_dim, n_seqs);
416+
cb(qkv_mixed, "qkv_mixed_for_conv", il);
417+
395418
// Calculate convolution kernel size
396419
const int64_t conv_kernel_size = model.layers[il].ssm_conv1d->ne[0];
397-
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1,
398-
d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state, n_seqs);
420+
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state, n_seqs);
399421
cb(conv_states, "conv_states_reshaped", il);
400422

401-
// Reshape to [input_dim, n_seq_tokens, n_seqs] for concatenation
402-
qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_dim, n_seq_tokens, n_seqs);
403-
cb(qkv_mixed, "qkv_mixed_for_conv", il);
404-
405-
// Concatenate cached conv states with current input
406-
// conv_states: [conv_kernel_size - 1, input_dim, n_seqs]
407-
// qkv_mixed: [input_dim, n_seq_tokens, n_seqs]
408-
// After transpose: [n_seq_tokens, input_dim, n_seqs]
409-
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, ggml_transpose(ctx0, qkv_mixed), 0);
423+
// Now concatenate along the sequence dimension (dim 0 in Llama.cpp)
424+
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
410425
cb(conv_input, "conv_input", il);
411426

412427
// Apply convolution

src/models/llm_build_qwen3next.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,7 @@ struct llm_build_qwen3next : public llm_graph_context_mamba {
4747
ggml_tensor * build_layer_ffn(ggml_tensor * cur, const llama_model & model, const int il);
4848

4949
ggml_tensor * softplus(ggml_tensor * alpha, ggml_tensor * dt_bias);
50+
51+
ggml_tensor * build_q3n_norm(struct ggml_tensor * input, struct ggml_tensor * weights, int layer);
52+
5053
};

0 commit comments

Comments
 (0)