@@ -21,9 +21,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
2121
2222 for (int il = 0 ; il < n_layer; ++il) {
2323 struct ggml_tensor * inpSA = inpL;
24-
25- // Pre-norm for attention/linear attention
26- cur = build_norm (inpL, model.layers [il].attn_norm , NULL , LLM_NORM_RMS, il);
24+ cur = build_q3n_norm (inpL, model.layers [il].attn_norm , il);
2725 cb (cur, " attn_norm" , il);
2826
2927 // Determine layer type and build appropriate attention mechanism
@@ -35,7 +33,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
3533 cur = build_qwen3next_attention_layer (cur, inp_pos, inp->get_attn (), model, n_embd_head, il);
3634 }
3735 // Post-attention norm
38- cur = build_norm (cur, model.layers [il].attn_post_norm , NULL , LLM_NORM_RMS , il);
36+ cur = build_q3n_norm (cur, model.layers [il].attn_post_norm , il);
3937 cb (cur, " attn_post_norm" , il);
4038
4139 if (il == n_layer - 1 && inp_out_ids) {
@@ -48,14 +46,15 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
4846
4947 // FFN layer (MoE or dense)
5048 cur = build_layer_ffn (cur, model, il);
49+ cb (cur, " post_moe" , il);
5150
5251 // Input for next layer
5352 inpL = cur;
5453 }
5554 cur = inpL;
5655
5756 // Final norm
58- cur = build_norm (cur, model.output_norm , NULL , LLM_NORM_RMS , -1 );
57+ cur = build_q3n_norm (cur, model.output_norm , -1 );
5958
6059 cb (cur, " result_norm" , -1 );
6160 res->t_embd = cur;
@@ -70,6 +69,11 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
7069 ggml_build_forward_expand (gf, cur);
7170}
7271
72+ struct ggml_tensor * llm_build_qwen3next::build_q3n_norm (struct ggml_tensor * input, struct ggml_tensor * weights, int layer) {
73+ ggml_tensor * input_norm = ggml_scale_bias (ctx0, weights, 1 .0f , 1 .0f );
74+ return build_norm (input, input_norm, nullptr , LLM_NORM_RMS, layer);
75+ }
76+
7377// ggml_delta_net
7478struct ggml_tensor * llm_build_qwen3next::ggml_delta_net (struct ggml_tensor * k,
7579 struct ggml_tensor * v,
@@ -386,27 +390,38 @@ ggml_tensor * llm_build_qwen3next::build_qwen3next_linear_attn_layer(llm_graph_i
386390 ggml_tensor * conv_states = build_rs (inp, conv_states_all, hparams.n_embd_r (), n_seqs);
387391 cb (conv_states, " conv_states" , il);
388392
389- // Combine query, key, value for convolution input
390- ggml_tensor * qkv_mixed = ggml_concat (ctx0, query, key, 1 );
391- qkv_mixed = ggml_concat (ctx0, qkv_mixed, value_reshaped, 1 );
393+ // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
394+ // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
395+ ggml_tensor * query_flat = ggml_reshape_3d (ctx0, query, head_k_dim * num_k_heads, n_tokens, n_seqs);
396+ cb (query_flat, " query_flat" , il);
397+
398+ // key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
399+ ggml_tensor * key_flat = ggml_reshape_3d (ctx0, key, head_k_dim * num_k_heads, n_tokens, n_seqs);
400+ cb (key_flat, " key_flat" , il);
401+
402+ // value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
403+ ggml_tensor * value_flat = ggml_reshape_3d (ctx0, value_reshaped, head_v_dim * num_v_heads, n_tokens, n_seqs);
404+ cb (value_flat, " value_flat" , il);
392405
406+ // Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
407+ ggml_tensor * qkv_mixed = ggml_concat (ctx0, query_flat, key_flat, 0 );
408+ qkv_mixed = ggml_concat (ctx0, qkv_mixed, value_flat, 0 );
409+ cb (qkv_mixed, " qkv_mixed_concatenated" , il);
410+
411+ // Calculate the total conv dimension
393412 int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
394413
414+ // Reshape to [n_tokens, qkv_dim, n_seqs] for proper convolution input format
415+ qkv_mixed = ggml_reshape_3d (ctx0, qkv_mixed, n_tokens, qkv_dim, n_seqs);
416+ cb (qkv_mixed, " qkv_mixed_for_conv" , il);
417+
395418 // Calculate convolution kernel size
396419 const int64_t conv_kernel_size = model.layers [il].ssm_conv1d ->ne [0 ];
397- conv_states = ggml_reshape_3d (ctx0, conv_states, conv_kernel_size - 1 ,
398- d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state , n_seqs);
420+ conv_states = ggml_reshape_3d (ctx0, conv_states, conv_kernel_size - 1 , d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state , n_seqs);
399421 cb (conv_states, " conv_states_reshaped" , il);
400422
401- // Reshape to [input_dim, n_seq_tokens, n_seqs] for concatenation
402- qkv_mixed = ggml_reshape_3d (ctx0, qkv_mixed, qkv_dim, n_seq_tokens, n_seqs);
403- cb (qkv_mixed, " qkv_mixed_for_conv" , il);
404-
405- // Concatenate cached conv states with current input
406- // conv_states: [conv_kernel_size - 1, input_dim, n_seqs]
407- // qkv_mixed: [input_dim, n_seq_tokens, n_seqs]
408- // After transpose: [n_seq_tokens, input_dim, n_seqs]
409- ggml_tensor * conv_input = ggml_concat (ctx0, conv_states, ggml_transpose (ctx0, qkv_mixed), 0 );
423+ // Now concatenate along the sequence dimension (dim 0 in Llama.cpp)
424+ ggml_tensor * conv_input = ggml_concat (ctx0, conv_states, qkv_mixed, 0 );
410425 cb (conv_input, " conv_input" , il);
411426
412427 // Apply convolution
0 commit comments