Some attempts to get the convolution input right.

pwilkin · pwilkin · commit 32dcee47ef3c · 2025-09-25T18:31:24.000+02:00
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -7138,6 +7138,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_ARCEE:
         case LLM_ARCH_ERNIE4_5:
         case LLM_ARCH_ERNIE4_5_MOE:
+        case LLM_ARCH_QWEN3NEXT:
             return LLAMA_ROPE_TYPE_NORM;
 
         // the pairs of head values are offset by n_rot/2
@@ -7157,7 +7158,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_QWEN2MOE:
         case LLM_ARCH_QWEN3:
         case LLM_ARCH_QWEN3MOE:
-        case LLM_ARCH_QWEN3NEXT:
         case LLM_ARCH_LLADA_MOE:
         case LLM_ARCH_OLMO2:
         case LLM_ARCH_OLMOE:
diff --git a/src/models/llm_build_qwen3next.cpp b/src/models/llm_build_qwen3next.cpp
@@ -21,9 +21,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
 
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
-
-        // Pre-norm for attention/linear attention
-        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cur = build_q3n_norm(inpL, model.layers[il].attn_norm, il);
         cb(cur, "attn_norm", il);
 
         // Determine layer type and build appropriate attention mechanism
@@ -35,7 +33,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
             cur = build_qwen3next_attention_layer(cur, inp_pos, inp->get_attn(), model, n_embd_head, il);
         }
         // Post-attention norm
-        cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+        cur = build_q3n_norm(cur, model.layers[il].attn_post_norm, il);
         cb(cur, "attn_post_norm", il);
 
         if (il == n_layer - 1 && inp_out_ids) {
@@ -48,14 +46,15 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
 
         // FFN layer (MoE or dense)
         cur = build_layer_ffn(cur, model, il);
+        cb(cur, "post_moe", il);
 
         // Input for next layer
         inpL = cur;
     }
     cur = inpL;
 
     // Final norm
-    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+    cur = build_q3n_norm(cur, model.output_norm, -1);
 
     cb(cur, "result_norm", -1);
     res->t_embd = cur;
@@ -70,6 +69,11 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
     ggml_build_forward_expand(gf, cur);
 }
 
+struct ggml_tensor * llm_build_qwen3next::build_q3n_norm(struct ggml_tensor * input, struct ggml_tensor * weights, int layer) {
+    ggml_tensor * input_norm = ggml_scale_bias(ctx0, weights, 1.0f, 1.0f);
+    return build_norm(input, input_norm, nullptr, LLM_NORM_RMS, layer);
+}
+
 // ggml_delta_net
 struct ggml_tensor * llm_build_qwen3next::ggml_delta_net(struct ggml_tensor * k,
                                                          struct ggml_tensor * v,
@@ -386,27 +390,38 @@ ggml_tensor * llm_build_qwen3next::build_qwen3next_linear_attn_layer(llm_graph_i
     ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
     cb(conv_states, "conv_states", il);
 
-    // Combine query, key, value for convolution input
-    ggml_tensor * qkv_mixed = ggml_concat(ctx0, query, key, 1);
-    qkv_mixed               = ggml_concat(ctx0, qkv_mixed, value_reshaped, 1);
+    // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
+    // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
+    ggml_tensor * query_flat = ggml_reshape_3d(ctx0, query, head_k_dim * num_k_heads, n_tokens, n_seqs);
+    cb(query_flat, "query_flat", il);
+
+    // key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
+    ggml_tensor * key_flat = ggml_reshape_3d(ctx0, key, head_k_dim * num_k_heads, n_tokens, n_seqs);
+    cb(key_flat, "key_flat", il);
+
+    // value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
+    ggml_tensor * value_flat = ggml_reshape_3d(ctx0, value_reshaped, head_v_dim * num_v_heads, n_tokens, n_seqs);
+    cb(value_flat, "value_flat", il);
 
+    // Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
+    ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
+    qkv_mixed               = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
+    cb(qkv_mixed, "qkv_mixed_concatenated", il);
+
+    // Calculate the total conv dimension
     int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
 
+    // Reshape to [n_tokens, qkv_dim, n_seqs] for proper convolution input format
+    qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, n_tokens, qkv_dim, n_seqs);
+    cb(qkv_mixed, "qkv_mixed_for_conv", il);
+
     // Calculate convolution kernel size
     const int64_t conv_kernel_size = model.layers[il].ssm_conv1d->ne[0];
-    conv_states                    = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1,
-                                                     d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state, n_seqs);
+    conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state, n_seqs);
     cb(conv_states, "conv_states_reshaped", il);
 
-    // Reshape to [input_dim, n_seq_tokens, n_seqs] for concatenation
-    qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_dim, n_seq_tokens, n_seqs);
-    cb(qkv_mixed, "qkv_mixed_for_conv", il);
-
-    // Concatenate cached conv states with current input
-    // conv_states: [conv_kernel_size - 1, input_dim, n_seqs]
-    // qkv_mixed: [input_dim, n_seq_tokens, n_seqs]
-    // After transpose: [n_seq_tokens, input_dim, n_seqs]
-    ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, ggml_transpose(ctx0, qkv_mixed), 0);
+    // Now concatenate along the sequence dimension (dim 0 in Llama.cpp)
+    ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);    
     cb(conv_input, "conv_input", il);
 
     // Apply convolution
diff --git a/src/models/llm_build_qwen3next.h b/src/models/llm_build_qwen3next.h
@@ -47,4 +47,7 @@ struct llm_build_qwen3next : public llm_graph_context_mamba {
     ggml_tensor * build_layer_ffn(ggml_tensor * cur, const llama_model & model, const int il);
 
     ggml_tensor * softplus(ggml_tensor * alpha, ggml_tensor * dt_bias);
+
+    ggml_tensor * build_q3n_norm(struct ggml_tensor * input, struct ggml_tensor * weights, int layer);
+
 };