@@ -28,6 +28,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
2828
2929 virtual size_t get_ctx_size () const override { return ctx_size; }
3030
31+ bool merge_qkv (const LLM_TN & tn, int i, int bias);
32+
3133 bool create_tensors () override ;
3234
3335 bool create_llama_tensors (const LLM_TN & tn);
@@ -284,15 +286,11 @@ bool create_tensors_helper::create_llama_tensors(const LLM_TN & tn) {
284286
285287 layer.attn_norm = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd});
286288
287- layer.wq = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_Q, " weight" , i), {n_embd, n_embd_head_k * n_head});
288- layer.wk = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_K, " weight" , i), {n_embd, n_embd_k_gqa});
289- layer.wv = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_embd_v_gqa});
289+ use_mmap_buffer &= !merge_qkv (tn, i, 1 );
290+
290291 layer.wo = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd_head_k * n_head, n_embd});
291292
292293 // optional bias tensors
293- layer.bq = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_Q, " bias" , i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
294- layer.bk = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_K, " bias" , i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
295- layer.bv = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_V, " bias" , i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
296294 layer.bo = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
297295
298296 layer.ffn_norm = create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd});
@@ -418,9 +416,8 @@ bool create_tensors_helper::create_llama4_tensors(const LLM_TN & tn) {
418416
419417 layer.attn_norm = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd}, 0 );
420418
421- layer.wq = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_Q, " weight" , i), {n_embd, n_embd_head_k * n_head}, 0 );
422- layer.wk = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_K, " weight" , i), {n_embd, n_embd_k_gqa}, 0 );
423- layer.wv = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_embd_v_gqa}, 0 );
419+ use_mmap_buffer &= !merge_qkv (tn, i, 0 );
420+
424421 layer.wo = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd_head_k * n_head, n_embd}, 0 );
425422
426423 layer.ffn_norm = create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd}, 0 );
@@ -1018,9 +1015,8 @@ bool create_tensors_helper::create_qwen3_tensors(const LLM_TN & tn) {
10181015
10191016 layer.attn_norm = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd});
10201017
1021- layer.wq = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_Q, " weight" , i), {n_embd, n_embd_head_k * n_head});
1022- layer.wk = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_K, " weight" , i), {n_embd, n_embd_gqa});
1023- layer.wv = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_embd_gqa});
1018+ use_mmap_buffer &= !merge_qkv (tn, i, 0 );
1019+
10241020 layer.wo = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd_head_k * n_head, n_embd});
10251021
10261022 layer.attn_k_norm = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_K_NORM, " weight" , i), {n_embd_head_k});
@@ -1044,9 +1040,8 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) {
10441040
10451041 layer.attn_norm = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd});
10461042
1047- layer.wq = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_Q, " weight" , i), {n_embd, n_embd_head_k * n_head});
1048- layer.wk = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_K, " weight" , i), {n_embd, n_embd_gqa});
1049- layer.wv = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_embd_gqa});
1043+ use_mmap_buffer &= !merge_qkv (tn, i, 0 );
1044+
10501045 layer.wo = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd_head_k * n_head, n_embd});
10511046
10521047 layer.attn_k_norm = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_K_NORM, " weight" , i), {n_embd_head_k});
@@ -1700,12 +1695,16 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) {
17001695 layer.attn_norm = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd}, flags);
17011696
17021697 // GLM-style attention with bias terms
1703- layer.wq = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_Q, " weight" , i), { n_embd, n_embd_head_k * n_head }, flags);
1704- layer.wk = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_K, " weight" , i), { n_embd, n_embd_k_gqa }, flags);
1705- layer.wv = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_V, " weight" , i), { n_embd, n_embd_v_gqa }, flags);
1706- layer.bq = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_Q, " bias" , i), { n_embd_head_k * n_head }, flags);
1707- layer.bk = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_K, " bias" , i), { n_embd_k_gqa }, flags);
1708- layer.bv = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_V, " bias" , i), { n_embd_v_gqa }, flags);
1698+ if (!flags) {
1699+ use_mmap_buffer &= !merge_qkv (tn, i, 2 );
1700+ } else {
1701+ layer.wq = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_Q, " weight" , i), { n_embd, n_embd_head_k * n_head }, flags);
1702+ layer.wk = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_K, " weight" , i), { n_embd, n_embd_k_gqa }, flags);
1703+ layer.wv = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_V, " weight" , i), { n_embd, n_embd_v_gqa }, flags);
1704+ layer.bq = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_Q, " bias" , i), { n_embd_head_k * n_head }, flags);
1705+ layer.bk = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_K, " bias" , i), { n_embd_k_gqa }, flags);
1706+ layer.bv = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_V, " bias" , i), { n_embd_v_gqa }, flags);
1707+ }
17091708
17101709 layer.wo = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), { n_embd_head_k * n_head, n_embd }, flags);
17111710
@@ -2380,10 +2379,10 @@ bool create_tensors_helper::create_openai_moe_tensors(const LLM_TN & tn) {
23802379 layer.attn_norm = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd}, 0 );
23812380 layer.attn_post_norm = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_POST_NORM, " weight" , i), {n_embd}, 0 );
23822381
2383- layer.wq = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_Q, " weight" , i), {n_embd, n_head * n_rot}, 0 );
2384- layer.wk = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_K, " weight" , i), {n_embd, n_head_kv * n_rot}, 0 );
2385- layer.wv = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_head_kv * n_rot}, 0 );
2382+ use_mmap_buffer &= !merge_qkv (tn, i, 2 );
2383+
23862384 layer.wo = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_head * n_rot, n_embd}, 0 );
2385+ layer.bo = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, 0 );
23872386
23882387 layer.attn_sinks = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_SINKS, " weight" , i), {n_head}, 0 );
23892388
@@ -2394,11 +2393,6 @@ bool create_tensors_helper::create_openai_moe_tensors(const LLM_TN & tn) {
23942393 layer.ffn_up_exps = create_tensor (ctx_split, tn (LLM_TENSOR_FFN_UP_EXPS, " weight" , i), { n_embd, n_ff_exp, n_expert}, 0 , &ctx_ffn_up);
23952394
23962395 // bias
2397- layer.bq = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_Q, " bias" , i), {n_head * n_rot}, 0 );
2398- layer.bk = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_K, " bias" , i), {n_head_kv * n_rot}, 0 );
2399- layer.bv = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_V, " bias" , i), {n_head_kv * n_rot}, 0 );
2400- layer.bo = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, 0 );
2401-
24022396 ggml_context *ctx_ffn_gate_b, *ctx_ffn_up_b, *ctx_ffn_down_b;
24032397 layer.ffn_gate_inp_b = create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_GATE_INP, " bias" , i), {n_expert}, 0 );
24042398 layer.ffn_gate_exps_b = create_tensor (ctx_split, tn (LLM_TENSOR_FFN_GATE_EXPS, " bias" , i), {n_ff_exp, n_expert}, 0 , &ctx_ffn_gate_b);
@@ -2421,6 +2415,88 @@ bool create_tensors_helper::create_openai_moe_tensors(const LLM_TN & tn) {
24212415 return use_mmap_buffer;
24222416}
24232417
2418+ bool create_tensors_helper::merge_qkv (const LLM_TN & tn, int i, int bias) {
2419+ auto & hparams = model.hparams ;
2420+ const int64_t n_head = hparams.n_head ();
2421+ const int64_t n_head_kv = hparams.n_head_kv ();
2422+ const int64_t n_embd = hparams.n_embd ;
2423+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa ();
2424+ const int64_t n_embd_head_k = hparams.n_embd_head_k ;
2425+ const int64_t n_embd_gqa = n_embd_v_gqa;
2426+
2427+ ggml_context * ctx_layer = ctx_for_layer (i);
2428+ ggml_context * ctx_split = ctx_for_layer_split (i);
2429+
2430+ auto & layer = model.layers [i];
2431+
2432+ auto wq_name = tn (LLM_TENSOR_ATTN_Q, " weight" , i);
2433+ auto wk_name = tn (LLM_TENSOR_ATTN_K, " weight" , i);
2434+ auto wv_name = tn (LLM_TENSOR_ATTN_V, " weight" , i);
2435+ auto wq = ml.require_tensor_meta (wq_name.c_str ());
2436+ auto wk = ml.require_tensor_meta (wk_name.c_str ());
2437+ auto wv = ml.require_tensor_meta (wv_name.c_str ());
2438+ GGML_ASSERT (wq && wk && wv);
2439+
2440+ bool fused_qkv = false ;
2441+ if (ml.merge_qkv && wq->type == wk->type && wq->type == wv->type && hparams.f_attention_scale == 0 .0f ) {
2442+ GGML_ASSERT (wq->ne [0 ] == n_embd && wq->ne [1 ] == n_head * n_embd_head_k);
2443+ GGML_ASSERT (wk->ne [0 ] == n_embd && wk->ne [1 ] == n_embd_gqa);
2444+ GGML_ASSERT (wv->ne [0 ] == n_embd && wv->ne [1 ] == n_embd_gqa);
2445+ layer.wqkv = ggml_new_tensor_2d (ctx_split, wq->type , n_embd, n_embd_head_k * (n_head + n_head_kv + n_head_kv));
2446+ snprintf (layer.wqkv ->name , GGML_MAX_NAME, " blk.%d.attn_qkv.weight" , i);
2447+ // This does not work. If we are doing this merge manually, it basically means that the arch does not have
2448+ // an LLM_TENSOR_ATTN_QKV entry, so we will get __missing__ as the tensor name.
2449+ // ggml_set_name(layer.wqkv, tn(LLM_TENSOR_ATTN_QKV, "weight", i).c_str());
2450+ layer.wq = ml.create_tensor_as_view (ctx_split, layer.wqkv , wq_name.c_str (), { wq->ne [0 ], wq->ne [1 ] }, 0 );
2451+ layer.wk = ml.create_tensor_as_view (ctx_split, layer.wqkv , wk_name.c_str (), { wk->ne [0 ], wk->ne [1 ] }, wq->ne [1 ]*wq->nb [1 ]);
2452+ layer.wv = ml.create_tensor_as_view (ctx_split, layer.wqkv , wv_name.c_str (), { wv->ne [0 ], wv->ne [1 ] }, wq->ne [1 ]*wq->nb [1 ] + wk->ne [1 ]*wk->nb [1 ] );
2453+ fused_qkv = true ;
2454+ printf (" ================================== Created merged qkv %s\n " , layer.wqkv ->name );
2455+ if (bias) {
2456+ auto bq_name = tn (LLM_TENSOR_ATTN_Q, " bias" , i);
2457+ auto bk_name = tn (LLM_TENSOR_ATTN_K, " bias" , i);
2458+ auto bv_name = tn (LLM_TENSOR_ATTN_V, " bias" , i);
2459+ auto bq = ml.get_tensor_meta (bq_name.c_str ());
2460+ auto bk = ml.get_tensor_meta (bk_name.c_str ());
2461+ auto bv = ml.get_tensor_meta (bv_name.c_str ());
2462+ if (bias == 2 ) {
2463+ GGML_ASSERT (bq && bk && bv);
2464+ } else {
2465+ GGML_ASSERT (!bq && !bk && !bv);
2466+ }
2467+ if (bq && bk && bv) {
2468+ GGML_ASSERT (bq->type == GGML_TYPE_F32 && bk->type == GGML_TYPE_F32 && bv->type == GGML_TYPE_F32);
2469+ GGML_ASSERT (ggml_nrows (bq) == 1 && bq->ne [0 ] == wq->ne [1 ]);
2470+ GGML_ASSERT (ggml_nrows (bk) == 1 && bk->ne [0 ] == wk->ne [1 ]);
2471+ GGML_ASSERT (ggml_nrows (bv) == 1 && bv->ne [0 ] == wv->ne [1 ]);
2472+ layer.bqkv = ggml_new_tensor_1d (ctx_layer, bq->type , n_embd_head_k * (n_head + n_head_kv + n_head_kv));
2473+ snprintf (layer.bqkv ->name , GGML_MAX_NAME, " blk.%d.attn_qkv.bias" , i);
2474+ layer.bq = ml.create_tensor_as_view (ctx_layer, layer.bqkv , bq_name.c_str (), { bq->ne [0 ] }, 0 );
2475+ layer.bk = ml.create_tensor_as_view (ctx_layer, layer.bqkv , bk_name.c_str (), { bk->ne [0 ] }, bq->ne [0 ]*bq->nb [0 ]);
2476+ layer.bv = ml.create_tensor_as_view (ctx_layer, layer.bqkv , bv_name.c_str (), { bv->ne [0 ] }, bq->ne [0 ]*bq->nb [0 ] + bk->ne [0 ]*bk->nb [0 ] );
2477+ }
2478+ }
2479+ }
2480+
2481+ if (!fused_qkv) {
2482+ if (ml.merge_qkv ) {
2483+ printf (" %s: did not merge Q, K, V in layer %d because %d, %d, %d\n " , __func__, i,
2484+ wq->type == wk->type , wq->type == wv->type , hparams.f_attention_scale == 0 .0f );
2485+ }
2486+ layer.wq = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_Q, " weight" , i), {n_embd, n_embd_head_k * n_head});
2487+ layer.wk = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_K, " weight" , i), {n_embd, n_embd_gqa});
2488+ layer.wv = create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_embd_gqa});
2489+ if (bias) {
2490+ auto flags = bias == 1 ? llama_model_loader::TENSOR_NOT_REQUIRED : 0 ;
2491+ layer.bq = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_Q, " bias" , i), {layer.wq ->ne [1 ]}, flags);
2492+ layer.bk = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_K, " bias" , i), {layer.wk ->ne [1 ]}, flags);
2493+ layer.bv = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_V, " bias" , i), {layer.wv ->ne [1 ]}, flags);
2494+ }
2495+ }
2496+
2497+ return fused_qkv;
2498+ }
2499+
24242500bool create_tensors_helper::create_tensors () {
24252501 const auto tn = LLM_TN (model.arch );
24262502 bool use_mmap_buffer = true ;
0 commit comments