@@ -1656,10 +1656,10 @@ struct llm_build_context {
16561656 const float kq_scale = hparams.f_attention_scale == 0 .0f ? 1 .0f /sqrtf (float (n_embd_head)) : hparams.f_attention_scale ;
16571657 for (int il = 0 ; il < n_layer; ++il) {
16581658 struct ggml_tensor * inpSA = inpL;
1659- const int64_t n_head_kv = hparams.n_head_kv (il);
1660- const int64_t n_head = hparams.n_head (il);
1659+ const int64_t n_head_kv_i = hparams.n_head_kv (il);
1660+ const int64_t n_head_i = hparams.n_head (il);
16611661
1662- if (n_head == 0 ) {
1662+ if (n_head_i == 0 ) {
16631663 // attention-free layer of Llama-3_1-Nemotron-51B
16641664 cur = inpL;
16651665 } else {
@@ -1670,11 +1670,11 @@ struct llm_build_context {
16701670 cb (cur, " attn_norm" , il);
16711671 }
16721672
1673- if (n_head > 0 && n_head_kv == 0 ) {
1673+ if (n_head_i > 0 && n_head_kv_i == 0 ) {
16741674 // "linear attention" of Llama-3_1-Nemotron-51B
16751675 cur = llm_build_lora_mm (lctx, ctx0, model.layers [il].wo , cur);
16761676 cb (cur, " wo" , il);
1677- } else if (n_head > 0 ) {
1677+ } else if (n_head_i > 0 ) {
16781678 // self-attention
16791679 // rope freq factors for llama3; may return nullptr for llama2 and other models
16801680 struct ggml_tensor * rope_factors = build_rope_factors (il);
@@ -1702,14 +1702,14 @@ struct llm_build_context {
17021702 }
17031703
17041704 Qcur = ggml_rope_ext (
1705- ctx0, ggml_reshape_3d (ctx0, Qcur, n_embd_head, n_head , n_tokens), inp_pos, rope_factors,
1705+ ctx0, ggml_reshape_3d (ctx0, Qcur, n_embd_head, n_head_i , n_tokens), inp_pos, rope_factors,
17061706 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17071707 ext_factor, attn_factor, beta_fast, beta_slow
17081708 );
17091709 cb (Qcur, " Qcur" , il);
17101710
17111711 Kcur = ggml_rope_ext (
1712- ctx0, ggml_reshape_3d (ctx0, Kcur, n_embd_head, n_head_kv , n_tokens), inp_pos, rope_factors,
1712+ ctx0, ggml_reshape_3d (ctx0, Kcur, n_embd_head, n_head_kv_i , n_tokens), inp_pos, rope_factors,
17131713 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17141714 ext_factor, attn_factor, beta_fast, beta_slow
17151715 );
@@ -1734,7 +1734,7 @@ struct llm_build_context {
17341734
17351735 // modified to support attention-free layer of Llama-3_1-Nemotron-51B
17361736 struct ggml_tensor * ffn_inp = cur;
1737- if (n_head > 0 ) {
1737+ if (n_head_i > 0 ) {
17381738 ffn_inp = ggml_add (ctx0, cur, inpSA);
17391739 cb (ffn_inp, " ffn_inp" , il);
17401740 }
@@ -2643,7 +2643,7 @@ struct llm_build_context {
26432643
26442644 // iterate layers
26452645 for (int il = 0 ; il < n_layer; ++il) {
2646- struct ggml_tensor * cur = inpL;
2646+ cur = inpL;
26472647
26482648 struct ggml_tensor * Qcur;
26492649 struct ggml_tensor * Kcur;
@@ -4717,8 +4717,6 @@ struct llm_build_context {
47174717 struct ggml_cgraph * build_gemma () {
47184718 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
47194719
4720- const int64_t n_embd_head_k = hparams.n_embd_head_k ;
4721-
47224720 struct ggml_tensor * cur;
47234721 struct ggml_tensor * inpL;
47244722
@@ -4825,8 +4823,6 @@ struct llm_build_context {
48254823 struct ggml_cgraph * build_gemma2 () {
48264824 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
48274825
4828- const int64_t n_embd_head_k = hparams.n_embd_head_k ;
4829-
48304826 struct ggml_tensor * cur;
48314827 struct ggml_tensor * inpL;
48324828
@@ -4962,6 +4958,7 @@ struct llm_build_context {
49624958 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
49634959
49644960 const int64_t n_embd_head = hparams.n_embd_head_v ;
4961+
49654962 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
49664963 GGML_ASSERT (n_embd_head == hparams.n_rot );
49674964
@@ -5800,9 +5797,9 @@ struct llm_build_context {
58005797 struct ggml_tensor * KQ_mask = build_inp_KQ_mask ();
58015798
58025799 for (int il = 0 ; il < n_layer; ++il) {
5803- const int64_t n_head = hparams.n_head (il);
5804- const int64_t n_head_kv = hparams.n_head_kv (il);
5805- const int64_t n_head_qkv = 2 *n_head_kv + n_head ;
5800+ const int64_t n_head_i = hparams.n_head (il);
5801+ const int64_t n_head_kv_i = hparams.n_head_kv (il);
5802+ const int64_t n_head_qkv_i = 2 *n_head_kv_i + n_head_i ;
58065803
58075804 cur = inpL;
58085805 struct ggml_tensor * residual = cur;
@@ -5818,15 +5815,15 @@ struct llm_build_context {
58185815 cur = llm_build_lora_mm (lctx, ctx0, model.layers [il].wqkv , cur);
58195816 cb (cur, " wqkv" , il);
58205817
5821- cur = ggml_reshape_3d (ctx0, cur, n_embd_head_k, n_head_qkv , n_tokens);
5818+ cur = ggml_reshape_3d (ctx0, cur, n_embd_head_k, n_head_qkv_i , n_tokens);
58225819
5823- struct ggml_tensor * Qcur = ggml_cont (ctx0, ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, cur->nb [1 ], cur->nb [2 ], 0 ));
5820+ struct ggml_tensor * Qcur = ggml_cont (ctx0, ggml_view_3d (ctx0, cur, n_embd_head, n_head_i , n_tokens, cur->nb [1 ], cur->nb [2 ], 0 ));
58245821 cb (Qcur, " Qcur" , il);
58255822
5826- struct ggml_tensor * Kcur = ggml_cont (ctx0, ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, cur->nb [1 ], cur->nb [2 ], cur->nb [1 ]*n_head ));
5823+ struct ggml_tensor * Kcur = ggml_cont (ctx0, ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv_i , n_tokens, cur->nb [1 ], cur->nb [2 ], cur->nb [1 ]*n_head_i ));
58275824 cb (Kcur, " Kcur" , il);
58285825
5829- struct ggml_tensor * Vcur = ggml_cont (ctx0, ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, cur->nb [1 ], cur->nb [2 ], cur->nb [1 ]*(n_head+n_head_kv )));
5826+ struct ggml_tensor * Vcur = ggml_cont (ctx0, ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv_i , n_tokens, cur->nb [1 ], cur->nb [2 ], cur->nb [1 ]*(n_head_i+n_head_kv_i )));
58305827 cb (Vcur, " Vcur" , il);
58315828
58325829 Qcur = llm_build_norm (ctx0, Qcur, hparams,
@@ -5851,7 +5848,7 @@ struct llm_build_context {
58515848 );
58525849 cb (Kcur, " Kcur" , il);
58535850
5854- Vcur = ggml_reshape_2d (ctx0, Vcur, n_embd_head * n_head_kv , n_tokens);
5851+ Vcur = ggml_reshape_2d (ctx0, Vcur, n_embd_head * n_head_kv_i , n_tokens);
58555852 cb (Qcur, " Vcur" , il);
58565853
58575854 cur = llm_build_kv (ctx0, lctx, kv_self, gf,
@@ -7495,9 +7492,9 @@ struct llm_build_context {
74957492 // Token shift state dimensions should be 2 * n_emb
74967493 GGML_ASSERT (n_embd == hparams.n_embd_k_s () / 2 );
74977494
7498- const int64_t n_seqs = ubatch.n_seqs ;
7495+ const int64_t n_seqs = ubatch.n_seqs ;
74997496 const int64_t n_seq_tokens = ubatch.n_seq_tokens ;
7500- const int64_t n_tokens = ubatch. n_tokens ;
7497+
75017498 GGML_ASSERT (n_seqs != 0 );
75027499 GGML_ASSERT (ubatch.equal_seqs );
75037500 GGML_ASSERT (n_tokens == n_seq_tokens * n_seqs);
@@ -7608,9 +7605,9 @@ struct llm_build_context {
76087605
76097606 GGML_ASSERT (n_embd == hparams.n_embd_k_s ());
76107607
7611- const int64_t n_seqs = ubatch.n_seqs ;
7608+ const int64_t n_seqs = ubatch.n_seqs ;
76127609 const int64_t n_seq_tokens = ubatch.n_seq_tokens ;
7613- const int64_t n_tokens = ubatch. n_tokens ;
7610+
76147611 GGML_ASSERT (n_seqs != 0 );
76157612 GGML_ASSERT (ubatch.equal_seqs );
76167613 GGML_ASSERT (n_tokens == n_seq_tokens * n_seqs);
0 commit comments