@@ -1476,9 +1476,6 @@ struct llm_build_context {
14761476 struct ggml_cgraph * build_llama () {
14771477 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
14781478
1479- // mutable variable, needed during the last layer of the computation to skip unused tokens
1480- int32_t n_tokens = this ->n_tokens ;
1481-
14821479 const int64_t n_embd_head = hparams.n_embd_head_v ;
14831480 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
14841481 GGML_ASSERT (n_embd_head == hparams.n_rot );
@@ -1553,7 +1550,6 @@ struct llm_build_context {
15531550 if (il == n_layer - 1 ) {
15541551 // skip computing output for unused tokens
15551552 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
1556- n_tokens = n_outputs;
15571553 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
15581554 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
15591555 }
@@ -1642,9 +1638,6 @@ struct llm_build_context {
16421638 struct ggml_cgraph * build_deci () {
16431639 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
16441640
1645- // mutable variable, needed during the last layer of the computation to skip unused tokens
1646- int32_t n_tokens = this ->n_tokens ;
1647-
16481641 const int64_t n_embd_head = hparams.n_embd_head_v ;
16491642 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
16501643 GGML_ASSERT (n_embd_head == hparams.n_rot );
@@ -1730,7 +1723,6 @@ struct llm_build_context {
17301723 if (il == n_layer - 1 ) {
17311724 // skip computing output for unused tokens
17321725 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
1733- n_tokens = n_outputs;
17341726 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
17351727 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
17361728 }
@@ -2141,9 +2133,6 @@ struct llm_build_context {
21412133 struct ggml_cgraph * build_grok () {
21422134 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
21432135
2144- // mutable variable, needed during the last layer of the computation to skip unused tokens
2145- int32_t n_tokens = this ->n_tokens ;
2146-
21472136 const int64_t n_embd_head = hparams.n_embd_head_v ;
21482137 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
21492138 GGML_ASSERT (n_embd_head == hparams.n_rot );
@@ -2218,7 +2207,6 @@ struct llm_build_context {
22182207 if (il == n_layer - 1 ) {
22192208 // skip computing output for unused tokens
22202209 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
2221- n_tokens = n_outputs;
22222210 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
22232211 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
22242212 }
@@ -2300,9 +2288,6 @@ struct llm_build_context {
23002288 struct ggml_cgraph * build_dbrx () {
23012289 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
23022290
2303- // mutable variable, needed during the last layer of the computation to skip unused tokens
2304- int32_t n_tokens = this ->n_tokens ;
2305-
23062291 const int64_t n_embd_head = hparams.n_embd_head_v ;
23072292 const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
23082293 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
@@ -2370,7 +2355,6 @@ struct llm_build_context {
23702355 if (il == n_layer - 1 ) {
23712356 // skip computing output for unused tokens
23722357 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
2373- n_tokens = n_outputs;
23742358 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
23752359 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
23762360 }
@@ -3553,9 +3537,6 @@ struct llm_build_context {
35533537 struct ggml_cgraph * build_qwen2moe () {
35543538 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
35553539
3556- // mutable variable, needed during the last layer of the computation to skip unused tokens
3557- int32_t n_tokens = this ->n_tokens ;
3558-
35593540 const int64_t n_embd_head = hparams.n_embd_head_v ;
35603541 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
35613542 GGML_ASSERT (n_embd_head == hparams.n_rot );
@@ -3620,7 +3601,6 @@ struct llm_build_context {
36203601 if (il == n_layer - 1 ) {
36213602 // skip computing output for unused tokens
36223603 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
3623- n_tokens = n_outputs;
36243604 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
36253605 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
36263606 }
@@ -5440,9 +5420,6 @@ struct llm_build_context {
54405420 struct ggml_cgraph * build_olmo () {
54415421 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
54425422
5443- // mutable variable, needed during the last layer of the computation to skip unused tokens
5444- int32_t n_tokens = this ->n_tokens ;
5445-
54465423 const int64_t n_embd_head = hparams.n_embd_head_v ;
54475424 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
54485425 GGML_ASSERT (n_embd_head == hparams.n_rot );
@@ -5513,7 +5490,6 @@ struct llm_build_context {
55135490 if (il == n_layer - 1 ) {
55145491 // skip computing output for unused tokens
55155492 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
5516- n_tokens = n_outputs;
55175493 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
55185494 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
55195495 }
@@ -5564,9 +5540,6 @@ struct llm_build_context {
55645540 struct ggml_cgraph * build_olmo2 () {
55655541 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
55665542
5567- // mutable variable, needed during the last layer of the computation to skip unused tokens
5568- int32_t n_tokens = this ->n_tokens ;
5569-
55705543 const int64_t n_embd_head = hparams.n_embd_head_v ;
55715544 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
55725545 GGML_ASSERT (n_embd_head == hparams.n_rot );
@@ -5637,7 +5610,6 @@ struct llm_build_context {
56375610 if (il == n_layer - 1 ) {
56385611 // skip computing output for unused tokens
56395612 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
5640- n_tokens = n_outputs;
56415613 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
56425614 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
56435615 }
@@ -5692,9 +5664,6 @@ struct llm_build_context {
56925664 struct ggml_cgraph * build_olmoe () {
56935665 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
56945666
5695- // mutable variable, needed during the last layer of the computation to skip unused tokens
5696- int32_t n_tokens = this ->n_tokens ;
5697-
56985667 const int64_t n_embd_head = hparams.n_embd_head_v ;
56995668 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
57005669 GGML_ASSERT (n_embd_head == hparams.n_rot );
@@ -5764,7 +5733,6 @@ struct llm_build_context {
57645733 if (il == n_layer - 1 ) {
57655734 // skip computing output for unused tokens
57665735 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
5767- n_tokens = n_outputs;
57685736 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
57695737 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
57705738 }
@@ -6085,9 +6053,6 @@ struct llm_build_context {
60856053 struct ggml_cgraph * build_arctic () {
60866054 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
60876055
6088- // mutable variable, needed during the last layer of the computation to skip unused tokens
6089- int32_t n_tokens = this ->n_tokens ;
6090-
60916056 const int64_t n_embd_head = hparams.n_embd_head_v ;
60926057 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
60936058 GGML_ASSERT (n_embd_head == hparams.n_rot );
@@ -6146,7 +6111,6 @@ struct llm_build_context {
61466111 if (il == n_layer - 1 ) {
61476112 // skip computing output for unused tokens
61486113 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
6149- n_tokens = n_outputs;
61506114 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
61516115 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
61526116 }
@@ -6219,9 +6183,6 @@ struct llm_build_context {
62196183 struct ggml_cgraph * build_deepseek () {
62206184 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
62216185
6222- // mutable variable, needed during the last layer of the computation to skip unused tokens
6223- int32_t n_tokens = this ->n_tokens ;
6224-
62256186 const int64_t n_embd_head = hparams.n_embd_head_v ;
62266187 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
62276188 GGML_ASSERT (n_embd_head == hparams.n_rot );
@@ -6295,7 +6256,6 @@ struct llm_build_context {
62956256 if (il == n_layer - 1 ) {
62966257 // skip computing output for unused tokens
62976258 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
6298- n_tokens = n_outputs;
62996259 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
63006260 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
63016261 }
@@ -6376,9 +6336,6 @@ struct llm_build_context {
63766336 struct ggml_cgraph * build_deepseek2 () {
63776337 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
63786338
6379- // mutable variable, needed during the last layer of the computation to skip unused tokens
6380- int32_t n_tokens = this ->n_tokens ;
6381-
63826339 bool is_lite = (hparams.n_layer == 27 );
63836340
63846341 // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
@@ -6527,7 +6484,6 @@ struct llm_build_context {
65276484 if (il == n_layer - 1 ) {
65286485 // skip computing output for unused tokens
65296486 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
6530- n_tokens = n_outputs;
65316487 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
65326488 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
65336489 }
@@ -6757,9 +6713,6 @@ struct llm_build_context {
67576713 struct ggml_cgraph * build_t5_enc () {
67586714 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
67596715
6760- // mutable variable, needed during the last layer of the computation to skip unused tokens
6761- int32_t n_tokens = this ->n_tokens ;
6762-
67636716 const int64_t n_embd_head = hparams.n_embd_head_v ;
67646717 const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
67656718 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
@@ -6833,7 +6786,6 @@ struct llm_build_context {
68336786 if (il == n_layer - 1 ) {
68346787 // skip computing output for unused tokens
68356788 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
6836- n_tokens = n_outputs;
68376789 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
68386790 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
68396791 }
@@ -6889,9 +6841,6 @@ struct llm_build_context {
68896841 struct ggml_cgraph * build_t5_dec () {
68906842 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
68916843
6892- // mutable variable, needed during the last layer of the computation to skip unused tokens
6893- int32_t n_tokens = this ->n_tokens ;
6894-
68956844 const int64_t n_embd_head = hparams.n_embd_head_v ;
68966845 const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
68976846 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
@@ -7033,7 +6982,6 @@ struct llm_build_context {
70336982 if (il == n_layer - 1 ) {
70346983 // skip computing output for unused tokens
70356984 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7036- n_tokens = n_outputs;
70376985 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
70386986 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
70396987 inpCA = ggml_get_rows (ctx0, inpCA, inp_out_ids);
@@ -7421,9 +7369,6 @@ struct llm_build_context {
74217369 struct ggml_cgraph * build_exaone () {
74227370 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
74237371
7424- // mutable variable, needed during the last layer of the computation to skip unused tokens
7425- int32_t n_tokens = this ->n_tokens ;
7426-
74277372 const int64_t n_embd_head = hparams.n_embd_head_v ;
74287373 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
74297374 GGML_ASSERT (n_embd_head == hparams.n_rot );
@@ -7497,7 +7442,6 @@ struct llm_build_context {
74977442 if (il == n_layer - 1 ) {
74987443 // skip computing output for unused tokens
74997444 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7500- n_tokens = n_outputs;
75017445 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
75027446 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
75037447 }
@@ -7779,9 +7723,6 @@ struct llm_build_context {
77797723 struct ggml_cgraph * build_chameleon () {
77807724 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
77817725
7782- // mutable variable, needed during the last layer of the computation to skip unused tokens
7783- int32_t n_tokens = this ->n_tokens ;
7784-
77857726 const int64_t n_embd_head = hparams.n_embd_head_v ;
77867727 GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
77877728 GGML_ASSERT (n_embd_head == hparams.n_rot );
@@ -7878,7 +7819,6 @@ struct llm_build_context {
78787819 if (il == n_layer - 1 ) {
78797820 // skip computing output for unused tokens
78807821 struct ggml_tensor * inp_out_ids = build_inp_out_ids ();
7881- n_tokens = n_outputs;
78827822 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
78837823 inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
78847824 }
0 commit comments