@@ -3997,9 +3997,6 @@ struct llm_build_context {
39973997 struct ggml_cgraph * build_llama() {
39983998 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
39993999
4000- // mutable variable, needed during the last layer of the computation to skip unused tokens
4001- int32_t n_tokens = this->n_tokens;
4002-
40034000 const int64_t n_embd_head = hparams.n_embd_head_v;
40044001 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
40054002 GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -4074,7 +4071,6 @@ struct llm_build_context {
40744071 if (il == n_layer - 1) {
40754072 // skip computing output for unused tokens
40764073 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
4077- n_tokens = n_outputs;
40784074 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
40794075 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
40804076 }
@@ -4163,9 +4159,6 @@ struct llm_build_context {
41634159 struct ggml_cgraph * build_deci() {
41644160 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
41654161
4166- // mutable variable, needed during the last layer of the computation to skip unused tokens
4167- int32_t n_tokens = this->n_tokens;
4168-
41694162 const int64_t n_embd_head = hparams.n_embd_head_v;
41704163 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
41714164 GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -4251,7 +4244,6 @@ struct llm_build_context {
42514244 if (il == n_layer - 1) {
42524245 // skip computing output for unused tokens
42534246 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
4254- n_tokens = n_outputs;
42554247 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
42564248 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
42574249 }
@@ -4662,9 +4654,6 @@ struct llm_build_context {
46624654 struct ggml_cgraph * build_grok() {
46634655 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
46644656
4665- // mutable variable, needed during the last layer of the computation to skip unused tokens
4666- int32_t n_tokens = this->n_tokens;
4667-
46684657 const int64_t n_embd_head = hparams.n_embd_head_v;
46694658 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
46704659 GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -4739,7 +4728,6 @@ struct llm_build_context {
47394728 if (il == n_layer - 1) {
47404729 // skip computing output for unused tokens
47414730 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
4742- n_tokens = n_outputs;
47434731 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
47444732 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
47454733 }
@@ -4821,9 +4809,6 @@ struct llm_build_context {
48214809 struct ggml_cgraph * build_dbrx() {
48224810 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
48234811
4824- // mutable variable, needed during the last layer of the computation to skip unused tokens
4825- int32_t n_tokens = this->n_tokens;
4826-
48274812 const int64_t n_embd_head = hparams.n_embd_head_v;
48284813 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
48294814 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -4891,7 +4876,6 @@ struct llm_build_context {
48914876 if (il == n_layer - 1) {
48924877 // skip computing output for unused tokens
48934878 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
4894- n_tokens = n_outputs;
48954879 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
48964880 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
48974881 }
@@ -6074,9 +6058,6 @@ struct llm_build_context {
60746058 struct ggml_cgraph * build_qwen2moe() {
60756059 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
60766060
6077- // mutable variable, needed during the last layer of the computation to skip unused tokens
6078- int32_t n_tokens = this->n_tokens;
6079-
60806061 const int64_t n_embd_head = hparams.n_embd_head_v;
60816062 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
60826063 GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -6141,7 +6122,6 @@ struct llm_build_context {
61416122 if (il == n_layer - 1) {
61426123 // skip computing output for unused tokens
61436124 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6144- n_tokens = n_outputs;
61456125 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
61466126 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
61476127 }
@@ -7961,9 +7941,6 @@ struct llm_build_context {
79617941 struct ggml_cgraph * build_olmo() {
79627942 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
79637943
7964- // mutable variable, needed during the last layer of the computation to skip unused tokens
7965- int32_t n_tokens = this->n_tokens;
7966-
79677944 const int64_t n_embd_head = hparams.n_embd_head_v;
79687945 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
79697946 GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8034,7 +8011,6 @@ struct llm_build_context {
80348011 if (il == n_layer - 1) {
80358012 // skip computing output for unused tokens
80368013 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8037- n_tokens = n_outputs;
80388014 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
80398015 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
80408016 }
@@ -8085,9 +8061,6 @@ struct llm_build_context {
80858061 struct ggml_cgraph * build_olmo2() {
80868062 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
80878063
8088- // mutable variable, needed during the last layer of the computation to skip unused tokens
8089- int32_t n_tokens = this->n_tokens;
8090-
80918064 const int64_t n_embd_head = hparams.n_embd_head_v;
80928065 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
80938066 GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8158,7 +8131,6 @@ struct llm_build_context {
81588131 if (il == n_layer - 1) {
81598132 // skip computing output for unused tokens
81608133 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8161- n_tokens = n_outputs;
81628134 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
81638135 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
81648136 }
@@ -8213,9 +8185,6 @@ struct llm_build_context {
82138185 struct ggml_cgraph * build_olmoe() {
82148186 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
82158187
8216- // mutable variable, needed during the last layer of the computation to skip unused tokens
8217- int32_t n_tokens = this->n_tokens;
8218-
82198188 const int64_t n_embd_head = hparams.n_embd_head_v;
82208189 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
82218190 GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8285,7 +8254,6 @@ struct llm_build_context {
82858254 if (il == n_layer - 1) {
82868255 // skip computing output for unused tokens
82878256 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8288- n_tokens = n_outputs;
82898257 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
82908258 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
82918259 }
@@ -8606,9 +8574,6 @@ struct llm_build_context {
86068574 struct ggml_cgraph * build_arctic() {
86078575 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
86088576
8609- // mutable variable, needed during the last layer of the computation to skip unused tokens
8610- int32_t n_tokens = this->n_tokens;
8611-
86128577 const int64_t n_embd_head = hparams.n_embd_head_v;
86138578 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
86148579 GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8667,7 +8632,6 @@ struct llm_build_context {
86678632 if (il == n_layer - 1) {
86688633 // skip computing output for unused tokens
86698634 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8670- n_tokens = n_outputs;
86718635 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
86728636 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
86738637 }
@@ -8740,9 +8704,6 @@ struct llm_build_context {
87408704 struct ggml_cgraph * build_deepseek() {
87418705 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
87428706
8743- // mutable variable, needed during the last layer of the computation to skip unused tokens
8744- int32_t n_tokens = this->n_tokens;
8745-
87468707 const int64_t n_embd_head = hparams.n_embd_head_v;
87478708 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
87488709 GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8816,7 +8777,6 @@ struct llm_build_context {
88168777 if (il == n_layer - 1) {
88178778 // skip computing output for unused tokens
88188779 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8819- n_tokens = n_outputs;
88208780 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
88218781 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
88228782 }
@@ -8897,9 +8857,6 @@ struct llm_build_context {
88978857 struct ggml_cgraph * build_deepseek2() {
88988858 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
88998859
8900- // mutable variable, needed during the last layer of the computation to skip unused tokens
8901- int32_t n_tokens = this->n_tokens;
8902-
89038860 bool is_lite = (hparams.n_layer == 27);
89048861
89058862 // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
@@ -9048,7 +9005,6 @@ struct llm_build_context {
90489005 if (il == n_layer - 1) {
90499006 // skip computing output for unused tokens
90509007 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9051- n_tokens = n_outputs;
90529008 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
90539009 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
90549010 }
@@ -9278,9 +9234,6 @@ struct llm_build_context {
92789234 struct ggml_cgraph * build_t5_enc() {
92799235 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
92809236
9281- // mutable variable, needed during the last layer of the computation to skip unused tokens
9282- int32_t n_tokens = this->n_tokens;
9283-
92849237 const int64_t n_embd_head = hparams.n_embd_head_v;
92859238 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
92869239 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9354,7 +9307,6 @@ struct llm_build_context {
93549307 if (il == n_layer - 1) {
93559308 // skip computing output for unused tokens
93569309 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9357- n_tokens = n_outputs;
93589310 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
93599311 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
93609312 }
@@ -9410,9 +9362,6 @@ struct llm_build_context {
94109362 struct ggml_cgraph * build_t5_dec() {
94119363 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
94129364
9413- // mutable variable, needed during the last layer of the computation to skip unused tokens
9414- int32_t n_tokens = this->n_tokens;
9415-
94169365 const int64_t n_embd_head = hparams.n_embd_head_v;
94179366 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
94189367 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9554,7 +9503,6 @@ struct llm_build_context {
95549503 if (il == n_layer - 1) {
95559504 // skip computing output for unused tokens
95569505 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9557- n_tokens = n_outputs;
95589506 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
95599507 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
95609508 inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
@@ -9942,9 +9890,6 @@ struct llm_build_context {
99429890 struct ggml_cgraph * build_exaone() {
99439891 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
99449892
9945- // mutable variable, needed during the last layer of the computation to skip unused tokens
9946- int32_t n_tokens = this->n_tokens;
9947-
99489893 const int64_t n_embd_head = hparams.n_embd_head_v;
99499894 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
99509895 GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -10018,7 +9963,6 @@ struct llm_build_context {
100189963 if (il == n_layer - 1) {
100199964 // skip computing output for unused tokens
100209965 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10021- n_tokens = n_outputs;
100229966 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
100239967 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
100249968 }
@@ -10300,9 +10244,6 @@ struct llm_build_context {
1030010244 struct ggml_cgraph * build_chameleon() {
1030110245 struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1030210246
10303- // mutable variable, needed during the last layer of the computation to skip unused tokens
10304- int32_t n_tokens = this->n_tokens;
10305-
1030610247 const int64_t n_embd_head = hparams.n_embd_head_v;
1030710248 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
1030810249 GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -10399,7 +10340,6 @@ struct llm_build_context {
1039910340 if (il == n_layer - 1) {
1040010341 // skip computing output for unused tokens
1040110342 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10402- n_tokens = n_outputs;
1040310343 cur = ggml_get_rows(ctx0, cur, inp_out_ids);
1040410344 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1040510345 }
0 commit comments