Skip to content

Commit 02be875

Browse files
committed
llama : remove unused mutable n_tokens [no ci]
1 parent 7447ad3 commit 02be875

File tree

1 file changed

+0
-60
lines changed

1 file changed

+0
-60
lines changed

src/llama.cpp

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -3997,9 +3997,6 @@ struct llm_build_context {
39973997
struct ggml_cgraph * build_llama() {
39983998
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
39993999

4000-
// mutable variable, needed during the last layer of the computation to skip unused tokens
4001-
int32_t n_tokens = this->n_tokens;
4002-
40034000
const int64_t n_embd_head = hparams.n_embd_head_v;
40044001
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
40054002
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -4074,7 +4071,6 @@ struct llm_build_context {
40744071
if (il == n_layer - 1) {
40754072
// skip computing output for unused tokens
40764073
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
4077-
n_tokens = n_outputs;
40784074
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
40794075
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
40804076
}
@@ -4163,9 +4159,6 @@ struct llm_build_context {
41634159
struct ggml_cgraph * build_deci() {
41644160
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
41654161

4166-
// mutable variable, needed during the last layer of the computation to skip unused tokens
4167-
int32_t n_tokens = this->n_tokens;
4168-
41694162
const int64_t n_embd_head = hparams.n_embd_head_v;
41704163
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
41714164
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -4251,7 +4244,6 @@ struct llm_build_context {
42514244
if (il == n_layer - 1) {
42524245
// skip computing output for unused tokens
42534246
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
4254-
n_tokens = n_outputs;
42554247
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
42564248
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
42574249
}
@@ -4662,9 +4654,6 @@ struct llm_build_context {
46624654
struct ggml_cgraph * build_grok() {
46634655
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
46644656

4665-
// mutable variable, needed during the last layer of the computation to skip unused tokens
4666-
int32_t n_tokens = this->n_tokens;
4667-
46684657
const int64_t n_embd_head = hparams.n_embd_head_v;
46694658
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
46704659
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -4739,7 +4728,6 @@ struct llm_build_context {
47394728
if (il == n_layer - 1) {
47404729
// skip computing output for unused tokens
47414730
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
4742-
n_tokens = n_outputs;
47434731
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
47444732
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
47454733
}
@@ -4821,9 +4809,6 @@ struct llm_build_context {
48214809
struct ggml_cgraph * build_dbrx() {
48224810
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
48234811

4824-
// mutable variable, needed during the last layer of the computation to skip unused tokens
4825-
int32_t n_tokens = this->n_tokens;
4826-
48274812
const int64_t n_embd_head = hparams.n_embd_head_v;
48284813
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
48294814
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -4891,7 +4876,6 @@ struct llm_build_context {
48914876
if (il == n_layer - 1) {
48924877
// skip computing output for unused tokens
48934878
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
4894-
n_tokens = n_outputs;
48954879
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
48964880
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
48974881
}
@@ -6074,9 +6058,6 @@ struct llm_build_context {
60746058
struct ggml_cgraph * build_qwen2moe() {
60756059
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
60766060

6077-
// mutable variable, needed during the last layer of the computation to skip unused tokens
6078-
int32_t n_tokens = this->n_tokens;
6079-
60806061
const int64_t n_embd_head = hparams.n_embd_head_v;
60816062
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
60826063
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -6141,7 +6122,6 @@ struct llm_build_context {
61416122
if (il == n_layer - 1) {
61426123
// skip computing output for unused tokens
61436124
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6144-
n_tokens = n_outputs;
61456125
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
61466126
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
61476127
}
@@ -7961,9 +7941,6 @@ struct llm_build_context {
79617941
struct ggml_cgraph * build_olmo() {
79627942
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
79637943

7964-
// mutable variable, needed during the last layer of the computation to skip unused tokens
7965-
int32_t n_tokens = this->n_tokens;
7966-
79677944
const int64_t n_embd_head = hparams.n_embd_head_v;
79687945
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
79697946
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8034,7 +8011,6 @@ struct llm_build_context {
80348011
if (il == n_layer - 1) {
80358012
// skip computing output for unused tokens
80368013
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8037-
n_tokens = n_outputs;
80388014
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
80398015
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
80408016
}
@@ -8085,9 +8061,6 @@ struct llm_build_context {
80858061
struct ggml_cgraph * build_olmo2() {
80868062
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
80878063

8088-
// mutable variable, needed during the last layer of the computation to skip unused tokens
8089-
int32_t n_tokens = this->n_tokens;
8090-
80918064
const int64_t n_embd_head = hparams.n_embd_head_v;
80928065
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
80938066
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8158,7 +8131,6 @@ struct llm_build_context {
81588131
if (il == n_layer - 1) {
81598132
// skip computing output for unused tokens
81608133
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8161-
n_tokens = n_outputs;
81628134
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
81638135
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
81648136
}
@@ -8213,9 +8185,6 @@ struct llm_build_context {
82138185
struct ggml_cgraph * build_olmoe() {
82148186
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
82158187

8216-
// mutable variable, needed during the last layer of the computation to skip unused tokens
8217-
int32_t n_tokens = this->n_tokens;
8218-
82198188
const int64_t n_embd_head = hparams.n_embd_head_v;
82208189
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
82218190
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8285,7 +8254,6 @@ struct llm_build_context {
82858254
if (il == n_layer - 1) {
82868255
// skip computing output for unused tokens
82878256
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8288-
n_tokens = n_outputs;
82898257
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
82908258
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
82918259
}
@@ -8606,9 +8574,6 @@ struct llm_build_context {
86068574
struct ggml_cgraph * build_arctic() {
86078575
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
86088576

8609-
// mutable variable, needed during the last layer of the computation to skip unused tokens
8610-
int32_t n_tokens = this->n_tokens;
8611-
86128577
const int64_t n_embd_head = hparams.n_embd_head_v;
86138578
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
86148579
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8667,7 +8632,6 @@ struct llm_build_context {
86678632
if (il == n_layer - 1) {
86688633
// skip computing output for unused tokens
86698634
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8670-
n_tokens = n_outputs;
86718635
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
86728636
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
86738637
}
@@ -8740,9 +8704,6 @@ struct llm_build_context {
87408704
struct ggml_cgraph * build_deepseek() {
87418705
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
87428706

8743-
// mutable variable, needed during the last layer of the computation to skip unused tokens
8744-
int32_t n_tokens = this->n_tokens;
8745-
87468707
const int64_t n_embd_head = hparams.n_embd_head_v;
87478708
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
87488709
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8816,7 +8777,6 @@ struct llm_build_context {
88168777
if (il == n_layer - 1) {
88178778
// skip computing output for unused tokens
88188779
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8819-
n_tokens = n_outputs;
88208780
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
88218781
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
88228782
}
@@ -8897,9 +8857,6 @@ struct llm_build_context {
88978857
struct ggml_cgraph * build_deepseek2() {
88988858
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
88998859

8900-
// mutable variable, needed during the last layer of the computation to skip unused tokens
8901-
int32_t n_tokens = this->n_tokens;
8902-
89038860
bool is_lite = (hparams.n_layer == 27);
89048861

89058862
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
@@ -9048,7 +9005,6 @@ struct llm_build_context {
90489005
if (il == n_layer - 1) {
90499006
// skip computing output for unused tokens
90509007
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9051-
n_tokens = n_outputs;
90529008
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
90539009
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
90549010
}
@@ -9278,9 +9234,6 @@ struct llm_build_context {
92789234
struct ggml_cgraph * build_t5_enc() {
92799235
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
92809236

9281-
// mutable variable, needed during the last layer of the computation to skip unused tokens
9282-
int32_t n_tokens = this->n_tokens;
9283-
92849237
const int64_t n_embd_head = hparams.n_embd_head_v;
92859238
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
92869239
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9354,7 +9307,6 @@ struct llm_build_context {
93549307
if (il == n_layer - 1) {
93559308
// skip computing output for unused tokens
93569309
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9357-
n_tokens = n_outputs;
93589310
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
93599311
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
93609312
}
@@ -9410,9 +9362,6 @@ struct llm_build_context {
94109362
struct ggml_cgraph * build_t5_dec() {
94119363
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
94129364

9413-
// mutable variable, needed during the last layer of the computation to skip unused tokens
9414-
int32_t n_tokens = this->n_tokens;
9415-
94169365
const int64_t n_embd_head = hparams.n_embd_head_v;
94179366
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
94189367
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9554,7 +9503,6 @@ struct llm_build_context {
95549503
if (il == n_layer - 1) {
95559504
// skip computing output for unused tokens
95569505
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9557-
n_tokens = n_outputs;
95589506
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
95599507
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
95609508
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
@@ -9942,9 +9890,6 @@ struct llm_build_context {
99429890
struct ggml_cgraph * build_exaone() {
99439891
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
99449892

9945-
// mutable variable, needed during the last layer of the computation to skip unused tokens
9946-
int32_t n_tokens = this->n_tokens;
9947-
99489893
const int64_t n_embd_head = hparams.n_embd_head_v;
99499894
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
99509895
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -10018,7 +9963,6 @@ struct llm_build_context {
100189963
if (il == n_layer - 1) {
100199964
// skip computing output for unused tokens
100209965
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10021-
n_tokens = n_outputs;
100229966
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
100239967
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
100249968
}
@@ -10300,9 +10244,6 @@ struct llm_build_context {
1030010244
struct ggml_cgraph * build_chameleon() {
1030110245
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1030210246

10303-
// mutable variable, needed during the last layer of the computation to skip unused tokens
10304-
int32_t n_tokens = this->n_tokens;
10305-
1030610247
const int64_t n_embd_head = hparams.n_embd_head_v;
1030710248
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
1030810249
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -10399,7 +10340,6 @@ struct llm_build_context {
1039910340
if (il == n_layer - 1) {
1040010341
// skip computing output for unused tokens
1040110342
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10402-
n_tokens = n_outputs;
1040310343
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
1040410344
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1040510345
}

0 commit comments

Comments
 (0)