Skip to content

Commit 0127774

Browse files
committed
llama : remove unused mutable n_tokens [no ci]
1 parent 0bebe45 commit 0127774

File tree

1 file changed

+0
-60
lines changed

1 file changed

+0
-60
lines changed

src/llama.cpp

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,9 +1476,6 @@ struct llm_build_context {
14761476
struct ggml_cgraph * build_llama() {
14771477
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
14781478

1479-
// mutable variable, needed during the last layer of the computation to skip unused tokens
1480-
int32_t n_tokens = this->n_tokens;
1481-
14821479
const int64_t n_embd_head = hparams.n_embd_head_v;
14831480
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14841481
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -1553,7 +1550,6 @@ struct llm_build_context {
15531550
if (il == n_layer - 1) {
15541551
// skip computing output for unused tokens
15551552
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
1556-
n_tokens = n_outputs;
15571553
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
15581554
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
15591555
}
@@ -1642,9 +1638,6 @@ struct llm_build_context {
16421638
struct ggml_cgraph * build_deci() {
16431639
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
16441640

1645-
// mutable variable, needed during the last layer of the computation to skip unused tokens
1646-
int32_t n_tokens = this->n_tokens;
1647-
16481641
const int64_t n_embd_head = hparams.n_embd_head_v;
16491642
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
16501643
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -1730,7 +1723,6 @@ struct llm_build_context {
17301723
if (il == n_layer - 1) {
17311724
// skip computing output for unused tokens
17321725
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
1733-
n_tokens = n_outputs;
17341726
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17351727
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17361728
}
@@ -2141,9 +2133,6 @@ struct llm_build_context {
21412133
struct ggml_cgraph * build_grok() {
21422134
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
21432135

2144-
// mutable variable, needed during the last layer of the computation to skip unused tokens
2145-
int32_t n_tokens = this->n_tokens;
2146-
21472136
const int64_t n_embd_head = hparams.n_embd_head_v;
21482137
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
21492138
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -2218,7 +2207,6 @@ struct llm_build_context {
22182207
if (il == n_layer - 1) {
22192208
// skip computing output for unused tokens
22202209
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
2221-
n_tokens = n_outputs;
22222210
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
22232211
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
22242212
}
@@ -2300,9 +2288,6 @@ struct llm_build_context {
23002288
struct ggml_cgraph * build_dbrx() {
23012289
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
23022290

2303-
// mutable variable, needed during the last layer of the computation to skip unused tokens
2304-
int32_t n_tokens = this->n_tokens;
2305-
23062291
const int64_t n_embd_head = hparams.n_embd_head_v;
23072292
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
23082293
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -2370,7 +2355,6 @@ struct llm_build_context {
23702355
if (il == n_layer - 1) {
23712356
// skip computing output for unused tokens
23722357
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
2373-
n_tokens = n_outputs;
23742358
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
23752359
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
23762360
}
@@ -3553,9 +3537,6 @@ struct llm_build_context {
35533537
struct ggml_cgraph * build_qwen2moe() {
35543538
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
35553539

3556-
// mutable variable, needed during the last layer of the computation to skip unused tokens
3557-
int32_t n_tokens = this->n_tokens;
3558-
35593540
const int64_t n_embd_head = hparams.n_embd_head_v;
35603541
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
35613542
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -3620,7 +3601,6 @@ struct llm_build_context {
36203601
if (il == n_layer - 1) {
36213602
// skip computing output for unused tokens
36223603
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
3623-
n_tokens = n_outputs;
36243604
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
36253605
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
36263606
}
@@ -5440,9 +5420,6 @@ struct llm_build_context {
54405420
struct ggml_cgraph * build_olmo() {
54415421
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
54425422

5443-
// mutable variable, needed during the last layer of the computation to skip unused tokens
5444-
int32_t n_tokens = this->n_tokens;
5445-
54465423
const int64_t n_embd_head = hparams.n_embd_head_v;
54475424
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
54485425
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5513,7 +5490,6 @@ struct llm_build_context {
55135490
if (il == n_layer - 1) {
55145491
// skip computing output for unused tokens
55155492
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
5516-
n_tokens = n_outputs;
55175493
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
55185494
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
55195495
}
@@ -5564,9 +5540,6 @@ struct llm_build_context {
55645540
struct ggml_cgraph * build_olmo2() {
55655541
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
55665542

5567-
// mutable variable, needed during the last layer of the computation to skip unused tokens
5568-
int32_t n_tokens = this->n_tokens;
5569-
55705543
const int64_t n_embd_head = hparams.n_embd_head_v;
55715544
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
55725545
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5637,7 +5610,6 @@ struct llm_build_context {
56375610
if (il == n_layer - 1) {
56385611
// skip computing output for unused tokens
56395612
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
5640-
n_tokens = n_outputs;
56415613
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
56425614
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
56435615
}
@@ -5692,9 +5664,6 @@ struct llm_build_context {
56925664
struct ggml_cgraph * build_olmoe() {
56935665
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
56945666

5695-
// mutable variable, needed during the last layer of the computation to skip unused tokens
5696-
int32_t n_tokens = this->n_tokens;
5697-
56985667
const int64_t n_embd_head = hparams.n_embd_head_v;
56995668
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
57005669
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5764,7 +5733,6 @@ struct llm_build_context {
57645733
if (il == n_layer - 1) {
57655734
// skip computing output for unused tokens
57665735
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
5767-
n_tokens = n_outputs;
57685736
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
57695737
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
57705738
}
@@ -6085,9 +6053,6 @@ struct llm_build_context {
60856053
struct ggml_cgraph * build_arctic() {
60866054
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
60876055

6088-
// mutable variable, needed during the last layer of the computation to skip unused tokens
6089-
int32_t n_tokens = this->n_tokens;
6090-
60916056
const int64_t n_embd_head = hparams.n_embd_head_v;
60926057
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
60936058
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -6146,7 +6111,6 @@ struct llm_build_context {
61466111
if (il == n_layer - 1) {
61476112
// skip computing output for unused tokens
61486113
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6149-
n_tokens = n_outputs;
61506114
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
61516115
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
61526116
}
@@ -6219,9 +6183,6 @@ struct llm_build_context {
62196183
struct ggml_cgraph * build_deepseek() {
62206184
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
62216185

6222-
// mutable variable, needed during the last layer of the computation to skip unused tokens
6223-
int32_t n_tokens = this->n_tokens;
6224-
62256186
const int64_t n_embd_head = hparams.n_embd_head_v;
62266187
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
62276188
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -6295,7 +6256,6 @@ struct llm_build_context {
62956256
if (il == n_layer - 1) {
62966257
// skip computing output for unused tokens
62976258
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6298-
n_tokens = n_outputs;
62996259
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
63006260
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
63016261
}
@@ -6376,9 +6336,6 @@ struct llm_build_context {
63766336
struct ggml_cgraph * build_deepseek2() {
63776337
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
63786338

6379-
// mutable variable, needed during the last layer of the computation to skip unused tokens
6380-
int32_t n_tokens = this->n_tokens;
6381-
63826339
bool is_lite = (hparams.n_layer == 27);
63836340

63846341
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
@@ -6527,7 +6484,6 @@ struct llm_build_context {
65276484
if (il == n_layer - 1) {
65286485
// skip computing output for unused tokens
65296486
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6530-
n_tokens = n_outputs;
65316487
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
65326488
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
65336489
}
@@ -6757,9 +6713,6 @@ struct llm_build_context {
67576713
struct ggml_cgraph * build_t5_enc() {
67586714
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
67596715

6760-
// mutable variable, needed during the last layer of the computation to skip unused tokens
6761-
int32_t n_tokens = this->n_tokens;
6762-
67636716
const int64_t n_embd_head = hparams.n_embd_head_v;
67646717
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
67656718
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -6833,7 +6786,6 @@ struct llm_build_context {
68336786
if (il == n_layer - 1) {
68346787
// skip computing output for unused tokens
68356788
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6836-
n_tokens = n_outputs;
68376789
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
68386790
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
68396791
}
@@ -6889,9 +6841,6 @@ struct llm_build_context {
68896841
struct ggml_cgraph * build_t5_dec() {
68906842
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
68916843

6892-
// mutable variable, needed during the last layer of the computation to skip unused tokens
6893-
int32_t n_tokens = this->n_tokens;
6894-
68956844
const int64_t n_embd_head = hparams.n_embd_head_v;
68966845
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
68976846
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7033,7 +6982,6 @@ struct llm_build_context {
70336982
if (il == n_layer - 1) {
70346983
// skip computing output for unused tokens
70356984
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7036-
n_tokens = n_outputs;
70376985
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
70386986
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
70396987
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
@@ -7421,9 +7369,6 @@ struct llm_build_context {
74217369
struct ggml_cgraph * build_exaone() {
74227370
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
74237371

7424-
// mutable variable, needed during the last layer of the computation to skip unused tokens
7425-
int32_t n_tokens = this->n_tokens;
7426-
74277372
const int64_t n_embd_head = hparams.n_embd_head_v;
74287373
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
74297374
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -7497,7 +7442,6 @@ struct llm_build_context {
74977442
if (il == n_layer - 1) {
74987443
// skip computing output for unused tokens
74997444
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7500-
n_tokens = n_outputs;
75017445
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
75027446
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
75037447
}
@@ -7779,9 +7723,6 @@ struct llm_build_context {
77797723
struct ggml_cgraph * build_chameleon() {
77807724
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
77817725

7782-
// mutable variable, needed during the last layer of the computation to skip unused tokens
7783-
int32_t n_tokens = this->n_tokens;
7784-
77857726
const int64_t n_embd_head = hparams.n_embd_head_v;
77867727
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
77877728
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -7878,7 +7819,6 @@ struct llm_build_context {
78787819
if (il == n_layer - 1) {
78797820
// skip computing output for unused tokens
78807821
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7881-
n_tokens = n_outputs;
78827822
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
78837823
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
78847824
}

0 commit comments

Comments
 (0)