Skip to content

Commit 32e7b9d

Browse files
committed
llama : de-shadow (cont) [no ci]
1 parent 0127774 commit 32e7b9d

File tree

2 files changed

+34
-30
lines changed

2 files changed

+34
-30
lines changed

src/llama-vocab.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,25 +24,30 @@
2424
struct naive_trie {
2525
naive_trie() : has_value(false), value(0) {
2626
}
27-
void insert(const char * key, size_t len, int32_t value = 0) {
27+
28+
void insert(const char * key, size_t len, int32_t val = 0) {
2829
if (len == 0) {
29-
this->has_value = true;
30-
this->value = value;
30+
has_value = true;
31+
value = val;
32+
3133
return;
3234
}
35+
3336
char c = key[0];
3437
auto res = children.find(c);
3538
if (res != children.end()) {
36-
res->second.insert(key + 1, len - 1, value);
39+
res->second.insert(key + 1, len - 1, val);
3740
} else {
3841
auto res = children.insert(std::make_pair(c, naive_trie()));
39-
res.first->second.insert(key + 1, len - 1, value);
42+
res.first->second.insert(key + 1, len - 1, val);
4043
}
4144
}
45+
4246
std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
4347
if (len == 0 || offset == len) {
4448
return std::make_pair(key, offset);
4549
}
50+
4651
char c = key[offset];
4752
auto res = children.find(c);
4853
if (res != children.end()) {
@@ -51,6 +56,7 @@ struct naive_trie {
5156

5257
return std::make_pair(key, offset);
5358
}
59+
5460
const struct naive_trie * traverse(const char c) const {
5561
auto res = children.find(c);
5662
if (res != children.end()) {
@@ -59,6 +65,7 @@ struct naive_trie {
5965

6066
return NULL;
6167
}
68+
6269
std::map<char, struct naive_trie> children;
6370
bool has_value;
6471
llama_token value;

src/llama.cpp

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1656,10 +1656,10 @@ struct llm_build_context {
16561656
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
16571657
for (int il = 0; il < n_layer; ++il) {
16581658
struct ggml_tensor * inpSA = inpL;
1659-
const int64_t n_head_kv = hparams.n_head_kv(il);
1660-
const int64_t n_head = hparams.n_head(il);
1659+
const int64_t n_head_kv_i = hparams.n_head_kv(il);
1660+
const int64_t n_head_i = hparams.n_head(il);
16611661

1662-
if (n_head == 0) {
1662+
if (n_head_i == 0) {
16631663
// attention-free layer of Llama-3_1-Nemotron-51B
16641664
cur = inpL;
16651665
} else {
@@ -1670,11 +1670,11 @@ struct llm_build_context {
16701670
cb(cur, "attn_norm", il);
16711671
}
16721672

1673-
if (n_head > 0 && n_head_kv == 0) {
1673+
if (n_head_i > 0 && n_head_kv_i == 0) {
16741674
// "linear attention" of Llama-3_1-Nemotron-51B
16751675
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
16761676
cb(cur, "wo", il);
1677-
} else if (n_head > 0) {
1677+
} else if (n_head_i > 0) {
16781678
// self-attention
16791679
// rope freq factors for llama3; may return nullptr for llama2 and other models
16801680
struct ggml_tensor * rope_factors = build_rope_factors(il);
@@ -1702,14 +1702,14 @@ struct llm_build_context {
17021702
}
17031703

17041704
Qcur = ggml_rope_ext(
1705-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
1705+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head_i, n_tokens), inp_pos, rope_factors,
17061706
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17071707
ext_factor, attn_factor, beta_fast, beta_slow
17081708
);
17091709
cb(Qcur, "Qcur", il);
17101710

17111711
Kcur = ggml_rope_ext(
1712-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
1712+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv_i, n_tokens), inp_pos, rope_factors,
17131713
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17141714
ext_factor, attn_factor, beta_fast, beta_slow
17151715
);
@@ -1734,7 +1734,7 @@ struct llm_build_context {
17341734

17351735
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
17361736
struct ggml_tensor * ffn_inp = cur;
1737-
if (n_head > 0) {
1737+
if (n_head_i > 0) {
17381738
ffn_inp = ggml_add(ctx0, cur, inpSA);
17391739
cb(ffn_inp, "ffn_inp", il);
17401740
}
@@ -2643,7 +2643,7 @@ struct llm_build_context {
26432643

26442644
// iterate layers
26452645
for (int il = 0; il < n_layer; ++il) {
2646-
struct ggml_tensor * cur = inpL;
2646+
cur = inpL;
26472647

26482648
struct ggml_tensor * Qcur;
26492649
struct ggml_tensor * Kcur;
@@ -4717,8 +4717,6 @@ struct llm_build_context {
47174717
struct ggml_cgraph * build_gemma() {
47184718
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
47194719

4720-
const int64_t n_embd_head_k = hparams.n_embd_head_k;
4721-
47224720
struct ggml_tensor * cur;
47234721
struct ggml_tensor * inpL;
47244722

@@ -4825,8 +4823,6 @@ struct llm_build_context {
48254823
struct ggml_cgraph * build_gemma2() {
48264824
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
48274825

4828-
const int64_t n_embd_head_k = hparams.n_embd_head_k;
4829-
48304826
struct ggml_tensor * cur;
48314827
struct ggml_tensor * inpL;
48324828

@@ -4962,6 +4958,7 @@ struct llm_build_context {
49624958
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
49634959

49644960
const int64_t n_embd_head = hparams.n_embd_head_v;
4961+
49654962
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
49664963
GGML_ASSERT(n_embd_head == hparams.n_rot);
49674964

@@ -5800,9 +5797,9 @@ struct llm_build_context {
58005797
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
58015798

58025799
for (int il = 0; il < n_layer; ++il) {
5803-
const int64_t n_head = hparams.n_head(il);
5804-
const int64_t n_head_kv = hparams.n_head_kv(il);
5805-
const int64_t n_head_qkv = 2*n_head_kv + n_head;
5800+
const int64_t n_head_i = hparams.n_head(il);
5801+
const int64_t n_head_kv_i = hparams.n_head_kv(il);
5802+
const int64_t n_head_qkv_i = 2*n_head_kv_i + n_head_i;
58065803

58075804
cur = inpL;
58085805
struct ggml_tensor * residual = cur;
@@ -5818,15 +5815,15 @@ struct llm_build_context {
58185815
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
58195816
cb(cur, "wqkv", il);
58205817

5821-
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
5818+
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv_i, n_tokens);
58225819

5823-
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
5820+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_i, n_tokens, cur->nb[1], cur->nb[2], 0));
58245821
cb(Qcur, "Qcur", il);
58255822

5826-
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
5823+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head_i));
58275824
cb(Kcur, "Kcur", il);
58285825

5829-
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
5826+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head_i+n_head_kv_i)));
58305827
cb(Vcur, "Vcur", il);
58315828

58325829
Qcur = llm_build_norm(ctx0, Qcur, hparams,
@@ -5851,7 +5848,7 @@ struct llm_build_context {
58515848
);
58525849
cb(Kcur, "Kcur", il);
58535850

5854-
Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
5851+
Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv_i, n_tokens);
58555852
cb(Qcur, "Vcur", il);
58565853

58575854
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
@@ -7495,9 +7492,9 @@ struct llm_build_context {
74957492
// Token shift state dimensions should be 2 * n_emb
74967493
GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
74977494

7498-
const int64_t n_seqs = ubatch.n_seqs;
7495+
const int64_t n_seqs = ubatch.n_seqs;
74997496
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
7500-
const int64_t n_tokens = ubatch.n_tokens;
7497+
75017498
GGML_ASSERT(n_seqs != 0);
75027499
GGML_ASSERT(ubatch.equal_seqs);
75037500
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
@@ -7608,9 +7605,9 @@ struct llm_build_context {
76087605

76097606
GGML_ASSERT(n_embd == hparams.n_embd_k_s());
76107607

7611-
const int64_t n_seqs = ubatch.n_seqs;
7608+
const int64_t n_seqs = ubatch.n_seqs;
76127609
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
7613-
const int64_t n_tokens = ubatch.n_tokens;
7610+
76147611
GGML_ASSERT(n_seqs != 0);
76157612
GGML_ASSERT(ubatch.equal_seqs);
76167613
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);

0 commit comments

Comments
 (0)