llama : de-shadow (cont) [no ci]

ggerganov · ggerganov · commit 32e7b9dc995a · 2025-01-12T12:50:02.000+02:00
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -24,25 +24,30 @@
 struct naive_trie {
     naive_trie() : has_value(false), value(0) {
     }
-    void insert(const char * key, size_t len, int32_t value = 0) {
+
+    void insert(const char * key, size_t len, int32_t val = 0) {
         if (len == 0) {
-            this->has_value = true;
-            this->value = value;
+            has_value = true;
+            value = val;
+
             return;
         }
+
         char c = key[0];
         auto res = children.find(c);
         if (res != children.end()) {
-            res->second.insert(key + 1, len - 1, value);
+            res->second.insert(key + 1, len - 1, val);
         } else {
             auto res = children.insert(std::make_pair(c, naive_trie()));
-            res.first->second.insert(key + 1, len - 1, value);
+            res.first->second.insert(key + 1, len - 1, val);
         }
     }
+
     std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
         if (len == 0 || offset == len) {
             return std::make_pair(key, offset);
         }
+
         char c = key[offset];
         auto res = children.find(c);
         if (res != children.end()) {
@@ -51,6 +56,7 @@ struct naive_trie {
 
         return std::make_pair(key, offset);
     }
+
     const struct naive_trie * traverse(const char c) const {
         auto res = children.find(c);
         if (res != children.end()) {
@@ -59,6 +65,7 @@ struct naive_trie {
 
         return NULL;
     }
+
     std::map<char, struct naive_trie> children;
     bool has_value;
     llama_token value;
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -1656,10 +1656,10 @@ struct llm_build_context {
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
-            const int64_t n_head_kv = hparams.n_head_kv(il);
-            const int64_t n_head    = hparams.n_head(il);
+            const int64_t n_head_kv_i = hparams.n_head_kv(il);
+            const int64_t n_head_i    = hparams.n_head(il);
 
-            if (n_head == 0) {
+            if (n_head_i == 0) {
                 // attention-free layer of Llama-3_1-Nemotron-51B
                 cur = inpL;
             } else {
@@ -1670,11 +1670,11 @@ struct llm_build_context {
                 cb(cur, "attn_norm", il);
             }
 
-            if (n_head > 0 && n_head_kv == 0) {
+            if (n_head_i > 0 && n_head_kv_i == 0) {
                 // "linear attention" of Llama-3_1-Nemotron-51B
                 cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
                 cb(cur, "wo", il);
-            } else if (n_head > 0) {
+            } else if (n_head_i > 0) {
                 // self-attention
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
                 struct ggml_tensor * rope_factors = build_rope_factors(il);
@@ -1702,14 +1702,14 @@ struct llm_build_context {
                 }
 
                 Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head_i, n_tokens), inp_pos, rope_factors,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv_i, n_tokens), inp_pos, rope_factors,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
@@ -1734,7 +1734,7 @@ struct llm_build_context {
 
             // modified to support attention-free layer of Llama-3_1-Nemotron-51B
             struct ggml_tensor * ffn_inp = cur;
-            if (n_head > 0) {
+            if (n_head_i > 0) {
                 ffn_inp = ggml_add(ctx0, cur, inpSA);
                 cb(ffn_inp, "ffn_inp", il);
             }
@@ -2643,7 +2643,7 @@ struct llm_build_context {
 
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * cur = inpL;
+            cur = inpL;
 
             struct ggml_tensor * Qcur;
             struct ggml_tensor * Kcur;
@@ -4717,8 +4717,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_gemma() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        const int64_t n_embd_head_k = hparams.n_embd_head_k;
-
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
@@ -4825,8 +4823,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_gemma2() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        const int64_t n_embd_head_k = hparams.n_embd_head_k;
-
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
@@ -4962,6 +4958,7 @@ struct llm_build_context {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -5800,9 +5797,9 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
 
         for (int il = 0; il < n_layer; ++il) {
-            const int64_t n_head    = hparams.n_head(il);
-            const int64_t n_head_kv = hparams.n_head_kv(il);
-            const int64_t n_head_qkv = 2*n_head_kv + n_head;
+            const int64_t n_head_i     = hparams.n_head(il);
+            const int64_t n_head_kv_i  = hparams.n_head_kv(il);
+            const int64_t n_head_qkv_i = 2*n_head_kv_i + n_head_i;
 
             cur = inpL;
             struct ggml_tensor * residual = cur;
@@ -5818,15 +5815,15 @@ struct llm_build_context {
                 cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
-                cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
+                cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv_i, n_tokens);
 
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_i, n_tokens, cur->nb[1], cur->nb[2], 0));
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head_i));
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head_i+n_head_kv_i)));
                 cb(Vcur, "Vcur", il);
 
                 Qcur = llm_build_norm(ctx0, Qcur, hparams,
@@ -5851,7 +5848,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
+                Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv_i, n_tokens);
                 cb(Qcur, "Vcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
@@ -7495,9 +7492,9 @@ struct llm_build_context {
         // Token shift state dimensions should be 2 * n_emb
         GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
 
-        const int64_t n_seqs = ubatch.n_seqs;
+        const int64_t n_seqs       = ubatch.n_seqs;
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_tokens = ubatch.n_tokens;
+
         GGML_ASSERT(n_seqs != 0);
         GGML_ASSERT(ubatch.equal_seqs);
         GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
@@ -7608,9 +7605,9 @@ struct llm_build_context {
 
         GGML_ASSERT(n_embd == hparams.n_embd_k_s());
 
-        const int64_t n_seqs = ubatch.n_seqs;
+        const int64_t n_seqs       = ubatch.n_seqs;
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_tokens = ubatch.n_tokens;
+
         GGML_ASSERT(n_seqs != 0);
         GGML_ASSERT(ubatch.equal_seqs);
         GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);