Nexesenex
diff --git a/‎CMakeLists.txt‎
Lines changed: 14 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 3 additions & 3 deletions b/‎common/arg.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 4 additions & 4 deletions b/‎common/common.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎common/common.h‎
Lines changed: 2 additions & 2 deletions b/‎common/common.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/sampling.cpp‎
Lines changed: 37 additions & 51 deletions b/‎common/sampling.cpp‎
Lines changed: 37 additions & 51 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 3 additions & 0 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎convert_lora_to_gguf.py‎
Lines changed: 3 additions & 0 deletions b/‎convert_lora_to_gguf.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/batched-bench/batched-bench.cpp‎
Lines changed: 0 additions & 1 deletion b/‎examples/batched-bench/batched-bench.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/cvector-generator/cvector-generator.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/cvector-generator/cvector-generator.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/eval-callback/eval-callback.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/eval-callback/eval-callback.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -441,6 +441,7 @@ endif()
 if (WIN32)
     add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
 
+
     if (BUILD_SHARED_LIBS)
         set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
     endif()
@@ -485,6 +486,19 @@ if (LLAMA_CCACHE)
     endif ()
 endif()
 
+# change the default for these ggml options
+if (NOT DEFINED GGML_LLAMAFILE)
+    set(GGML_LLAMAFILE_DEFAULT ON)
+endif()
+
+if (NOT DEFINED GGML_AMX)
+    set(GGML_AMX ON)
+endif()
+
+if (NOT DEFINED GGML_CUDA_GRAPHS)
+    set(GGML_CUDA_GRAPHS_DEFAULT ON)
+endif()
+
 # this version of Apple ld64 is buggy
 execute_process(
     COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
 
@@ -1098,7 +1098,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
     add_opt(common_arg(
-        {"--attention"}, "{causal,non,causal}",
+        {"--attention"}, "{causal,non-causal}",
         "attention type for embeddings, use model default if unspecified",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
@@ -1696,7 +1696,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
     add_opt(common_arg(
         {"--embd-normalize"}, "N",
-        string_format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
+        string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
         [](common_params & params, int value) {
             params.embd_normalize = value;
         }
@@ -1710,7 +1710,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--embd-separator"}, "STRING",
-        "separator of embendings (default \\n) for example \"<#sep#>\"",
+        "separator of embeddings (default \\n) for example \"<#sep#>\"",
         [](common_params & params, const std::string & value) {
             params.embd_sep = value;
         }
 
@@ -957,7 +957,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         }
 
         if (llama_model_has_encoder(model)) {
-            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
+            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
             llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
             if (decoder_start_token_id == -1) {
                 decoder_start_token_id = bos;
@@ -966,7 +966,7 @@ struct common_init_result common_init_from_params(common_params & params) {
             tmp.push_back(decoder_start_token_id);
         }
         if (llama_model_has_decoder(model)) {
-            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
+            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
         llama_kv_cache_clear(lctx);
         llama_synchronize(lctx);
@@ -1040,7 +1040,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
         return GGML_TYPE_Q6_0;
     }
 
-    throw std::runtime_error("Invalid cache type: " + s);
+    throw std::runtime_error("Unsupported cache type: " + s);
 }
 
 struct llama_context_params common_context_params_to_llama(const common_params & params) {
@@ -1052,7 +1052,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.n_ubatch          = params.n_ubatch;
     cparams.n_threads         = params.cpuparams.n_threads;
     cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
-                                    params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
+                                params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
     cparams.logits_all        = params.logits_all;
     cparams.embeddings        = params.embedding;
     cparams.rope_scaling_type = params.rope_scaling_type;
 
@@ -270,9 +270,9 @@ struct common_params {
 
     // embedding
     bool embedding         = false; // get only sentence embedding
-    int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
+    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
     std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
-    std::string embd_sep   = "\n";  // separator of embendings
+    std::string embd_sep   = "\n";  // separator of embeddings
     bool reranking         = false; // enable reranking support on server
 
     // server params
 
@@ -171,60 +171,46 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                 params.penalize_nl,
                 params.ignore_eos));
 
-    if (params.temp > 0.0f) {
-        if (params.mirostat == 0) {
-            for (const auto & cnstr : params.samplers) {
-                switch (cnstr) {
-                    case COMMON_SAMPLER_TYPE_TOP_K:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TOP_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_MIN_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_XTC:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TFS_Z:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                        break;
-                    case COMMON_SAMPLER_TYPE_INFILL:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
-                        break;
-                    default:
-                        GGML_ASSERT(false && "unknown sampler type");
-                }
+    if (params.mirostat == 0) {
+        for (const auto & cnstr : params.samplers) {
+            switch (cnstr) {
+                case COMMON_SAMPLER_TYPE_TOP_K:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_MIN_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_XTC:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    break;
+                case COMMON_SAMPLER_TYPE_TFS_Z:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_TYPICAL_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    break;
+                case COMMON_SAMPLER_TYPE_INFILL:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
+                    break;
+                default:
+                    GGML_ASSERT(false && "unknown sampler type");
             }
-            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
-            llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
-        } else if (params.mirostat == 1) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
-        } else if (params.mirostat == 2) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
-        } else {
-            GGML_ASSERT(false && "unknown mirostat version");
         }
+        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+    } else if (params.mirostat == 1) {
+        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+    } else if (params.mirostat == 2) {
+        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
     } else {
-        if (params.n_probs > 0) {
-            // some use cases require to sample greedily, but still obtain the probabilities of the top tokens
-            // ref: https://github.com/ggerganov/llama.cpp/pull/9605
-            //
-            // the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
-            // it is much faster, since we avoid sorting all tokens and should give a good approximation
-            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
-        }
-        llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
+        GGML_ASSERT(false && "unknown mirostat version");
     }
 
     return result;
 
@@ -2864,6 +2864,9 @@ def set_vocab(self):
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
+        special_vocab.chat_template = "rwkv-world"
+        # hack: Add '\n\n' as the EOT token to make it chat normally
+        special_vocab._set_special_token("eot", 261)
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
 
@@ -348,6 +348,9 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                         if ".base_layer.weight" in name:
                             continue
                         logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
+                        if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
+                            logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
+                            logger.error("Hint: if you are using TRL, make sure not to call setup_chat_format()")
                         sys.exit(1)
 
                     if base_name in tensor_map:
 
@@ -74,7 +74,6 @@ int main(int argc, char ** argv) {
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
                 batch.logits   + i,
-                0, 0, 0, // unused
             };
 
             const int ret = llama_decode(ctx, batch_view);
 
@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
     llama_kv_cache_clear(ctx);
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
         fprintf(stderr, "%s : failed to eval\n", __func__);
         return false;
     }
 
@@ -131,7 +131,7 @@ static bool run(llama_context * ctx, const common_params & params) {
 
     std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
 
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
         LOG_ERR("%s : failed to eval\n", __func__);
         return false;
     }
Original file line number	Diff line number	Diff line change
`@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {`
`339`	`339`
`340`	`340`	`static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {`
`341`	`341`	`llama_kv_cache_clear(ctx);`
`342`		`- if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {`
	`342`	`+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {`
`343`	`343`	`fprintf(stderr, "%s : failed to eval\n", __func__);`
`344`	`344`	`return false;`
`345`	`345`	`}`
Original file line number	Diff line number	Diff line change
`@@ -131,7 +131,7 @@ static bool run(llama_context * ctx, const common_params & params) {`
`131`	`131`
`132`	`132`	`std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);`
`133`	`133`
`134`		`- if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {`
	`134`	`+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {`
`135`	`135`	`LOG_ERR("%s : failed to eval\n", __func__);`
`136`	`136`	`return false;`
`137`	`137`	`}`