@@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;
3737
3838// build info
3939extern int LLAMA_BUILD_NUMBER;
40- extern char const * LLAMA_COMMIT;
41- extern char const * LLAMA_COMPILER;
42- extern char const * LLAMA_BUILD_TARGET;
40+ extern const char * LLAMA_COMMIT;
41+ extern const char * LLAMA_COMPILER;
42+ extern const char * LLAMA_BUILD_TARGET;
4343
4444struct common_control_vector_load_info ;
4545
@@ -95,6 +95,7 @@ enum common_sampler_type {
9595 COMMON_SAMPLER_TYPE_TEMPERATURE = 7 ,
9696 COMMON_SAMPLER_TYPE_XTC = 8 ,
9797 COMMON_SAMPLER_TYPE_INFILL = 9 ,
98+ COMMON_SAMPLER_TYPE_PENALTIES = 10 ,
9899};
99100
100101// dimensionality reduction methods, used by cvector-generator
@@ -130,7 +131,6 @@ struct common_params_sampling {
130131 int32_t mirostat = 0 ; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
131132 float mirostat_tau = 5 .00f ; // target entropy
132133 float mirostat_eta = 0 .10f ; // learning rate
133- bool penalize_nl = false ; // consider newlines as a repeatable token
134134 bool ignore_eos = false ;
135135 bool no_perf = false ; // disable performance metrics
136136 bool timing_per_token = false ;
@@ -139,6 +139,7 @@ struct common_params_sampling {
139139
140140
141141 std::vector<enum common_sampler_type> samplers = {
142+ COMMON_SAMPLER_TYPE_PENALTIES,
142143 COMMON_SAMPLER_TYPE_DRY,
143144 COMMON_SAMPLER_TYPE_TOP_K,
144145 COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -193,11 +194,13 @@ struct common_params {
193194 float defrag_thold = 0 .1f ; // KV cache defragmentation threshold
194195
195196 // offload params
196- std::vector<ggml_backend_dev_t > devices; // devices to use for offloading
197- int32_t n_gpu_layers = -1 ; // number of layers to store in VRAM (-1 - use default)
198- int32_t main_gpu = 0 ; // the GPU that is used for scratch and small tensors
199- float tensor_split[128 ] = {0 }; // how split tensors should be distributed across GPUs
200- enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
197+ std::vector<ggml_backend_dev_t > devices; // devices to use for offloading
198+
199+ int32_t n_gpu_layers = -1 ; // number of layers to store in VRAM (-1 - use default)
200+ int32_t main_gpu = 0 ; // the GPU that is used for scratch and small tensors
201+ float tensor_split[128 ] = {0 }; // how split tensors should be distributed across GPUs
202+
203+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
201204
202205 struct cpu_params cpuparams;
203206 struct cpu_params cpuparams_batch;
@@ -437,6 +440,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
437440 return parts;
438441}
439442
443+ static bool string_starts_with (const std::string & str,
444+ const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
445+ return str.rfind (prefix, 0 ) == 0 ;
446+ }
447+
440448bool string_parse_kv_override (const char * data, std::vector<llama_model_kv_override> & overrides);
441449void string_process_escapes (std::string & input);
442450
@@ -588,7 +596,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
588596// Embedding utils
589597//
590598
591- void common_embd_normalize (const float * inp, float * out, int n, int embd_norm = 2 );
599+ // TODO: repace embd_norm with an enum
600+ void common_embd_normalize (const float * inp, float * out, int n, int embd_norm);
592601
593602float common_embd_similarity_cos (const float * embd1, const float * embd2, int n);
594603
0 commit comments