@@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;
3737
3838// build info
3939extern int LLAMA_BUILD_NUMBER;
40- extern char  const * LLAMA_COMMIT;
41- extern char  const * LLAMA_COMPILER;
42- extern char  const * LLAMA_BUILD_TARGET;
40+ extern const char  * LLAMA_COMMIT;
41+ extern const char  * LLAMA_COMPILER;
42+ extern const char  * LLAMA_BUILD_TARGET;
4343
4444struct common_control_vector_load_info;
4545
@@ -95,6 +95,7 @@ enum common_sampler_type {
9595    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
9696    COMMON_SAMPLER_TYPE_XTC         = 8,
9797    COMMON_SAMPLER_TYPE_INFILL      = 9,
98+     COMMON_SAMPLER_TYPE_PENALTIES   = 10,
9899};
99100
100101// dimensionality reduction methods, used by cvector-generator
@@ -130,7 +131,6 @@ struct common_params_sampling {
130131    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
131132    float   mirostat_tau       = 5.00f; // target entropy
132133    float   mirostat_eta       = 0.10f; // learning rate
133-     bool    penalize_nl        = false; // consider newlines as a repeatable token
134134    bool    ignore_eos         = false;
135135    bool    no_perf            = false; // disable performance metrics
136136    bool    timing_per_token   = false;
@@ -140,6 +140,7 @@ struct common_params_sampling {
140140
141141
142142    std::vector<enum common_sampler_type> samplers = {
143+         COMMON_SAMPLER_TYPE_PENALTIES,
143144        COMMON_SAMPLER_TYPE_DRY,
144145        COMMON_SAMPLER_TYPE_TOP_K,
145146        COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -194,11 +195,13 @@ struct common_params {
194195    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
195196
196197    // offload params
197-     std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
198-     int32_t n_gpu_layers                    =    -1; // number of layers to store in VRAM (-1 - use default)
199-     int32_t main_gpu                        =     0; // the GPU that is used for scratch and small tensors
200-     float   tensor_split[128]               =   {0}; // how split tensors should be distributed across GPUs
201-     enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
198+     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
199+ 
200+     int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
201+     int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
202+     float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
203+ 
204+     enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
202205
203206    struct cpu_params cpuparams;
204207    struct cpu_params cpuparams_batch;
@@ -438,6 +441,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
438441    return parts;
439442}
440443
444+ static bool string_starts_with(const std::string & str,
445+                                const std::string & prefix) {  // While we wait for C++20's std::string::starts_with...
446+     return str.rfind(prefix, 0) == 0;
447+ }
448+ 
441449bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
442450void string_process_escapes(std::string & input);
443451
@@ -589,7 +597,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
589597// Embedding utils
590598//
591599
592- void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
600+ // TODO: repace embd_norm with an enum
601+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
593602
594603float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
595604
0 commit comments