Skip to content

Commit d6e5b75

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents 69ceca8 + cc2983d commit d6e5b75

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+3659
-1365
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ Typically finetunes of the base models below are supported as well.
9393
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
9494
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
9595
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
96+
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
9697

9798
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
9899

@@ -122,6 +123,7 @@ Typically finetunes of the base models below are supported as well.
122123
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
123124
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
124125
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
126+
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
125127
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
126128
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
127129
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
@@ -172,6 +174,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
172174
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
173175
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
174176
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
177+
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
175178

176179
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
177180

@@ -187,6 +190,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
187190

188191
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
189192
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
193+
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
190194

191195
**Games:**
192196
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
5353
exit 1
5454
fi
5555

56-
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
56+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
5757
fi
5858

5959
if [ ! -z ${GG_BUILD_VULKAN} ]; then

common/arg.cpp

Lines changed: 69 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -128,13 +128,13 @@ static void common_params_handle_model_default(common_params & params) {
128128
}
129129
params.hf_file = params.model;
130130
} else if (params.model.empty()) {
131-
params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
131+
params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
132132
}
133133
} else if (!params.model_url.empty()) {
134134
if (params.model.empty()) {
135-
auto f = string_split(params.model_url, '#').front();
136-
f = string_split(f, '?').front();
137-
params.model = fs_get_cache_file(string_split(f, '/').back());
135+
auto f = string_split<std::string>(params.model_url, '#').front();
136+
f = string_split<std::string>(f, '?').front();
137+
params.model = fs_get_cache_file(string_split<std::string>(f, '/').back());
138138
}
139139
} else if (params.model.empty()) {
140140
params.model = DEFAULT_MODEL_PATH;
@@ -251,6 +251,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
251251
for (auto & antiprompt : params.antiprompt) {
252252
string_process_escapes(antiprompt);
253253
}
254+
for (auto & seq_breaker : params.sparams.dry_sequence_breakers) {
255+
string_process_escapes(seq_breaker);
256+
}
254257
}
255258

256259
if (!params.kv_overrides.empty()) {
@@ -879,7 +882,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
879882
{"--samplers"}, "SAMPLERS",
880883
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
881884
[](common_params & params, const std::string & value) {
882-
const auto sampler_names = string_split(value, ';');
885+
const auto sampler_names = string_split<std::string>(value, ';');
883886
params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
884887
}
885888
).set_sparam());
@@ -997,6 +1000,64 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
9971000
params.sparams.penalty_freq = std::stof(value);
9981001
}
9991002
).set_sparam());
1003+
add_opt(common_arg(
1004+
{"--dry-multiplier"}, "N",
1005+
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
1006+
[](common_params & params, const std::string & value) {
1007+
params.sparams.dry_multiplier = std::stof(value);
1008+
}
1009+
).set_sparam());
1010+
add_opt(common_arg(
1011+
{"--dry-base"}, "N",
1012+
string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
1013+
[](common_params & params, const std::string & value) {
1014+
float potential_base = std::stof(value);
1015+
if (potential_base >= 1.0f)
1016+
{
1017+
params.sparams.dry_base = potential_base;
1018+
}
1019+
}
1020+
).set_sparam());
1021+
add_opt(common_arg(
1022+
{"--dry-allowed-length"}, "N",
1023+
string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
1024+
[](common_params & params, int value) {
1025+
params.sparams.dry_allowed_length = value;
1026+
}
1027+
).set_sparam());
1028+
add_opt(common_arg(
1029+
{"--dry-penalty-last-n"}, "N",
1030+
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
1031+
[](common_params & params, int value) {
1032+
params.sparams.dry_penalty_last_n = value;
1033+
}
1034+
).set_sparam());
1035+
add_opt(common_arg(
1036+
{"--dry-sequence-breaker"}, "STRING",
1037+
string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
1038+
params.sparams.dry_sequence_breakers.empty() ? "none" :
1039+
std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()),
1040+
params.sparams.dry_sequence_breakers.end(),
1041+
std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'",
1042+
[](const std::string& a, const std::string& b) {
1043+
std::string formatted_b = (b == "\n") ? "\\n" : b;
1044+
return a + ", '" + formatted_b + "'";
1045+
}).c_str()),
1046+
[](common_params & params, const std::string & value) {
1047+
static bool defaults_cleared = false;
1048+
1049+
if (!defaults_cleared) {
1050+
params.sparams.dry_sequence_breakers.clear();
1051+
defaults_cleared = true;
1052+
}
1053+
1054+
if (value == "none") {
1055+
params.sparams.dry_sequence_breakers.clear();
1056+
} else {
1057+
params.sparams.dry_sequence_breakers.emplace_back(value);
1058+
}
1059+
}
1060+
).set_sparam());
10001061
add_opt(common_arg(
10011062
{"--dynatemp-range"}, "N",
10021063
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
@@ -1097,7 +1158,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
10971158
}
10981159
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
10991160
add_opt(common_arg(
1100-
{"--attention"}, "{causal,non,causal}",
1161+
{"--attention"}, "{causal,non-causal}",
11011162
"attention type for embeddings, use model default if unspecified",
11021163
[](common_params & params, const std::string & value) {
11031164
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
@@ -1695,7 +1756,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
16951756
).set_examples({LLAMA_EXAMPLE_BENCH}));
16961757
add_opt(common_arg(
16971758
{"--embd-normalize"}, "N",
1698-
string_format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
1759+
string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
16991760
[](common_params & params, int value) {
17001761
params.embd_normalize = value;
17011762
}
@@ -1709,7 +1770,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
17091770
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
17101771
add_opt(common_arg(
17111772
{"--embd-separator"}, "STRING",
1712-
"separator of embendings (default \\n) for example \"<#sep#>\"",
1773+
"separator of embeddings (default \\n) for example \"<#sep#>\"",
17131774
[](common_params & params, const std::string & value) {
17141775
params.embd_sep = value;
17151776
}

common/common.cpp

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -416,19 +416,6 @@ std::string string_format(const char * fmt, ...) {
416416
return std::string(buf.data(), size);
417417
}
418418

419-
std::vector<std::string> string_split(std::string input, char separator) {
420-
std::vector<std::string> parts;
421-
size_t separator_pos = input.find(separator);
422-
while (separator_pos != std::string::npos) {
423-
std::string part = input.substr(0, separator_pos);
424-
parts.emplace_back(part);
425-
input = input.substr(separator_pos + 1);
426-
separator_pos = input.find(separator);
427-
}
428-
parts.emplace_back(input);
429-
return parts;
430-
}
431-
432419
std::string string_strip(const std::string & str) {
433420
size_t start = 0;
434421
size_t end = str.size();
@@ -1035,7 +1022,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
10351022
return GGML_TYPE_Q5_1;
10361023
}
10371024

1038-
throw std::runtime_error("Invalid cache type: " + s);
1025+
throw std::runtime_error("Unsupported cache type: " + s);
10391026
}
10401027

10411028
struct llama_context_params common_context_params_to_llama(const common_params & params) {
@@ -1047,7 +1034,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
10471034
cparams.n_ubatch = params.n_ubatch;
10481035
cparams.n_threads = params.cpuparams.n_threads;
10491036
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1050-
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1037+
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
10511038
cparams.logits_all = params.logits_all;
10521039
cparams.embeddings = params.embedding;
10531040
cparams.rope_scaling_type = params.rope_scaling_type;
@@ -2019,6 +2006,10 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
20192006
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
20202007
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
20212008
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2009+
fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
2010+
fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
2011+
fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
2012+
fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
20222013
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
20232014
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
20242015
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);

common/common.h

Lines changed: 58 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,15 @@ enum llama_example {
8484

8585
enum common_sampler_type {
8686
COMMON_SAMPLER_TYPE_NONE = 0,
87-
COMMON_SAMPLER_TYPE_TOP_K = 1,
88-
COMMON_SAMPLER_TYPE_TOP_P = 2,
89-
COMMON_SAMPLER_TYPE_MIN_P = 3,
90-
COMMON_SAMPLER_TYPE_TFS_Z = 4,
91-
COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
92-
COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
93-
COMMON_SAMPLER_TYPE_XTC = 7,
94-
COMMON_SAMPLER_TYPE_INFILL = 8,
87+
COMMON_SAMPLER_TYPE_DRY = 1,
88+
COMMON_SAMPLER_TYPE_TOP_K = 2,
89+
COMMON_SAMPLER_TYPE_TOP_P = 3,
90+
COMMON_SAMPLER_TYPE_MIN_P = 4,
91+
COMMON_SAMPLER_TYPE_TFS_Z = 5,
92+
COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
93+
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
94+
COMMON_SAMPLER_TYPE_XTC = 8,
95+
COMMON_SAMPLER_TYPE_INFILL = 9,
9596
};
9697

9798
// dimensionality reduction methods, used by cvector-generator
@@ -104,32 +105,39 @@ enum dimre_method {
104105
struct common_sampler_params {
105106
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
106107

107-
int32_t n_prev = 64; // number of previous tokens to remember
108-
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
109-
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
110-
int32_t top_k = 40; // <= 0 to use vocab size
111-
float top_p = 0.95f; // 1.0 = disabled
112-
float min_p = 0.05f; // 0.0 = disabled
113-
float xtc_probability = 0.00f; // 0.0 = disabled
114-
float xtc_threshold = 0.10f; // > 0.5 disables XTC
115-
float tfs_z = 1.00f; // 1.0 = disabled
116-
float typ_p = 1.00f; // typical_p, 1.0 = disabled
117-
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
118-
float dynatemp_range = 0.00f; // 0.0 = disabled
119-
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
120-
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
121-
float penalty_repeat = 1.00f; // 1.0 = disabled
122-
float penalty_freq = 0.00f; // 0.0 = disabled
123-
float penalty_present = 0.00f; // 0.0 = disabled
124-
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
125-
float mirostat_tau = 5.00f; // target entropy
126-
float mirostat_eta = 0.10f; // learning rate
127-
bool penalize_nl = false; // consider newlines as a repeatable token
128-
bool ignore_eos = false;
129-
bool no_perf = false; // disable performance metrics
108+
int32_t n_prev = 64; // number of previous tokens to remember
109+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
110+
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
111+
int32_t top_k = 40; // <= 0 to use vocab size
112+
float top_p = 0.95f; // 1.0 = disabled
113+
float min_p = 0.05f; // 0.0 = disabled
114+
float xtc_probability = 0.00f; // 0.0 = disabled
115+
float xtc_threshold = 0.10f; // > 0.5 disables XTC
116+
float tfs_z = 1.00f; // 1.0 = disabled
117+
float typ_p = 1.00f; // typical_p, 1.0 = disabled
118+
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
119+
float dynatemp_range = 0.00f; // 0.0 = disabled
120+
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
121+
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
122+
float penalty_repeat = 1.00f; // 1.0 = disabled
123+
float penalty_freq = 0.00f; // 0.0 = disabled
124+
float penalty_present = 0.00f; // 0.0 = disabled
125+
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
126+
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
127+
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
128+
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
129+
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
130+
float mirostat_tau = 5.00f; // target entropy
131+
float mirostat_eta = 0.10f; // learning rate
132+
bool penalize_nl = false; // consider newlines as a repeatable token
133+
bool ignore_eos = false;
134+
bool no_perf = false; // disable performance metrics
135+
136+
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
130137

131138

132139
std::vector<enum common_sampler_type> samplers = {
140+
COMMON_SAMPLER_TYPE_DRY,
133141
COMMON_SAMPLER_TYPE_TOP_K,
134142
COMMON_SAMPLER_TYPE_TFS_Z,
135143
COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -274,9 +282,9 @@ struct common_params {
274282

275283
// embedding
276284
bool embedding = false; // get only sentence embedding
277-
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
285+
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
278286
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
279-
std::string embd_sep = "\n"; // separator of embendings
287+
std::string embd_sep = "\n"; // separator of embeddings
280288
bool reranking = false; // enable reranking support on server
281289

282290
// server params
@@ -380,15 +388,14 @@ bool set_process_priority(enum ggml_sched_priority prio);
380388
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
381389
std::string string_format(const char * fmt, ...);
382390

383-
std::vector<std::string> string_split(std::string input, char separator);
384-
385391
std::string string_strip(const std::string & str);
386392
std::string string_get_sortable_timestamp();
387393

388394
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
389395

390396
template<class T>
391397
static std::vector<T> string_split(const std::string & str, char delim) {
398+
static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
392399
std::vector<T> values;
393400
std::istringstream str_stream(str);
394401
std::string token;
@@ -401,6 +408,22 @@ static std::vector<T> string_split(const std::string & str, char delim) {
401408
return values;
402409
}
403410

411+
template<>
412+
std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
413+
{
414+
std::vector<std::string> parts;
415+
size_t begin_pos = 0;
416+
size_t separator_pos = input.find(separator);
417+
while (separator_pos != std::string::npos) {
418+
std::string part = input.substr(begin_pos, separator_pos - begin_pos);
419+
parts.emplace_back(part);
420+
begin_pos = separator_pos + 1;
421+
separator_pos = input.find(separator, begin_pos);
422+
}
423+
parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
424+
return parts;
425+
}
426+
404427
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
405428
void string_process_escapes(std::string & input);
406429

0 commit comments

Comments
 (0)