Skip to content

Commit ee486ba

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # README.md # examples/CMakeLists.txt # examples/batched/batched.cpp # examples/gritlm/gritlm.cpp # examples/llama.android/llama/build.gradle.kts # examples/main/README.md # examples/retrieval/retrieval.cpp # examples/server/CMakeLists.txt # examples/server/README.md # ggml/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml.c # scripts/compare-commits.sh # scripts/sync-ggml.last # tests/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-chat-template.cpp # tests/test-sampling.cpp
2 parents 6343604 + 9177484 commit ee486ba

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+20502
-13156
lines changed

common/arg.cpp

Lines changed: 50 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -120,29 +120,33 @@ std::string common_arg::to_string() {
120120
// utils
121121
//
122122

123-
static void common_params_handle_model_default(common_params & params) {
124-
if (!params.hf_repo.empty()) {
123+
static void common_params_handle_model_default(
124+
std::string & model,
125+
std::string & model_url,
126+
std::string & hf_repo,
127+
std::string & hf_file) {
128+
if (!hf_repo.empty()) {
125129
// short-hand to avoid specifying --hf-file -> default it to --model
126-
if (params.hf_file.empty()) {
127-
if (params.model.empty()) {
130+
if (hf_file.empty()) {
131+
if (model.empty()) {
128132
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
129133
}
130-
params.hf_file = params.model;
131-
} else if (params.model.empty()) {
134+
hf_file = model;
135+
} else if (model.empty()) {
132136
// this is to avoid different repo having same file name, or same file name in different subdirs
133-
std::string filename = params.hf_repo + "_" + params.hf_file;
137+
std::string filename = hf_repo + "_" + hf_file;
134138
// to make sure we don't have any slashes in the filename
135139
string_replace_all(filename, "/", "_");
136-
params.model = fs_get_cache_file(filename);
140+
model = fs_get_cache_file(filename);
137141
}
138-
} else if (!params.model_url.empty()) {
139-
if (params.model.empty()) {
140-
auto f = string_split<std::string>(params.model_url, '#').front();
142+
} else if (!model_url.empty()) {
143+
if (model.empty()) {
144+
auto f = string_split<std::string>(model_url, '#').front();
141145
f = string_split<std::string>(f, '?').front();
142-
params.model = fs_get_cache_file(string_split<std::string>(f, '/').back());
146+
model = fs_get_cache_file(string_split<std::string>(f, '/').back());
143147
}
144-
} else if (params.model.empty()) {
145-
params.model = DEFAULT_MODEL_PATH;
148+
} else if (model.empty()) {
149+
model = DEFAULT_MODEL_PATH;
146150
}
147151
}
148152

@@ -277,7 +281,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
277281
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
278282
}
279283

280-
common_params_handle_model_default(params);
284+
// TODO: refactor model params in a common struct
285+
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file);
286+
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
281287

282288
if (params.escape) {
283289
string_process_escapes(params.prompt);
@@ -843,7 +849,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
843849
}
844850
).set_sparam());
845851
add_opt(common_arg(
846-
{"--sampling-seq"}, "SEQUENCE",
852+
{"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
847853
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
848854
[](common_params & params, const std::string & value) {
849855
params.sampling.samplers = common_sampler_types_from_chars(value);
@@ -856,13 +862,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
856862
params.sampling.ignore_eos = true;
857863
}
858864
).set_sparam());
859-
add_opt(common_arg(
860-
{"--penalize-nl"},
861-
string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
862-
[](common_params & params) {
863-
params.sampling.penalize_nl = true;
864-
}
865-
).set_sparam());
866865
add_opt(common_arg(
867866
{"--temp"}, "N",
868867
string_format("temperature (default: %.1f)", (double)params.sampling.temp),
@@ -917,6 +916,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
917916
{"--repeat-last-n"}, "N",
918917
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
919918
[](common_params & params, int value) {
919+
if (value < -1) {
920+
throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
921+
}
920922
params.sampling.penalty_last_n = value;
921923
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
922924
}
@@ -971,6 +973,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
971973
{"--dry-penalty-last-n"}, "N",
972974
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
973975
[](common_params & params, int value) {
976+
if (value < -1) {
977+
throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
978+
}
974979
params.sampling.dry_penalty_last_n = value;
975980
}
976981
).set_sparam());
@@ -1583,6 +1588,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15831588
params.hf_file = value;
15841589
}
15851590
).set_env("LLAMA_ARG_HF_FILE"));
1591+
add_opt(common_arg(
1592+
{"-hfrv", "--hf-repo-v"}, "REPO",
1593+
"Hugging Face model repository for the vocoder model (default: unused)",
1594+
[](common_params & params, const std::string & value) {
1595+
params.vocoder.hf_repo = value;
1596+
}
1597+
).set_env("LLAMA_ARG_HF_REPO_V"));
1598+
add_opt(common_arg(
1599+
{"-hffv", "--hf-file-v"}, "FILE",
1600+
"Hugging Face model file for the vocoder model (default: unused)",
1601+
[](common_params & params, const std::string & value) {
1602+
params.vocoder.hf_file = value;
1603+
}
1604+
).set_env("LLAMA_ARG_HF_FILE_V"));
15861605
add_opt(common_arg(
15871606
{"-hft", "--hf-token"}, "TOKEN",
15881607
"Hugging Face access token (default: value from HF_TOKEN environment variable)",
@@ -2180,5 +2199,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21802199
}
21812200
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
21822201

2202+
add_opt(common_arg(
2203+
{"-mv", "--model-vocoder"}, "FNAME",
2204+
"vocoder model for audio generation (default: unused)",
2205+
[](common_params & params, const std::string & value) {
2206+
params.vocoder.model = value;
2207+
}
2208+
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2209+
21832210
return ctx_arg;
21842211
}

common/common.cpp

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -942,6 +942,25 @@ struct common_init_result common_init_from_params(common_params & params) {
942942
params.sampling.ignore_eos = false;
943943
}
944944

945+
if (params.sampling.ignore_eos) {
946+
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
947+
if (llama_token_is_eog(model, i)) {
948+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
949+
params.sampling.logit_bias.push_back({i, -INFINITY});
950+
}
951+
}
952+
}
953+
954+
if (params.sampling.penalty_last_n == -1) {
955+
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
956+
params.sampling.penalty_last_n = llama_n_ctx(lctx);
957+
}
958+
959+
if (params.sampling.dry_penalty_last_n == -1) {
960+
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
961+
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
962+
}
963+
945964
if (params.warmup) {
946965
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
947966

@@ -1078,7 +1097,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
10781097
#define CURL_MAX_RETRY 3
10791098
#define CURL_RETRY_DELAY_SECONDS 2
10801099

1081-
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
1100+
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
10821101
int remaining_attempts = max_attempts;
10831102

10841103
while (remaining_attempts > 0) {
@@ -1102,7 +1121,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
11021121
}
11031122

11041123
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1105-
11061124
// Initialize libcurl
11071125
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
11081126
if (!curl) {
@@ -1175,11 +1193,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
11751193
std::string etag;
11761194
std::string last_modified;
11771195
};
1196+
11781197
common_load_model_from_url_headers headers;
1198+
11791199
{
11801200
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
11811201
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1182-
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
1202+
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
11831203

11841204
static std::regex header_regex("([^:]+): (.*)\r\n");
11851205
static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1763,7 +1783,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
17631783
break;
17641784
case 0: // max absolute
17651785
for (int i = 0; i < n; i++) {
1766-
if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
1786+
if (sum < std::abs(inp[i])) {
1787+
sum = std::abs(inp[i]);
1788+
}
17671789
}
17681790
sum /= 32760.0; // make an int16 range
17691791
break;

common/common.h

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ enum llama_example {
7676
LLAMA_EXAMPLE_LLAVA,
7777
LLAMA_EXAMPLE_LOOKUP,
7878
LLAMA_EXAMPLE_PARALLEL,
79+
LLAMA_EXAMPLE_TTS,
7980

8081
LLAMA_EXAMPLE_COUNT,
8182
};
@@ -91,6 +92,7 @@ enum common_sampler_type {
9192
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
9293
COMMON_SAMPLER_TYPE_XTC = 8,
9394
COMMON_SAMPLER_TYPE_INFILL = 9,
95+
COMMON_SAMPLER_TYPE_PENALTIES = 10,
9496
};
9597

9698
// dimensionality reduction methods, used by cvector-generator
@@ -126,7 +128,6 @@ struct common_params_sampling {
126128
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
127129
float mirostat_tau = 5.00f; // target entropy
128130
float mirostat_eta = 0.10f; // learning rate
129-
bool penalize_nl = false; // consider newlines as a repeatable token
130131
bool ignore_eos = false;
131132
bool no_perf = false; // disable performance metrics
132133
bool timing_per_token = false;
@@ -135,6 +136,7 @@ struct common_params_sampling {
135136

136137

137138
std::vector<enum common_sampler_type> samplers = {
139+
COMMON_SAMPLER_TYPE_PENALTIES,
138140
COMMON_SAMPLER_TYPE_DRY,
139141
COMMON_SAMPLER_TYPE_TOP_K,
140142
COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -154,6 +156,7 @@ struct common_params_sampling {
154156

155157
struct common_params_speculative {
156158
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
159+
157160
int32_t n_ctx = 0; // draft context size
158161
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
159162
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
@@ -167,6 +170,14 @@ struct common_params_speculative {
167170
std::string model = ""; // draft model for speculative decoding // NOLINT
168171
};
169172

173+
struct common_params_vocoder {
174+
std::string hf_repo = ""; // HF repo // NOLINT
175+
std::string hf_file = ""; // HF file // NOLINT
176+
177+
std::string model = ""; // model path // NOLINT
178+
std::string model_url = ""; // model url to download // NOLINT
179+
};
180+
170181
struct common_params {
171182
int32_t n_predict = -1; // new tokens to predict
172183
int32_t n_ctx = 4096; // context size
@@ -189,11 +200,13 @@ struct common_params {
189200
float defrag_thold = 0.1f; // KV cache defragmentation threshold
190201

191202
// offload params
192-
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
193-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
194-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
195-
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
196-
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
203+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
204+
205+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
206+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
207+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
208+
209+
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
197210

198211
struct cpu_params cpuparams;
199212
struct cpu_params cpuparams_batch;
@@ -207,8 +220,9 @@ struct common_params {
207220
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
208221
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
209222

210-
struct common_params_sampling sampling;
223+
struct common_params_sampling sampling;
211224
struct common_params_speculative speculative;
225+
struct common_params_vocoder vocoder;
212226

213227
std::string model = ""; // model path // NOLINT
214228
std::string model_alias = ""; // model alias // NOLINT
@@ -589,7 +603,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
589603
// Embedding utils
590604
//
591605

592-
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
606+
// TODO: repace embd_norm with an enum
607+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
593608

594609
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
595610

common/sampling.cpp

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -161,32 +161,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
161161
params.logit_bias.size(),
162162
params.logit_bias.data()));
163163

164-
llama_sampler_chain_add(result->chain,
165-
llama_sampler_init_penalties(
166-
llama_n_vocab (model),
167-
llama_token_eos(model),
168-
llama_token_nl (model),
169-
params.penalty_last_n,
170-
params.penalty_repeat,
171-
params.penalty_freq,
172-
params.penalty_present,
173-
params.penalize_nl,
174-
params.ignore_eos));
175-
176164
if (params.mirostat == 0) {
177165
for (const auto & cnstr : params.samplers) {
178166
switch (cnstr) {
179-
case COMMON_SAMPLER_TYPE_DRY:
167+
case COMMON_SAMPLER_TYPE_DRY:
180168
{
181-
std::vector<const char*> c_breakers;
169+
std::vector<const char *> c_breakers;
182170
c_breakers.reserve(params.dry_sequence_breakers.size());
183-
for (const auto& str : params.dry_sequence_breakers) {
171+
for (const auto & str : params.dry_sequence_breakers) {
184172
c_breakers.push_back(str.c_str());
185173
}
186174

187175
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
188176
}
189-
break;
177+
break;
190178
case COMMON_SAMPLER_TYPE_TOP_K:
191179
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
192180
break;
@@ -208,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
208196
case COMMON_SAMPLER_TYPE_INFILL:
209197
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
210198
break;
199+
case COMMON_SAMPLER_TYPE_PENALTIES:
200+
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
201+
break;
211202
default:
212203
GGML_ASSERT(false && "unknown sampler type");
213204
}
@@ -415,6 +406,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
415406
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
416407
case COMMON_SAMPLER_TYPE_XTC: return 'x';
417408
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
409+
case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
418410
default : return '?';
419411
}
420412
}
@@ -429,6 +421,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
429421
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
430422
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
431423
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
424+
case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
432425
default : return "";
433426
}
434427
}
@@ -443,6 +436,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
443436
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
444437
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
445438
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
439+
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
446440
};
447441

448442
// since samplers names are written multiple ways
@@ -489,6 +483,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
489483
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
490484
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
491485
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
486+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
492487
};
493488

494489
std::vector<common_sampler_type> samplers;

0 commit comments

Comments
 (0)