Skip to content

Commit 62dc170

Browse files
committed
merge master
2 parents c656d92 + 9177484 commit 62dc170

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+4691
-959
lines changed

.github/workflows/build.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ jobs:
317317
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
318318
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
319319
sudo apt-get update -y
320-
sudo apt-get install -y build-essential vulkan-sdk
320+
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
321321
322322
- name: Build
323323
id: cmake_build
@@ -327,6 +327,12 @@ jobs:
327327
cmake -DGGML_VULKAN=ON ..
328328
cmake --build . --config Release -j $(nproc)
329329
330+
- name: Test
331+
id: cmake_test
332+
run: |
333+
cd build
334+
ctest -L main --verbose --timeout 900
335+
330336
ubuntu-22-cmake-hip:
331337
runs-on: ubuntu-22.04
332338
container: rocm/dev-ubuntu-22.04:6.0.2

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
221221
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
222222
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
223223
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
224-
| [hipBLAS](docs/build.md#hipblas) | AMD GPU |
224+
| [HIP](docs/build.md#hip) | AMD GPU |
225225
| [Vulkan](docs/build.md#vulkan) | GPU |
226226
| [CANN](docs/build.md#cann) | Ascend NPU |
227227

@@ -414,7 +414,7 @@ To learn more about model quantization, [read this documentation](examples/quant
414414
[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
415415
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
416416

417-
## [`llama-bench`](example/bench)
417+
## [`llama-bench`](examples/llama-bench)
418418

419419
#### Benchmark the performance of the inference for various parameters.
420420

common/arg.cpp

Lines changed: 50 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -119,29 +119,33 @@ std::string common_arg::to_string() {
119119
// utils
120120
//
121121

122-
static void common_params_handle_model_default(common_params & params) {
123-
if (!params.hf_repo.empty()) {
122+
static void common_params_handle_model_default(
123+
std::string & model,
124+
std::string & model_url,
125+
std::string & hf_repo,
126+
std::string & hf_file) {
127+
if (!hf_repo.empty()) {
124128
// short-hand to avoid specifying --hf-file -> default it to --model
125-
if (params.hf_file.empty()) {
126-
if (params.model.empty()) {
129+
if (hf_file.empty()) {
130+
if (model.empty()) {
127131
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
128132
}
129-
params.hf_file = params.model;
130-
} else if (params.model.empty()) {
133+
hf_file = model;
134+
} else if (model.empty()) {
131135
// this is to avoid different repo having same file name, or same file name in different subdirs
132-
std::string filename = params.hf_repo + "_" + params.hf_file;
136+
std::string filename = hf_repo + "_" + hf_file;
133137
// to make sure we don't have any slashes in the filename
134138
string_replace_all(filename, "/", "_");
135-
params.model = fs_get_cache_file(filename);
139+
model = fs_get_cache_file(filename);
136140
}
137-
} else if (!params.model_url.empty()) {
138-
if (params.model.empty()) {
139-
auto f = string_split<std::string>(params.model_url, '#').front();
141+
} else if (!model_url.empty()) {
142+
if (model.empty()) {
143+
auto f = string_split<std::string>(model_url, '#').front();
140144
f = string_split<std::string>(f, '?').front();
141-
params.model = fs_get_cache_file(string_split<std::string>(f, '/').back());
145+
model = fs_get_cache_file(string_split<std::string>(f, '/').back());
142146
}
143-
} else if (params.model.empty()) {
144-
params.model = DEFAULT_MODEL_PATH;
147+
} else if (model.empty()) {
148+
model = DEFAULT_MODEL_PATH;
145149
}
146150
}
147151

@@ -276,7 +280,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
276280
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
277281
}
278282

279-
common_params_handle_model_default(params);
283+
// TODO: refactor model params in a common struct
284+
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file);
285+
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
280286

281287
if (params.escape) {
282288
string_process_escapes(params.prompt);
@@ -842,7 +848,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
842848
}
843849
).set_sparam());
844850
add_opt(common_arg(
845-
{"--sampling-seq"}, "SEQUENCE",
851+
{"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
846852
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
847853
[](common_params & params, const std::string & value) {
848854
params.sampling.samplers = common_sampler_types_from_chars(value);
@@ -855,13 +861,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
855861
params.sampling.ignore_eos = true;
856862
}
857863
).set_sparam());
858-
add_opt(common_arg(
859-
{"--penalize-nl"},
860-
string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
861-
[](common_params & params) {
862-
params.sampling.penalize_nl = true;
863-
}
864-
).set_sparam());
865864
add_opt(common_arg(
866865
{"--temp"}, "N",
867866
string_format("temperature (default: %.1f)", (double)params.sampling.temp),
@@ -916,6 +915,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
916915
{"--repeat-last-n"}, "N",
917916
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
918917
[](common_params & params, int value) {
918+
if (value < -1) {
919+
throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
920+
}
919921
params.sampling.penalty_last_n = value;
920922
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
921923
}
@@ -970,6 +972,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
970972
{"--dry-penalty-last-n"}, "N",
971973
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
972974
[](common_params & params, int value) {
975+
if (value < -1) {
976+
throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
977+
}
973978
params.sampling.dry_penalty_last_n = value;
974979
}
975980
).set_sparam());
@@ -1582,6 +1587,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15821587
params.hf_file = value;
15831588
}
15841589
).set_env("LLAMA_ARG_HF_FILE"));
1590+
add_opt(common_arg(
1591+
{"-hfrv", "--hf-repo-v"}, "REPO",
1592+
"Hugging Face model repository for the vocoder model (default: unused)",
1593+
[](common_params & params, const std::string & value) {
1594+
params.vocoder.hf_repo = value;
1595+
}
1596+
).set_env("LLAMA_ARG_HF_REPO_V"));
1597+
add_opt(common_arg(
1598+
{"-hffv", "--hf-file-v"}, "FILE",
1599+
"Hugging Face model file for the vocoder model (default: unused)",
1600+
[](common_params & params, const std::string & value) {
1601+
params.vocoder.hf_file = value;
1602+
}
1603+
).set_env("LLAMA_ARG_HF_FILE_V"));
15851604
add_opt(common_arg(
15861605
{"-hft", "--hf-token"}, "TOKEN",
15871606
"Hugging Face access token (default: value from HF_TOKEN environment variable)",
@@ -2179,5 +2198,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21792198
}
21802199
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
21812200

2201+
add_opt(common_arg(
2202+
{"-mv", "--model-vocoder"}, "FNAME",
2203+
"vocoder model for audio generation (default: unused)",
2204+
[](common_params & params, const std::string & value) {
2205+
params.vocoder.model = value;
2206+
}
2207+
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2208+
21822209
return ctx_arg;
21832210
}

common/common.cpp

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -940,6 +940,25 @@ struct common_init_result common_init_from_params(common_params & params) {
940940
params.sampling.ignore_eos = false;
941941
}
942942

943+
if (params.sampling.ignore_eos) {
944+
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
945+
if (llama_token_is_eog(model, i)) {
946+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
947+
params.sampling.logit_bias.push_back({i, -INFINITY});
948+
}
949+
}
950+
}
951+
952+
if (params.sampling.penalty_last_n == -1) {
953+
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
954+
params.sampling.penalty_last_n = llama_n_ctx(lctx);
955+
}
956+
957+
if (params.sampling.dry_penalty_last_n == -1) {
958+
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
959+
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
960+
}
961+
943962
if (params.warmup) {
944963
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
945964

@@ -1076,7 +1095,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
10761095
#define CURL_MAX_RETRY 3
10771096
#define CURL_RETRY_DELAY_SECONDS 2
10781097

1079-
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
1098+
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
10801099
int remaining_attempts = max_attempts;
10811100

10821101
while (remaining_attempts > 0) {
@@ -1100,7 +1119,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
11001119
}
11011120

11021121
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1103-
11041122
// Initialize libcurl
11051123
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
11061124
if (!curl) {
@@ -1173,11 +1191,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
11731191
std::string etag;
11741192
std::string last_modified;
11751193
};
1194+
11761195
common_load_model_from_url_headers headers;
1196+
11771197
{
11781198
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
11791199
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1180-
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
1200+
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
11811201

11821202
static std::regex header_regex("([^:]+): (.*)\r\n");
11831203
static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1761,7 +1781,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
17611781
break;
17621782
case 0: // max absolute
17631783
for (int i = 0; i < n; i++) {
1764-
if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
1784+
if (sum < std::abs(inp[i])) {
1785+
sum = std::abs(inp[i]);
1786+
}
17651787
}
17661788
sum /= 32760.0; // make an int16 range
17671789
break;

common/common.h

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ enum llama_example {
8080
LLAMA_EXAMPLE_LLAVA,
8181
LLAMA_EXAMPLE_LOOKUP,
8282
LLAMA_EXAMPLE_PARALLEL,
83+
LLAMA_EXAMPLE_TTS,
8384

8485
LLAMA_EXAMPLE_COUNT,
8586
};
@@ -95,6 +96,7 @@ enum common_sampler_type {
9596
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
9697
COMMON_SAMPLER_TYPE_XTC = 8,
9798
COMMON_SAMPLER_TYPE_INFILL = 9,
99+
COMMON_SAMPLER_TYPE_PENALTIES = 10,
98100
};
99101

100102
// dimensionality reduction methods, used by cvector-generator
@@ -130,7 +132,6 @@ struct common_params_sampling {
130132
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
131133
float mirostat_tau = 5.00f; // target entropy
132134
float mirostat_eta = 0.10f; // learning rate
133-
bool penalize_nl = false; // consider newlines as a repeatable token
134135
bool ignore_eos = false;
135136
bool no_perf = false; // disable performance metrics
136137
bool timing_per_token = false;
@@ -139,6 +140,7 @@ struct common_params_sampling {
139140

140141

141142
std::vector<enum common_sampler_type> samplers = {
143+
COMMON_SAMPLER_TYPE_PENALTIES,
142144
COMMON_SAMPLER_TYPE_DRY,
143145
COMMON_SAMPLER_TYPE_TOP_K,
144146
COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -158,6 +160,7 @@ struct common_params_sampling {
158160

159161
struct common_params_speculative {
160162
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
163+
161164
int32_t n_ctx = 0; // draft context size
162165
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
163166
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
@@ -171,6 +174,14 @@ struct common_params_speculative {
171174
std::string model = ""; // draft model for speculative decoding // NOLINT
172175
};
173176

177+
struct common_params_vocoder {
178+
std::string hf_repo = ""; // HF repo // NOLINT
179+
std::string hf_file = ""; // HF file // NOLINT
180+
181+
std::string model = ""; // model path // NOLINT
182+
std::string model_url = ""; // model url to download // NOLINT
183+
};
184+
174185
struct common_params {
175186
int32_t n_predict = -1; // new tokens to predict
176187
int32_t n_ctx = 4096; // context size
@@ -193,11 +204,13 @@ struct common_params {
193204
float defrag_thold = 0.1f; // KV cache defragmentation threshold
194205

195206
// offload params
196-
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
197-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
198-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
199-
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
200-
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
207+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
208+
209+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
210+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
211+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
212+
213+
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
201214

202215
struct cpu_params cpuparams;
203216
struct cpu_params cpuparams_batch;
@@ -211,8 +224,9 @@ struct common_params {
211224
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
212225
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
213226

214-
struct common_params_sampling sampling;
227+
struct common_params_sampling sampling;
215228
struct common_params_speculative speculative;
229+
struct common_params_vocoder vocoder;
216230

217231
std::string model = ""; // model path // NOLINT
218232
std::string model_alias = ""; // model alias // NOLINT
@@ -593,7 +607,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
593607
// Embedding utils
594608
//
595609

596-
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
610+
// TODO: repace embd_norm with an enum
611+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
597612

598613
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
599614

0 commit comments

Comments
 (0)