Skip to content

Commit 8273739

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/cpu.Dockerfile # .devops/cuda.Dockerfile # .devops/intel.Dockerfile # .devops/llama-cli-cann.Dockerfile # .devops/musa.Dockerfile # .devops/rocm.Dockerfile # .devops/vulkan.Dockerfile # examples/llama-bench/llama-bench.cpp # examples/rpc/rpc-server.cpp # scripts/compare-llama-bench.py # tests/test-quantize-stats.cpp
2 parents 5d38297 + 5933e6f commit 8273739

File tree

9 files changed

+210
-98
lines changed

9 files changed

+210
-98
lines changed

common/arg.cpp

Lines changed: 83 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,25 @@ std::initializer_list<enum llama_example> mmproj_examples = {
4444
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
4545
};
4646

47+
static std::string read_file(const std::string & fname) {
48+
std::ifstream file(fname);
49+
if (!file) {
50+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
51+
}
52+
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
53+
file.close();
54+
return content;
55+
}
56+
57+
static void write_file(const std::string & fname, const std::string & content) {
58+
std::ofstream file(fname);
59+
if (!file) {
60+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
61+
}
62+
file << content;
63+
file.close();
64+
}
65+
4766
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
4867
this->examples = std::move(examples);
4968
return *this;
@@ -201,9 +220,11 @@ struct curl_slist_ptr {
201220

202221
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
203222
int remaining_attempts = max_attempts;
223+
char * method = nullptr;
224+
curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_METHOD, &method);
204225

205226
while (remaining_attempts > 0) {
206-
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
227+
LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
207228

208229
CURLcode res = curl_easy_perform(curl);
209230
if (res == CURLE_OK) {
@@ -214,6 +235,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
214235
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
215236

216237
remaining_attempts--;
238+
if (remaining_attempts == 0) break;
217239
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
218240
}
219241

@@ -232,8 +254,6 @@ static bool common_download_file_single(const std::string & url, const std::stri
232254
return false;
233255
}
234256

235-
bool force_download = false;
236-
237257
// Set the URL, allow to follow http redirection
238258
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
239259
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
@@ -257,7 +277,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
257277

258278
// If the file exists, check its JSON metadata companion file.
259279
std::string metadata_path = path + ".json";
260-
nlohmann::json metadata;
280+
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
261281
std::string etag;
262282
std::string last_modified;
263283

@@ -267,7 +287,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
267287
if (metadata_in.good()) {
268288
try {
269289
metadata_in >> metadata;
270-
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
290+
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
271291
if (metadata.contains("url") && metadata.at("url").is_string()) {
272292
auto previous_url = metadata.at("url").get<std::string>();
273293
if (previous_url != url) {
@@ -297,7 +317,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
297317
};
298318

299319
common_load_model_from_url_headers headers;
320+
bool head_request_ok = false;
321+
bool should_download = !file_exists; // by default, we should download if the file does not exist
300322

323+
// get ETag to see if the remote file has changed
301324
{
302325
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
303326
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
@@ -326,23 +349,28 @@ static bool common_download_file_single(const std::string & url, const std::stri
326349
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
327350
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
328351

329-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
352+
// we only allow retrying once for HEAD requests
353+
// this is for the use case of using running offline (no internet), retrying can be annoying
354+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0);
330355
if (!was_perform_successful) {
331-
return false;
356+
head_request_ok = false;
332357
}
333358

334359
long http_code = 0;
335360
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
336-
if (http_code != 200) {
337-
// HEAD not supported, we don't know if the file has changed
338-
// force trigger downloading
339-
force_download = true;
340-
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
361+
if (http_code == 200) {
362+
head_request_ok = true;
363+
} else {
364+
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
365+
head_request_ok = false;
341366
}
342367
}
343368

344-
bool should_download = !file_exists || force_download;
345-
if (!should_download) {
369+
// if head_request_ok is false, we don't have the etag or last-modified headers
370+
// we leave should_download as-is, which is true if the file does not exist
371+
if (head_request_ok) {
372+
// check if ETag or Last-Modified headers are different
373+
// if it is, we need to download the file again
346374
if (!etag.empty() && etag != headers.etag) {
347375
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
348376
should_download = true;
@@ -351,6 +379,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
351379
should_download = true;
352380
}
353381
}
382+
354383
if (should_download) {
355384
std::string path_temporary = path + ".downloadInProgress";
356385
if (file_exists) {
@@ -425,13 +454,15 @@ static bool common_download_file_single(const std::string & url, const std::stri
425454
{"etag", headers.etag},
426455
{"lastModified", headers.last_modified}
427456
});
428-
std::ofstream(metadata_path) << metadata.dump(4);
429-
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
457+
write_file(metadata_path, metadata.dump(4));
458+
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
430459

431460
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
432461
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
433462
return false;
434463
}
464+
} else {
465+
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
435466
}
436467

437468
return true;
@@ -606,16 +637,37 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
606637
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
607638
// User-Agent header is already set in common_remote_get_content, no need to set it here
608639

640+
// we use "=" to avoid clashing with other component, while still being allowed on windows
641+
std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
642+
string_replace_all(cached_response_fname, "/", "_");
643+
std::string cached_response_path = fs_get_cache_file(cached_response_fname);
644+
609645
// make the request
610646
common_remote_params params;
611647
params.headers = headers;
612-
auto res = common_remote_get_content(url, params);
613-
long res_code = res.first;
614-
std::string res_str(res.second.data(), res.second.size());
648+
long res_code = 0;
649+
std::string res_str;
650+
bool use_cache = false;
651+
try {
652+
auto res = common_remote_get_content(url, params);
653+
res_code = res.first;
654+
res_str = std::string(res.second.data(), res.second.size());
655+
} catch (const std::exception & e) {
656+
LOG_WRN("error: failed to get manifest: %s\n", e.what());
657+
LOG_WRN("try reading from cache\n");
658+
// try to read from cache
659+
try {
660+
res_str = read_file(cached_response_path);
661+
res_code = 200;
662+
use_cache = true;
663+
} catch (const std::exception & e) {
664+
throw std::runtime_error("error: failed to get manifest (check your internet connection)");
665+
}
666+
}
615667
std::string ggufFile;
616668
std::string mmprojFile;
617669

618-
if (res_code == 200) {
670+
if (res_code == 200 || res_code == 304) {
619671
// extract ggufFile.rfilename in json, using regex
620672
{
621673
std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
@@ -632,6 +684,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
632684
mmprojFile = match[1].str();
633685
}
634686
}
687+
if (!use_cache) {
688+
// if not using cached response, update the cache file
689+
write_file(cached_response_path, res_str);
690+
}
635691
} else if (res_code == 401) {
636692
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
637693
} else {
@@ -1143,6 +1199,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
11431199
fprintf(stderr, "%s\n", ex.what());
11441200
ctx_arg.params = params_org;
11451201
return false;
1202+
} catch (std::exception & ex) {
1203+
fprintf(stderr, "%s\n", ex.what());
1204+
exit(1); // for other exceptions, we exit with status code 1
11461205
}
11471206

11481207
return true;
@@ -1443,13 +1502,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14431502
{"-f", "--file"}, "FNAME",
14441503
"a file containing the prompt (default: none)",
14451504
[](common_params & params, const std::string & value) {
1446-
std::ifstream file(value);
1447-
if (!file) {
1448-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1449-
}
1505+
params.prompt = read_file(value);
14501506
// store the external file name in params
14511507
params.prompt_file = value;
1452-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
14531508
if (!params.prompt.empty() && params.prompt.back() == '\n') {
14541509
params.prompt.pop_back();
14551510
}
@@ -1459,11 +1514,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14591514
{"-sysf", "--system-prompt-file"}, "FNAME",
14601515
"a file containing the system prompt (default: none)",
14611516
[](common_params & params, const std::string & value) {
1462-
std::ifstream file(value);
1463-
if (!file) {
1464-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1465-
}
1466-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
1517+
params.system_prompt = read_file(value);
14671518
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
14681519
params.system_prompt.pop_back();
14691520
}
@@ -1888,15 +1939,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
18881939
{"--grammar-file"}, "FNAME",
18891940
"file to read grammar from",
18901941
[](common_params & params, const std::string & value) {
1891-
std::ifstream file(value);
1892-
if (!file) {
1893-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1894-
}
1895-
std::copy(
1896-
std::istreambuf_iterator<char>(file),
1897-
std::istreambuf_iterator<char>(),
1898-
std::back_inserter(params.sampling.grammar)
1899-
);
1942+
params.sampling.grammar = read_file(value);
19001943
}
19011944
).set_sparam());
19021945
add_opt(common_arg(
@@ -2816,14 +2859,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28162859
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
28172860
),
28182861
[](common_params & params, const std::string & value) {
2819-
std::ifstream file(value);
2820-
if (!file) {
2821-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
2822-
}
2823-
std::copy(
2824-
std::istreambuf_iterator<char>(file),
2825-
std::istreambuf_iterator<char>(),
2826-
std::back_inserter(params.chat_template));
2862+
params.chat_template = read_file(value);
28272863
}
28282864
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
28292865
add_opt(common_arg(

examples/server/utils.hpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,9 +642,31 @@ static json oaicompat_completion_params_parse(
642642
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
643643
}
644644

645+
// if the assistant message appears at the end of list, we do not add end-of-turn token
646+
// for ex. this can be useful to modify the reasoning process in reasoning models
647+
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
648+
common_chat_msg last_message;
649+
if (prefill_assistant_message) {
650+
last_message = inputs.messages.back();
651+
inputs.messages.pop_back();
652+
653+
/* sanity check, max one assistant message at the end of the list */
654+
if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
655+
throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
656+
}
657+
658+
inputs.extract_reasoning = false;
659+
inputs.add_generation_prompt = true;
660+
}
661+
645662
// Apply chat template to the list of messages
646663
auto chat_params = common_chat_templates_apply(tmpls, inputs);
647664

665+
/* Append assistant prefilled message */
666+
if (prefill_assistant_message) {
667+
chat_params.prompt += last_message.content;
668+
}
669+
648670
llama_params["chat_format"] = static_cast<int>(chat_params.format);
649671
llama_params["prompt"] = chat_params.prompt;
650672
if (!chat_params.grammar.empty()) {

0 commit comments

Comments
 (0)