Skip to content

Commit 7b20310

Browse files
authored
feat: sync llama.cpp (#75)
* feat: sync llama.cpp * fix: migrate from api changes * feat: sync llama.cpp * feat: sync llama.cpp
1 parent 471d186 commit 7b20310

24 files changed

+2422
-1138
lines changed

android/src/main/jni.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ Java_com_rnllama_LlamaContext_initContext(
168168

169169
const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
170170
if (lora_chars != nullptr && lora_chars[0] != '\0') {
171-
defaultParams.lora_adapter.push_back({lora_chars, lora_scaled});
171+
defaultParams.lora_adapters.push_back({lora_chars, lora_scaled});
172172
defaultParams.use_mmap = false;
173173
}
174174

cpp/common.cpp

Lines changed: 158 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,41 @@ char const *LLAMA_BUILD_TARGET = "unknown";
8383

8484
using json = nlohmann::ordered_json;
8585

86+
//
87+
// Environment variable utils
88+
//
89+
90+
template<typename T>
91+
static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
92+
get_env(std::string name, T & target) {
93+
char * value = std::getenv(name.c_str());
94+
target = value ? std::string(value) : target;
95+
}
96+
97+
template<typename T>
98+
static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
99+
get_env(std::string name, T & target) {
100+
char * value = std::getenv(name.c_str());
101+
target = value ? std::stoi(value) : target;
102+
}
103+
104+
template<typename T>
105+
static typename std::enable_if<std::is_floating_point<T>::value, void>::type
106+
get_env(std::string name, T & target) {
107+
char * value = std::getenv(name.c_str());
108+
target = value ? std::stof(value) : target;
109+
}
110+
111+
template<typename T>
112+
static typename std::enable_if<std::is_same<T, bool>::value, void>::type
113+
get_env(std::string name, T & target) {
114+
char * value = std::getenv(name.c_str());
115+
if (value) {
116+
std::string val(value);
117+
target = val == "1" || val == "true";
118+
}
119+
}
120+
86121
//
87122
// CPU utils
88123
//
@@ -116,8 +151,34 @@ int32_t cpu_get_num_physical_cores() {
116151
if (result == 0) {
117152
return num_physical_cores;
118153
}
119-
#elif defined(_WIN32)
120-
//TODO: Implement
154+
#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
155+
// TODO: windows + arm64 + mingw64
156+
unsigned int n_threads_win = std::thread::hardware_concurrency();
157+
unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
158+
159+
DWORD buffer_size = 0;
160+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
161+
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
162+
return default_threads;
163+
}
164+
}
165+
166+
std::vector<char> buffer(buffer_size);
167+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
168+
return default_threads;
169+
}
170+
171+
int32_t num_physical_cores = 0;
172+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
173+
while (buffer_size > 0) {
174+
if (info->Relationship == RelationProcessorCore) {
175+
num_physical_cores += info->Processor.GroupCount;
176+
}
177+
buffer_size -= info->Size;
178+
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
179+
}
180+
181+
return num_physical_cores > 0 ? num_physical_cores : default_threads;
121182
#endif
122183
unsigned int n_threads = std::thread::hardware_concurrency();
123184
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@@ -200,12 +261,6 @@ int32_t cpu_get_num_math() {
200261
// CLI argument parsing
201262
//
202263

203-
void gpt_params_handle_hf_token(gpt_params & params) {
204-
if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
205-
params.hf_token = std::getenv("HF_TOKEN");
206-
}
207-
}
208-
209264
void gpt_params_handle_model_default(gpt_params & params) {
210265
if (!params.hf_repo.empty()) {
211266
// short-hand to avoid specifying --hf-file -> default it to --model
@@ -253,7 +308,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
253308

254309
gpt_params_handle_model_default(params);
255310

256-
gpt_params_handle_hf_token(params);
311+
if (params.hf_token.empty()) {
312+
get_env("HF_TOKEN", params.hf_token);
313+
}
257314

258315
if (params.escape) {
259316
string_process_escapes(params.prompt);
@@ -273,6 +330,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
273330
return true;
274331
}
275332

333+
void gpt_params_parse_from_env(gpt_params & params) {
334+
// we only care about server-related params for now
335+
get_env("LLAMA_ARG_MODEL", params.model);
336+
get_env("LLAMA_ARG_THREADS", params.n_threads);
337+
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
338+
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
339+
get_env("LLAMA_ARG_BATCH", params.n_batch);
340+
get_env("LLAMA_ARG_UBATCH", params.n_ubatch);
341+
get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers);
342+
get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http);
343+
get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template);
344+
get_env("LLAMA_ARG_N_PREDICT", params.n_predict);
345+
get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
346+
get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots);
347+
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
348+
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
349+
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
350+
}
351+
276352
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
277353
const auto params_org = params; // the example can modify the default params
278354

@@ -690,14 +766,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
690766
}
691767
if (arg == "--lora") {
692768
CHECK_ARG
693-
params.lora_adapter.emplace_back(argv[i], 1.0f);
769+
params.lora_adapters.push_back({
770+
std::string(argv[i]),
771+
1.0,
772+
});
694773
return true;
695774
}
696775
if (arg == "--lora-scaled") {
697776
CHECK_ARG
698-
const char* lora_adapter = argv[i];
777+
std::string lora_adapter = argv[i];
699778
CHECK_ARG
700-
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
779+
params.lora_adapters.push_back({
780+
lora_adapter,
781+
std::stof(argv[i]),
782+
});
783+
return true;
784+
}
785+
if (arg == "--lora-init-without-apply") {
786+
params.lora_init_without_apply = true;
701787
return true;
702788
}
703789
if (arg == "--control-vector") {
@@ -821,7 +907,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
821907
}
822908
return true;
823909
}
824-
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
910+
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
825911
CHECK_ARG
826912
params.n_gpu_layers_draft = std::stoi(argv[i]);
827913
if (!llama_supports_gpu_offload()) {
@@ -1660,6 +1746,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16601746
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
16611747
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
16621748
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
1749+
options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
16631750

16641751
#ifndef LOG_DISABLE_LOGS
16651752
options.push_back({ "logging" });
@@ -1722,7 +1809,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
17221809
if (params.n_threads_batch != -1) {
17231810
os << " (n_threads_batch = " << params.n_threads_batch << ")";
17241811
}
1812+
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
1813+
// TODO: windows + arm64 + mingw64
1814+
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
1815+
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
1816+
#else
17251817
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
1818+
#endif
17261819

17271820
return os.str();
17281821
}
@@ -1772,6 +1865,17 @@ std::string string_get_sortable_timestamp() {
17721865
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
17731866
}
17741867

1868+
void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
1869+
if (search.empty()) {
1870+
return; // Avoid infinite loop if 'search' is an empty string
1871+
}
1872+
size_t pos = 0;
1873+
while ((pos = s.find(search, pos)) != std::string::npos) {
1874+
s.replace(pos, search.length(), replace);
1875+
pos += replace.length();
1876+
}
1877+
}
1878+
17751879
void string_process_escapes(std::string & input) {
17761880
std::size_t input_len = input.length();
17771881
std::size_t output_idx = 0;
@@ -2045,8 +2149,8 @@ std::string fs_get_cache_file(const std::string & filename) {
20452149
//
20462150
// Model utils
20472151
//
2048-
2049-
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
2152+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
2153+
llama_init_result iparams;
20502154
auto mparams = llama_model_params_from_gpt_params(params);
20512155

20522156
llama_model * model = nullptr;
@@ -2061,7 +2165,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20612165

20622166
if (model == NULL) {
20632167
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
2064-
return std::make_tuple(nullptr, nullptr);
2168+
return iparams;
20652169
}
20662170

20672171
auto cparams = llama_context_params_from_gpt_params(params);
@@ -2070,7 +2174,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20702174
if (lctx == NULL) {
20712175
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
20722176
llama_free_model(model);
2073-
return std::make_tuple(nullptr, nullptr);
2177+
return iparams;
20742178
}
20752179

20762180
if (!params.control_vectors.empty()) {
@@ -2081,7 +2185,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20812185
if (cvec.n_embd == -1) {
20822186
llama_free(lctx);
20832187
llama_free_model(model);
2084-
return std::make_tuple(nullptr, nullptr);
2188+
return iparams;
20852189
}
20862190

20872191
int err = llama_control_vector_apply(lctx,
@@ -2093,21 +2197,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20932197
if (err) {
20942198
llama_free(lctx);
20952199
llama_free_model(model);
2096-
return std::make_tuple(nullptr, nullptr);
2200+
return iparams;
20972201
}
20982202
}
20992203

2100-
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
2101-
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
2102-
float lora_scale = std::get<1>(params.lora_adapter[i]);
2103-
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
2104-
if (adapter == nullptr) {
2105-
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
2204+
// load and optionally apply lora adapters
2205+
for (auto & la : params.lora_adapters) {
2206+
llama_lora_adapter_container loaded_la;
2207+
loaded_la.path = la.path;
2208+
loaded_la.scale = la.scale;
2209+
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
2210+
if (loaded_la.adapter == nullptr) {
2211+
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
21062212
llama_free(lctx);
21072213
llama_free_model(model);
2108-
return std::make_tuple(nullptr, nullptr);
2214+
return iparams;
21092215
}
2110-
llama_lora_adapter_set(lctx, adapter, lora_scale);
2216+
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
2217+
}
2218+
if (!params.lora_init_without_apply) {
2219+
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
21112220
}
21122221

21132222
if (params.ignore_eos) {
@@ -2135,13 +2244,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
21352244
tmp.clear();
21362245
tmp.push_back(decoder_start_token_id);
21372246
}
2138-
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
2247+
if (llama_model_has_decoder(model)) {
2248+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
2249+
}
21392250
llama_kv_cache_clear(lctx);
21402251
llama_synchronize(lctx);
21412252
llama_reset_timings(lctx);
21422253
}
21432254

2144-
return std::make_tuple(model, lctx);
2255+
iparams.model = model;
2256+
iparams.context = lctx;
2257+
return iparams;
2258+
}
2259+
2260+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
2261+
llama_lora_adapter_clear(ctx);
2262+
for (auto & la : lora_adapters) {
2263+
if (la.scale != 0.0f) {
2264+
llama_lora_adapter_set(ctx, la.adapter, la.scale);
2265+
}
2266+
}
21452267
}
21462268

21472269
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
@@ -2668,12 +2790,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
26682790
return text;
26692791
}
26702792

2671-
bool llama_should_add_bos_token(const llama_model * model) {
2672-
const int add_bos = llama_add_bos_token(model);
2673-
2674-
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2675-
}
2676-
26772793
//
26782794
// Chat template utils
26792795
//
@@ -3166,19 +3282,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
31663282
}
31673283

31683284
fprintf(stream, "lora:\n");
3169-
for (std::tuple<std::string, float> la : params.lora_adapter) {
3170-
if (std::get<1>(la) != 1.0f) {
3171-
continue;
3285+
for (auto & la : params.lora_adapters) {
3286+
if (la.scale == 1.0f) {
3287+
fprintf(stream, " - %s\n", la.path.c_str());
31723288
}
3173-
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
31743289
}
31753290
fprintf(stream, "lora_scaled:\n");
3176-
for (std::tuple<std::string, float> la : params.lora_adapter) {
3177-
if (std::get<1>(la) == 1.0f) {
3178-
continue;
3291+
for (auto & la : params.lora_adapters) {
3292+
if (la.scale != 1.0f) {
3293+
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
31793294
}
3180-
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
31813295
}
3296+
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
31823297
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
31833298
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
31843299
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);

0 commit comments

Comments
 (0)