@@ -83,6 +83,41 @@ char const *LLAMA_BUILD_TARGET = "unknown";
8383
8484using json = nlohmann::ordered_json;
8585
86+ //
87+ // Environment variable utils
88+ //
89+
90+ template <typename T>
91+ static typename std::enable_if<std::is_same<T, std::string>::value, void >::type
92+ get_env (std::string name, T & target) {
93+ char * value = std::getenv (name.c_str ());
94+ target = value ? std::string (value) : target;
95+ }
96+
97+ template <typename T>
98+ static typename std::enable_if<!std::is_same<T, bool >::value && std::is_integral<T>::value, void >::type
99+ get_env (std::string name, T & target) {
100+ char * value = std::getenv (name.c_str ());
101+ target = value ? std::stoi (value) : target;
102+ }
103+
104+ template <typename T>
105+ static typename std::enable_if<std::is_floating_point<T>::value, void >::type
106+ get_env (std::string name, T & target) {
107+ char * value = std::getenv (name.c_str ());
108+ target = value ? std::stof (value) : target;
109+ }
110+
111+ template <typename T>
112+ static typename std::enable_if<std::is_same<T, bool >::value, void >::type
113+ get_env (std::string name, T & target) {
114+ char * value = std::getenv (name.c_str ());
115+ if (value) {
116+ std::string val (value);
117+ target = val == " 1" || val == " true" ;
118+ }
119+ }
120+
86121//
87122// CPU utils
88123//
@@ -116,8 +151,34 @@ int32_t cpu_get_num_physical_cores() {
116151 if (result == 0 ) {
117152 return num_physical_cores;
118153 }
119- #elif defined(_WIN32)
120- // TODO: Implement
154+ #elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
155+ // TODO: windows + arm64 + mingw64
156+ unsigned int n_threads_win = std::thread::hardware_concurrency ();
157+ unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2 ) : 4 ;
158+
159+ DWORD buffer_size = 0 ;
160+ if (!GetLogicalProcessorInformationEx (RelationProcessorCore, nullptr , &buffer_size)) {
161+ if (GetLastError () != ERROR_INSUFFICIENT_BUFFER) {
162+ return default_threads;
163+ }
164+ }
165+
166+ std::vector<char > buffer (buffer_size);
167+ if (!GetLogicalProcessorInformationEx (RelationProcessorCore, reinterpret_cast <PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data ()), &buffer_size)) {
168+ return default_threads;
169+ }
170+
171+ int32_t num_physical_cores = 0 ;
172+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast <PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data ());
173+ while (buffer_size > 0 ) {
174+ if (info->Relationship == RelationProcessorCore) {
175+ num_physical_cores += info->Processor .GroupCount ;
176+ }
177+ buffer_size -= info->Size ;
178+ info = reinterpret_cast <PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast <char *>(info) + info->Size );
179+ }
180+
181+ return num_physical_cores > 0 ? num_physical_cores : default_threads;
121182#endif
122183 unsigned int n_threads = std::thread::hardware_concurrency ();
123184 return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2 ) : 4 ;
@@ -200,12 +261,6 @@ int32_t cpu_get_num_math() {
200261// CLI argument parsing
201262//
202263
203- void gpt_params_handle_hf_token (gpt_params & params) {
204- if (params.hf_token .empty () && std::getenv (" HF_TOKEN" )) {
205- params.hf_token = std::getenv (" HF_TOKEN" );
206- }
207- }
208-
209264void gpt_params_handle_model_default (gpt_params & params) {
210265 if (!params.hf_repo .empty ()) {
211266 // short-hand to avoid specifying --hf-file -> default it to --model
@@ -253,7 +308,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
253308
254309 gpt_params_handle_model_default (params);
255310
256- gpt_params_handle_hf_token (params);
311+ if (params.hf_token .empty ()) {
312+ get_env (" HF_TOKEN" , params.hf_token );
313+ }
257314
258315 if (params.escape ) {
259316 string_process_escapes (params.prompt );
@@ -273,6 +330,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
273330 return true ;
274331}
275332
333+ void gpt_params_parse_from_env (gpt_params & params) {
334+ // we only care about server-related params for now
335+ get_env (" LLAMA_ARG_MODEL" , params.model );
336+ get_env (" LLAMA_ARG_THREADS" , params.n_threads );
337+ get_env (" LLAMA_ARG_CTX_SIZE" , params.n_ctx );
338+ get_env (" LLAMA_ARG_N_PARALLEL" , params.n_parallel );
339+ get_env (" LLAMA_ARG_BATCH" , params.n_batch );
340+ get_env (" LLAMA_ARG_UBATCH" , params.n_ubatch );
341+ get_env (" LLAMA_ARG_N_GPU_LAYERS" , params.n_gpu_layers );
342+ get_env (" LLAMA_ARG_THREADS_HTTP" , params.n_threads_http );
343+ get_env (" LLAMA_ARG_CHAT_TEMPLATE" , params.chat_template );
344+ get_env (" LLAMA_ARG_N_PREDICT" , params.n_predict );
345+ get_env (" LLAMA_ARG_ENDPOINT_METRICS" , params.endpoint_metrics );
346+ get_env (" LLAMA_ARG_ENDPOINT_SLOTS" , params.endpoint_slots );
347+ get_env (" LLAMA_ARG_EMBEDDINGS" , params.embedding );
348+ get_env (" LLAMA_ARG_FLASH_ATTN" , params.flash_attn );
349+ get_env (" LLAMA_ARG_DEFRAG_THOLD" , params.defrag_thold );
350+ }
351+
276352bool gpt_params_parse (int argc, char ** argv, gpt_params & params) {
277353 const auto params_org = params; // the example can modify the default params
278354
@@ -690,14 +766,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
690766 }
691767 if (arg == " --lora" ) {
692768 CHECK_ARG
693- params.lora_adapter .emplace_back (argv[i], 1 .0f );
769+ params.lora_adapters .push_back ({
770+ std::string (argv[i]),
771+ 1.0 ,
772+ });
694773 return true ;
695774 }
696775 if (arg == " --lora-scaled" ) {
697776 CHECK_ARG
698- const char * lora_adapter = argv[i];
777+ std::string lora_adapter = argv[i];
699778 CHECK_ARG
700- params.lora_adapter .emplace_back (lora_adapter, std::stof (argv[i]));
779+ params.lora_adapters .push_back ({
780+ lora_adapter,
781+ std::stof (argv[i]),
782+ });
783+ return true ;
784+ }
785+ if (arg == " --lora-init-without-apply" ) {
786+ params.lora_init_without_apply = true ;
701787 return true ;
702788 }
703789 if (arg == " --control-vector" ) {
@@ -821,7 +907,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
821907 }
822908 return true ;
823909 }
824- if (arg == " -ngld" || arg == " --gpu-layers-draft" || arg == " --gpu-layers-draft" ) {
910+ if (arg == " -ngld" || arg == " --gpu-layers-draft" || arg == " --n- gpu-layers-draft" ) {
825911 CHECK_ARG
826912 params.n_gpu_layers_draft = std::stoi (argv[i]);
827913 if (!llama_supports_gpu_offload ()) {
@@ -1660,6 +1746,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16601746 " https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
16611747 options.push_back ({ " server" , " -sps, --slot-prompt-similarity SIMILARITY" ,
16621748 " how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n " , params.slot_prompt_similarity });
1749+ options.push_back ({ " server" , " --lora-init-without-apply" , " load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)" , params.lora_init_without_apply ? " enabled" : " disabled" });
16631750
16641751#ifndef LOG_DISABLE_LOGS
16651752 options.push_back ({ " logging" });
@@ -1722,7 +1809,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
17221809 if (params.n_threads_batch != -1 ) {
17231810 os << " (n_threads_batch = " << params.n_threads_batch << " )" ;
17241811 }
1812+ #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
1813+ // TODO: windows + arm64 + mingw64
1814+ DWORD logicalProcessorCount = GetActiveProcessorCount (ALL_PROCESSOR_GROUPS);
1815+ os << " / " << logicalProcessorCount << " | " << llama_print_system_info ();
1816+ #else
17251817 os << " / " << std::thread::hardware_concurrency () << " | " << llama_print_system_info ();
1818+ #endif
17261819
17271820 return os.str ();
17281821}
@@ -1772,6 +1865,17 @@ std::string string_get_sortable_timestamp() {
17721865 return std::string (timestamp_no_ns) + " ." + std::string (timestamp_ns);
17731866}
17741867
1868+ void string_replace_all (std::string & s, const std::string & search, const std::string & replace) {
1869+ if (search.empty ()) {
1870+ return ; // Avoid infinite loop if 'search' is an empty string
1871+ }
1872+ size_t pos = 0 ;
1873+ while ((pos = s.find (search, pos)) != std::string::npos) {
1874+ s.replace (pos, search.length (), replace);
1875+ pos += replace.length ();
1876+ }
1877+ }
1878+
17751879void string_process_escapes (std::string & input) {
17761880 std::size_t input_len = input.length ();
17771881 std::size_t output_idx = 0 ;
@@ -2045,8 +2149,8 @@ std::string fs_get_cache_file(const std::string & filename) {
20452149//
20462150// Model utils
20472151//
2048-
2049- std::tuple< struct llama_model *, struct llama_context *> llama_init_from_gpt_params (gpt_params & params) {
2152+ struct llama_init_result llama_init_from_gpt_params (gpt_params & params) {
2153+ llama_init_result iparams;
20502154 auto mparams = llama_model_params_from_gpt_params (params);
20512155
20522156 llama_model * model = nullptr ;
@@ -2061,7 +2165,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20612165
20622166 if (model == NULL ) {
20632167 fprintf (stderr, " %s: error: failed to load model '%s'\n " , __func__, params.model .c_str ());
2064- return std::make_tuple ( nullptr , nullptr ) ;
2168+ return iparams ;
20652169 }
20662170
20672171 auto cparams = llama_context_params_from_gpt_params (params);
@@ -2070,7 +2174,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20702174 if (lctx == NULL ) {
20712175 fprintf (stderr, " %s: error: failed to create context with model '%s'\n " , __func__, params.model .c_str ());
20722176 llama_free_model (model);
2073- return std::make_tuple ( nullptr , nullptr ) ;
2177+ return iparams ;
20742178 }
20752179
20762180 if (!params.control_vectors .empty ()) {
@@ -2081,7 +2185,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20812185 if (cvec.n_embd == -1 ) {
20822186 llama_free (lctx);
20832187 llama_free_model (model);
2084- return std::make_tuple ( nullptr , nullptr ) ;
2188+ return iparams ;
20852189 }
20862190
20872191 int err = llama_control_vector_apply (lctx,
@@ -2093,21 +2197,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20932197 if (err) {
20942198 llama_free (lctx);
20952199 llama_free_model (model);
2096- return std::make_tuple ( nullptr , nullptr ) ;
2200+ return iparams ;
20972201 }
20982202 }
20992203
2100- for (unsigned int i = 0 ; i < params.lora_adapter .size (); ++i) {
2101- const std::string & lora_adapter = std::get<0 >(params.lora_adapter [i]);
2102- float lora_scale = std::get<1 >(params.lora_adapter [i]);
2103- auto adapter = llama_lora_adapter_init (model, lora_adapter.c_str ());
2104- if (adapter == nullptr ) {
2105- fprintf (stderr, " %s: error: failed to apply lora adapter\n " , __func__);
2204+ // load and optionally apply lora adapters
2205+ for (auto & la : params.lora_adapters ) {
2206+ llama_lora_adapter_container loaded_la;
2207+ loaded_la.path = la.path ;
2208+ loaded_la.scale = la.scale ;
2209+ loaded_la.adapter = llama_lora_adapter_init (model, la.path .c_str ());
2210+ if (loaded_la.adapter == nullptr ) {
2211+ fprintf (stderr, " %s: error: failed to apply lora adapter '%s'\n " , __func__, la.path .c_str ());
21062212 llama_free (lctx);
21072213 llama_free_model (model);
2108- return std::make_tuple ( nullptr , nullptr ) ;
2214+ return iparams ;
21092215 }
2110- llama_lora_adapter_set (lctx, adapter, lora_scale);
2216+ iparams.lora_adapters .push_back (loaded_la); // copy to list of loaded adapters
2217+ }
2218+ if (!params.lora_init_without_apply ) {
2219+ llama_lora_adapters_apply (lctx, iparams.lora_adapters );
21112220 }
21122221
21132222 if (params.ignore_eos ) {
@@ -2135,13 +2244,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
21352244 tmp.clear ();
21362245 tmp.push_back (decoder_start_token_id);
21372246 }
2138- llama_decode (lctx, llama_batch_get_one (tmp.data (), std::min (tmp.size (), (size_t ) params.n_batch ), 0 , 0 ));
2247+ if (llama_model_has_decoder (model)) {
2248+ llama_decode (lctx, llama_batch_get_one (tmp.data (), std::min (tmp.size (), (size_t ) params.n_batch ), 0 , 0 ));
2249+ }
21392250 llama_kv_cache_clear (lctx);
21402251 llama_synchronize (lctx);
21412252 llama_reset_timings (lctx);
21422253 }
21432254
2144- return std::make_tuple (model, lctx);
2255+ iparams.model = model;
2256+ iparams.context = lctx;
2257+ return iparams;
2258+ }
2259+
2260+ void llama_lora_adapters_apply (struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
2261+ llama_lora_adapter_clear (ctx);
2262+ for (auto & la : lora_adapters) {
2263+ if (la.scale != 0 .0f ) {
2264+ llama_lora_adapter_set (ctx, la.adapter , la.scale );
2265+ }
2266+ }
21452267}
21462268
21472269struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params) {
@@ -2668,12 +2790,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
26682790 return text;
26692791}
26702792
2671- bool llama_should_add_bos_token (const llama_model * model) {
2672- const int add_bos = llama_add_bos_token (model);
2673-
2674- return add_bos != -1 ? bool (add_bos) : (llama_vocab_type (model) == LLAMA_VOCAB_TYPE_SPM);
2675- }
2676-
26772793//
26782794// Chat template utils
26792795//
@@ -3166,19 +3282,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
31663282 }
31673283
31683284 fprintf (stream, " lora:\n " );
3169- for (std::tuple<std::string, float > la : params.lora_adapter ) {
3170- if (std::get< 1 >(la) ! = 1 .0f ) {
3171- continue ;
3285+ for (auto & la : params.lora_adapters ) {
3286+ if (la. scale = = 1 .0f ) {
3287+ fprintf (stream, " - %s \n " , la. path . c_str ()) ;
31723288 }
3173- fprintf (stream, " - %s\n " , std::get<0 >(la).c_str ());
31743289 }
31753290 fprintf (stream, " lora_scaled:\n " );
3176- for (std::tuple<std::string, float > la : params.lora_adapter ) {
3177- if (std::get< 1 >(la) = = 1 .0f ) {
3178- continue ;
3291+ for (auto & la : params.lora_adapters ) {
3292+ if (la. scale ! = 1 .0f ) {
3293+ fprintf (stream, " - %s: %f \n " , la. path . c_str (), la. scale ) ;
31793294 }
3180- fprintf (stream, " - %s: %f\n " , std::get<0 >(la).c_str (), std::get<1 >(la));
31813295 }
3296+ fprintf (stream, " lora_init_without_apply: %s # default: false\n " , params.lora_init_without_apply ? " true" : " false" );
31823297 fprintf (stream, " main_gpu: %d # default: 0\n " , params.main_gpu );
31833298 fprintf (stream, " min_keep: %d # default: 0 (disabled)\n " , sparams.min_keep );
31843299 fprintf (stream, " mirostat: %d # default: 0 (disabled)\n " , sparams.mirostat );
0 commit comments