@@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
691691 [](gpt_params & params) {
692692 params.ctx_shift = false ;
693693 }
694- ).set_examples ({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
694+ ).set_examples ({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_NO_CONTEXT_SHIFT " ) );
695695 add_opt (llama_arg (
696696 {" --chunks" }, " N" ,
697697 format (" max number of chunks to process (default: %d, -1 = all)" , params.n_chunks ),
@@ -1102,7 +1102,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
11021102 else if (value == " last" ) { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
11031103 else { throw std::invalid_argument (" invalid value" ); }
11041104 }
1105- ).set_examples ({LLAMA_EXAMPLE_EMBEDDING} ));
1105+ ).set_examples ({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_POOLING " ));
11061106 add_opt (llama_arg (
11071107 {" --attention" }, " {causal,non,causal}" ,
11081108 " attention type for embeddings, use model default if unspecified" ,
@@ -1121,77 +1121,77 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
11211121 else if (value == " yarn" ) { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
11221122 else { throw std::invalid_argument (" invalid value" ); }
11231123 }
1124- ));
1124+ ). set_env ( " LLAMA_ARG_ROPE_SCALING_TYPE " ) );
11251125 add_opt (llama_arg (
11261126 {" --rope-scale" }, " N" ,
11271127 " RoPE context scaling factor, expands context by a factor of N" ,
11281128 [](gpt_params & params, const std::string & value) {
11291129 params.rope_freq_scale = 1 .0f / std::stof (value);
11301130 }
1131- ));
1131+ ). set_env ( " LLAMA_ARG_ROPE_SCALE " ) );
11321132 add_opt (llama_arg (
11331133 {" --rope-freq-base" }, " N" ,
11341134 " RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" ,
11351135 [](gpt_params & params, const std::string & value) {
11361136 params.rope_freq_base = std::stof (value);
11371137 }
1138- ));
1138+ ). set_env ( " LLAMA_ARG_ROPE_FREQ_BASE " ) );
11391139 add_opt (llama_arg (
11401140 {" --rope-freq-scale" }, " N" ,
11411141 " RoPE frequency scaling factor, expands context by a factor of 1/N" ,
11421142 [](gpt_params & params, const std::string & value) {
11431143 params.rope_freq_scale = std::stof (value);
11441144 }
1145- ));
1145+ ). set_env ( " LLAMA_ARG_ROPE_FREQ_SCALE " ) );
11461146 add_opt (llama_arg (
11471147 {" --yarn-orig-ctx" }, " N" ,
11481148 format (" YaRN: original context size of model (default: %d = model training context size)" , params.yarn_orig_ctx ),
11491149 [](gpt_params & params, int value) {
11501150 params.yarn_orig_ctx = value;
11511151 }
1152- ));
1152+ ). set_env ( " LLAMA_ARG_YARN_ORIG_CTX " ) );
11531153 add_opt (llama_arg (
11541154 {" --yarn-ext-factor" }, " N" ,
11551155 format (" YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)" , (double )params.yarn_ext_factor ),
11561156 [](gpt_params & params, const std::string & value) {
11571157 params.yarn_ext_factor = std::stof (value);
11581158 }
1159- ));
1159+ ). set_env ( " LLAMA_ARG_YARN_EXT_FACTOR " ) );
11601160 add_opt (llama_arg (
11611161 {" --yarn-attn-factor" }, " N" ,
11621162 format (" YaRN: scale sqrt(t) or attention magnitude (default: %.1f)" , (double )params.yarn_attn_factor ),
11631163 [](gpt_params & params, const std::string & value) {
11641164 params.yarn_attn_factor = std::stof (value);
11651165 }
1166- ));
1166+ ). set_env ( " LLAMA_ARG_YARN_ATTN_FACTOR " ) );
11671167 add_opt (llama_arg (
11681168 {" --yarn-beta-slow" }, " N" ,
11691169 format (" YaRN: high correction dim or alpha (default: %.1f)" , (double )params.yarn_beta_slow ),
11701170 [](gpt_params & params, const std::string & value) {
11711171 params.yarn_beta_slow = std::stof (value);
11721172 }
1173- ));
1173+ ). set_env ( " LLAMA_ARG_YARN_BETA_SLOW " ) );
11741174 add_opt (llama_arg (
11751175 {" --yarn-beta-fast" }, " N" ,
11761176 format (" YaRN: low correction dim or beta (default: %.1f)" , (double )params.yarn_beta_fast ),
11771177 [](gpt_params & params, const std::string & value) {
11781178 params.yarn_beta_fast = std::stof (value);
11791179 }
1180- ));
1180+ ). set_env ( " LLAMA_ARG_YARN_BETA_FAST " ) );
11811181 add_opt (llama_arg (
11821182 {" -gan" , " --grp-attn-n" }, " N" ,
11831183 format (" group-attention factor (default: %d)" , params.grp_attn_n ),
11841184 [](gpt_params & params, int value) {
11851185 params.grp_attn_n = value;
11861186 }
1187- ));
1187+ ). set_env ( " LLAMA_ARG_GRP_ATTN_N " ) );
11881188 add_opt (llama_arg (
11891189 {" -gaw" , " --grp-attn-w" }, " N" ,
11901190 format (" group-attention width (default: %.1f)" , (double )params.grp_attn_w ),
11911191 [](gpt_params & params, int value) {
11921192 params.grp_attn_w = value;
11931193 }
1194- ));
1194+ ). set_env ( " LLAMA_ARG_GRP_ATTN_W " ) );
11951195 add_opt (llama_arg (
11961196 {" -dkvc" , " --dump-kv-cache" },
11971197 " verbose print of the KV cache" ,
@@ -1205,23 +1205,23 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
12051205 [](gpt_params & params) {
12061206 params.no_kv_offload = true ;
12071207 }
1208- ));
1208+ ). set_env ( " LLAMA_ARG_NO_KV_OFFLOAD " ) );
12091209 add_opt (llama_arg (
12101210 {" -ctk" , " --cache-type-k" }, " TYPE" ,
12111211 format (" KV cache data type for K (default: %s)" , params.cache_type_k .c_str ()),
12121212 [](gpt_params & params, const std::string & value) {
12131213 // TODO: get the type right here
12141214 params.cache_type_k = value;
12151215 }
1216- ));
1216+ ). set_env ( " LLAMA_ARG_CACHE_TYPE_K " ) );
12171217 add_opt (llama_arg (
12181218 {" -ctv" , " --cache-type-v" }, " TYPE" ,
12191219 format (" KV cache data type for V (default: %s)" , params.cache_type_v .c_str ()),
12201220 [](gpt_params & params, const std::string & value) {
12211221 // TODO: get the type right here
12221222 params.cache_type_v = value;
12231223 }
1224- ));
1224+ ). set_env ( " LLAMA_ARG_CACHE_TYPE_V " ) );
12251225 add_opt (llama_arg (
12261226 {" --perplexity" , " --all-logits" },
12271227 format (" return logits for all tokens in the batch (default: %s)" , params.logits_all ? " true" : " false" ),
@@ -1355,22 +1355,22 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
13551355 [](gpt_params & params, const std::string & value) {
13561356 params.rpc_servers = value;
13571357 }
1358- ));
1358+ ). set_env ( " LLAMA_ARG_RPC " ) );
13591359#endif
13601360 add_opt (llama_arg (
13611361 {" --mlock" },
13621362 " force system to keep model in RAM rather than swapping or compressing" ,
13631363 [](gpt_params & params) {
13641364 params.use_mlock = true ;
13651365 }
1366- ));
1366+ ). set_env ( " LLAMA_ARG_MLOCK " ) );
13671367 add_opt (llama_arg (
13681368 {" --no-mmap" },
13691369 " do not memory-map model (slower load but may reduce pageouts if not using mlock)" ,
13701370 [](gpt_params & params) {
13711371 params.use_mmap = false ;
13721372 }
1373- ));
1373+ ). set_env ( " LLAMA_ARG_NO_MMAP " ) );
13741374 add_opt (llama_arg (
13751375 {" --numa" }, " TYPE" ,
13761376 " attempt optimizations that help on some NUMA systems\n "
@@ -1385,7 +1385,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
13851385 else if (value == " numactl" ) { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
13861386 else { throw std::invalid_argument (" invalid value" ); }
13871387 }
1388- ));
1388+ ). set_env ( " LLAMA_ARG_NUMA " ) );
13891389 add_opt (llama_arg (
13901390 {" -ngl" , " --gpu-layers" , " --n-gpu-layers" }, " N" ,
13911391 " number of layers to store in VRAM" ,
@@ -1433,7 +1433,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
14331433 fprintf (stderr, " warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n " );
14341434 }
14351435 }
1436- ));
1436+ ). set_env ( " LLAMA_ARG_SPLIT_MODE " ) );
14371437 add_opt (llama_arg (
14381438 {" -ts" , " --tensor-split" }, " N0,N1,N2,..." ,
14391439 " fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" ,
@@ -1460,7 +1460,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
14601460 fprintf (stderr, " warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n " );
14611461 }
14621462 }
1463- ));
1463+ ). set_env ( " LLAMA_ARG_TENSOR_SPLIT " ) );
14641464 add_opt (llama_arg (
14651465 {" -mg" , " --main-gpu" }, " INDEX" ,
14661466 format (" the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)" , params.main_gpu ),
@@ -1470,7 +1470,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
14701470 fprintf (stderr, " warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n " );
14711471 }
14721472 }
1473- ));
1473+ ). set_env ( " LLAMA_ARG_MAIN_GPU " ) );
14741474 add_opt (llama_arg (
14751475 {" --check-tensors" },
14761476 format (" check model tensor data for invalid values (default: %s)" , params.check_tensors ? " true" : " false" ),
@@ -1533,7 +1533,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
15331533 [](gpt_params & params, const std::string & value) {
15341534 params.model_alias = value;
15351535 }
1536- ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1536+ ).set_examples ({LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_ALIAS " ) );
15371537 add_opt (llama_arg (
15381538 {" -m" , " --model" }, " FNAME" ,
15391539 ex == LLAMA_EXAMPLE_EXPORT_LORA
@@ -1741,7 +1741,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
17411741 [](gpt_params & params, const std::string & value) {
17421742 params.public_path = value;
17431743 }
1744- ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1744+ ).set_examples ({LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_STATIC_PATH " ) );
17451745 add_opt (llama_arg (
17461746 {" --embedding" , " --embeddings" },
17471747 format (" restrict to only support embedding use case; use only with dedicated embedding models (default: %s)" , params.embedding ? " enabled" : " disabled" ),
@@ -1779,22 +1779,22 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
17791779 [](gpt_params & params, const std::string & value) {
17801780 params.ssl_file_key = value;
17811781 }
1782- ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1782+ ).set_examples ({LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_SSL_KEY_FILE " ) );
17831783 add_opt (llama_arg (
17841784 {" --ssl-cert-file" }, " FNAME" ,
17851785 " path to file a PEM-encoded SSL certificate" ,
17861786 [](gpt_params & params, const std::string & value) {
17871787 params.ssl_file_cert = value;
17881788 }
1789- ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1789+ ).set_examples ({LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_SSL_CERT_FILE " ) );
17901790 add_opt (llama_arg (
17911791 {" -to" , " --timeout" }, " N" ,
17921792 format (" server read/write timeout in seconds (default: %d)" , params.timeout_read ),
17931793 [](gpt_params & params, int value) {
17941794 params.timeout_read = value;
17951795 params.timeout_write = value;
17961796 }
1797- ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1797+ ).set_examples ({LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_TIMEOUT " ) );
17981798 add_opt (llama_arg (
17991799 {" --threads-http" }, " N" ,
18001800 format (" number of threads used to process HTTP requests (default: %d)" , params.n_threads_http ),
0 commit comments