@@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
977977 for (auto & seq_breaker : params.sampling .dry_sequence_breakers ) {
978978 string_process_escapes (seq_breaker);
979979 }
980+ for (auto & pair : params.speculative .replacements ) {
981+ string_process_escapes (pair.first );
982+ string_process_escapes (pair.second );
983+ }
980984 }
981985
982986 if (!params.kv_overrides .empty ()) {
@@ -2091,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20912095 params.no_kv_offload = true ;
20922096 }
20932097 ).set_env (" LLAMA_ARG_NO_KV_OFFLOAD" ));
2098+ add_opt (common_arg (
2099+ {" -nr" , " --no-repack" },
2100+ " disable weight repacking" ,
2101+ [](common_params & params) {
2102+ params.no_extra_bufts = true ;
2103+ }
2104+ ).set_env (" LLAMA_ARG_NO_REPACK" ));
20942105 add_opt (common_arg (
20952106 {" -ctk" , " --cache-type-k" }, " TYPE" ,
20962107 string_format (
@@ -2369,6 +2380,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23692380 }
23702381 }
23712382 ));
2383+ add_opt (common_arg (
2384+ {" --cpu-moe" },
2385+ " use CPU for Mixture of Experts (MoE) weights" ,
2386+ [](common_params & params) {
2387+ params.tensor_buft_overrides .push_back ({" \\ .ffn_up_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2388+ params.tensor_buft_overrides .push_back ({" \\ .ffn_down_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2389+ params.tensor_buft_overrides .push_back ({" \\ .ffn_gate_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2390+ }
2391+ ).set_env (" LLAMA_ARG_CPU_MOE" ));
23722392 add_opt (common_arg (
23732393 {" -ngl" , " --gpu-layers" , " --n-gpu-layers" }, " N" ,
23742394 " number of layers to store in VRAM" ,
@@ -2627,6 +2647,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26272647 params.n_out_freq = value;
26282648 }
26292649 ).set_examples ({LLAMA_EXAMPLE_IMATRIX}));
2650+ add_opt (common_arg (
2651+ {" --output-format" }, " {gguf,dat}" ,
2652+ string_format (" output format for imatrix file (default: %s)" , params.imat_dat ? " dat" : " gguf" ),
2653+ [](common_params & params, const std::string & value) {
2654+ /* */ if (value == " gguf" ) { params.imat_dat = false ; }
2655+ else if (value == " dat" ) { params.imat_dat = true ; }
2656+ else { throw std::invalid_argument (" invalid output format" ); }
2657+ }
2658+ ).set_examples ({LLAMA_EXAMPLE_IMATRIX}));
26302659 add_opt (common_arg (
26312660 {" --save-frequency" }, " N" ,
26322661 string_format (" save an imatrix copy every N iterations (default: %d)" , params.n_save_freq ),
@@ -3249,6 +3278,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32493278 params.speculative .model .path = value;
32503279 }
32513280 ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_MODEL_DRAFT" ));
3281+ add_opt (common_arg (
3282+ {" --spec-replace" }, " TARGET" , " DRAFT" ,
3283+ " translate the string in TARGET into DRAFT if the draft model and main model are not compatible" ,
3284+ [](common_params & params, const std::string & tgt, const std::string & dft) {
3285+ params.speculative .replacements .push_back ({ tgt, dft });
3286+ }
3287+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
32523288 add_opt (common_arg (
32533289 {" -ctkd" , " --cache-type-k-draft" }, " TYPE" ,
32543290 string_format (
@@ -3438,34 +3474,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34383474 }
34393475 ).set_examples ({LLAMA_EXAMPLE_SERVER}));
34403476
3441- // diffusion parameters
34423477 add_opt (common_arg (
34433478 { " --diffusion-steps" }, " N" ,
34443479 string_format (" number of diffusion steps (default: %d)" , params.diffusion .steps ),
34453480 [](common_params & params, int value) { params.diffusion .steps = value; }
34463481 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3482+ add_opt (common_arg (
3483+ { " --diffusion-visual" },
3484+ string_format (" enable visual diffusion mode (show progressive generation) (default: %s)" ,
3485+ params.diffusion .visual_mode ? " true" : " false" ),
3486+ [](common_params & params) { params.diffusion .visual_mode = true ; }
3487+ ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3488+
34473489 add_opt (common_arg (
34483490 { " --diffusion-eps" }, " F" ,
34493491 string_format (" epsilon for timesteps (default: %.6f)" , (double ) params.diffusion .eps ),
34503492 [](common_params & params, const std::string & value) { params.diffusion .eps = std::stof (value); }
34513493 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
34523494 add_opt (common_arg (
34533495 { " --diffusion-algorithm" }, " N" ,
3454- string_format (" diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS , 2=TOPK_MARGIN , 3=ENTROPY (default: %d)" ,
3496+ string_format (" diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED , 2=MARGIN_BASED , 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)" ,
34553497 params.diffusion .algorithm ),
34563498 [](common_params & params, int value) { params.diffusion .algorithm = value; }
34573499 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
34583500 add_opt (common_arg (
34593501 { " --diffusion-alg-temp" }, " F" ,
3460- string_format (" algorithm temperature (default: %.3f)" , (double ) params.diffusion .alg_temp ),
3502+ string_format (" dream algorithm temperature (default: %.3f)" , (double ) params.diffusion .alg_temp ),
34613503 [](common_params & params, const std::string & value) { params.diffusion .alg_temp = std::stof (value); }
34623504 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3505+
34633506 add_opt (common_arg (
3464- { " --diffusion-visual" },
3465- string_format (" enable visual diffusion mode (show progressive generation) (default: %s)" ,
3466- params.diffusion .visual_mode ? " true" : " false" ),
3467- [](common_params & params) { params.diffusion .visual_mode = true ; }
3507+ { " --diffusion-block-length" }, " N" ,
3508+ string_format (" llada block length for generation (default: %d)" , params.diffusion .block_length ),
3509+ [](common_params & params, int value) { params.diffusion .block_length = value; }
3510+ ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3511+ add_opt (common_arg (
3512+ { " --diffusion-cfg-scale" }, " F" ,
3513+ string_format (" llada classifier-free guidance scale (default: %.3f)" , (double ) params.diffusion .cfg_scale ),
3514+ [](common_params & params, const std::string & value) { params.diffusion .cfg_scale = std::stof (value); }
34683515 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3516+ add_opt (common_arg (
3517+ { " --diffusion-add-gumbel-noise" }, " F" ,
3518+ string_format (" add gumbel noise to the logits if temp > 0.0 (default: %s)" , params.diffusion .add_gumbel_noise ? " true" : " false" ),
3519+ [](common_params & params, const std::string & value) { params.diffusion .add_gumbel_noise = std::stof (value); }
3520+ ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3521+
34693522
34703523 return ctx_arg;
34713524}
0 commit comments