@@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
977977 for (auto & seq_breaker : params.sampling .dry_sequence_breakers ) {
978978 string_process_escapes (seq_breaker);
979979 }
980+ for (auto & pair : params.speculative .replacements ) {
981+ string_process_escapes (pair.first );
982+ string_process_escapes (pair.second );
983+ }
980984 }
981985
982986 if (!params.kv_overrides .empty ()) {
@@ -2091,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20912095 params.no_kv_offload = true ;
20922096 }
20932097 ).set_env (" LLAMA_ARG_NO_KV_OFFLOAD" ));
2098+ add_opt (common_arg (
2099+ {" -nr" , " --no-repack" },
2100+ " disable weight repacking" ,
2101+ [](common_params & params) {
2102+ params.no_extra_bufts = true ;
2103+ }
2104+ ).set_env (" LLAMA_ARG_NO_REPACK" ));
20942105 add_opt (common_arg (
20952106 {" -ctk" , " --cache-type-k" }, " TYPE" ,
20962107 string_format (
@@ -2369,6 +2380,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23692380 }
23702381 }
23712382 ));
2383+ add_opt (common_arg (
2384+ {" --cpu-moe" },
2385+ " use CPU for Mixture of Experts (MoE) weights" ,
2386+ [](common_params & params) {
2387+ params.tensor_buft_overrides .push_back ({" \\ .ffn_up_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2388+ params.tensor_buft_overrides .push_back ({" \\ .ffn_down_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2389+ params.tensor_buft_overrides .push_back ({" \\ .ffn_gate_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2390+ }
2391+ ).set_env (" LLAMA_ARG_CPU_MOE" ));
23722392 add_opt (common_arg (
23732393 {" -ngl" , " --gpu-layers" , " --n-gpu-layers" }, " N" ,
23742394 " number of layers to store in VRAM" ,
@@ -3249,6 +3269,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32493269 params.speculative .model .path = value;
32503270 }
32513271 ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_MODEL_DRAFT" ));
3272+ add_opt (common_arg (
3273+ {" --spec-replace" }, " TARGET" , " DRAFT" ,
3274+ " translate the string in TARGET into DRAFT if the draft model and main model are not compatible" ,
3275+ [](common_params & params, const std::string & tgt, const std::string & dft) {
3276+ params.speculative .replacements .push_back ({ tgt, dft });
3277+ }
3278+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
32523279 add_opt (common_arg (
32533280 {" -ctkd" , " --cache-type-k-draft" }, " TYPE" ,
32543281 string_format (
@@ -3438,34 +3465,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34383465 }
34393466 ).set_examples ({LLAMA_EXAMPLE_SERVER}));
34403467
3441- // diffusion parameters
34423468 add_opt (common_arg (
34433469 { " --diffusion-steps" }, " N" ,
34443470 string_format (" number of diffusion steps (default: %d)" , params.diffusion .steps ),
34453471 [](common_params & params, int value) { params.diffusion .steps = value; }
34463472 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3473+ add_opt (common_arg (
3474+ { " --diffusion-visual" },
3475+ string_format (" enable visual diffusion mode (show progressive generation) (default: %s)" ,
3476+ params.diffusion .visual_mode ? " true" : " false" ),
3477+ [](common_params & params) { params.diffusion .visual_mode = true ; }
3478+ ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3479+
34473480 add_opt (common_arg (
34483481 { " --diffusion-eps" }, " F" ,
34493482 string_format (" epsilon for timesteps (default: %.6f)" , (double ) params.diffusion .eps ),
34503483 [](common_params & params, const std::string & value) { params.diffusion .eps = std::stof (value); }
34513484 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
34523485 add_opt (common_arg (
34533486 { " --diffusion-algorithm" }, " N" ,
3454- string_format (" diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS , 2=TOPK_MARGIN , 3=ENTROPY (default: %d)" ,
3487+ string_format (" diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED , 2=MARGIN_BASED , 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)" ,
34553488 params.diffusion .algorithm ),
34563489 [](common_params & params, int value) { params.diffusion .algorithm = value; }
34573490 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
34583491 add_opt (common_arg (
34593492 { " --diffusion-alg-temp" }, " F" ,
3460- string_format (" algorithm temperature (default: %.3f)" , (double ) params.diffusion .alg_temp ),
3493+ string_format (" dream algorithm temperature (default: %.3f)" , (double ) params.diffusion .alg_temp ),
34613494 [](common_params & params, const std::string & value) { params.diffusion .alg_temp = std::stof (value); }
34623495 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3496+
34633497 add_opt (common_arg (
3464- { " --diffusion-visual" },
3465- string_format (" enable visual diffusion mode (show progressive generation) (default: %s)" ,
3466- params.diffusion .visual_mode ? " true" : " false" ),
3467- [](common_params & params) { params.diffusion .visual_mode = true ; }
3498+ { " --diffusion-block-length" }, " N" ,
3499+ string_format (" llada block length for generation (default: %d)" , params.diffusion .block_length ),
3500+ [](common_params & params, int value) { params.diffusion .block_length = value; }
3501+ ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3502+ add_opt (common_arg (
3503+ { " --diffusion-cfg-scale" }, " F" ,
3504+ string_format (" llada classifier-free guidance scale (default: %.3f)" , (double ) params.diffusion .cfg_scale ),
3505+ [](common_params & params, const std::string & value) { params.diffusion .cfg_scale = std::stof (value); }
34683506 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3507+ add_opt (common_arg (
3508+ { " --diffusion-add-gumbel-noise" }, " F" ,
3509+ string_format (" add gumbel noise to the logits if temp > 0.0 (default: %s)" , params.diffusion .add_gumbel_noise ? " true" : " false" ),
3510+ [](common_params & params, const std::string & value) { params.diffusion .add_gumbel_noise = std::stof (value); }
3511+ ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3512+
34693513
34703514 return ctx_arg;
34713515}
0 commit comments