@@ -980,6 +980,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
980980 for (auto & seq_breaker : params.sampling .dry_sequence_breakers ) {
981981 string_process_escapes (seq_breaker);
982982 }
983+ for (auto & pair : params.speculative .replacements ) {
984+ string_process_escapes (pair.first );
985+ string_process_escapes (pair.second );
986+ }
983987 }
984988
985989 if (!params.kv_overrides .empty ()) {
@@ -2094,6 +2098,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20942098 params.no_kv_offload = true ;
20952099 }
20962100 ).set_env (" LLAMA_ARG_NO_KV_OFFLOAD" ));
2101+ add_opt (common_arg (
2102+ {" -nr" , " --no-repack" },
2103+ " disable weight repacking" ,
2104+ [](common_params & params) {
2105+ params.no_extra_bufts = true ;
2106+ }
2107+ ).set_env (" LLAMA_ARG_NO_REPACK" ));
20972108 add_opt (common_arg (
20982109 {" -ctk" , " --cache-type-k" }, " TYPE" ,
20992110 string_format (
@@ -2372,6 +2383,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23722383 }
23732384 }
23742385 ));
2386+ add_opt (common_arg (
2387+ {" --cpu-moe" },
2388+ " use CPU for Mixture of Experts (MoE) weights" ,
2389+ [](common_params & params) {
2390+ params.tensor_buft_overrides .push_back ({" \\ .ffn_up_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2391+ params.tensor_buft_overrides .push_back ({" \\ .ffn_down_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2392+ params.tensor_buft_overrides .push_back ({" \\ .ffn_gate_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2393+ }
2394+ ).set_env (" LLAMA_ARG_CPU_MOE" ));
23752395 add_opt (common_arg (
23762396 {" -ngl" , " --gpu-layers" , " --n-gpu-layers" }, " N" ,
23772397 " number of layers to store in VRAM" ,
@@ -3252,6 +3272,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32523272 params.speculative .model .path = value;
32533273 }
32543274 ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_MODEL_DRAFT" ));
3275+ add_opt (common_arg (
3276+ {" --spec-replace" }, " TARGET" , " DRAFT" ,
3277+ " translate the string in TARGET into DRAFT if the draft model and main model are not compatible" ,
3278+ [](common_params & params, const std::string & tgt, const std::string & dft) {
3279+ params.speculative .replacements .push_back ({ tgt, dft });
3280+ }
3281+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
32553282 add_opt (common_arg (
32563283 {" -ctkd" , " --cache-type-k-draft" }, " TYPE" ,
32573284 string_format (
@@ -3441,34 +3468,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34413468 }
34423469 ).set_examples ({LLAMA_EXAMPLE_SERVER}));
34433470
3444- // diffusion parameters
34453471 add_opt (common_arg (
34463472 { " --diffusion-steps" }, " N" ,
34473473 string_format (" number of diffusion steps (default: %d)" , params.diffusion .steps ),
34483474 [](common_params & params, int value) { params.diffusion .steps = value; }
34493475 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3476+ add_opt (common_arg (
3477+ { " --diffusion-visual" },
3478+ string_format (" enable visual diffusion mode (show progressive generation) (default: %s)" ,
3479+ params.diffusion .visual_mode ? " true" : " false" ),
3480+ [](common_params & params) { params.diffusion .visual_mode = true ; }
3481+ ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3482+
34503483 add_opt (common_arg (
34513484 { " --diffusion-eps" }, " F" ,
34523485 string_format (" epsilon for timesteps (default: %.6f)" , (double ) params.diffusion .eps ),
34533486 [](common_params & params, const std::string & value) { params.diffusion .eps = std::stof (value); }
34543487 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
34553488 add_opt (common_arg (
34563489 { " --diffusion-algorithm" }, " N" ,
3457- string_format (" diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS , 2=TOPK_MARGIN , 3=ENTROPY (default: %d)" ,
3490+ string_format (" diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED , 2=MARGIN_BASED , 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)" ,
34583491 params.diffusion .algorithm ),
34593492 [](common_params & params, int value) { params.diffusion .algorithm = value; }
34603493 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
34613494 add_opt (common_arg (
34623495 { " --diffusion-alg-temp" }, " F" ,
3463- string_format (" algorithm temperature (default: %.3f)" , (double ) params.diffusion .alg_temp ),
3496+ string_format (" dream algorithm temperature (default: %.3f)" , (double ) params.diffusion .alg_temp ),
34643497 [](common_params & params, const std::string & value) { params.diffusion .alg_temp = std::stof (value); }
34653498 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3499+
34663500 add_opt (common_arg (
3467- { " --diffusion-visual" },
3468- string_format (" enable visual diffusion mode (show progressive generation) (default: %s)" ,
3469- params.diffusion .visual_mode ? " true" : " false" ),
3470- [](common_params & params) { params.diffusion .visual_mode = true ; }
3501+ { " --diffusion-block-length" }, " N" ,
3502+ string_format (" llada block length for generation (default: %d)" , params.diffusion .block_length ),
3503+ [](common_params & params, int value) { params.diffusion .block_length = value; }
3504+ ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3505+ add_opt (common_arg (
3506+ { " --diffusion-cfg-scale" }, " F" ,
3507+ string_format (" llada classifier-free guidance scale (default: %.3f)" , (double ) params.diffusion .cfg_scale ),
3508+ [](common_params & params, const std::string & value) { params.diffusion .cfg_scale = std::stof (value); }
34713509 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3510+ add_opt (common_arg (
3511+ { " --diffusion-add-gumbel-noise" }, " F" ,
3512+ string_format (" add gumbel noise to the logits if temp > 0.0 (default: %s)" , params.diffusion .add_gumbel_noise ? " true" : " false" ),
3513+ [](common_params & params, const std::string & value) { params.diffusion .add_gumbel_noise = std::stof (value); }
3514+ ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3515+
34723516
34733517 return ctx_arg;
34743518}
0 commit comments