Skip to content

Commit 2dd5641

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 603cfb4 + 4b2dae3 commit 2dd5641

File tree

4 files changed

+173
-140
lines changed

4 files changed

+173
-140
lines changed

common/arg.cpp

Lines changed: 162 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -3358,7 +3358,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
33583358
add_opt(common_arg(
33593359
{"--chat-template-kwargs"}, "STRING",
33603360
string_format("sets additional params for the json template parser"),
3361-
[](common_params & params, const std::string & value) {
3361+
[](common_params & params, const std::string & value) {
33623362
auto parsed = json::parse(value);
33633363
for (const auto & item : parsed.items()) {
33643364
params.default_template_kwargs[item.key()] = item.value().dump();
@@ -3570,21 +3570,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35703570
common_log_set_file(common_log_main(), value.c_str());
35713571
}
35723572
));
3573-
add_opt(common_arg({ "--log-colors" }, "[on|off|auto]",
3574-
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3575-
"'auto' enables colors when output is to a terminal",
3576-
[](common_params &, const std::string & value) {
3577-
if (is_truthy(value)) {
3578-
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3579-
} else if (is_falsey(value)) {
3580-
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3581-
} else if (is_autoy(value)) {
3582-
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3583-
} else {
3584-
throw std::invalid_argument(
3585-
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3586-
}
3587-
}).set_env("LLAMA_LOG_COLORS"));
3573+
add_opt(common_arg(
3574+
{"--log-colors"}, "[on|off|auto]",
3575+
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3576+
"'auto' enables colors when output is to a terminal",
3577+
[](common_params &, const std::string & value) {
3578+
if (is_truthy(value)) {
3579+
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3580+
} else if (is_falsey(value)) {
3581+
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3582+
} else if (is_autoy(value)) {
3583+
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3584+
} else {
3585+
throw std::invalid_argument(
3586+
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3587+
}
3588+
}
3589+
).set_env("LLAMA_LOG_COLORS"));
35883590
add_opt(common_arg(
35893591
{"-v", "--verbose", "--log-verbose"},
35903592
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
@@ -3850,7 +3852,87 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
38503852
}
38513853
).set_examples({LLAMA_EXAMPLE_TTS}));
38523854

3853-
// model-specific
3855+
add_opt(common_arg(
3856+
{"--diffusion-steps"}, "N",
3857+
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3858+
[](common_params & params, int value) { params.diffusion.steps = value; }
3859+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3860+
add_opt(common_arg(
3861+
{"--diffusion-visual"},
3862+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
3863+
[](common_params & params) { params.diffusion.visual_mode = true; }
3864+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3865+
add_opt(common_arg(
3866+
{"--diffusion-eps"}, "F",
3867+
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3868+
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3869+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3870+
add_opt(common_arg(
3871+
{"--diffusion-algorithm"}, "N",
3872+
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
3873+
[](common_params & params, int value) { params.diffusion.algorithm = value; }
3874+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3875+
add_opt(common_arg(
3876+
{"--diffusion-alg-temp"}, "F",
3877+
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3878+
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3879+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3880+
add_opt(common_arg(
3881+
{"--diffusion-block-length"}, "N",
3882+
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3883+
[](common_params & params, int value) { params.diffusion.block_length = value; }
3884+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3885+
add_opt(common_arg(
3886+
{"--diffusion-cfg-scale"}, "F",
3887+
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3888+
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
3889+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3890+
add_opt(common_arg(
3891+
{"--diffusion-add-gumbel-noise"}, "F",
3892+
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3893+
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3894+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3895+
add_opt(common_arg(
3896+
{ "-lr", "--learning-rate" }, "ALPHA",
3897+
string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
3898+
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
3899+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3900+
add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
3901+
string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
3902+
(double) params.lr.lr_min),
3903+
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
3904+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3905+
add_opt(common_arg(
3906+
{"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
3907+
string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
3908+
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
3909+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3910+
add_opt(common_arg(
3911+
{"-wd", "--weight-decay"}, "WD",
3912+
string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
3913+
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
3914+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3915+
add_opt(common_arg(
3916+
{"-val-split", "--val-split"}, "FRACTION",
3917+
string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
3918+
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
3919+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3920+
add_opt(common_arg(
3921+
{"-epochs", "--epochs"}, "N",
3922+
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
3923+
[](common_params & params, int epochs) { params.lr.epochs = epochs; }
3924+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3925+
add_opt(common_arg(
3926+
{"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
3927+
[](common_params & params, const std::string & name) {
3928+
params.optimizer = common_opt_get_optimizer(name.c_str());
3929+
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
3930+
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
3931+
}
3932+
}
3933+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3934+
3935+
// presets
38543936
add_opt(common_arg(
38553937
{"--tts-oute-default"},
38563938
string_format("use default OuteTTS models (note: can download weights from the internet)"),
@@ -3863,39 +3945,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
38633945
).set_examples({LLAMA_EXAMPLE_TTS}));
38643946

38653947
add_opt(common_arg(
3866-
{"--embd-bge-small-en-default"},
3867-
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
3868-
[](common_params & params) {
3869-
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
3870-
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
3871-
params.embd_normalize = 2;
3872-
params.n_ctx = 512;
3873-
params.verbose_prompt = true;
3874-
params.embedding = true;
3875-
}
3876-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3877-
3878-
add_opt(common_arg(
3879-
{"--embd-e5-small-en-default"},
3880-
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
3881-
[](common_params & params) {
3882-
params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
3883-
params.model.hf_file = "e5-small-v2-q8_0.gguf";
3884-
params.embd_normalize = 2;
3885-
params.n_ctx = 512;
3886-
params.verbose_prompt = true;
3887-
params.embedding = true;
3888-
}
3889-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3890-
3891-
add_opt(common_arg(
3892-
{"--embd-gte-small-default"},
3893-
string_format("use default gte-small model (note: can download weights from the internet)"),
3948+
{"--embd-gemma-default"},
3949+
string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
38943950
[](common_params & params) {
3895-
params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
3896-
params.model.hf_file = "gte-small-q8_0.gguf";
3897-
params.embd_normalize = 2;
3898-
params.n_ctx = 512;
3951+
params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
3952+
params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
3953+
params.port = 8011;
3954+
params.n_ubatch = 2048;
3955+
params.n_batch = 2048;
3956+
params.n_parallel = 32;
3957+
params.n_ctx = 2048*params.n_parallel;
38993958
params.verbose_prompt = true;
39003959
params.embedding = true;
39013960
}
@@ -3990,96 +4049,65 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
39904049
).set_examples({LLAMA_EXAMPLE_SERVER}));
39914050

39924051
add_opt(common_arg(
3993-
{ "--diffusion-steps" }, "N",
3994-
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3995-
[](common_params & params, int value) { params.diffusion.steps = value; }
3996-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3997-
add_opt(common_arg(
3998-
{ "--diffusion-visual" },
3999-
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
4000-
params.diffusion.visual_mode ? "true" : "false"),
4001-
[](common_params & params) { params.diffusion.visual_mode = true; }
4002-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4052+
{"--gpt-oss-20b-default"},
4053+
string_format("use gpt-oss-20b (note: can download weights from the internet)"),
4054+
[](common_params & params) {
4055+
params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
4056+
params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
4057+
params.port = 8013;
4058+
params.n_ubatch = 2048;
4059+
params.n_batch = 32768;
4060+
params.n_parallel = 2;
4061+
params.n_ctx = 131072*params.n_parallel;
4062+
params.sampling.temp = 1.0f;
4063+
params.sampling.top_p = 1.0f;
4064+
params.sampling.top_k = 0;
4065+
params.sampling.min_p = 0.01f;
4066+
params.use_jinja = true;
4067+
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
4068+
}
4069+
).set_examples({LLAMA_EXAMPLE_SERVER}));
40034070

40044071
add_opt(common_arg(
4005-
{ "--diffusion-eps" }, "F",
4006-
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
4007-
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
4008-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4009-
add_opt(common_arg(
4010-
{ "--diffusion-algorithm" }, "N",
4011-
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
4012-
params.diffusion.algorithm),
4013-
[](common_params & params, int value) { params.diffusion.algorithm = value; }
4014-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4015-
add_opt(common_arg(
4016-
{ "--diffusion-alg-temp" }, "F",
4017-
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
4018-
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
4019-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4072+
{"--gpt-oss-120b-default"},
4073+
string_format("use gpt-oss-120b (note: can download weights from the internet)"),
4074+
[](common_params & params) {
4075+
params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
4076+
params.port = 8013;
4077+
params.n_ubatch = 2048;
4078+
params.n_batch = 32768;
4079+
params.n_parallel = 2;
4080+
params.n_ctx = 131072*params.n_parallel;
4081+
params.sampling.temp = 1.0f;
4082+
params.sampling.top_p = 1.0f;
4083+
params.sampling.top_k = 0;
4084+
params.sampling.min_p = 0.01f;
4085+
params.use_jinja = true;
4086+
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
4087+
}
4088+
).set_examples({LLAMA_EXAMPLE_SERVER}));
40204089

40214090
add_opt(common_arg(
4022-
{ "--diffusion-block-length" }, "N",
4023-
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
4024-
[](common_params & params, int value) { params.diffusion.block_length = value; }
4025-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4026-
add_opt(common_arg(
4027-
{ "--diffusion-cfg-scale" }, "F",
4028-
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
4029-
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
4030-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4031-
add_opt(common_arg(
4032-
{ "--diffusion-add-gumbel-noise" }, "F",
4033-
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
4034-
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
4035-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4036-
4091+
{"--vision-gemma-4b-default"},
4092+
string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
4093+
[](common_params & params) {
4094+
params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
4095+
params.port = 8014;
4096+
params.n_ctx = 0;
4097+
params.use_jinja = true;
4098+
}
4099+
).set_examples({LLAMA_EXAMPLE_SERVER}));
40374100

4038-
add_opt(
4039-
common_arg({ "-lr", "--learning-rate" }, "ALPHA",
4040-
string_format(
4041-
"adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
4042-
(double) params.lr.lr0),
4043-
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
4044-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4045-
add_opt(
4046-
common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
4047-
string_format(
4048-
"(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
4049-
(double) params.lr.lr_min),
4050-
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
4051-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4052-
add_opt(
4053-
common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
4054-
string_format(
4055-
"(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
4056-
(double) params.lr.decay_epochs),
4057-
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
4058-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4059-
add_opt(common_arg(
4060-
{ "-wd", "--weight-decay" }, "WD",
4061-
string_format(
4062-
"adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
4063-
(double) params.lr.wd),
4064-
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
4065-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4066-
add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
4067-
string_format("fraction of data to use as validation set for training (default: %.2g).",
4068-
(double) params.val_split),
4069-
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
4070-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4071-
add_opt(common_arg({ "-epochs", "--epochs" }, "N",
4072-
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
4073-
[](common_params & params, int epochs) { params.lr.epochs = epochs; })
4074-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4075-
add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
4076-
[](common_params & params, const std::string & name) {
4077-
params.optimizer = common_opt_get_optimizer(name.c_str());
4078-
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
4079-
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
4080-
}
4081-
})
4082-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4101+
add_opt(common_arg(
4102+
{"--vision-gemma-12b-default"},
4103+
string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
4104+
[](common_params & params) {
4105+
params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
4106+
params.port = 8014;
4107+
params.n_ctx = 0;
4108+
params.use_jinja = true;
4109+
}
4110+
).set_examples({LLAMA_EXAMPLE_SERVER}));
40834111

40844112
return ctx_arg;
40854113
}

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ struct common_params {
426426
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
427427
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
428428
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
429-
int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.
429+
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
430430

431431
std::string hostname = "127.0.0.1";
432432
std::string public_path = ""; // NOLINT

0 commit comments

Comments
 (0)