Skip to content

Commit 185cc3f

Browse files
Merge pull request #107 from ggml-org/master
Sync with upstream
2 parents abd7a3c + 7845240 commit 185cc3f

39 files changed

+1845
-628
lines changed

common/arg.cpp

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
977977
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
978978
string_process_escapes(seq_breaker);
979979
}
980+
for (auto & pair : params.speculative.replacements) {
981+
string_process_escapes(pair.first);
982+
string_process_escapes(pair.second);
983+
}
980984
}
981985

982986
if (!params.kv_overrides.empty()) {
@@ -2091,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20912095
params.no_kv_offload = true;
20922096
}
20932097
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
2098+
add_opt(common_arg(
2099+
{"-nr", "--no-repack"},
2100+
"disable weight repacking",
2101+
[](common_params & params) {
2102+
params.no_extra_bufts = true;
2103+
}
2104+
).set_env("LLAMA_ARG_NO_REPACK"));
20942105
add_opt(common_arg(
20952106
{"-ctk", "--cache-type-k"}, "TYPE",
20962107
string_format(
@@ -3249,6 +3260,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32493260
params.speculative.model.path = value;
32503261
}
32513262
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3263+
add_opt(common_arg(
3264+
{"--spec-replace"}, "TARGET", "DRAFT",
3265+
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
3266+
[](common_params & params, const std::string & tgt, const std::string & dft) {
3267+
params.speculative.replacements.push_back({ tgt, dft });
3268+
}
3269+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
32523270
add_opt(common_arg(
32533271
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
32543272
string_format(
@@ -3438,34 +3456,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34383456
}
34393457
).set_examples({LLAMA_EXAMPLE_SERVER}));
34403458

3441-
// diffusion parameters
34423459
add_opt(common_arg(
34433460
{ "--diffusion-steps" }, "N",
34443461
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
34453462
[](common_params & params, int value) { params.diffusion.steps = value; }
34463463
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3464+
add_opt(common_arg(
3465+
{ "--diffusion-visual" },
3466+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3467+
params.diffusion.visual_mode ? "true" : "false"),
3468+
[](common_params & params) { params.diffusion.visual_mode = true; }
3469+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3470+
34473471
add_opt(common_arg(
34483472
{ "--diffusion-eps" }, "F",
34493473
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
34503474
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
34513475
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
34523476
add_opt(common_arg(
34533477
{ "--diffusion-algorithm" }, "N",
3454-
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
3478+
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
34553479
params.diffusion.algorithm),
34563480
[](common_params & params, int value) { params.diffusion.algorithm = value; }
34573481
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
34583482
add_opt(common_arg(
34593483
{ "--diffusion-alg-temp" }, "F",
3460-
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3484+
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
34613485
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
34623486
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3487+
34633488
add_opt(common_arg(
3464-
{ "--diffusion-visual" },
3465-
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3466-
params.diffusion.visual_mode ? "true" : "false"),
3467-
[](common_params & params) { params.diffusion.visual_mode = true; }
3489+
{ "--diffusion-block-length" }, "N",
3490+
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3491+
[](common_params & params, int value) { params.diffusion.block_length = value; }
3492+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3493+
add_opt(common_arg(
3494+
{ "--diffusion-cfg-scale" }, "F",
3495+
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3496+
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
34683497
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3498+
add_opt(common_arg(
3499+
{ "--diffusion-add-gumbel-noise" }, "F",
3500+
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3501+
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3502+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3503+
34693504

34703505
return ctx_arg;
34713506
}

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,6 +1122,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
11221122
mparams.use_mmap = params.use_mmap;
11231123
mparams.use_mlock = params.use_mlock;
11241124
mparams.check_tensors = params.check_tensors;
1125+
mparams.use_extra_bufts = !params.no_extra_bufts;
11251126

11261127
if (params.kv_overrides.empty()) {
11271128
mparams.kv_overrides = NULL;

common/common.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ struct common_params_speculative {
201201
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
202202
float p_split = 0.1f; // speculative decoding split probability
203203
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
204+
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
204205

205206
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
206207
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -220,11 +221,17 @@ struct common_params_vocoder {
220221
};
221222

222223
struct common_params_diffusion {
223-
int32_t steps = 64; // number of diffusion steps
224-
float eps = 1e-3f; // epsilon for timesteps
225-
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
226-
float alg_temp = 0.0f; // algorithm temperature
227-
bool visual_mode = false; // show progressive diffusion on screen
224+
int32_t steps = 128;
225+
bool visual_mode = false;
226+
227+
float eps = 0; // epsilon for timesteps
228+
int32_t block_length = 0; // block length for generation
229+
230+
int32_t algorithm = 4; // default algorithm: low-confidence
231+
float alg_temp = 0.0f; // algorithm temperature
232+
233+
float cfg_scale = 0; // classifier-free guidance scale
234+
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
228235
};
229236

230237
enum common_reasoning_format {
@@ -355,6 +362,7 @@ struct common_params {
355362
bool warmup = true; // warmup run
356363
bool check_tensors = false; // validate tensor data
357364
bool no_op_offload = false; // globally disable offload host tensor operations to device
365+
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
358366

359367
bool single_turn = false; // single turn chat conversation
360368

0 commit comments

Comments
 (0)