Skip to content

Commit 6b71242

Browse files
committed
Merge branch 'master' into wan
2 parents d9f1d13 + 9961d24 commit 6b71242

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+2436
-1034
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
137137
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
138138
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
139139
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
140+
- [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
140141

141142
#### Multimodal
142143

ci/run.sh

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -386,10 +386,10 @@ function gg_run_open_llama_7b_v2 {
386386

387387
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
388388

389-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
390-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
391-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
392-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
389+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
390+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
391+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
392+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
393393

394394
function check_ppl {
395395
qnt="$1"
@@ -520,8 +520,8 @@ function gg_run_pythia_1_4b {
520520

521521
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
522522

523-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
524-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
523+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
524+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
525525

526526
function check_ppl {
527527
qnt="$1"
@@ -651,10 +651,10 @@ function gg_run_pythia_2_8b {
651651

652652
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
653653

654-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
655-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
656-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
657-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
654+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
655+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
656+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
657+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
658658

659659
function check_ppl {
660660
qnt="$1"

common/arg.cpp

Lines changed: 19 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15451545
}
15461546
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
15471547
add_opt(common_arg(
1548-
{"-fa", "--flash-attn"},
1549-
string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
1550-
[](common_params & params) {
1551-
params.flash_attn = true;
1548+
{"-fa", "--flash-attn"}, "FA",
1549+
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
1550+
[](common_params & params, const std::string & value) {
1551+
if (value == "on" || value == "enabled") {
1552+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1553+
} else if (value == "off" || value == "disabled") {
1554+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1555+
} else if (value == "auto") {
1556+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1557+
} else {
1558+
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1559+
}
15521560
}
15531561
).set_env("LLAMA_ARG_FLASH_ATTN"));
15541562
add_opt(common_arg(
@@ -2954,20 +2962,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29542962
params.endpoint_metrics = true;
29552963
}
29562964
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
2957-
add_opt(common_arg(
2958-
{"--slots"},
2959-
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2960-
[](common_params & params) {
2961-
params.endpoint_slots = true;
2962-
}
2963-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
29642965
add_opt(common_arg(
29652966
{"--props"},
29662967
string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
29672968
[](common_params & params) {
29682969
params.endpoint_props = true;
29692970
}
29702971
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
2972+
add_opt(common_arg(
2973+
{"--slots"},
2974+
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2975+
[](common_params & params) {
2976+
params.endpoint_slots = true;
2977+
}
2978+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
29712979
add_opt(common_arg(
29722980
{"--no-slots"},
29732981
"disables slots monitoring endpoint",
@@ -3459,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34593467
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
34603468
params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
34613469
params.port = 8012;
3462-
params.n_gpu_layers = 99;
3463-
params.flash_attn = true;
34643470
params.n_ubatch = 1024;
34653471
params.n_batch = 1024;
34663472
params.n_ctx = 0;
@@ -3475,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34753481
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
34763482
params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
34773483
params.port = 8012;
3478-
params.n_gpu_layers = 99;
3479-
params.flash_attn = true;
34803484
params.n_ubatch = 1024;
34813485
params.n_batch = 1024;
34823486
params.n_ctx = 0;
@@ -3491,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34913495
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
34923496
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
34933497
params.port = 8012;
3494-
params.n_gpu_layers = 99;
3495-
params.flash_attn = true;
34963498
params.n_ubatch = 1024;
34973499
params.n_batch = 1024;
34983500
params.n_ctx = 0;
@@ -3508,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35083510
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
35093511
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
35103512
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3511-
params.speculative.n_gpu_layers = 99;
35123513
params.port = 8012;
3513-
params.n_gpu_layers = 99;
3514-
params.flash_attn = true;
35153514
params.n_ubatch = 1024;
35163515
params.n_batch = 1024;
35173516
params.n_ctx = 0;
@@ -3527,10 +3526,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35273526
params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
35283527
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
35293528
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3530-
params.speculative.n_gpu_layers = 99;
35313529
params.port = 8012;
3532-
params.n_gpu_layers = 99;
3533-
params.flash_attn = true;
35343530
params.n_ubatch = 1024;
35353531
params.n_batch = 1024;
35363532
params.n_ctx = 0;
@@ -3545,8 +3541,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35453541
params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
35463542
params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
35473543
params.port = 8012;
3548-
params.n_gpu_layers = 99;
3549-
params.flash_attn = true;
35503544
params.n_ubatch = 1024;
35513545
params.n_batch = 1024;
35523546
params.n_ctx = 0;

common/common.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -901,7 +901,8 @@ struct common_init_result common_init_from_params(common_params & params) {
901901

902902
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
903903
if (model == NULL) {
904-
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
904+
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
905+
__func__, params.model.path.c_str());
905906
return iparams;
906907
}
907908

@@ -911,7 +912,8 @@ struct common_init_result common_init_from_params(common_params & params) {
911912

912913
llama_context * lctx = llama_init_from_model(model, cparams);
913914
if (lctx == NULL) {
914-
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
915+
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
916+
__func__, params.model.path.c_str());
915917
llama_model_free(model);
916918
return iparams;
917919
}
@@ -1157,10 +1159,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11571159
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
11581160
cparams.pooling_type = params.pooling_type;
11591161
cparams.attention_type = params.attention_type;
1162+
cparams.flash_attn_type = params.flash_attn_type;
11601163
cparams.cb_eval = params.cb_eval;
11611164
cparams.cb_eval_user_data = params.cb_eval_user_data;
11621165
cparams.offload_kqv = !params.no_kv_offload;
1163-
cparams.flash_attn = params.flash_attn;
11641166
cparams.no_perf = params.no_perf;
11651167
cparams.op_offload = !params.no_op_offload;
11661168
cparams.swa_full = params.swa_full;

common/common.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,7 @@ struct common_params {
312312
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
313313
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
314314
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
315+
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
315316

316317
struct common_params_sampling sampling;
317318
struct common_params_speculative speculative;
@@ -375,7 +376,6 @@ struct common_params {
375376
bool multiline_input = false; // reverse the usage of `\`
376377
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
377378
bool cont_batching = true; // insert new sequences for decoding on-the-fly
378-
bool flash_attn = false; // flash attention
379379
bool no_perf = false; // disable performance metrics
380380
bool ctx_shift = false; // context shift on infinite text generation
381381
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
@@ -444,7 +444,7 @@ struct common_params {
444444

445445
// "advanced" endpoints are disabled by default for better security
446446
bool webui = true;
447-
bool endpoint_slots = false;
447+
bool endpoint_slots = true;
448448
bool endpoint_props = false; // only control POST requests, not GET
449449
bool endpoint_metrics = false;
450450

common/sampling.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -426,8 +426,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
426426

427427
// helpers
428428

429-
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
430-
return &gsmpl->cur_p;
429+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
430+
auto * res = &gsmpl->cur_p;
431+
432+
if (do_sort && !res->sorted) {
433+
// remember the selected token before sorting
434+
const llama_token id = res->data[res->selected].id;
435+
436+
std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
437+
return a.p > b.p;
438+
});
439+
440+
// restore the selected token after sorting
441+
for (size_t i = 0; i < res->size; ++i) {
442+
if (res->data[i].id == id) {
443+
res->selected = i;
444+
break;
445+
}
446+
}
447+
448+
res->sorted = true;
449+
}
450+
451+
return res;
431452
}
432453

433454
llama_token common_sampler_last(const struct common_sampler * gsmpl) {

common/sampling.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
8686
// helpers
8787

8888
// access the internal list of current candidate tokens
89-
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
89+
// if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
90+
// the .sorted flag of the result indicates whether the returned candidates are sorted
91+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
9092

9193
// get the last accepted token
9294
llama_token common_sampler_last(const struct common_sampler * gsmpl);

common/speculative.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ llama_tokens common_speculative_gen_draft(
317317

318318
common_sampler_sample(smpl, ctx_dft, 0, true);
319319

320-
const auto * cur_p = common_sampler_get_candidates(smpl);
320+
const auto * cur_p = common_sampler_get_candidates(smpl, true);
321321

322322
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
323323
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",

convert_hf_to_gguf.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -302,10 +302,6 @@ def prepare_tensors(self):
302302
# data = data_torch.squeeze().numpy()
303303
data = data_torch.numpy()
304304

305-
# if data ends up empty, it means data_torch was a scalar tensor -> restore
306-
if len(data.shape) == 0:
307-
data = data_torch.numpy()
308-
309305
n_dims = len(data.shape)
310306
data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
311307

docs/backend/CANN.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,3 +314,7 @@ Controls automatic cleanup of the memory pool. This option is only effective whe
314314

315315
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
316316

317+
### GGML_CANN_DISABLE_ACL_GRAPH
318+
319+
When this variable is set, ACL graph execution is disabled and operators are executed in an op-by-op (eager) mode.
320+
This mode is mainly intended for debugging or for cases where the overhead of graph construction and execution is not desirable.

0 commit comments

Comments
 (0)