Skip to content

Commit 228b915

Browse files
committed
Revert "llama: use FA + max. GPU layers by default (ggml-org#15434)"
1 parent f97a844 commit 228b915

File tree

15 files changed

+53
-222
lines changed

15 files changed

+53
-222
lines changed

common/arg.cpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1547,18 +1547,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15471547
}
15481548
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
15491549
add_opt(common_arg(
1550-
{"-fa", "--flash-attn"}, "FA",
1551-
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
1552-
[](common_params & params, const std::string & value) {
1553-
if (value == "on" || value == "enabled") {
1554-
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1555-
} else if (value == "off" || value == "disabled") {
1556-
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1557-
} else if (value == "auto") {
1558-
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1559-
} else {
1560-
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1561-
}
1550+
{"-fa", "--flash-attn"},
1551+
string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
1552+
[](common_params & params) {
1553+
params.flash_attn = true;
15621554
}
15631555
).set_env("LLAMA_ARG_FLASH_ATTN"));
15641556
add_opt(common_arg(
@@ -3469,6 +3461,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34693461
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
34703462
params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
34713463
params.port = 8012;
3464+
params.n_gpu_layers = 99;
3465+
params.flash_attn = true;
34723466
params.n_ubatch = 1024;
34733467
params.n_batch = 1024;
34743468
params.n_ctx = 0;
@@ -3483,6 +3477,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34833477
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
34843478
params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
34853479
params.port = 8012;
3480+
params.n_gpu_layers = 99;
3481+
params.flash_attn = true;
34863482
params.n_ubatch = 1024;
34873483
params.n_batch = 1024;
34883484
params.n_ctx = 0;
@@ -3497,6 +3493,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34973493
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
34983494
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
34993495
params.port = 8012;
3496+
params.n_gpu_layers = 99;
3497+
params.flash_attn = true;
35003498
params.n_ubatch = 1024;
35013499
params.n_batch = 1024;
35023500
params.n_ctx = 0;
@@ -3512,7 +3510,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35123510
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
35133511
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
35143512
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3513+
params.speculative.n_gpu_layers = 99;
35153514
params.port = 8012;
3515+
params.n_gpu_layers = 99;
3516+
params.flash_attn = true;
35163517
params.n_ubatch = 1024;
35173518
params.n_batch = 1024;
35183519
params.n_ctx = 0;
@@ -3528,7 +3529,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35283529
params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
35293530
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
35303531
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3532+
params.speculative.n_gpu_layers = 99;
35313533
params.port = 8012;
3534+
params.n_gpu_layers = 99;
3535+
params.flash_attn = true;
35323536
params.n_ubatch = 1024;
35333537
params.n_batch = 1024;
35343538
params.n_ctx = 0;
@@ -3543,6 +3547,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35433547
params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
35443548
params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
35453549
params.port = 8012;
3550+
params.n_gpu_layers = 99;
3551+
params.flash_attn = true;
35463552
params.n_ubatch = 1024;
35473553
params.n_batch = 1024;
35483554
params.n_ctx = 0;

common/common.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -909,8 +909,7 @@ struct common_init_result common_init_from_params(common_params & params) {
909909

910910
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
911911
if (model == NULL) {
912-
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
913-
__func__, params.model.path.c_str());
912+
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
914913
return iparams;
915914
}
916915

@@ -920,8 +919,7 @@ struct common_init_result common_init_from_params(common_params & params) {
920919

921920
llama_context * lctx = llama_init_from_model(model, cparams);
922921
if (lctx == NULL) {
923-
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
924-
__func__, params.model.path.c_str());
922+
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
925923
llama_model_free(model);
926924
return iparams;
927925
}
@@ -1167,10 +1165,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11671165
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
11681166
cparams.pooling_type = params.pooling_type;
11691167
cparams.attention_type = params.attention_type;
1170-
cparams.flash_attn_type = params.flash_attn_type;
11711168
cparams.cb_eval = params.cb_eval;
11721169
cparams.cb_eval_user_data = params.cb_eval_user_data;
11731170
cparams.offload_kqv = !params.no_kv_offload;
1171+
cparams.flash_attn = params.flash_attn;
11741172
cparams.no_perf = params.no_perf;
11751173
cparams.op_offload = !params.no_op_offload;
11761174
cparams.swa_full = params.swa_full;

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,6 @@ struct common_params {
308308
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
309309
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
310310
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
311-
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
312311

313312
struct common_params_sampling sampling;
314313
struct common_params_speculative speculative;
@@ -372,6 +371,7 @@ struct common_params {
372371
bool multiline_input = false; // reverse the usage of `\`
373372
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
374373
bool cont_batching = true; // insert new sequences for decoding on-the-fly
374+
bool flash_attn = false; // flash attention
375375
bool no_perf = false; // disable performance metrics
376376
bool ctx_shift = false; // context shift on infinite text generation
377377
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)

examples/diffusion/diffusion-cli.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -564,7 +564,7 @@ int main(int argc, char ** argv) {
564564
ctx_params.n_ctx = params.n_ctx;
565565
ctx_params.n_batch = params.n_batch;
566566
ctx_params.n_ubatch = params.n_ubatch;
567-
ctx_params.flash_attn_type = params.flash_attn_type;
567+
ctx_params.flash_attn = params.flash_attn;
568568
ctx_params.no_perf = params.no_perf;
569569
ctx_params.type_k = params.cache_type_k;
570570
ctx_params.type_v = params.cache_type_v;

0 commit comments

Comments
 (0)