ggml-org
diff --git a/‎common/arg.cpp‎
Lines changed: 24 additions & 17 deletions b/‎common/arg.cpp‎
Lines changed: 24 additions & 17 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 5 additions & 3 deletions b/‎common/common.cpp‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎common/common.h‎
Lines changed: 1 addition & 1 deletion b/‎common/common.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/diffusion/diffusion-cli.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/diffusion/diffusion-cli.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-backend.cpp‎
Lines changed: 1 addition & 0 deletions b/‎ggml/src/ggml-backend.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/llama.h‎
Lines changed: 9 additions & 1 deletion b/‎include/llama.h‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎scripts/server-bench.py‎
Lines changed: 0 additions & 6 deletions b/‎scripts/server-bench.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎src/llama-context.cpp‎
Lines changed: 62 additions & 7 deletions b/‎src/llama-context.cpp‎
Lines changed: 62 additions & 7 deletions
@@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
-        {"-fa", "--flash-attn"},
-        string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.flash_attn = true;
+        {"-fa", "--flash-attn"}, "FA",
+        string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
+        [](common_params & params, const std::string & value) {
+            if (value == "on" || value == "enabled") {
+                params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
+            } else if (value == "off" || value == "disabled") {
+                params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+            } else if (value == "auto") {
+                params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
+            } else {
+                throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
+            }
         }
     ).set_env("LLAMA_ARG_FLASH_ATTN"));
     add_opt(common_arg(
@@ -3459,8 +3467,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
             params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
             params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
+            params.n_gpu_layers = 999;
+            params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
             params.n_ubatch = 1024;
             params.n_batch = 1024;
             params.n_ctx = 0;
@@ -3475,8 +3483,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
             params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
             params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
+            params.n_gpu_layers = 999;
+            params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
             params.n_ubatch = 1024;
             params.n_batch = 1024;
             params.n_ctx = 0;
@@ -3491,8 +3499,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
             params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
             params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
+            params.n_gpu_layers = 999;
             params.n_ubatch = 1024;
             params.n_batch = 1024;
             params.n_ctx = 0;
@@ -3508,10 +3515,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
             params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
             params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
-            params.speculative.n_gpu_layers = 99;
+            params.speculative.n_gpu_layers = 999;
             params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
+            params.n_gpu_layers = 999;
+            params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
             params.n_ubatch = 1024;
             params.n_batch = 1024;
             params.n_ctx = 0;
@@ -3527,10 +3534,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
             params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
             params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
-            params.speculative.n_gpu_layers = 99;
+            params.speculative.n_gpu_layers = 999;
             params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
+            params.n_gpu_layers = 999;
+            params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
             params.n_ubatch = 1024;
             params.n_batch = 1024;
             params.n_ctx = 0;
@@ -3546,7 +3553,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
             params.port = 8012;
             params.n_gpu_layers = 99;
-            params.flash_attn = true;
+            params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
             params.n_ubatch = 1024;
             params.n_batch = 1024;
             params.n_ctx = 0;
 
@@ -901,7 +901,8 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
     if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+        LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
+            __func__, params.model.path.c_str());
         return iparams;
     }
 
@@ -911,7 +912,8 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
+            __func__, params.model.path.c_str());
         llama_model_free(model);
         return iparams;
     }
@@ -1152,10 +1154,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
     cparams.pooling_type      = params.pooling_type;
     cparams.attention_type    = params.attention_type;
+    cparams.flash_attn_type   = params.flash_attn_type;
     cparams.cb_eval           = params.cb_eval;
     cparams.cb_eval_user_data = params.cb_eval_user_data;
     cparams.offload_kqv       = !params.no_kv_offload;
-    cparams.flash_attn        = params.flash_attn;
     cparams.no_perf           = params.no_perf;
     cparams.op_offload        = !params.no_op_offload;
     cparams.swa_full          = params.swa_full;
 
@@ -309,6 +309,7 @@ struct common_params {
     enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
+    enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
 
     struct common_params_sampling    sampling;
     struct common_params_speculative speculative;
@@ -372,7 +373,6 @@ struct common_params {
     bool multiline_input   = false; // reverse the usage of `\`
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
-    bool flash_attn        = false; // flash attention
     bool no_perf           = false; // disable performance metrics
     bool ctx_shift         = false;  // context shift on infinite text generation
     bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
 
@@ -564,7 +564,7 @@ int main(int argc, char ** argv) {
     ctx_params.n_ctx                = params.n_ctx;
     ctx_params.n_batch              = params.n_batch;
     ctx_params.n_ubatch             = params.n_ubatch;
-    ctx_params.flash_attn           = params.flash_attn;
+    ctx_params.flash_attn_type      = params.flash_attn_type;
     ctx_params.no_perf              = params.no_perf;
     ctx_params.type_k               = params.cache_type_k;
     ctx_params.type_v               = params.cache_type_v;
 
@@ -346,6 +346,7 @@ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor *
 }
 
 ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
+    GGML_ASSERT(backend);
     return backend->device;
 }
 
 
@@ -179,6 +179,14 @@ extern "C" {
         LLAMA_ATTENTION_TYPE_NON_CAUSAL  = 1,
     };
 
+    enum llama_flash_attn_type {
+        LLAMA_FLASH_ATTN_TYPE_AUTO     = -1,
+        LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
+        LLAMA_FLASH_ATTN_TYPE_ENABLED  = 1,
+    };
+
+    LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
+
     enum llama_split_mode {
         LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
         LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@@ -303,6 +311,7 @@ extern "C" {
         enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
         enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
         enum llama_attention_type    attention_type;    // attention type to use for embeddings
+        enum llama_flash_attn_type   flash_attn_type;   // when to enable Flash Attention
 
         // ref: https://github.com/ggml-org/llama.cpp/pull/2054
         float    rope_freq_base;   // RoPE base frequency, 0 = from model
@@ -329,7 +338,6 @@ extern "C" {
         // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
         bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
-        bool flash_attn;  // use flash attention [EXPERIMENTAL]
         bool no_perf;     // measure performance timings
         bool op_offload;  // offload host tensor operations to device
         bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
 
@@ -151,12 +151,6 @@ def benchmark(
     if os.environ.get("LLAMA_ARG_N_PARALLEL") is None:
         logger.info("LLAMA_ARG_N_PARALLEL not explicitly set, using 32")
         os.environ["LLAMA_ARG_N_PARALLEL"] = "32"
-    if not external_server and os.environ.get("LLAMA_ARG_N_GPU_LAYERS") is None:
-        logger.info("LLAMA_ARG_N_GPU_LAYERS not explicitly set, using 999")
-        os.environ["LLAMA_ARG_N_GPU_LAYERS"] = "999"
-    if not external_server and os.environ.get("LLAMA_ARG_FLASH_ATTN") is None:
-        logger.info("LLAMA_ARG_FLASH_ATTN not explicitly set, using 'true'")
-        os.environ["LLAMA_ARG_FLASH_ATTN"] = "true"
 
     parallel: int = int(os.environ.get("LLAMA_ARG_N_PARALLEL")) # type: ignore
     prompts: Union[None, list[str], list[list[int]]] = get_prompts_text(prompt_source, n_prompts)
 
@@ -41,7 +41,6 @@ llama_context::llama_context(
     cparams.yarn_beta_slow   = params.yarn_beta_slow;
     cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
-    cparams.flash_attn       = params.flash_attn;
     cparams.no_perf          = params.no_perf;
     cparams.pooling_type     = params.pooling_type;
     cparams.warmup           = false;
@@ -86,6 +85,8 @@ llama_context::llama_context(
         cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
     }
 
+    cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
+
     // with causal attention, the batch size is limited by the context size
     cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
 
@@ -129,7 +130,7 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
     LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
     LLAMA_LOG_INFO("%s: causal_attn   = %d\n",   __func__, cparams.causal_attn);
-    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
+    LLAMA_LOG_INFO("%s: flash_attn    = %s\n",   __func__, llama_flash_attn_type_name(params.flash_attn_type));
     LLAMA_LOG_INFO("%s: kv_unified    = %s\n",   __func__, cparams.kv_unified ? "true" : "false");
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
@@ -279,7 +280,7 @@ llama_context::llama_context(
         }
     }
 
-    // reserve worst-case graph
+    // resolve automatic Flash Attention use and reserve worst-case graph
     if (!hparams.vocab_only) {
         const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
         const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
@@ -310,6 +311,42 @@ llama_context::llama_context(
                 throw std::runtime_error("failed to allocate compute pp buffers");
             }
 
+            if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
+                ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+                bool fa_device_mismatch = false;
+                for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
+                    ggml_tensor * n = ggml_graph_node(gf, i);
+                    if (n->op != GGML_OP_FLASH_ATTN_EXT) {
+                        continue;
+                    }
+                    ggml_backend_dev_t device_fa = ggml_backend_get_device(
+                        ggml_backend_sched_get_tensor_backend(sched.get(), n));
+
+                    GGML_ASSERT(strncmp(n->name, "fattn-", 6) == 0);
+                    const int il = std::stoi(n->name + 6);
+                    ggml_backend_dev_t device_kv = model.dev_layer(il);
+                    if (device_fa != device_kv) {
+                        fa_device_mismatch = true;
+                        break;
+                    }
+                }
+                if (fa_device_mismatch) {
+                    cparams.flash_attn = false;
+                    LLAMA_LOG_INFO("%s: Flash Attention was auto, set to disabled\n", __func__);
+                    if (ggml_is_quantized(params.type_v)) {
+                        throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
+                    }
+                    auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+                    if (!gf) {
+                        throw std::runtime_error("failed to allocate compute pp buffers");
+                    }
+                } else {
+                    cparams.flash_attn = true;
+                    LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
+                }
+            }
+
             n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
             n_nodes_pp  = ggml_graph_n_nodes(gf);
         }
@@ -2230,6 +2267,7 @@ llama_context_params llama_context_default_params() {
         /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
         /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
         /*.attention_type              =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
+        /*.flash_attn_type             =*/ LLAMA_FLASH_ATTN_TYPE_AUTO,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
         /*.yarn_ext_factor             =*/ -1.0f,
@@ -2246,7 +2284,6 @@ llama_context_params llama_context_default_params() {
         /*.abort_callback_data         =*/ nullptr,
         /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
-        /*.flash_attn                  =*/ false,
         /*.no_perf                     =*/ true,
         /*.op_offload                  =*/ true,
         /*.swa_full                    =*/ true,
@@ -2274,12 +2311,30 @@ llama_context * llama_init_from_model(
         return nullptr;
     }
 
-    if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
+    if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && model->arch == LLM_ARCH_GROK) {
         LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
-        params.flash_attn = false;
+        params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+    }
+
+    if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
+        const uint32_t blck_size = ggml_blck_size(params.type_k);
+        if (model->hparams.n_embd_head_k % blck_size != 0) {
+            LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
+                __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
+            return nullptr;
+        }
+    }
+
+    if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
+        const uint32_t blck_size = ggml_blck_size(params.type_v);
+        if (model->hparams.n_embd_head_v % blck_size != 0) {
+            LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
+                __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
+            return nullptr;
+        }
     }
 
-    if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
+    if (ggml_is_quantized(params.type_v) && params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_DISABLED) {
         LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
         return nullptr;
     }
Original file line number	Diff line number	Diff line change
`@@ -346,6 +346,7 @@ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor *`
`346`	`346`	`}`
`347`	`347`
`348`	`348`	`ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {`
	`349`	`+ GGML_ASSERT(backend);`
`349`	`350`	`return backend->device;`
`350`	`351`	`}`
`351`	`352`