Disable Flash Attention with USE_FLASH_ATTENTION

alvarobartt · alvarobartt · commit 7ac2214ec91e · 2025-08-06T09:05:03.000+02:00
diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs
@@ -423,6 +423,10 @@ impl CandleBackend {
                 if dtype != DType::F16
                     || !cfg!(feature = "flash-attn")
                     || get_runtime_compute_cap().unwrap() < 80
+                    || &std::env::var("USE_FLASH_ATTENTION")
+                        .unwrap_or("True".to_string())
+                        .to_lowercase()
+                        != "true"
                 {
                     return Err(BackendError::Start("Mistral is only supported on Cuda devices in fp16 with flash attention v2 enabled".to_string()));
                 }
@@ -435,6 +439,10 @@ impl CandleBackend {
             (Config::Gte(config), Device::Cuda(_)) => {
                 if dtype != DType::F16
                     || !cfg!(any(feature = "flash-attn", feature = "flash-attn-v1"))
+                    || &std::env::var("USE_FLASH_ATTENTION")
+                        .unwrap_or("True".to_string())
+                        .to_lowercase()
+                        != "true"
                 {
                     tracing::info!("Starting GTE model on {:?}", device);
                     Ok(Box::new(GTEModel::load(vb, &config, model_type).s()?))
@@ -447,6 +455,10 @@ impl CandleBackend {
             (Config::Qwen2(config), Device::Cuda(_)) => {
                 if dtype != DType::F16
                     || !cfg!(any(feature = "flash-attn", feature = "flash-attn-v1"))
+                    || &std::env::var("USE_FLASH_ATTENTION")
+                        .unwrap_or("True".to_string())
+                        .to_lowercase()
+                        != "true"
                 {
                     return Err(BackendError::Start("Qwen2 is only supported on Cuda devices in fp16 with flash attention v2 enabled".to_string()));
                 }
@@ -459,6 +471,10 @@ impl CandleBackend {
             (Config::Qwen3(config), Device::Cuda(_)) => {
                 if dtype != DType::F16
                     || !cfg!(any(feature = "flash-attn", feature = "flash-attn-v1"))
+                    || &std::env::var("USE_FLASH_ATTENTION")
+                        .unwrap_or("True".to_string())
+                        .to_lowercase()
+                        != "true"
                 {
                     tracing::info!("Starting Qwen3 model on {:?}", device);
                     Ok(Box::new(Qwen3Model::load(vb, &config, model_type).s()?))