enable flash attention for image generation (LostRuins#1633)

wbruna · web-flow · commit d74c16e6e0b3 · 2025-07-05T11:20:51.000+08:00
diff --git a/expose.h b/expose.h
@@ -161,6 +161,7 @@ struct sd_load_model_inputs
     const char * vulkan_info = nullptr;
     const int threads = 0;
     const int quant = 0;
+    const bool flash_attention = false;
     const bool taesd = false;
     const int tiled_vae_threshold = 0;
     const char * t5xxl_filename = nullptr;
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -273,6 +273,7 @@ class sd_load_model_inputs(ctypes.Structure):
                 ("vulkan_info", ctypes.c_char_p),
                 ("threads", ctypes.c_int),
                 ("quant", ctypes.c_int),
+                ("flash_attention", ctypes.c_bool),
                 ("taesd", ctypes.c_bool),
                 ("tiled_vae_threshold", ctypes.c_int),
                 ("t5xxl_filename", ctypes.c_char_p),
@@ -1624,6 +1625,7 @@ def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl
 
     inputs.threads = thds
     inputs.quant = quant
+    inputs.flash_attention = args.flashattention
     inputs.taesd = True if args.sdvaeauto else False
     inputs.tiled_vae_threshold = args.sdtiledvae
     inputs.vae_filename = vae_filename.encode("UTF-8")
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -179,6 +179,10 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
         printf("With PhotoMaker Model: %s\n",photomaker_filename.c_str());
         photomaker_enabled = true;
     }
+    if(inputs.flash_attention)
+    {
+        printf("Flash Attention is enabled\n");
+    }
     if(inputs.quant)
     {
         printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
@@ -213,6 +217,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     sd_params->model_path = inputs.model_filename;
     sd_params->wtype = (inputs.quant==0?SD_TYPE_COUNT:SD_TYPE_Q4_0);
     sd_params->n_threads = inputs.threads; //if -1 use physical cores
+    sd_params->diffusion_flash_attn = inputs.flash_attention;
     sd_params->input_path = ""; //unused
     sd_params->batch_count = 1;
     sd_params->vae_path = vaefilename;