Skip to content

Commit 5908f2c

Browse files
committed
based on occam and henky advice, disabled flash attention entirely on vulkan.
1 parent 7a7bdea commit 5908f2c

File tree

1 file changed

+16
-11
lines changed

1 file changed

+16
-11
lines changed

koboldcpp.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4134,8 +4134,10 @@ def changerunmode(a,b,c):
41344134
tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
41354135
tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
41364136
quick_use_flashattn.grid_remove()
4137+
use_flashattn.grid_remove()
41374138
else:
41384139
quick_use_flashattn.grid(row=22, column=1, padx=8, pady=1, stick="nw")
4140+
use_flashattn.grid(row=28, column=0, padx=8, pady=1, stick="nw")
41394141

41404142
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
41414143
gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
@@ -4256,11 +4258,6 @@ def changerunmode(a,b,c):
42564258
ctk.CTkButton(hardware_tab , text = "Run Benchmark", command = guibench ).grid(row=110,column=0, stick="se", padx= 0, pady=2)
42574259

42584260

4259-
runopts_var.trace('w', changerunmode)
4260-
changerunmode(1,1,1)
4261-
global runmode_untouched
4262-
runmode_untouched = True
4263-
42644261
# Tokens Tab
42654262
tokens_tab = tabcontent["Tokens"]
42664263
# tokens checkboxes
@@ -4283,18 +4280,14 @@ def togglerope(a,b,c):
42834280
else:
42844281
item.grid_remove()
42854282
makecheckbox(tokens_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
4286-
makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.")
4283+
use_flashattn = makecheckbox(tokens_tab, "Use FlashAttention", flashattention, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.")
42874284
noqkvlabel = makelabel(tokens_tab,"QuantKV works best with flash attention enabled",33,0,"WARNING: NOT RECOMMENDED.\nOnly K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.")
42884285
noqkvlabel.configure(text_color="#ff5555")
42894286
qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires FlashAttention for full effect, otherwise only K cache is quantized.")
42904287
quantkv_var.trace("w", toggleflashattn)
42914288
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
42924289
makelabelentry(tokens_tab, "MoE Experts:", moeexperts_var, row=45, padx=100, singleline=True, tooltip="Override number of MoE experts.")
42934290

4294-
togglerope(1,1,1)
4295-
toggleflashattn(1,1,1)
4296-
togglectxshift(1,1,1)
4297-
42984291
# Model Tab
42994292
model_tab = tabcontent["Loaded Files"]
43004293

@@ -4370,7 +4363,6 @@ def togglehorde(a,b,c):
43704363
horde_name_var.set(sanitize_string(os.path.splitext(basefile)[0]))
43714364

43724365
makecheckbox(horde_tab, "Configure for Horde", usehorde_var, 19, command=togglehorde,tooltiptxt="Enable the embedded AI Horde worker.")
4373-
togglehorde(1,1,1)
43744366

43754367
# Image Gen Tab
43764368

@@ -4471,6 +4463,16 @@ def togglezenity(a,b,c):
44714463
makecheckbox(extra_tab, "Use Classic FilePicker", nozenity_var, 20, tooltiptxt="Use the classic TKinter file picker instead.")
44724464
nozenity_var.trace("w", togglezenity)
44734465

4466+
# refresh
4467+
runopts_var.trace('w', changerunmode)
4468+
changerunmode(1,1,1)
4469+
global runmode_untouched
4470+
runmode_untouched = True
4471+
togglerope(1,1,1)
4472+
toggleflashattn(1,1,1)
4473+
togglectxshift(1,1,1)
4474+
togglehorde(1,1,1)
4475+
44744476
# launch
44754477
def guilaunch():
44764478
if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and tts_model_var.get() == "" and embeddings_model_var.get() == "" and nomodel.get()!=1:
@@ -6008,6 +6010,9 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
60086010

60096011
if not args.blasthreads or args.blasthreads <= 0:
60106012
args.blasthreads = args.threads
6013+
if args.flashattention and (args.usevulkan is not None):
6014+
print("FlashAttention should not be used with Vulkan as it is not fully implemented. Disabling flash attention.")
6015+
args.flashattention = False
60116016

60126017
modelname = os.path.abspath(args.model_param)
60136018
print(args)

0 commit comments

Comments
 (0)