Skip to content

Commit 7fad4dc

Browse files
committed
fixed ordering of gpu overhead detection
1 parent 7c82cad commit 7fad4dc

File tree

1 file changed

+23
-23
lines changed

1 file changed

+23
-23
lines changed

koboldcpp.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@
106106
args = None #global args
107107
runmode_untouched = True
108108
modelfile_extracted_meta = None
109+
calulated_gpu_overhead = 0 # may be populated at runtime, can also be missing if undetected
109110
importvars_in_progress = False
110111
has_multiplayer = False
111112
has_audio_support = False
@@ -1159,26 +1160,8 @@ def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,
11591160
except Exception:
11601161
modelfile_extracted_meta = None
11611162

1162-
def calculate_secondary_model_overheads(sdquant):
1163-
cost = 0
1164-
if modelfile_extracted_meta[3] > 1024*1024*1024*5: #sdxl tax
1165-
cost += 1024*1024*1024*(9 - sdquant * 1.5) # 9, 7.5, 6
1166-
elif modelfile_extracted_meta[3] > 1024*1024*512: #normal sd tax
1167-
cost += 1024*1024*1024*(4.25 - sdquant * 0.5) # 4.25, 3.75, 3.25
1168-
if modelfile_extracted_meta[4] > 1024*1024*10: #whisper tax
1169-
cost += max(350*1024*1024,modelfile_extracted_meta[4]*1.5)
1170-
if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax
1171-
cost += max(350*1024*1024,modelfile_extracted_meta[5]*1.5)
1172-
if modelfile_extracted_meta[6] > 1024*1024*10: #draft model tax
1173-
cost += (modelfile_extracted_meta[6] * 1.5)
1174-
if modelfile_extracted_meta[7] > 1024*1024*10: #tts model tax
1175-
cost += max(600*1024*1024, modelfile_extracted_meta[7] * 3)
1176-
if modelfile_extracted_meta[8] > 1024*1024*10: #embeddings model tax
1177-
cost += max(350*1024*1024, modelfile_extracted_meta[8] * 1.5)
1178-
return cost
1179-
11801163
def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to determine how many layers to use
1181-
global showusedmemwarning, showmultigpuwarning, modelfile_extracted_meta # reference cached values instead
1164+
global showusedmemwarning, showmultigpuwarning, modelfile_extracted_meta, calulated_gpu_overhead # reference cached values instead
11821165
gpumem = MaxMemory[0]
11831166
usedmem = 0
11841167
if MaxFreeMemory[0]>0:
@@ -1206,8 +1189,23 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to dete
12061189
print("Multi-Part GGUF detected. Layer estimates may not be very accurate - recommend setting layers manually.")
12071190
fsize *= total_parts
12081191

1209-
extracost = calculate_secondary_model_overheads(sdquanted)
1210-
mem -= extracost
1192+
calulated_gpu_overhead = 0
1193+
if modelfile_extracted_meta[3] > 1024*1024*1024*5: #sdxl tax
1194+
calulated_gpu_overhead += 1024*1024*1024*(9 - sdquanted * 1.5) # 9, 7.5, 6
1195+
elif modelfile_extracted_meta[3] > 1024*1024*512: #normal sd tax
1196+
calulated_gpu_overhead += 1024*1024*1024*(4.25 - sdquanted * 0.5) # 4.25, 3.75, 3.25
1197+
if modelfile_extracted_meta[4] > 1024*1024*10: #whisper tax
1198+
calulated_gpu_overhead += max(350*1024*1024,modelfile_extracted_meta[4]*1.5)
1199+
if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax
1200+
calulated_gpu_overhead += max(350*1024*1024,modelfile_extracted_meta[5]*1.5)
1201+
if modelfile_extracted_meta[6] > 1024*1024*10: #draft model tax
1202+
calulated_gpu_overhead += (modelfile_extracted_meta[6] * 1.5)
1203+
if modelfile_extracted_meta[7] > 1024*1024*10: #tts model tax
1204+
calulated_gpu_overhead += max(600*1024*1024, modelfile_extracted_meta[7] * 3)
1205+
if modelfile_extracted_meta[8] > 1024*1024*10: #embeddings model tax
1206+
calulated_gpu_overhead += max(350*1024*1024, modelfile_extracted_meta[8] * 1.5)
1207+
1208+
mem -= calulated_gpu_overhead
12111209
mem = 0 if mem < 0 else mem
12121210

12131211
csmul = (cs/4096) if cs >= 8192 else 1.8 if cs > 4096 else 1.2 if cs > 2048 else 1.0
@@ -1454,7 +1452,7 @@ def auto_set_backend_cli():
14541452
print(f"Auto Selected Default Backend (flag={cpusupport})\n")
14551453

14561454
def load_model(model_filename):
1457-
global args
1455+
global args, calulated_gpu_overhead
14581456
inputs = load_model_inputs()
14591457
inputs.model_filename = model_filename.encode("UTF-8")
14601458
inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
@@ -1497,7 +1495,7 @@ def load_model(model_filename):
14971495
inputs.quant_k = inputs.quant_v = 0
14981496
inputs.batchsize = args.batchsize
14991497
inputs.autofit = args.autofit
1500-
inputs.autofit_tax_mb = int(calculate_secondary_model_overheads(args.sdquant)/(1024*1024))
1498+
inputs.autofit_tax_mb = calulated_gpu_overhead
15011499
inputs.gpulayers = args.gpulayers
15021500
if args.overridenativecontext and args.overridenativecontext>0:
15031501
inputs.overridenativecontext = args.overridenativecontext
@@ -7830,6 +7828,8 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
78307828
ssl._create_default_https_context = ssl._create_unverified_context
78317829

78327830
if args.gpulayers:
7831+
if args.autofit:
7832+
args.gpulayers = -1
78337833
shouldavoidgpu = False
78347834
if args.usecpu and sys.platform!="darwin":
78357835
shouldavoidgpu = True

0 commit comments

Comments
 (0)