|
106 | 106 | args = None #global args |
107 | 107 | runmode_untouched = True |
108 | 108 | modelfile_extracted_meta = None |
| 109 | +calulated_gpu_overhead = 0 # may be populated at runtime, can also be missing if undetected |
109 | 110 | importvars_in_progress = False |
110 | 111 | has_multiplayer = False |
111 | 112 | has_audio_support = False |
@@ -1159,26 +1160,8 @@ def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath, |
1159 | 1160 | except Exception: |
1160 | 1161 | modelfile_extracted_meta = None |
1161 | 1162 |
|
1162 | | -def calculate_secondary_model_overheads(sdquant): |
1163 | | - cost = 0 |
1164 | | - if modelfile_extracted_meta[3] > 1024*1024*1024*5: #sdxl tax |
1165 | | - cost += 1024*1024*1024*(9 - sdquant * 1.5) # 9, 7.5, 6 |
1166 | | - elif modelfile_extracted_meta[3] > 1024*1024*512: #normal sd tax |
1167 | | - cost += 1024*1024*1024*(4.25 - sdquant * 0.5) # 4.25, 3.75, 3.25 |
1168 | | - if modelfile_extracted_meta[4] > 1024*1024*10: #whisper tax |
1169 | | - cost += max(350*1024*1024,modelfile_extracted_meta[4]*1.5) |
1170 | | - if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax |
1171 | | - cost += max(350*1024*1024,modelfile_extracted_meta[5]*1.5) |
1172 | | - if modelfile_extracted_meta[6] > 1024*1024*10: #draft model tax |
1173 | | - cost += (modelfile_extracted_meta[6] * 1.5) |
1174 | | - if modelfile_extracted_meta[7] > 1024*1024*10: #tts model tax |
1175 | | - cost += max(600*1024*1024, modelfile_extracted_meta[7] * 3) |
1176 | | - if modelfile_extracted_meta[8] > 1024*1024*10: #embeddings model tax |
1177 | | - cost += max(350*1024*1024, modelfile_extracted_meta[8] * 1.5) |
1178 | | - return cost |
1179 | | - |
1180 | 1163 | def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to determine how many layers to use |
1181 | | - global showusedmemwarning, showmultigpuwarning, modelfile_extracted_meta # reference cached values instead |
| 1164 | + global showusedmemwarning, showmultigpuwarning, modelfile_extracted_meta, calulated_gpu_overhead # reference cached values instead |
1182 | 1165 | gpumem = MaxMemory[0] |
1183 | 1166 | usedmem = 0 |
1184 | 1167 | if MaxFreeMemory[0]>0: |
@@ -1206,8 +1189,23 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to dete |
1206 | 1189 | print("Multi-Part GGUF detected. Layer estimates may not be very accurate - recommend setting layers manually.") |
1207 | 1190 | fsize *= total_parts |
1208 | 1191 |
|
1209 | | - extracost = calculate_secondary_model_overheads(sdquanted) |
1210 | | - mem -= extracost |
| 1192 | + calulated_gpu_overhead = 0 |
| 1193 | + if modelfile_extracted_meta[3] > 1024*1024*1024*5: #sdxl tax |
| 1194 | + calulated_gpu_overhead += 1024*1024*1024*(9 - sdquanted * 1.5) # 9, 7.5, 6 |
| 1195 | + elif modelfile_extracted_meta[3] > 1024*1024*512: #normal sd tax |
| 1196 | + calulated_gpu_overhead += 1024*1024*1024*(4.25 - sdquanted * 0.5) # 4.25, 3.75, 3.25 |
| 1197 | + if modelfile_extracted_meta[4] > 1024*1024*10: #whisper tax |
| 1198 | + calulated_gpu_overhead += max(350*1024*1024,modelfile_extracted_meta[4]*1.5) |
| 1199 | + if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax |
| 1200 | + calulated_gpu_overhead += max(350*1024*1024,modelfile_extracted_meta[5]*1.5) |
| 1201 | + if modelfile_extracted_meta[6] > 1024*1024*10: #draft model tax |
| 1202 | + calulated_gpu_overhead += (modelfile_extracted_meta[6] * 1.5) |
| 1203 | + if modelfile_extracted_meta[7] > 1024*1024*10: #tts model tax |
| 1204 | + calulated_gpu_overhead += max(600*1024*1024, modelfile_extracted_meta[7] * 3) |
| 1205 | + if modelfile_extracted_meta[8] > 1024*1024*10: #embeddings model tax |
| 1206 | + calulated_gpu_overhead += max(350*1024*1024, modelfile_extracted_meta[8] * 1.5) |
| 1207 | + |
| 1208 | + mem -= calulated_gpu_overhead |
1211 | 1209 | mem = 0 if mem < 0 else mem |
1212 | 1210 |
|
1213 | 1211 | csmul = (cs/4096) if cs >= 8192 else 1.8 if cs > 4096 else 1.2 if cs > 2048 else 1.0 |
@@ -1454,7 +1452,7 @@ def auto_set_backend_cli(): |
1454 | 1452 | print(f"Auto Selected Default Backend (flag={cpusupport})\n") |
1455 | 1453 |
|
1456 | 1454 | def load_model(model_filename): |
1457 | | - global args |
| 1455 | + global args, calulated_gpu_overhead |
1458 | 1456 | inputs = load_model_inputs() |
1459 | 1457 | inputs.model_filename = model_filename.encode("UTF-8") |
1460 | 1458 | inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten |
@@ -1497,7 +1495,7 @@ def load_model(model_filename): |
1497 | 1495 | inputs.quant_k = inputs.quant_v = 0 |
1498 | 1496 | inputs.batchsize = args.batchsize |
1499 | 1497 | inputs.autofit = args.autofit |
1500 | | - inputs.autofit_tax_mb = int(calculate_secondary_model_overheads(args.sdquant)/(1024*1024)) |
| 1498 | + inputs.autofit_tax_mb = calulated_gpu_overhead |
1501 | 1499 | inputs.gpulayers = args.gpulayers |
1502 | 1500 | if args.overridenativecontext and args.overridenativecontext>0: |
1503 | 1501 | inputs.overridenativecontext = args.overridenativecontext |
@@ -7830,6 +7828,8 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): |
7830 | 7828 | ssl._create_default_https_context = ssl._create_unverified_context |
7831 | 7829 |
|
7832 | 7830 | if args.gpulayers: |
| 7831 | + if args.autofit: |
| 7832 | + args.gpulayers = -1 |
7833 | 7833 | shouldavoidgpu = False |
7834 | 7834 | if args.usecpu and sys.platform!="darwin": |
7835 | 7835 | shouldavoidgpu = True |
|
0 commit comments