@@ -200,6 +200,7 @@ class load_model_inputs(ctypes.Structure):
200200 ("vulkan_info" , ctypes .c_char_p ),
201201 ("batchsize" , ctypes .c_int ),
202202 ("autofit" , ctypes .c_bool ),
203+ ("autofit_tax_mb" , ctypes .c_int ),
203204 ("gpulayers" , ctypes .c_int ),
204205 ("rope_freq_scale" , ctypes .c_float ),
205206 ("rope_freq_base" , ctypes .c_float ),
@@ -1158,6 +1159,24 @@ def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,
11581159 except Exception :
11591160 modelfile_extracted_meta = None
11601161
1162+ def calculate_secondary_model_overheads (sdquant ):
1163+ cost = 0
1164+ if modelfile_extracted_meta [3 ] > 1024 * 1024 * 1024 * 5 : #sdxl tax
1165+ cost += 1024 * 1024 * 1024 * (9 - sdquant * 1.5 ) # 9, 7.5, 6
1166+ elif modelfile_extracted_meta [3 ] > 1024 * 1024 * 512 : #normal sd tax
1167+ cost += 1024 * 1024 * 1024 * (4.25 - sdquant * 0.5 ) # 4.25, 3.75, 3.25
1168+ if modelfile_extracted_meta [4 ] > 1024 * 1024 * 10 : #whisper tax
1169+ cost += max (350 * 1024 * 1024 ,modelfile_extracted_meta [4 ]* 1.5 )
1170+ if modelfile_extracted_meta [5 ] > 1024 * 1024 * 10 : #mmproj tax
1171+ cost += max (350 * 1024 * 1024 ,modelfile_extracted_meta [5 ]* 1.5 )
1172+ if modelfile_extracted_meta [6 ] > 1024 * 1024 * 10 : #draft model tax
1173+ cost += (modelfile_extracted_meta [6 ] * 1.5 )
1174+ if modelfile_extracted_meta [7 ] > 1024 * 1024 * 10 : #tts model tax
1175+ cost += max (600 * 1024 * 1024 , modelfile_extracted_meta [7 ] * 3 )
1176+ if modelfile_extracted_meta [8 ] > 1024 * 1024 * 10 : #embeddings model tax
1177+ cost += max (350 * 1024 * 1024 , modelfile_extracted_meta [8 ] * 1.5 )
1178+ return cost
1179+
11611180def autoset_gpu_layers (ctxsize , sdquanted , bbs , qkv_level ): #shitty algo to determine how many layers to use
11621181 global showusedmemwarning , showmultigpuwarning , modelfile_extracted_meta # reference cached values instead
11631182 gpumem = MaxMemory [0 ]
@@ -1186,21 +1205,9 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to dete
11861205 showmultigpuwarning = False
11871206 print ("Multi-Part GGUF detected. Layer estimates may not be very accurate - recommend setting layers manually." )
11881207 fsize *= total_parts
1189- sdquantsavings = sdquanted
1190- if modelfile_extracted_meta [3 ] > 1024 * 1024 * 1024 * 5 : #sdxl tax
1191- mem -= 1024 * 1024 * 1024 * (9 - sdquantsavings * 1.5 ) # 9, 7.5, 6
1192- elif modelfile_extracted_meta [3 ] > 1024 * 1024 * 512 : #normal sd tax
1193- mem -= 1024 * 1024 * 1024 * (4.25 - sdquantsavings * 0.5 ) # 4.25, 3.75, 3.25
1194- if modelfile_extracted_meta [4 ] > 1024 * 1024 * 10 : #whisper tax
1195- mem -= max (350 * 1024 * 1024 ,modelfile_extracted_meta [4 ]* 1.5 )
1196- if modelfile_extracted_meta [5 ] > 1024 * 1024 * 10 : #mmproj tax
1197- mem -= max (350 * 1024 * 1024 ,modelfile_extracted_meta [5 ]* 1.5 )
1198- if modelfile_extracted_meta [6 ] > 1024 * 1024 * 10 : #draft model tax
1199- mem -= (modelfile_extracted_meta [6 ] * 1.5 )
1200- if modelfile_extracted_meta [7 ] > 1024 * 1024 * 10 : #tts model tax
1201- mem -= max (600 * 1024 * 1024 , modelfile_extracted_meta [7 ] * 3 )
1202- if modelfile_extracted_meta [8 ] > 1024 * 1024 * 10 : #embeddings model tax
1203- mem -= max (350 * 1024 * 1024 , modelfile_extracted_meta [8 ] * 1.5 )
1208+
1209+ extracost = calculate_secondary_model_overheads (sdquanted )
1210+ mem -= extracost
12041211 mem = 0 if mem < 0 else mem
12051212
12061213 csmul = (cs / 4096 ) if cs >= 8192 else 1.8 if cs > 4096 else 1.2 if cs > 2048 else 1.0
@@ -1490,6 +1497,7 @@ def load_model(model_filename):
14901497 inputs .quant_k = inputs .quant_v = 0
14911498 inputs .batchsize = args .batchsize
14921499 inputs .autofit = args .autofit
1500+ inputs .autofit_tax_mb = int (calculate_secondary_model_overheads (args .sdquant )/ (1024 * 1024 ))
14931501 inputs .gpulayers = args .gpulayers
14941502 if args .overridenativecontext and args .overridenativecontext > 0 :
14951503 inputs .overridenativecontext = args .overridenativecontext
0 commit comments