Skip to content

Commit fedd529

Browse files
committed
autofit counts overheads
1 parent edfc961 commit fedd529

File tree

3 files changed

+27
-16
lines changed

3 files changed

+27
-16
lines changed

expose.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ struct load_model_inputs
5858
const char * vulkan_info = nullptr;
5959
const int batchsize = 512;
6060
const bool autofit = false;
61+
const int autofit_tax_mb = 0;
6162
const int gpulayers = 0;
6263
const float rope_freq_scale = 1.0f;
6364
const float rope_freq_base = 10000.0f;

gpttype_adapter.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2492,15 +2492,17 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
24922492
if(inputs.autofit)
24932493
{
24942494
common_params temp_params;
2495+
size_t taxmb = 1024 + inputs.autofit_tax_mb;
24952496
printf("\nAttempting to use llama.cpp's automating fitting code. This will override all your layer configs, may or may not work!\n");
24962497
//zero out any customizations made
24972498
tenos.clear();
24982499
tenos.push_back({nullptr, nullptr});
24992500
model_params.tensor_buft_overrides = tenos.data();
25002501
model_params.tensor_split = tensor_split_temp;
25012502
model_params.n_gpu_layers = 999; //must be this value to be considered default
2503+
printf("Autofit Reserve Space: %d MB\n",taxmb);
25022504
llama_params_fit(kcpp_data->model_filename.c_str(), &model_params, &llama_ctx_params,
2503-
tensor_split_temp, tenos.data(), 1024*1024*1024, kcpp_data->n_ctx,
2505+
tensor_split_temp, tenos.data(), taxmb*1024*1024, kcpp_data->n_ctx,
25042506
GGML_LOG_LEVEL_DEBUG);
25052507
printf("Autofit Result: ");
25062508
print_fitted_params(model_params,llama_ctx_params);

koboldcpp.py

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ class load_model_inputs(ctypes.Structure):
200200
("vulkan_info", ctypes.c_char_p),
201201
("batchsize", ctypes.c_int),
202202
("autofit", ctypes.c_bool),
203+
("autofit_tax_mb", ctypes.c_int),
203204
("gpulayers", ctypes.c_int),
204205
("rope_freq_scale", ctypes.c_float),
205206
("rope_freq_base", ctypes.c_float),
@@ -1158,6 +1159,24 @@ def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath,
11581159
except Exception:
11591160
modelfile_extracted_meta = None
11601161

1162+
def calculate_secondary_model_overheads(sdquant):
1163+
cost = 0
1164+
if modelfile_extracted_meta[3] > 1024*1024*1024*5: #sdxl tax
1165+
cost += 1024*1024*1024*(9 - sdquant * 1.5) # 9, 7.5, 6
1166+
elif modelfile_extracted_meta[3] > 1024*1024*512: #normal sd tax
1167+
cost += 1024*1024*1024*(4.25 - sdquant * 0.5) # 4.25, 3.75, 3.25
1168+
if modelfile_extracted_meta[4] > 1024*1024*10: #whisper tax
1169+
cost += max(350*1024*1024,modelfile_extracted_meta[4]*1.5)
1170+
if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax
1171+
cost += max(350*1024*1024,modelfile_extracted_meta[5]*1.5)
1172+
if modelfile_extracted_meta[6] > 1024*1024*10: #draft model tax
1173+
cost += (modelfile_extracted_meta[6] * 1.5)
1174+
if modelfile_extracted_meta[7] > 1024*1024*10: #tts model tax
1175+
cost += max(600*1024*1024, modelfile_extracted_meta[7] * 3)
1176+
if modelfile_extracted_meta[8] > 1024*1024*10: #embeddings model tax
1177+
cost += max(350*1024*1024, modelfile_extracted_meta[8] * 1.5)
1178+
return cost
1179+
11611180
def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to determine how many layers to use
11621181
global showusedmemwarning, showmultigpuwarning, modelfile_extracted_meta # reference cached values instead
11631182
gpumem = MaxMemory[0]
@@ -1186,21 +1205,9 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to dete
11861205
showmultigpuwarning = False
11871206
print("Multi-Part GGUF detected. Layer estimates may not be very accurate - recommend setting layers manually.")
11881207
fsize *= total_parts
1189-
sdquantsavings = sdquanted
1190-
if modelfile_extracted_meta[3] > 1024*1024*1024*5: #sdxl tax
1191-
mem -= 1024*1024*1024*(9 - sdquantsavings * 1.5) # 9, 7.5, 6
1192-
elif modelfile_extracted_meta[3] > 1024*1024*512: #normal sd tax
1193-
mem -= 1024*1024*1024*(4.25 - sdquantsavings * 0.5) # 4.25, 3.75, 3.25
1194-
if modelfile_extracted_meta[4] > 1024*1024*10: #whisper tax
1195-
mem -= max(350*1024*1024,modelfile_extracted_meta[4]*1.5)
1196-
if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax
1197-
mem -= max(350*1024*1024,modelfile_extracted_meta[5]*1.5)
1198-
if modelfile_extracted_meta[6] > 1024*1024*10: #draft model tax
1199-
mem -= (modelfile_extracted_meta[6] * 1.5)
1200-
if modelfile_extracted_meta[7] > 1024*1024*10: #tts model tax
1201-
mem -= max(600*1024*1024, modelfile_extracted_meta[7] * 3)
1202-
if modelfile_extracted_meta[8] > 1024*1024*10: #embeddings model tax
1203-
mem -= max(350*1024*1024, modelfile_extracted_meta[8] * 1.5)
1208+
1209+
extracost = calculate_secondary_model_overheads(sdquanted)
1210+
mem -= extracost
12041211
mem = 0 if mem < 0 else mem
12051212

12061213
csmul = (cs/4096) if cs >= 8192 else 1.8 if cs > 4096 else 1.2 if cs > 2048 else 1.0
@@ -1490,6 +1497,7 @@ def load_model(model_filename):
14901497
inputs.quant_k = inputs.quant_v = 0
14911498
inputs.batchsize = args.batchsize
14921499
inputs.autofit = args.autofit
1500+
inputs.autofit_tax_mb = int(calculate_secondary_model_overheads(args.sdquant)/(1024*1024))
14931501
inputs.gpulayers = args.gpulayers
14941502
if args.overridenativecontext and args.overridenativecontext>0:
14951503
inputs.overridenativecontext = args.overridenativecontext

0 commit comments

Comments
 (0)