Skip to content

Commit 3a0610e

Browse files
committed
Good enough M-GPU AL for now
1 parent b9babcc commit 3a0610e

File tree

1 file changed

+36
-24
lines changed

1 file changed

+36
-24
lines changed

koboldcpp.py

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,8 @@ def autoset_gpu_layers(ctxsize,sdquanted,blasbatchsize,flashattention,quantkv,mm
693693
print("***")
694694

695695
overhead = 250*1024*1024
696-
reservedmem0 = (overhead + usedmem0) # determine vram overhead
696+
reservedmem0 = (overhead*2 + usedmem0) # determine vram overhead
697+
697698
mem0 = gpumem0 - reservedmem0
698699
if mem0 < 0:
699700
mem0 = 0
@@ -747,9 +748,9 @@ def autoset_gpu_layers(ctxsize,sdquanted,blasbatchsize,flashattention,quantkv,mm
747748
try:
748749
if not modelfile_extracted_meta:
749750
return 0
750-
layerlimit = 0
751+
layerlimit_intermed = 0
751752
fsize = modelfile_extracted_meta[1]
752-
print(f"Initial layer limit: {layerlimit} ; Model size: {fsize} MiB ; context size: {ctxsize} tokens")
753+
print(f"Initial layer limit: {layerlimit_intermed} ; Model size: {fsize} MiB ; context size: {ctxsize} tokens")
753754
print(f"GPUs global reserved VRAM: {reservedmem/1024/1024} MiB (Toral occupied VRAM + Total overhead) ; GPUs total usable VRAM: {mem/1024/1024} MiB")
754755

755756
if fsize>10000000: #dont bother with models < 10mb
@@ -778,20 +779,22 @@ def autoset_gpu_layers(ctxsize,sdquanted,blasbatchsize,flashattention,quantkv,mm
778779
fa = flashattention
779780
fa_ratio = 1
780781
if fa == 1:
781-
fa_ratio = 0.5
782+
fa_ratio = 0.25
782783

783784
mmq = mmqmode
784785
mmq_ratio = 1
785786
if mmq == 1:
786-
mmq_ratio = 0.5
787+
mmq_ratio = 0.25
787788

788789
lv = lowvram
789790
lvctx_ratio = 1
790791
if lv == 1:
791792
lvctx_ratio = 0
792793
lvcomp_ratio = 1
793794
if lv == 1:
794-
lvcomp_ratio = 0.5
795+
lvcomp_ratio = 4
796+
797+
demult = 1.03
795798

796799
kvq = quantkv
797800
kvbpw = 0
@@ -884,16 +887,16 @@ def autoset_gpu_layers(ctxsize,sdquanted,blasbatchsize,flashattention,quantkv,mm
884887
print(f"Failure to read metadata or no layers number declared. Fallback calculations.")
885888
sizeperlayer = int(fsize*csmul*0.025)
886889
layers = (fsize/sizeperlayer)
887-
print(f"Size per layer = Model size {fsize/1024/1024:.3f} MiB x 0.052 x {csmul} (CCBM); Estimated number of layers = {layers}")
888-
layerlimit = int(min(200,mem/sizeperlayer))
889-
print(f"Size per layer: {sizeperlayer/1024/1024} MiB ; layers limit: {layerlimit} + offset of {layer_offset} layers if <200, else 200.")
890+
print(f"Size per layer = Model size {fsize/1024/1024:.3f} MiB x 0.052 x {csmul} (CCBM); Brute estimated number of layers = {fsize/1024/1024:.3f} MiB / {sizeperlayer/1024/1024:.3f} MiB = {layers} llayers")
891+
layerlimit_intermed = int(min(200,mem/sizeperlayer))
892+
print(f"Size per layer: {sizeperlayer/1024/1024} MiB ; layers limit: {layerlimit_intermed} + offset of {layer_offset} layers if <200, else 200.")
890893
print("***")
891894
else:
892895
print(f"Success to read metadata, proceeding with more elaborate calculations...")
893896
layers = ggufmeta[0]
894897
headcount = ggufmeta[1]
895898
if headcount == 0:
896-
headcount = layers
899+
headcount = layers*1.5
897900
print(f"Retrieved number of Model layers: {layers} ; Missing number of attention heads, thus based on the number of layers: {headcount}")
898901
headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
899902
sizeperlayer = int(fsize/(layers+1))
@@ -907,21 +910,27 @@ def autoset_gpu_layers(ctxsize,sdquanted,blasbatchsize,flashattention,quantkv,mm
907910
loaded_layers = (layers*ratio_init)
908911
loaded_layers_size = int(loaded_layers * sizeperlayer)
909912
print(f"Initially loaded layers: {loaded_layers:.3f} ; Size per layer: {sizeperlayer/1024/1024:.3f} MiB ; Loaded layer size {loaded_layers_size/1024/1024:.3f} MiB")
910-
print(f"context size: {cs} tokens ; GPU usable VRAM: {mem/1024/1024} MiB ; quant_kv_bpw : {kvbpw} bpw")
911-
context_buffer = int(layers*headcount*headkvlen*cs*lvctx_ratio*kvbpw/8)
912-
compute_buffer = int(layers*bbs_ratio*mmq_ratio*fa_ratio*headkvlen*cs*lvcomp_ratio*4*1.01)
913+
print(f"Context size: {cs} tokens ; GPU usable VRAM: {mem/1024/1024} MiB ; quant_kv_bpw : mode {kvq}, {kvbpw} bpw")
914+
915+
context_buffer = int(layers*headcount*headkvlen*cs*kvbpw/8*lvctx_ratio)
916+
print(f"Context buffer : {layers} layers x {headcount} heads x {headkvlen} of KVH length x {cs} ctx x {kvbpw/8} kvq_bpw x {lvctx_ratio} = {context_buffer/1024/1024} MiB")
917+
918+
compute_buffer = int(layers*bbs_ratio*mmq_ratio*fa_ratio*headkvlen*cs*lvcomp_ratio**4*1.25)
919+
print(f"Compute buffer : {layers} layers x {bbs/128} x {mmq_ratio} MMQ shink x {fa_ratio} FA shrink x {headkvlen} of KVH length x {cs} ctx x {lvcomp_ratio} lowvram bump x 4 x 1.25 = {compute_buffer/1024/1024} MiB")
920+
913921
total_buffer = int(context_buffer + compute_buffer)
914-
loaded_size = int(fsize*1.03 + context_buffer)
922+
print(f"Total_buffer: {total_buffer/1024/1024:.3f} MiB = Context buffer: {context_buffer/1024/1024} MiB + Compute buffer: {compute_buffer/1024/1024:.3f} MiB")
923+
924+
loaded_size = int(fsize + context_buffer)
915925
ratio_formula = (mem - compute_buffer)/loaded_size
916-
print(f"Context buffer: {context_buffer/1024/1024} MiB + Compute buffer: {compute_buffer/1024/1024:.3f} MiB = Total_buffer: {total_buffer/1024/1024:.3f} MiB")
917926
print(f"Loaded size: {loaded_size/1024/1024:.3f} MiB ; Formula ratio: {ratio_formula:.3f}")
918927
ratio = max(ratio_init,ratio_formula)
919928
print("***")
920929
else:
921930
ratio = ratio_init
922-
layerlimit = int(ratio*layers)
923-
print(f"Layers limit: {layerlimit} = final ratio {ratio:.3f} x {layers} layers + eventual manual offset.")
924-
estimated_loaded_size = int(layerlimit*sizeperlayer + total_buffer)
931+
layerlimit_intermed = int(ratio*layers)
932+
print(f"Layers limit: {layerlimit_intermed} = final ratio {ratio:.3f} x {layers} layers + eventual manual offset.")
933+
estimated_loaded_size = int(layerlimit_intermed*sizeperlayer + total_buffer)
925934
print(f"Estimated loaded size in the GPU(s): {estimated_loaded_size/1024/1024:.3f} MiB")
926935
else:
927936
print(f"GPU usable VRAM : {mem/1024/1024} MiB > {size_init/1024/1024:.3f} MiB initially to load.")
@@ -933,18 +942,21 @@ def autoset_gpu_layers(ctxsize,sdquanted,blasbatchsize,flashattention,quantkv,mm
933942
sizeperlayer = int(fsize*csmul*0.052)
934943
layers = (fsize/sizeperlayer)
935944
print(f"Size per layer = Model size {fsize/1024/1024:.3f} MiB x 0.052 x {csmul} (CCBM); Estimated number of layers = {layers}")
936-
layerlimit = int(min(200,mem/sizeperlayer))
937-
print(f"Size per layer: {sizeperlayer/1024/1024} MiB ; layers limit: {layerlimit} + eventual offset if <200, else 200.")
945+
layerlimit_intermed = int(min(200,mem/sizeperlayer))
946+
print(f"Size per layer: {sizeperlayer/1024/1024} MiB ; layers limit: {layerlimit_intermed} + eventual offset if <200, else 200.")
938947
print("***")
939948
else:
940-
layerlimit = ggufmeta[0] + 3
949+
layerlimit_intermed = ggufmeta[0] + 3
941950
print(f"Metadata are read.")
942951
layers = ggufmeta[0]
943952
print(f"Layers limit is {ggufmeta[0]} + fixed offset of 3.")
944-
print(f"Layers limit: {layerlimit} = final ratio {ratio_init:.3f} x {layers} layers.")
953+
print(f"Layers limit: {layerlimit_intermed} = final ratio {ratio_init:.3f} x {layers} layers.")
945954
print("***")
946-
layerlimit = (0 if layerlimit<=2 else (layerlimit+layer_offset))
947-
print(f"Layers limit: {layerlimit}, including manual layer offset of {layer_offset}.")
955+
partial_offload_penalty = 0.95 if (layerlimit_intermed+layer_offset) < layers+1 else 1
956+
print(f"Partial offload penalty: {partial_offload_penalty}")
957+
layerlimit = 0 if layerlimit_intermed<=2 else int(layerlimit_intermed*partial_offload_penalty+layer_offset)
958+
# layerlimit = (0 if layerlimit_intermed<=2 else int((layerlimit_intermed+layer_offset)*(0.5 if (layerlimit_intermed+layer_offset) < layers+1 else 1)))
959+
print(f"Layers limit: {layerlimit} = {layerlimit_intermed} x {partial_offload_penalty} + {layer_offset} layers offset.")
948960
print("***")
949961
print("***")
950962
return layerlimit

0 commit comments

Comments
 (0)