@@ -693,7 +693,8 @@ def autoset_gpu_layers(ctxsize,sdquanted,blasbatchsize,flashattention,quantkv,mm
693693 print ("***" )
694694
695695 overhead = 250 * 1024 * 1024
696- reservedmem0 = (overhead + usedmem0 ) # determine vram overhead
696+ reservedmem0 = (overhead * 2 + usedmem0 ) # determine vram overhead
697+
697698 mem0 = gpumem0 - reservedmem0
698699 if mem0 < 0 :
699700 mem0 = 0
@@ -747,9 +748,9 @@ def autoset_gpu_layers(ctxsize,sdquanted,blasbatchsize,flashattention,quantkv,mm
747748 try :
748749 if not modelfile_extracted_meta :
749750 return 0
750- layerlimit = 0
751+ layerlimit_intermed = 0
751752 fsize = modelfile_extracted_meta [1 ]
752- print (f"Initial layer limit: { layerlimit } ; Model size: { fsize } MiB ; context size: { ctxsize } tokens" )
753+ print (f"Initial layer limit: { layerlimit_intermed } ; Model size: { fsize } MiB ; context size: { ctxsize } tokens" )
753754 print (f"GPUs global reserved VRAM: { reservedmem / 1024 / 1024 } MiB (Toral occupied VRAM + Total overhead) ; GPUs total usable VRAM: { mem / 1024 / 1024 } MiB" )
754755
755756 if fsize > 10000000 : #dont bother with models < 10mb
@@ -778,20 +779,22 @@ def autoset_gpu_layers(ctxsize,sdquanted,blasbatchsize,flashattention,quantkv,mm
778779 fa = flashattention
779780 fa_ratio = 1
780781 if fa == 1 :
781- fa_ratio = 0.5
782+ fa_ratio = 0.25
782783
783784 mmq = mmqmode
784785 mmq_ratio = 1
785786 if mmq == 1 :
786- mmq_ratio = 0.5
787+ mmq_ratio = 0.25
787788
788789 lv = lowvram
789790 lvctx_ratio = 1
790791 if lv == 1 :
791792 lvctx_ratio = 0
792793 lvcomp_ratio = 1
793794 if lv == 1 :
794- lvcomp_ratio = 0.5
795+ lvcomp_ratio = 4
796+
797+ demult = 1.03
795798
796799 kvq = quantkv
797800 kvbpw = 0
@@ -884,16 +887,16 @@ def autoset_gpu_layers(ctxsize,sdquanted,blasbatchsize,flashattention,quantkv,mm
884887 print (f"Failure to read metadata or no layers number declared. Fallback calculations." )
885888 sizeperlayer = int (fsize * csmul * 0.025 )
886889 layers = (fsize / sizeperlayer )
887- print (f"Size per layer = Model size { fsize / 1024 / 1024 :.3f} MiB x 0.052 x { csmul } (CCBM); Estimated number of layers = { layers } " )
888- layerlimit = int (min (200 ,mem / sizeperlayer ))
889- print (f"Size per layer: { sizeperlayer / 1024 / 1024 } MiB ; layers limit: { layerlimit } + offset of { layer_offset } layers if <200, else 200." )
890+ print (f"Size per layer = Model size { fsize / 1024 / 1024 :.3f} MiB x 0.052 x { csmul } (CCBM); Brute estimated number of layers = { fsize / 1024 / 1024 :.3f } MiB / { sizeperlayer / 1024 / 1024 :.3f } MiB = { layers } llayers " )
891+ layerlimit_intermed = int (min (200 ,mem / sizeperlayer ))
892+ print (f"Size per layer: { sizeperlayer / 1024 / 1024 } MiB ; layers limit: { layerlimit_intermed } + offset of { layer_offset } layers if <200, else 200." )
890893 print ("***" )
891894 else :
892895 print (f"Success to read metadata, proceeding with more elaborate calculations..." )
893896 layers = ggufmeta [0 ]
894897 headcount = ggufmeta [1 ]
895898 if headcount == 0 :
896- headcount = layers
899+ headcount = layers * 1.5
897900 print (f"Retrieved number of Model layers: { layers } ; Missing number of attention heads, thus based on the number of layers: { headcount } " )
898901 headkvlen = (ggufmeta [2 ] if ggufmeta [2 ] > 0 else 128 )
899902 sizeperlayer = int (fsize / (layers + 1 ))
@@ -907,21 +910,27 @@ def autoset_gpu_layers(ctxsize,sdquanted,blasbatchsize,flashattention,quantkv,mm
907910 loaded_layers = (layers * ratio_init )
908911 loaded_layers_size = int (loaded_layers * sizeperlayer )
909912 print (f"Initially loaded layers: { loaded_layers :.3f} ; Size per layer: { sizeperlayer / 1024 / 1024 :.3f} MiB ; Loaded layer size { loaded_layers_size / 1024 / 1024 :.3f} MiB" )
910- print (f"context size: { cs } tokens ; GPU usable VRAM: { mem / 1024 / 1024 } MiB ; quant_kv_bpw : { kvbpw } bpw" )
911- context_buffer = int (layers * headcount * headkvlen * cs * lvctx_ratio * kvbpw / 8 )
912- compute_buffer = int (layers * bbs_ratio * mmq_ratio * fa_ratio * headkvlen * cs * lvcomp_ratio * 4 * 1.01 )
913+ print (f"Context size: { cs } tokens ; GPU usable VRAM: { mem / 1024 / 1024 } MiB ; quant_kv_bpw : mode { kvq } , { kvbpw } bpw" )
914+
915+ context_buffer = int (layers * headcount * headkvlen * cs * kvbpw / 8 * lvctx_ratio )
916+ print (f"Context buffer : { layers } layers x { headcount } heads x { headkvlen } of KVH length x { cs } ctx x { kvbpw / 8 } kvq_bpw x { lvctx_ratio } = { context_buffer / 1024 / 1024 } MiB" )
917+
918+ compute_buffer = int (layers * bbs_ratio * mmq_ratio * fa_ratio * headkvlen * cs * lvcomp_ratio ** 4 * 1.25 )
919+ print (f"Compute buffer : { layers } layers x { bbs / 128 } x { mmq_ratio } MMQ shink x { fa_ratio } FA shrink x { headkvlen } of KVH length x { cs } ctx x { lvcomp_ratio } lowvram bump x 4 x 1.25 = { compute_buffer / 1024 / 1024 } MiB" )
920+
913921 total_buffer = int (context_buffer + compute_buffer )
914- loaded_size = int (fsize * 1.03 + context_buffer )
922+ print (f"Total_buffer: { total_buffer / 1024 / 1024 :.3f} MiB = Context buffer: { context_buffer / 1024 / 1024 } MiB + Compute buffer: { compute_buffer / 1024 / 1024 :.3f} MiB" )
923+
924+ loaded_size = int (fsize + context_buffer )
915925 ratio_formula = (mem - compute_buffer )/ loaded_size
916- print (f"Context buffer: { context_buffer / 1024 / 1024 } MiB + Compute buffer: { compute_buffer / 1024 / 1024 :.3f} MiB = Total_buffer: { total_buffer / 1024 / 1024 :.3f} MiB" )
917926 print (f"Loaded size: { loaded_size / 1024 / 1024 :.3f} MiB ; Formula ratio: { ratio_formula :.3f} " )
918927 ratio = max (ratio_init ,ratio_formula )
919928 print ("***" )
920929 else :
921930 ratio = ratio_init
922- layerlimit = int (ratio * layers )
923- print (f"Layers limit: { layerlimit } = final ratio { ratio :.3f} x { layers } layers + eventual manual offset." )
924- estimated_loaded_size = int (layerlimit * sizeperlayer + total_buffer )
931+ layerlimit_intermed = int (ratio * layers )
932+ print (f"Layers limit: { layerlimit_intermed } = final ratio { ratio :.3f} x { layers } layers + eventual manual offset." )
933+ estimated_loaded_size = int (layerlimit_intermed * sizeperlayer + total_buffer )
925934 print (f"Estimated loaded size in the GPU(s): { estimated_loaded_size / 1024 / 1024 :.3f} MiB" )
926935 else :
927936 print (f"GPU usable VRAM : { mem / 1024 / 1024 } MiB > { size_init / 1024 / 1024 :.3f} MiB initially to load." )
@@ -933,18 +942,21 @@ def autoset_gpu_layers(ctxsize,sdquanted,blasbatchsize,flashattention,quantkv,mm
933942 sizeperlayer = int (fsize * csmul * 0.052 )
934943 layers = (fsize / sizeperlayer )
935944 print (f"Size per layer = Model size { fsize / 1024 / 1024 :.3f} MiB x 0.052 x { csmul } (CCBM); Estimated number of layers = { layers } " )
936- layerlimit = int (min (200 ,mem / sizeperlayer ))
937- print (f"Size per layer: { sizeperlayer / 1024 / 1024 } MiB ; layers limit: { layerlimit } + eventual offset if <200, else 200." )
945+ layerlimit_intermed = int (min (200 ,mem / sizeperlayer ))
946+ print (f"Size per layer: { sizeperlayer / 1024 / 1024 } MiB ; layers limit: { layerlimit_intermed } + eventual offset if <200, else 200." )
938947 print ("***" )
939948 else :
940- layerlimit = ggufmeta [0 ] + 3
949+ layerlimit_intermed = ggufmeta [0 ] + 3
941950 print (f"Metadata are read." )
942951 layers = ggufmeta [0 ]
943952 print (f"Layers limit is { ggufmeta [0 ]} + fixed offset of 3." )
944- print (f"Layers limit: { layerlimit } = final ratio { ratio_init :.3f} x { layers } layers." )
953+ print (f"Layers limit: { layerlimit_intermed } = final ratio { ratio_init :.3f} x { layers } layers." )
945954 print ("***" )
946- layerlimit = (0 if layerlimit <= 2 else (layerlimit + layer_offset ))
947- print (f"Layers limit: { layerlimit } , including manual layer offset of { layer_offset } ." )
955+ partial_offload_penalty = 0.95 if (layerlimit_intermed + layer_offset ) < layers + 1 else 1
956+ print (f"Partial offload penalty: { partial_offload_penalty } " )
957+ layerlimit = 0 if layerlimit_intermed <= 2 else int (layerlimit_intermed * partial_offload_penalty + layer_offset )
958+ # layerlimit = (0 if layerlimit_intermed<=2 else int((layerlimit_intermed+layer_offset)*(0.5 if (layerlimit_intermed+layer_offset) < layers+1 else 1)))
959+ print (f"Layers limit: { layerlimit } = { layerlimit_intermed } x { partial_offload_penalty } + { layer_offset } layers offset." )
948960 print ("***" )
949961 print ("***" )
950962 return layerlimit
0 commit comments