KCPP benchmark, Croco revision

Nexesenex · Nexesenex · commit fe59f50fe86c · 2024-10-31T19:04:38.000+01:00
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -4727,8 +4727,19 @@ def onready_subprocess():
     if args.model_param and (args.benchmark or args.prompt):
         start_server = False
         save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="")
-        benchmaxctx = maxctx
-        benchlen = args.promptlimit
+        gpu0avram = int(MaxMemory[0]/1024/1024)
+        gpu1avram = int(MaxMemory[1]/1024/1024)
+        gpu2avram = int(MaxMemory[2]/1024/1024)
+        gpu3avram = int(MaxMemory[3]/1024/1024)
+        gpu0fvram = int(MaxFreeMemory[0]/1024/1024)
+        gpu1fvram = int(MaxFreeMemory[1]/1024/1024)
+        gpu2fvram = int(MaxFreeMemory[2]/1024/1024)
+        gpu3fvram = int(MaxFreeMemory[3]/1024/1024)
+        gpuavram = gpu0avram + gpu1avram + gpu2avram + gpu3avram
+        gpufvram = gpu0fvram + gpu1fvram + gpu2fvram + gpu3fvram
+        benchmaxctx = maxctx - 128
+        benchtg = args.promptlimit
+        benchpp = (benchmaxctx - benchtg)
         benchtemp = 0.1
         benchtopk = 1
         benchreppen = 1
@@ -4743,8 +4754,8 @@ def onready_subprocess():
             if not args.benchmark:
                 benchbaneos = False
         if args.benchmark:
-            if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000:
-                print(f"\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.")
+            if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 13000000:
+                print(f"\nWarning: The benchmark CSV output file you selected exceeds 13MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.")
                 save_to_file = False
             if save_to_file:
                 print(f"\nRunning benchmark (Save to File: {args.benchmark})...")
@@ -4756,7 +4767,7 @@ def onready_subprocess():
                     benchprompt += benchprompt
         genp = {
             "prompt":benchprompt,
-            "max_length":benchlen,
+            "max_length":benchtg,
             "max_context_length":benchmaxctx,
             "temperature":benchtemp,
             "top_k":benchtopk,
@@ -4769,34 +4780,83 @@ def onready_subprocess():
             restore_stdout()
             print(result)
         if args.benchmark:
-            result = (result[:8] if len(result)>8 else "") if not args.prompt else result
-            t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001
-            t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001
-            s_pp = float(benchmaxctx-benchlen)/t_pp
-            s_gen = float(benchlen)/t_gen
+            result = (result[:4] if len(result)>4 else "") if not args.prompt else result
+            resultok = ((result==" 1 1") or (result=="1 1 "))
+            t_pp = float(handle.get_last_process_time())*float(benchpp)*0.001
+            t_gen = float(handle.get_last_eval_time())*float(benchtg)*0.001
+            s_pp = float(benchpp)/t_pp
+            s_gen = float(benchtg)/t_gen
             datetimestamp = datetime.now(timezone.utc)
-            benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
+
+            print(f"\nBench Completed - v{KcppVersion} ; LlamaCPP {LcppVersion}\nIf Cuda mode: {CudaSpecifics} ; Release date: {ReleaseDate}; Results:")
+
+            benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} NoBlas={args.noblas} Cublas_Args={args.usecublas} Offloaded layers={args.gpulayers} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
             print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
             print(f"Flags: {benchflagstr}")
             print(f"Timestamp: {datetimestamp}")
             print(f"Backend: {libname}")
             print(f"Layers: {args.gpulayers}")
             print(f"Model: {benchmodel}")
-            print(f"MaxCtx: {benchmaxctx}")
-            print(f"GenAmount: {benchlen}\n-----")
+            print(f"NoAVX2: {args.noavx2}")
+            print(f"NoBlas: {args.noblas}")
+            print(f"NoMmap: {args.nommap}")
+            print(f"HighPriority: {args.highpriority}")
+            print(f"FlashAttention: {args.flashattention}")
+            print(f"Threads: {args.threads}")
+            CUDevicesNames.sort(reverse=True)
+            if gpu0avram>0:
+                print(f"GPU 0 Name: {CUDevicesNames[0]}")
+            if gpu0avram>0:
+                print(f"GPU 0 VRAM: {gpu0avram} MiB")
+            if gpu0fvram>0:
+                print(f"GPU 0 VRAM: {gpu0fvram} MiB")
+            if gpu1avram>0:
+                print(f"GPU 1 Name: {CUDevicesNames[1]}")
+            if gpu1avram>0:
+                print(f"GPU 1 VRAM: {gpu1avram} MiB")
+            if gpu1fvram>0:
+                print(f"GPU 1 VRAM: {gpu1fvram} MiB")
+            if gpu2avram>0:
+                print(f"GPU 2 Name: {CUDevicesNames[2]}")
+            if gpu2avram>0:
+                print(f"GPU 2 VRAM: {gpu2avram} MiB")
+            if gpu2fvram>0:
+                print(f"GPU 2 VRAM: {gpu2fvram} MiB")
+            if gpu3avram>0:
+                print(f"GPU 3 Name: {CUDevicesNames[3]}")
+            if gpu3avram>0:
+                print(f"GPU 3 VRAM: {gpu3avram} MiB")
+            if gpu3fvram>0:
+                print(f"GPU 3 VRAM: {gpu3fvram} MiB")
+            if gpuavram > gpu0avram:
+                print(f"GPUs Total VRAM: {gpuavram} MiB")
+            if gpufvram > gpu0fvram:
+                print(f"GPUs Total VRAM: {gpufvram} MiB")
+            print(f"Cublas_Args: {args.usecublas}")
+            print(f"Layers: {args.gpulayers}")
+            print(f"Tensor_Split: {args.tensor_split}")
+            print(f"BlasThreads: {args.blasthreads}")
+            print(f"Blas_nBatchSize: {args.blasbatchsize}")
+            print(f"Blas_uBatchSize: {args.blasubatchsize}")
+            print(f"KV_cache: {args.quantkv}")
+            print(f"MaxCtx: {maxctx}\n-----")
+            print(f"PPnum: {benchpp}")
             print(f"ProcessingTime: {t_pp:.3f}s")
             print(f"ProcessingSpeed: {s_pp:.2f}T/s")
+            print(f"TGnum: {benchtg}")
             print(f"GenerationTime: {t_gen:.3f}s")
             print(f"GenerationSpeed: {s_gen:.2f}T/s")
+            print(f"BenchmarkCtx: {benchmaxctx}")
             print(f"TotalTime: {(t_pp+t_gen):.3f}s")
-            print(f"Output: {result}\n-----")
+            print(f"Output: {result}")
+            print(f"Coherent: {resultok}")
             if save_to_file:
                 try:
                     with open(args.benchmark, "a") as file:
                         file.seek(0, 2)
                         if file.tell() == 0: #empty file
-                            file.write(f"Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags")
-                        file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}")
+                            file.write(f"Datime,KCPPF,LCPP,Backend,CudaSpecifics,Model,NoAvx2,NoBlas,NoMmap,HighP,FlashA,Thrd,VRAM,FVRAM0,Layers,BlasThrd,BBSizeN,BBSizeU,KVC,PPNum,PPTime,PPSpeed,TGNum,TGTime,TGSpeed,BenchCtx,TotalTime,Coher,Tensor1,Split2,Cublas1,Argument2,Argument3,Argument4")
+                        file.write(f"\n{ReleaseDate},{KcppVersion},{LcppVersion},{libname},{CudaSpecifics},{benchmodel},{args.noavx2},{args.noblas},{args.nommap},{args.highpriority},{args.flashattention},{args.threads},{gpuavram},{gpu0fvram},{args.gpulayers},{args.blasthreads},{args.blasbatchsize},{args.blasubatchsize},{args.quantkv},{benchpp},{t_pp:.3f},{s_pp:.2f},{benchtg},{t_gen:.3f},{s_gen:.2f},{benchmaxctx},{(t_pp+t_gen):.3f},{resultok},{args.tensor_split},,{args.usecublas},,,")
                 except Exception as e:
                     print(f"Error writing benchmark to file: {e}")
             global using_gui_launcher