renamed promptlimit to genlimit, now applies to API requests as well, can be set in the ui. hide API info display if running in CLI mode.

LostRuins · LostRuins · commit 645b09ea20ab · 2025-08-30T00:26:05.000+08:00
diff --git a/klite.embd b/klite.embd
@@ -3725,6 +3725,15 @@ Current version indicated by LITEVER below.
 		"name":"OpenAI Harmony",
 		"user":"<|start|>user<|message|>",
 		"user_end":"<|end|>",
+		"assistant":"<|start|>assistant",
+		"assistant_end":"<|end|>",
+		"system":"<|start|>developer<|message|>",
+		"system_end":"<|end|>",
+	},
+	{
+		"name":"OpenAI Harmony Non-Thinking",
+		"user":"<|start|>user<|message|>",
+		"user_end":"<|end|>",
 		"assistant":"<|start|>assistant<|channel|>final<|message|>",
 		"assistant_end":"<|end|>",
 		"system":"<|start|>developer<|message|>",
@@ -17063,6 +17072,8 @@ Current version indicated by LITEVER below.
 				}
 			}
 		}
+		//special case for GPT-OSS, never use it as a stop sequence or stuff gets messed
+		seqs = seqs.filter(itm => !itm.includes("<|start|>assistant"));
 		if(!localsettings.includedefaultstops)
 		{
 			seqs = [];
@@ -19012,6 +19023,9 @@ Current version indicated by LITEVER below.
 				}
 			}
 
+			//special case for GPT-OSS, never use it as a stop sequence or stuff gets messed
+			stripping_arr = stripping_arr.filter(itm => !itm.includes("<|start|>assistant"));
+
 			//sometimes the OAI type endpoints get confused and repeat the instruct tag, so trim it
 			for(let i=0;i<stripping_arr.length;++i)
 			{
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -1510,6 +1510,8 @@ def generate(genparams, stream_flag=False):
         max_context_length = maxctx
     min_remain_hardlimit = max(min(max_context_length-4, 16),int(max_context_length*0.2))
     min_remain_softlimit = max(min(max_context_length-4, 16),int(max_context_length*0.4))
+    if args.genlimit > 0 and max_length > args.genlimit:
+        max_length = args.genlimit
     if max_length >= (max_context_length-min_remain_softlimit):
         print(f"\n!!! ====== !!!\nWarning: You are trying to generate text with max_length ({max_length}) near or exceeding max_context_length limit ({max_context_length}).\nMost of the context will be removed, and your outputs will not be very coherent.\nConsider launching with increased --contextsize to avoid issues.\n!!! ====== !!!")
         if max_length >= (max_context_length-min_remain_hardlimit):
@@ -4555,6 +4557,7 @@ def hide_tooltip(event):
     moeexperts_var = ctk.StringVar(value=str(-1))
     moecpu_var = ctk.StringVar(value=str(0))
     defaultgenamt_var = ctk.StringVar(value=str(768))
+    genlimit_var = ctk.StringVar(value=str(0))
     nobostoken_var = ctk.IntVar(value=0)
     override_kv_var = ctk.StringVar(value="")
     override_tensors_var = ctk.StringVar(value="")
@@ -5223,6 +5226,7 @@ def changerunmode(a,b,c):
     makeslider(tokens_tab, "Context Size:",contextsize_text, context_var, 0, len(contextsize_text)-1, 18, width=280, set=7,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")
     context_var.trace_add("write", changed_gpulayers_estimate)
     makelabelentry(tokens_tab, "Default Gen Amt:", defaultgenamt_var, row=20, padx=120, singleline=True, tooltip="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.")
+    makelabelentry(tokens_tab, "Prompt Limit:", genlimit_var, row=20, padx=300, singleline=True, tooltip="If set, restricts max output tokens to this limit regardless of API request. Set to 0 to disable.",labelpadx=210)
 
     nativectx_entry, nativectx_label = makelabelentry(tokens_tab, "Override Native Context:", customrope_nativectx, row=23, padx=146, singleline=True, tooltip="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.")
     customrope_scale_entry, customrope_scale_label = makelabelentry(tokens_tab, "RoPE Scale:", customrope_scale, row=23, padx=100, singleline=True, tooltip="For Linear RoPE scaling. RoPE frequency scale.")
@@ -5559,6 +5563,7 @@ def export_vars():
         args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1
         args.moecpu = int(moecpu_var.get()) if moecpu_var.get()!="" else 0
         args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 768
+        args.genlimit = int(genlimit_var.get()) if genlimit_var.get()!="" else 0
         args.nobostoken = (nobostoken_var.get()==1)
         args.enableguidance = (enableguidance_var.get()==1)
         args.overridekv = None if override_kv_var.get() == "" else override_kv_var.get()
@@ -5784,6 +5789,10 @@ def import_vars(dict):
             moecpu_var.set(dict["moecpu"])
         if "defaultgenamt" in dict and dict["defaultgenamt"]:
             defaultgenamt_var.set(dict["defaultgenamt"])
+        if "genlimit" in dict and dict["genlimit"]:
+            genlimit_var.set(dict["genlimit"])
+        else:
+            genlimit_var.set(str(0))
         nobostoken_var.set(dict["nobostoken"] if ("nobostoken" in dict) else 0)
         enableguidance_var.set(dict["enableguidance"] if ("enableguidance" in dict) else 0)
         if "overridekv" in dict and dict["overridekv"]:
@@ -7362,7 +7371,8 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
 
     print(f"======\nActive Modules: {' '.join(enabledmlist)}")
     print(f"Inactive Modules: {' '.join(disabledmlist)}")
-    print(f"Enabled APIs: {' '.join(apimlist)}")
+    if not args.cli:
+        print(f"Enabled APIs: {' '.join(apimlist)}")
 
     global sslvalid
     if args.ssl:
@@ -7447,7 +7457,7 @@ def onready_subprocess():
         else:
             save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="")
             benchmaxctx = maxctx
-            benchlen = args.promptlimit
+            benchlen = args.genlimit if args.genlimit > 0 else 100
             benchtemp = 0.1
             benchtopk = 1
             benchreppen = 1
@@ -7593,7 +7603,7 @@ def range_checker(arg: str):
     advparser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None)
     advparser.add_argument("--prompt", metavar=('[prompt]'), help="Passing a prompt string triggers a direct inference, loading the model, outputs the response to stdout and exits. Can be used alone or with benchmark.", type=str, default="")
     advparser.add_argument("--cli", help="Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal.", action='store_true')
-    advparser.add_argument("--promptlimit", help="Sets the maximum number of generated tokens, usable only with --prompt or --benchmark",metavar=('[token limit]'), type=int, default=100)
+    advparser.add_argument("--genlimit","--promptlimit", help="Sets the maximum number of generated tokens, it will restrict all generations to this or lower. Also usable with --prompt or --benchmark.",metavar=('[token limit]'), type=int, default=0)
     advparser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=1)
     advparser.add_argument("--multiplayer", help="Hosts a shared multiplayer session that others can join.", action='store_true')
     advparser.add_argument("--websearch", help="Enable the local search engine proxy so Web Searches can be done.", action='store_true')

Original file line number	Diff line number	Diff line change
`@@ -3725,6 +3725,15 @@ Current version indicated by LITEVER below.`
`3725`	`3725`	`"name":"OpenAI Harmony",`
`3726`	`3726`	`"user":"<\|start\|>user<\|message\|>",`
`3727`	`3727`	`"user_end":"<\|end\|>",`
	`3728`	`+ "assistant":"<\|start\|>assistant",`
	`3729`	`+ "assistant_end":"<\|end\|>",`
	`3730`	`+ "system":"<\|start\|>developer<\|message\|>",`
	`3731`	`+ "system_end":"<\|end\|>",`
	`3732`	`+ },`
	`3733`	`+ {`
	`3734`	`+ "name":"OpenAI Harmony Non-Thinking",`
	`3735`	`+ "user":"<\|start\|>user<\|message\|>",`
	`3736`	`+ "user_end":"<\|end\|>",`
`3728`	`3737`	`"assistant":"<\|start\|>assistant<\|channel\|>final<\|message\|>",`
`3729`	`3738`	`"assistant_end":"<\|end\|>",`
`3730`	`3739`	`"system":"<\|start\|>developer<\|message\|>",`
`@@ -17063,6 +17072,8 @@ Current version indicated by LITEVER below.`
`17063`	`17072`	`}`
`17064`	`17073`	`}`
`17065`	`17074`	`}`
	`17075`	`+ //special case for GPT-OSS, never use it as a stop sequence or stuff gets messed`
	`17076`	`+ seqs = seqs.filter(itm => !itm.includes("<\|start\|>assistant"));`
`17066`	`17077`	`if(!localsettings.includedefaultstops)`
`17067`	`17078`	`{`
`17068`	`17079`	`seqs = [];`
`@@ -19012,6 +19023,9 @@ Current version indicated by LITEVER below.`
`19012`	`19023`	`}`
`19013`	`19024`	`}`
`19014`	`19025`
	`19026`	`+ //special case for GPT-OSS, never use it as a stop sequence or stuff gets messed`
	`19027`	`+ stripping_arr = stripping_arr.filter(itm => !itm.includes("<\|start\|>assistant"));`
	`19028`	`+`
`19015`	`19029`	`//sometimes the OAI type endpoints get confused and repeat the instruct tag, so trim it`
`19016`	`19030`	`for(let i=0;i<stripping_arr.length;++i)`
`19017`	`19031`	`{`