Merge remote-tracking branch 'origin/concedo_experimental' into remoteManagement

prima · prima · commit f97a844035b4 · 2025-09-01T17:35:37.000+01:00
diff --git a/README.md b/README.md
@@ -139,12 +139,12 @@ KoboldCpp is an easy-to-use AI text-generation software for GGML and GGUF models
 - LLM text generation (Supports all GGML and GGUF models, backwards compatibility with ALL past models)
 - Image Generation (Stable Diffusion 1.5, SDXL, SD3, Flux)
 - Speech-To-Text (Voice Recognition) via Whisper
-- Text-To-Speech (Voice Generation) via OuteTTS
+- Text-To-Speech (Voice Generation) via OuteTTS, Kokoro, Parler and Dia
 - Provides many compatible APIs endpoints for many popular webservices (KoboldCppApi OpenAiApi OllamaApi A1111ForgeApi ComfyUiApi WhisperTranscribeApi XttsApi OpenAiSpeechApi)
 - Bundled KoboldAI Lite UI with editing tools, save formats, memory, world info, author's note, characters, scenarios.
 - Includes multiple modes (chat, adventure, instruct, storywriter) and UI Themes (aesthetic roleplay, classic writer, corporate assistant, messsenger)
 - Supports loading Tavern Character Cards, importing many different data formats from various sites, reading or exporting JSON savefiles and persistent stories.
-- Many other features including new samplers, regex support, websearch, RAG via TextDB and more.
+- Many other features including new samplers, regex support, websearch, RAG via TextDB, image recognition/vision and more.
 - Ready-to-use binaries for Windows, MacOS, Linux. Runs directly with Colab, Docker, also supports other platforms if self-compiled (like  Android (via Termux) and Raspberry PI).
 - [Need help finding a model? Read this!](https://github.com/LostRuins/koboldcpp/wiki#getting-an-ai-model-file)
 
diff --git a/klite.embd b/klite.embd
@@ -3466,6 +3466,7 @@ Current version indicated by LITEVER below.
 	var custom_claude_model = "";
 	var uses_cors_proxy = false; //we start off attempting a direct connection. switch to proxy if that fails
 	var synchro_polled_response = null;
+	var synchro_polled_respimg = null; //sometimes a LLM response can also include an image.
 	var last_stop_reason = ""; //update stop reason if known
 	var synchro_pending_stream = ""; //used for storing incomplete streaming text
 	var streaming_was_thinking = false; //used as a switch to determine when thinking ends, to wrap output in tags
@@ -5912,10 +5913,13 @@ Current version indicated by LITEVER below.
 			if(x.ok)
 			{
 				return x;
-			}else{
-				throw new Error('Error occurred while SSE streaming: ' + (x.statusText));
-				return null;
 			}
+			return x.text().then(errorBody => {
+				throw new Error(`Error occurred while SSE streaming: ${x.statusText} - ${errorBody}`);
+			}).catch(bodyReadError => {
+				throw new Error(`${bodyReadError}`);
+			});
+
 		})
 		.then(resp => {
 			resp.body
@@ -6056,8 +6060,13 @@ Current version indicated by LITEVER below.
 				synchro_polled_response = data.candidates[0].output;
 			}else if (custom_gemini_key != "" && data.candidates != null && data.candidates.length>0 && data.candidates[0].content && data.candidates[0].content.parts != null && data.candidates[0].content.parts.length>0) {
 				synchro_polled_response = "";
+				synchro_polled_respimg = null;
 				for(let x=0;x<data.candidates[0].content.parts.length;++x)
 				{
+					if(!synchro_polled_respimg && data.candidates[0].content.parts[x].inlineData && data.candidates[0].content.parts[x].inlineData.data)
+					{
+						synchro_polled_respimg = data.candidates[0].content.parts[x].inlineData.data;
+					}
 					if(!data.candidates[0].content.parts[x].text)
 					{
 						continue;
@@ -6221,6 +6230,10 @@ Current version indicated by LITEVER below.
 								{
 									for(let x=0;x<event.data.candidates[0].content.parts.length;++x)
 									{
+										if(!synchro_polled_respimg && event.data.candidates[0].content.parts[x].inlineData && event.data.candidates[0].content.parts[x].inlineData.data)
+										{
+											synchro_polled_respimg = event.data.candidates[0].content.parts[x].inlineData.data;
+										}
 										if(event.data.candidates[0].content.parts[x].thought)
 										{
 											streaming_was_thinking = true;
@@ -15027,6 +15040,7 @@ Current version indicated by LITEVER below.
 		pending_response_id = "";
 		poll_in_progress = false;
 		synchro_polled_response = null;
+		synchro_polled_respimg = null;
 		last_stop_reason = "";
 		synchro_pending_stream = "";
 		streaming_was_thinking = false;
@@ -15052,6 +15066,7 @@ Current version indicated by LITEVER below.
 		nextgeneratedimagemilestone = generateimagesinterval;
 		pending_response_id = "";
 		synchro_polled_response = null;
+		synchro_polled_respimg = null;
 		last_stop_reason = "";
 		synchro_pending_stream = "";
 		streaming_was_thinking = false;
@@ -16778,7 +16793,12 @@ Current version indicated by LITEVER below.
 		retry_in_progress = false;
 
 		//match the request for creating images in instruct modes
-		if(newgen!="" && localsettings.img_gen_from_instruct && localsettings.opmode == 4 && localsettings.generate_images_mode!=0 && localsettings.img_autogen_type!=2 && !newgen.includes("\n"))
+		let model_allow_imggen = localsettings.img_gen_from_instruct;
+		if(custom_gemini_key!="" && document.getElementById("custom_gemini_model").value=="gemini-2.5-flash-image-preview")
+		{
+			model_allow_imggen = false; //nano banana interferes with imggen instructions
+		}
+		if(newgen!="" && model_allow_imggen && localsettings.opmode == 4 && localsettings.generate_images_mode!=0 && localsettings.img_autogen_type!=2 && !newgen.includes("\n"))
 		{
 			let newgenlc = newgen.toLowerCase().trim();
 			if (newgenlc.startsWith("draw ") ||
@@ -17746,6 +17766,7 @@ Current version indicated by LITEVER below.
 			poll_ticks_passed = 0;
 			poll_in_progress = false;
 			synchro_polled_response = null;
+			synchro_polled_respimg = null;
 			last_stop_reason = "";
 			synchro_pending_stream = "";
 			streaming_was_thinking = false;
@@ -18290,7 +18311,7 @@ Current version indicated by LITEVER below.
 					payload["tools"] = [{"google_search": {}}];
 				}
 
-				if(mdlname.includes("gemini-2.5"))
+				if(mdlname.includes("gemini-2.5") && !mdlname.includes("image"))
 				{
 					if(!document.getElementById("usegeminithink").checked)
 					{
@@ -20708,6 +20729,15 @@ Current version indicated by LITEVER below.
 						render_gametext();
 						sync_multiplayer(false);
 					}
+					if(synchro_polled_respimg)
+					{
+						if(!synchro_polled_respimg.startsWith("data:image"))
+						{
+							synchro_polled_respimg = "data:image/png;base64," + synchro_polled_respimg;
+						}
+						self_upload_img(synchro_polled_respimg,"output_image");
+						synchro_polled_respimg = null;
+					}
 				}
 				else {
 					//horde api needs to constantly poll to see if response is done
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -91,6 +91,7 @@
 maxhordelen = 1024
 modelbusy = threading.Lock()
 requestsinqueue = 0
+ratelimitlookup = {}
 defaultport = 5001
 showsamplerwarning = True
 showmaxctxwarning = True
@@ -4904,6 +4905,21 @@ def do_POST(self):
             return
 
         reqblocking = False
+        #handle rate limiting
+        ratelimiter = int(args.ratelimit)
+        if ratelimiter > 0:
+            client_ip = self.client_address[0]
+            lastdone = ratelimitlookup.get(client_ip, datetime.min)
+            diff = (datetime.now() - lastdone).total_seconds()
+            if diff < ratelimiter:
+                self.send_response(503)
+                self.end_headers(content_type='application/json')
+                self.wfile.write(json.dumps({"detail": {
+                        "msg": f"You are sending requests too quickly. Please try again in {int(ratelimiter-diff)} seconds.",
+                        "type": "service_unavailable",
+                    }}).encode())
+                return
+            ratelimitlookup[client_ip] = datetime.now()
         muint = int(args.multiuser)
         if muint<=0 and ((args.whispermodel and args.whispermodel!="") or (args.sdmodel and args.sdmodel!="") or (args.ttsmodel and args.ttsmodel!="") or (args.embeddingsmodel and args.embeddingsmodel!="")):
             muint = 2 # this prevents errors when using voice/img together with text
@@ -5667,6 +5683,7 @@ def hide_tooltip(event):
     ssl_key_var = ctk.StringVar()
     password_var = ctk.StringVar()
     maxrequestsize_var = ctk.StringVar(value=str(32))
+    ratelimit_var = ctk.StringVar(value=str(0))
 
     sd_model_var = ctk.StringVar()
     sd_lora_var = ctk.StringVar()
@@ -6395,6 +6412,7 @@ def pickpremadetemplate():
     makelabelentry(network_tab, "Password: ", password_var, 10, 200,tooltip="Enter a password required to use this instance.\nThis key will be required for all text endpoints.\nImage endpoints are not secured.")
 
     makelabelentry(network_tab, "Max Req. Size (MB):", maxrequestsize_var, row=20, width=50, tooltip="Specify a max request payload size. Any requests to the server larger than this size will be dropped. Do not change if unsure.")
+    makelabelentry(network_tab, "IP Rate Limiter (s):", ratelimit_var, row=22, width=50, tooltip="Rate limits each IP to allow a new request once per X seconds. Do not change if unsure.")
 
 
     # Horde Tab
@@ -6686,6 +6704,7 @@ def export_vars():
         args.multiplayer = (multiplayer_var.get()==1)
         args.websearch = (websearch_var.get()==1)
         args.maxrequestsize = int(maxrequestsize_var.get()) if maxrequestsize_var.get()!="" else 32
+        args.ratelimit = int(ratelimit_var.get()) if ratelimit_var.get()!="" else 0
 
         if usehorde_var.get() != 0:
             args.hordemodelname = horde_name_var.get()
@@ -6933,6 +6952,8 @@ def import_vars(dict):
         usehorde_var.set(1 if ("hordekey" in dict and dict["hordekey"]) else 0)
         if "maxrequestsize" in dict and dict["maxrequestsize"]:
             maxrequestsize_var.set(dict["maxrequestsize"])
+        if "ratelimit" in dict and dict["ratelimit"]:
+            ratelimit_var.set(dict["ratelimit"])
 
         sd_model_var.set(dict["sdmodel"] if ("sdmodel" in dict and dict["sdmodel"]) else "")
         sd_clamped_var.set(int(dict["sdclamped"]) if ("sdclamped" in dict and dict["sdclamped"]) else 0)
@@ -8744,6 +8765,7 @@ def range_checker(arg: str):
     advparser.add_argument("--draftgpulayers","--gpu-layers-draft","--n-gpu-layers-draft","-ngld", metavar=('[layers]'), help="How many layers to offload to GPU for the draft model (default=full offload)", type=int, default=999)
     advparser.add_argument("--draftgpusplit", help="GPU layer distribution ratio for draft model (default=same as main). Only works if multi-GPUs selected for MAIN model and tensor_split is set!", metavar=('[Ratios]'), type=float, nargs='+')
     advparser.add_argument("--password", metavar=('[API key]'), help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None)
+    advparser.add_argument("--ratelimit", metavar=('[seconds]'), help="If enabled, rate limit generative request by IP address. Each IP can only send a new request once per X seconds.", type=int, default=0)
     advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true')
     advparser.add_argument("--chatcompletionsadapter", metavar=('[filename]'), help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="AutoGuess")
     advparser.add_argument("--flashattention","--flash-attn","-fa", help="Enables flash attention.", action='store_true')