clearer multimodal capability display

LostRuins · LostRuins · commit abf527a2078e · 2025-07-28T22:54:49.000+08:00
diff --git a/expose.cpp b/expose.cpp
@@ -272,6 +272,10 @@ extern "C"
     {
         return audio_multimodal_supported;
     }
+    bool has_vision_support()
+    {
+        return vision_multimodal_supported;
+    }
     float get_last_eval_time() {
         return last_eval_time;
     }
diff --git a/expose.h b/expose.h
@@ -288,6 +288,7 @@ extern std::string draftmodel_filename;
 extern std::vector<std::string> generated_tokens;
 extern bool generation_finished;
 extern bool audio_multimodal_supported;
+extern bool vision_multimodal_supported;
 extern float last_eval_time;
 extern float last_process_time;
 extern int last_token_count;
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -58,6 +58,7 @@ std::string draftmodel_filename = "";
 int speculative_chunk_amt = 8; //do it in chunks of this many tokens
 bool generation_finished;
 bool audio_multimodal_supported = false;
+bool vision_multimodal_supported = false;
 float last_process_time = 0;
 float last_eval_time = 0;
 int last_token_count = 0;
@@ -1985,6 +1986,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     draft_ctx = nullptr;
     guidance_ctx = nullptr;
     audio_multimodal_supported = false;
+    vision_multimodal_supported = false;
 
     auto clamped_max_context_length = inputs.max_context_length;
 
@@ -2464,6 +2466,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
                 return ModelLoadResult::FAIL;
             }
 
+            if(clp_ctx_v)
+            {
+                vision_multimodal_supported = true;
+            }
+            clp_img_data = clip_image_u8_init();
             if(clp_ctx_a) //init audio
             {
                 if (clip_has_whisper_encoder(clp_ctx_a)) {
@@ -2472,7 +2479,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
                 }
                 audio_multimodal_supported = true;
             }
-            clp_img_data = clip_image_u8_init();
         }
 
         const llama_vocab * tmpvocab = llama_model_get_vocab(llamamodel);
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -76,7 +76,6 @@
 lastgeneratedcomfyimg = b''
 lastuploadedcomfyimg = b''
 fullsdmodelpath = ""  #if empty, it's not initialized
-mmprojpath = "" #if empty, it's not initialized
 password = "" #if empty, no auth key required
 fullwhispermodelpath = "" #if empty, it's not initialized
 ttsmodelpath = "" #if empty, not initialized
@@ -106,6 +105,7 @@
 importvars_in_progress = False
 has_multiplayer = False
 has_audio_support = False
+has_vision_support = False
 savedata_obj = None
 multiplayer_story_data_compressed = None #stores the full compressed story of the current multiplayer session
 multiplayer_turn_major = 1 # to keep track of when a client needs to sync their stories
@@ -539,6 +539,7 @@ def init_library():
     handle.get_stream_count.restype = ctypes.c_int
     handle.has_finished.restype = ctypes.c_bool
     handle.has_audio_support.restype = ctypes.c_bool
+    handle.has_vision_support.restype = ctypes.c_bool
     handle.get_last_eval_time.restype = ctypes.c_float
     handle.get_last_process_time.restype = ctypes.c_float
     handle.get_last_token_count.restype = ctypes.c_int
@@ -891,18 +892,17 @@ def convert_json_to_gbnf(json_obj):
         return ""
 
 def get_capabilities():
-    global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, has_audio_support
+    global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, has_audio_support, has_vision_support
     has_llm = not (friendlymodelname=="inactive")
     has_txt2img = not (friendlysdmodelname=="inactive" or fullsdmodelpath=="")
-    has_vision = (mmprojpath!="")
     has_password = (password!="")
     has_whisper = (fullwhispermodelpath!="")
     has_search = True if args.websearch else False
     has_tts = (ttsmodelpath!="")
     has_embeddings = (embeddingsmodelpath!="")
     has_guidance = True if args.enableguidance else False
     admin_type = (2 if args.admin and args.admindir and args.adminpassword else (1 if args.admin and args.admindir else 0))
-    return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance}
+    return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision_support,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance}
 
 def dump_gguf_metadata(file_path): #if you're gonna copy this into your own project at least credit concedo
     chunk_size = 1024*1024*12  # read first 12mb of file
@@ -3120,7 +3120,7 @@ def noscript_webui(self):
     def do_GET(self):
         global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
         global last_req_time, start_time
-        global savedata_obj, has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastuploadedcomfyimg, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, friendlyembeddingsmodelname
+        global savedata_obj, has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastuploadedcomfyimg, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, password, friendlyembeddingsmodelname
         self.path = self.path.rstrip('/')
         response_body = None
         content_type = 'application/json'
@@ -3370,7 +3370,7 @@ def do_GET(self):
         return
 
     def do_POST(self):
-        global modelbusy, requestsinqueue, currentusergenkey, totalgens, pendingabortkey, lastuploadedcomfyimg, lastgeneratedcomfyimg, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, net_save_slots
+        global modelbusy, requestsinqueue, currentusergenkey, totalgens, pendingabortkey, lastuploadedcomfyimg, lastgeneratedcomfyimg, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, net_save_slots, has_vision_support
         contlenstr = self.headers['content-length']
         content_length = 0
         body = None
@@ -3846,8 +3846,7 @@ def do_POST(self):
             elif self.path.endswith('/v1/chat/completions'):
                 api_format = 4
             elif self.path.endswith('/sdapi/v1/interrogate'):
-                has_vision = (mmprojpath!="")
-                if not has_vision:
+                if not has_vision_support:
                     self.send_response(503)
                     self.end_headers(content_type='application/json')
                     self.wfile.write(json.dumps({"detail": {
@@ -6641,7 +6640,7 @@ def main(launch_args, default_args):
 
 def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
     global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, start_time, exitcounter, global_memory, using_gui_launcher
-    global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname, has_audio_support
+    global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname, has_audio_support, has_vision_support
 
     start_server = True
 
@@ -6982,9 +6981,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
                     exitcounter = 999
                     exit_with_error(2,f"Cannot find mmproj file: {args.mmproj}")
             else:
-                global mmprojpath
                 args.mmproj = os.path.abspath(args.mmproj)
-                mmprojpath = args.mmproj
 
         if not args.blasthreads or args.blasthreads <= 0:
             args.blasthreads = args.threads
@@ -6998,7 +6995,13 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
             print("WARNING: Selected Text Model does not seem to be a GGUF file! Are you sure you picked the right file?")
         loadok = load_model(modelname)
         print("Load Text Model OK: " + str(loadok))
-        has_audio_support = handle.has_audio_support() # multimodal audio support is only known at runtime
+        if args.mmproj and args.mmproj!="": # multimodal vision and audio support is only known at runtime
+            has_audio_support = handle.has_audio_support()
+            has_vision_support = handle.has_vision_support()
+        else:
+            has_audio_support = False
+            has_vision_support = False
+
         if not loadok:
             exitcounter = 999
             exit_with_error(3,"Could not load text model: " + modelname)
@@ -7193,6 +7196,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
     enabledmlist.append("ImageGeneration") if "txt2img" in caps and caps["txt2img"] else disabledmlist.append("ImageGeneration")
     enabledmlist.append("VoiceRecognition") if "transcribe" in caps and caps["transcribe"] else disabledmlist.append("VoiceRecognition")
     enabledmlist.append("MultimodalVision") if "vision" in caps and caps["vision"] else disabledmlist.append("MultimodalVision")
+    enabledmlist.append("MultimodalAudio") if "audio" in caps and caps["audio"] else disabledmlist.append("MultimodalAudio")
     enabledmlist.append("NetworkMultiplayer") if "multiplayer" in caps and caps["multiplayer"] else disabledmlist.append("NetworkMultiplayer")
     enabledmlist.append("ApiKeyPassword") if "protected" in caps and caps["protected"] else disabledmlist.append("ApiKeyPassword")
     enabledmlist.append("WebSearchProxy") if "websearch" in caps and caps["websearch"] else disabledmlist.append("WebSearchProxy")

Original file line number	Diff line number	Diff line change
`@@ -272,6 +272,10 @@ extern "C"`
`272`	`272`	`{`
`273`	`273`	`return audio_multimodal_supported;`
`274`	`274`	`}`
	`275`	`+ bool has_vision_support()`
	`276`	`+ {`
	`277`	`+ return vision_multimodal_supported;`
	`278`	`+ }`
`275`	`279`	`float get_last_eval_time() {`
`276`	`280`	`return last_eval_time;`
`277`	`281`	`}`
Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@ std::string draftmodel_filename = "";`
`58`	`58`	`int speculative_chunk_amt = 8; //do it in chunks of this many tokens`
`59`	`59`	`bool generation_finished;`
`60`	`60`	`bool audio_multimodal_supported = false;`
	`61`	`+bool vision_multimodal_supported = false;`
`61`	`62`	`float last_process_time = 0;`
`62`	`63`	`float last_eval_time = 0;`
`63`	`64`	`int last_token_count = 0;`
`@@ -1985,6 +1986,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in`
`1985`	`1986`	`draft_ctx = nullptr;`
`1986`	`1987`	`guidance_ctx = nullptr;`
`1987`	`1988`	`audio_multimodal_supported = false;`
	`1989`	`+ vision_multimodal_supported = false;`
`1988`	`1990`
`1989`	`1991`	`auto clamped_max_context_length = inputs.max_context_length;`
`1990`	`1992`
`@@ -2464,6 +2466,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in`
`2464`	`2466`	`return ModelLoadResult::FAIL;`
`2465`	`2467`	`}`
`2466`	`2468`
	`2469`	`+ if(clp_ctx_v)`
	`2470`	`+ {`
	`2471`	`+ vision_multimodal_supported = true;`
	`2472`	`+ }`
	`2473`	`+ clp_img_data = clip_image_u8_init();`
`2467`	`2474`	`if(clp_ctx_a) //init audio`
`2468`	`2475`	`{`
`2469`	`2476`	`if (clip_has_whisper_encoder(clp_ctx_a)) {`
`@@ -2472,7 +2479,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in`
`2472`	`2479`	`}`
`2473`	`2480`	`audio_multimodal_supported = true;`
`2474`	`2481`	`}`
`2475`		`- clp_img_data = clip_image_u8_init();`
`2476`	`2482`	`}`
`2477`	`2483`
`2478`	`2484`	`const llama_vocab * tmpvocab = llama_model_get_vocab(llamamodel);`