Skip to content

Commit abf527a

Browse files
committed
clearer multimodal capability display
1 parent 12a6088 commit abf527a

File tree

4 files changed

+28
-13
lines changed

4 files changed

+28
-13
lines changed

expose.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,10 @@ extern "C"
272272
{
273273
return audio_multimodal_supported;
274274
}
275+
bool has_vision_support()
276+
{
277+
return vision_multimodal_supported;
278+
}
275279
float get_last_eval_time() {
276280
return last_eval_time;
277281
}

expose.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ extern std::string draftmodel_filename;
288288
extern std::vector<std::string> generated_tokens;
289289
extern bool generation_finished;
290290
extern bool audio_multimodal_supported;
291+
extern bool vision_multimodal_supported;
291292
extern float last_eval_time;
292293
extern float last_process_time;
293294
extern int last_token_count;

gpttype_adapter.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ std::string draftmodel_filename = "";
5858
int speculative_chunk_amt = 8; //do it in chunks of this many tokens
5959
bool generation_finished;
6060
bool audio_multimodal_supported = false;
61+
bool vision_multimodal_supported = false;
6162
float last_process_time = 0;
6263
float last_eval_time = 0;
6364
int last_token_count = 0;
@@ -1985,6 +1986,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
19851986
draft_ctx = nullptr;
19861987
guidance_ctx = nullptr;
19871988
audio_multimodal_supported = false;
1989+
vision_multimodal_supported = false;
19881990

19891991
auto clamped_max_context_length = inputs.max_context_length;
19901992

@@ -2464,6 +2466,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
24642466
return ModelLoadResult::FAIL;
24652467
}
24662468

2469+
if(clp_ctx_v)
2470+
{
2471+
vision_multimodal_supported = true;
2472+
}
2473+
clp_img_data = clip_image_u8_init();
24672474
if(clp_ctx_a) //init audio
24682475
{
24692476
if (clip_has_whisper_encoder(clp_ctx_a)) {
@@ -2472,7 +2479,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
24722479
}
24732480
audio_multimodal_supported = true;
24742481
}
2475-
clp_img_data = clip_image_u8_init();
24762482
}
24772483

24782484
const llama_vocab * tmpvocab = llama_model_get_vocab(llamamodel);

koboldcpp.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@
7676
lastgeneratedcomfyimg = b''
7777
lastuploadedcomfyimg = b''
7878
fullsdmodelpath = "" #if empty, it's not initialized
79-
mmprojpath = "" #if empty, it's not initialized
8079
password = "" #if empty, no auth key required
8180
fullwhispermodelpath = "" #if empty, it's not initialized
8281
ttsmodelpath = "" #if empty, not initialized
@@ -106,6 +105,7 @@
106105
importvars_in_progress = False
107106
has_multiplayer = False
108107
has_audio_support = False
108+
has_vision_support = False
109109
savedata_obj = None
110110
multiplayer_story_data_compressed = None #stores the full compressed story of the current multiplayer session
111111
multiplayer_turn_major = 1 # to keep track of when a client needs to sync their stories
@@ -539,6 +539,7 @@ def init_library():
539539
handle.get_stream_count.restype = ctypes.c_int
540540
handle.has_finished.restype = ctypes.c_bool
541541
handle.has_audio_support.restype = ctypes.c_bool
542+
handle.has_vision_support.restype = ctypes.c_bool
542543
handle.get_last_eval_time.restype = ctypes.c_float
543544
handle.get_last_process_time.restype = ctypes.c_float
544545
handle.get_last_token_count.restype = ctypes.c_int
@@ -891,18 +892,17 @@ def convert_json_to_gbnf(json_obj):
891892
return ""
892893

893894
def get_capabilities():
894-
global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, has_audio_support
895+
global savedata_obj, has_multiplayer, KcppVersion, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, has_audio_support, has_vision_support
895896
has_llm = not (friendlymodelname=="inactive")
896897
has_txt2img = not (friendlysdmodelname=="inactive" or fullsdmodelpath=="")
897-
has_vision = (mmprojpath!="")
898898
has_password = (password!="")
899899
has_whisper = (fullwhispermodelpath!="")
900900
has_search = True if args.websearch else False
901901
has_tts = (ttsmodelpath!="")
902902
has_embeddings = (embeddingsmodelpath!="")
903903
has_guidance = True if args.enableguidance else False
904904
admin_type = (2 if args.admin and args.admindir and args.adminpassword else (1 if args.admin and args.admindir else 0))
905-
return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance}
905+
return {"result":"KoboldCpp", "version":KcppVersion, "protected":has_password, "llm":has_llm, "txt2img":has_txt2img,"vision":has_vision_support,"audio":has_audio_support,"transcribe":has_whisper,"multiplayer":has_multiplayer,"websearch":has_search,"tts":has_tts, "embeddings":has_embeddings, "savedata":(savedata_obj is not None), "admin": admin_type, "guidance": has_guidance}
906906

907907
def dump_gguf_metadata(file_path): #if you're gonna copy this into your own project at least credit concedo
908908
chunk_size = 1024*1024*12 # read first 12mb of file
@@ -3120,7 +3120,7 @@ def noscript_webui(self):
31203120
def do_GET(self):
31213121
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
31223122
global last_req_time, start_time
3123-
global savedata_obj, has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastuploadedcomfyimg, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, friendlyembeddingsmodelname
3123+
global savedata_obj, has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastuploadedcomfyimg, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, password, friendlyembeddingsmodelname
31243124
self.path = self.path.rstrip('/')
31253125
response_body = None
31263126
content_type = 'application/json'
@@ -3370,7 +3370,7 @@ def do_GET(self):
33703370
return
33713371

33723372
def do_POST(self):
3373-
global modelbusy, requestsinqueue, currentusergenkey, totalgens, pendingabortkey, lastuploadedcomfyimg, lastgeneratedcomfyimg, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, net_save_slots
3373+
global modelbusy, requestsinqueue, currentusergenkey, totalgens, pendingabortkey, lastuploadedcomfyimg, lastgeneratedcomfyimg, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, net_save_slots, has_vision_support
33743374
contlenstr = self.headers['content-length']
33753375
content_length = 0
33763376
body = None
@@ -3846,8 +3846,7 @@ def do_POST(self):
38463846
elif self.path.endswith('/v1/chat/completions'):
38473847
api_format = 4
38483848
elif self.path.endswith('/sdapi/v1/interrogate'):
3849-
has_vision = (mmprojpath!="")
3850-
if not has_vision:
3849+
if not has_vision_support:
38513850
self.send_response(503)
38523851
self.end_headers(content_type='application/json')
38533852
self.wfile.write(json.dumps({"detail": {
@@ -6641,7 +6640,7 @@ def main(launch_args, default_args):
66416640

66426641
def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
66436642
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, start_time, exitcounter, global_memory, using_gui_launcher
6644-
global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname, has_audio_support
6643+
global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, friendlyembeddingsmodelname, has_audio_support, has_vision_support
66456644

66466645
start_server = True
66476646

@@ -6982,9 +6981,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
69826981
exitcounter = 999
69836982
exit_with_error(2,f"Cannot find mmproj file: {args.mmproj}")
69846983
else:
6985-
global mmprojpath
69866984
args.mmproj = os.path.abspath(args.mmproj)
6987-
mmprojpath = args.mmproj
69886985

69896986
if not args.blasthreads or args.blasthreads <= 0:
69906987
args.blasthreads = args.threads
@@ -6998,7 +6995,13 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
69986995
print("WARNING: Selected Text Model does not seem to be a GGUF file! Are you sure you picked the right file?")
69996996
loadok = load_model(modelname)
70006997
print("Load Text Model OK: " + str(loadok))
7001-
has_audio_support = handle.has_audio_support() # multimodal audio support is only known at runtime
6998+
if args.mmproj and args.mmproj!="": # multimodal vision and audio support is only known at runtime
6999+
has_audio_support = handle.has_audio_support()
7000+
has_vision_support = handle.has_vision_support()
7001+
else:
7002+
has_audio_support = False
7003+
has_vision_support = False
7004+
70027005
if not loadok:
70037006
exitcounter = 999
70047007
exit_with_error(3,"Could not load text model: " + modelname)
@@ -7193,6 +7196,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
71937196
enabledmlist.append("ImageGeneration") if "txt2img" in caps and caps["txt2img"] else disabledmlist.append("ImageGeneration")
71947197
enabledmlist.append("VoiceRecognition") if "transcribe" in caps and caps["transcribe"] else disabledmlist.append("VoiceRecognition")
71957198
enabledmlist.append("MultimodalVision") if "vision" in caps and caps["vision"] else disabledmlist.append("MultimodalVision")
7199+
enabledmlist.append("MultimodalAudio") if "audio" in caps and caps["audio"] else disabledmlist.append("MultimodalAudio")
71967200
enabledmlist.append("NetworkMultiplayer") if "multiplayer" in caps and caps["multiplayer"] else disabledmlist.append("NetworkMultiplayer")
71977201
enabledmlist.append("ApiKeyPassword") if "protected" in caps and caps["protected"] else disabledmlist.append("ApiKeyPassword")
71987202
enabledmlist.append("WebSearchProxy") if "websearch" in caps and caps["websearch"] else disabledmlist.append("WebSearchProxy")

0 commit comments

Comments
 (0)