Nexesenex
diff --git a/‎Makefile‎
Lines changed: 3 additions & 0 deletions b/‎Makefile‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 8 additions & 1 deletion b/‎convert_hf_to_gguf.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/llamafile/sgemm.cpp‎
Lines changed: 343 additions & 1094 deletions b/‎ggml/src/ggml-cpu/llamafile/sgemm.cpp‎
Lines changed: 343 additions & 1094 deletions
diff --git a/‎ggml/src/ggml-cuda/set-rows.cu‎
Lines changed: 7 additions & 1 deletion b/‎ggml/src/ggml-cuda/set-rows.cu‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎kcpp_adapters/AutoGuess.json‎
Lines changed: 33 additions & 0 deletions b/‎kcpp_adapters/AutoGuess.json‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎klite.embd‎
Lines changed: 29 additions & 11 deletions b/‎klite.embd‎
Lines changed: 29 additions & 11 deletions
diff --git a/‎koboldcpp.py‎
Lines changed: 26 additions & 3 deletions b/‎koboldcpp.py‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎models/templates/llama-cpp-rwkv-world.jinja‎
Lines changed: 34 additions & 0 deletions b/‎models/templates/llama-cpp-rwkv-world.jinja‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎requirements/requirements-server-bench.txt‎
Lines changed: 5 additions & 0 deletions b/‎requirements/requirements-server-bench.txt‎
Lines changed: 5 additions & 0 deletions
@@ -79,6 +79,9 @@ SIMPLERCFLAGS =
 FULLCFLAGS =
 NONECFLAGS =
 
+# prefer bundled glslc
+LLAMA_USE_BUNDLED_GLSLC := 1
+
 CLBLAST_FLAGS = -DGGML_USE_CLBLAST
 FAILSAFE_FLAGS = -DUSE_FAILSAFE
 VULKAN_FLAGS = -DGGML_USE_VULKAN -DSD_USE_VULKAN
 
@@ -1082,7 +1082,14 @@ def _set_vocab_rwkv_world(self):
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
-        special_vocab.chat_template = "rwkv-world"
+        if special_vocab.chat_template is None:
+            template_path = Path(__file__).parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja"
+            if template_path.is_file():
+                with open(template_path, "r", encoding="utf-8") as f:
+                    template = f.read()
+            else:
+                template = "rwkv-world"
+            special_vocab.chat_template = template
         # hack: Add '\n\n' as the EOT token to make it chat normally
         special_vocab._set_special_token("eot", 261)
         # hack: Override these as they have already been set (incorrectly)
 
@@ -3,7 +3,10 @@
 typedef void (*set_rows_kernel_t)(const char * src, char * dst);
 
 template<typename src_t, typename dst_t>
-__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {}
+__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
+    GGML_UNUSED(src_f);
+    GGML_UNUSED(dst_f);
+}
 
 template<>
 __device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
@@ -53,6 +56,9 @@ static __global__ void k_set_rows(
     const src_t* src_elem = src0_row + i00;
     dst_t* dst_elem = dst_row_ptr + i00;
     set_rows_1(src_elem, dst_elem);
+
+    GGML_UNUSED(ne10);
+    GGML_UNUSED(ne13);
 }
 
 template<typename src_t, typename dst_t>
 
@@ -177,6 +177,39 @@
         "assistant_start": "<｜Assistant｜>",
         "assistant_end": "<｜end▁of▁sentence｜>"
     }
+}, {
+    "search": ["<|bom|>","is_last_checked_defined"],
+    "name": "Jamba",
+    "adapter": {
+        "system_start": "<|bom|><|system|>",
+        "system_end": "<|eom|>",
+        "user_start": "<|bom|><|user|>",
+        "user_end": "<|eom|>",
+        "assistant_start": "<|bom|><|assistant|>",
+        "assistant_end": "<|eom|>"
+    }
+}, {
+    "search": ["<|im_start|>assistant<|im_middle|>", "<|im_assistant|>assistant<|im_middle|>", "<|im_end|>"],
+    "name": "ChatML (Kimi).",
+    "adapter": {
+        "system_start": "<|im_start|>system<|im_middle|>",
+        "system_end": "<|im_end|>",
+        "user_start": "<|im_start|>user<|im_middle|>",
+        "user_end": "<|im_end|>",
+        "assistant_start": "<|im_start|>assistant<|im_middle|>",
+        "assistant_end": "<|im_end|>"
+    }
+}, {
+    "search": ["<|userprompt|>", "<|endofuserprompt|>", "<|response|>", "<|endofresponse|>"],
+    "name": "Dots",
+    "adapter": {
+        "system_start": "<|system|>\n",
+        "system_end": "<|endofsystem|>\n",
+        "user_start": "<|userprompt|>\n",
+        "user_end": "<|endofuserprompt|>\n",
+        "assistant_start": "<|response|>\n",
+        "assistant_end": "<|endofresponse|>\n"
+    }
 }, {
     "search": ["rwkv-world"],
     "name": "RWKV World",
 
@@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
 -->
 
 <script id="init-config">
-	const LITEVER = 261;
+	const LITEVER = 262;
 	const urlParams = new URLSearchParams(window.location.search);
 	var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
 	const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@@ -3605,10 +3605,12 @@ Current version indicated by LITEVER below.
 		entersubmit: true, //enter sends the prompt
 		darkmode: true,
 		render_streaming_markdown: true,
-		raw_instruct_tags: false, //experimental flag
+
+		raw_instruct_tags: false, //experimental flags
 		show_endpoint_selector: false,
 		no_warn_unsaved: false,
 		no_compress_audio: false,
+		autoguess_third_party:false,
 
 		//section migrated from story itself
 		extrastopseq: "",
@@ -4622,7 +4624,10 @@ Current version indicated by LITEVER below.
 		let instag = localsettings.instruct_starttag;
 		if(instag=="{{[INPUT]}}" && !(custom_kobold_endpoint != "" && is_using_kcpp_with_autotags()))
 		{
-			instag = "\n### Instruction:\n"; //backend not compatible with auto
+			if(!localsettings.autoguess_third_party)
+			{
+				instag = "\n### Instruction:\n"; //backend not compatible with auto
+			}
 		}
 		if(doTrim){
 			return replaceAll(instag, "\\n", "\n").trim();
@@ -4635,7 +4640,10 @@ Current version indicated by LITEVER below.
 		let instag = localsettings.instruct_endtag;
 		if(instag=="{{[OUTPUT]}}" && !(custom_kobold_endpoint != "" && is_using_kcpp_with_autotags()))
 		{
-			instag = "\n### Response:\n"; //backend not compatible with auto
+			if(!localsettings.autoguess_third_party)
+			{
+				instag = "\n### Response:\n"; //backend not compatible with auto
+			}
 		}
 		if(doTrim){
 			return replaceAll(instag, "\\n", "\n").trim();
@@ -13009,6 +13017,7 @@ Current version indicated by LITEVER below.
 		document.getElementById("show_endpoint_selector").checked = localsettings.show_endpoint_selector;
 		document.getElementById("no_warn_unsaved").checked = localsettings.no_warn_unsaved;
 		document.getElementById("no_compress_audio").checked = localsettings.no_compress_audio;
+		document.getElementById("autoguess_third_party").checked = localsettings.autoguess_third_party;
 		document.getElementById("render_streaming_markdown").checked = localsettings.render_streaming_markdown;
 		document.getElementById("min_p").value = localsettings.min_p;
 		document.getElementById("dynatemp_range").value = localsettings.dynatemp_range;
@@ -13559,6 +13568,7 @@ Current version indicated by LITEVER below.
 		localsettings.show_endpoint_selector = (document.getElementById("show_endpoint_selector").checked ? true : false);
 		localsettings.no_warn_unsaved = (document.getElementById("no_warn_unsaved").checked ? true : false);
 		localsettings.no_compress_audio = (document.getElementById("no_compress_audio").checked ? true : false);
+		localsettings.autoguess_third_party = (document.getElementById("autoguess_third_party").checked ? true : false);
 		localsettings.render_streaming_markdown = (document.getElementById("render_streaming_markdown").checked ? true : false);
 		if(document.getElementById("opmode").value==1)
 		{
@@ -14290,9 +14300,9 @@ Current version indicated by LITEVER below.
 			let userinput = getInputBoxValue();
 			if(userinput.trim()!="")
 			{
-				let str = get_instructendplaceholder() + userinput.trim();
+				let str = get_instructstartplaceholder() + userinput.trim();
 				if (localsettings.separate_end_tags) {
-					str += get_instructendplaceholder_end();
+					str += get_instructstartplaceholder_end();
 				}
 				document.getElementById("memorytext").value += str;
 			}
@@ -18938,12 +18948,15 @@ Current version indicated by LITEVER below.
 					gentxt = gentxt.substring(curtag.length);
 				}
 
-				let found = gentxt.indexOf(curtag);
-				let splitresponse = [];
-				if (found != -1) //if found, truncate to it
+				if(localsettings.includedefaultstops)
 				{
-					splitresponse = gentxt.split(curtag);
-					gentxt = splitresponse[0];
+					let found = gentxt.indexOf(curtag);
+					let splitresponse = [];
+					if (found != -1) //if found, truncate to it
+					{
+						splitresponse = gentxt.split(curtag);
+						gentxt = splitresponse[0];
+					}
 				}
 			}
 
@@ -26043,6 +26056,11 @@ Current version indicated by LITEVER below.
 								class="helptext">Do not compress embedded audio files. Might crash on big files! (caution!)</span></span></div>
 							<input title="Do Not Recompress Audio" type="checkbox" id="no_compress_audio" style="margin:0px 0px 0px 0px;">
 						</div>
+						<div class="settinglabel">
+							<div class="justifyleft settingsmall">AutoguessTagsForThirdParty <span class="helpicon">?<span
+								class="helptext">Sends raw KoboldCppAutomatic AutoGuess tags to third party APIs. (e.g. Horde) Be warned, you better hope they handle them well internally...</span></span></div>
+							<input title="Send Autoguess Tags For Third Party APIs" type="checkbox" id="autoguess_third_party" style="margin:0px 0px 0px 0px;">
+						</div>
 					</div>
 
 					<div class="settingitem wide">
 
@@ -1127,7 +1127,7 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to dete
         if fsize > (10*1024*1024): #dont bother with models < 10mb
             cs = ctxsize
             mem = gpumem
-            if "-00001-of-0000" in fname:
+            if "-00001-of-00" in fname:
                 match = re.search(r'-(\d{5})-of-(\d{5})\.', fname)
                 if match:
                     total_parts = int(match.group(2))
@@ -2916,6 +2916,15 @@ def transform_genparams(genparams, api_format):
                     # In case of any issues, just do normal gen
                     print("Structured Output not valid - discarded")
                     pass
+            elif 'json_schema' in genparams:
+                try:
+                    schema = genparams.get('json_schema')
+                    decoded = convert_json_to_gbnf(schema)
+                    if decoded:
+                        genparams["grammar"] = decoded
+                except Exception:
+                    print("Structured Output (old format) not valid - discarded")
+                    pass
 
             message_index = 0
             for message in messages_array:
@@ -3639,6 +3648,18 @@ async def handle_sse_stream(self, genparams, api_format):
                                         tokenStr = tokenStr[:sindex]
 
                         if tokenStr!="" or streamDone:
+                            need_split_final_msg = True if (currfinishreason is not None and streamDone and tokenStr!="") else False
+                            if need_split_final_msg: #we need to send one message without the finish reason, then send a finish reason with no msg to follow standards
+                                if api_format == 4:  # if oai chat, set format to expected openai streaming response
+                                    event_str = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':tokenStr}}]})
+                                    await self.send_oai_sse_event(event_str)
+                                elif api_format == 3:  # non chat completions
+                                    event_str = json.dumps({"id":"koboldcpp","object":"text_completion","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"text":tokenStr}]})
+                                    await self.send_oai_sse_event(event_str)
+                                else:
+                                    event_str = json.dumps({"token": tokenStr, "finish_reason":None})
+                                    await self.send_kai_sse_event(event_str)
+                                tokenStr = "" # now the final finish reason can be sent alone
                             if api_format == 4:  # if oai chat, set format to expected openai streaming response
                                 event_str = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":currfinishreason,"delta":{'role':'assistant','content':tokenStr}}]})
                                 await self.send_oai_sse_event(event_str)
@@ -6766,7 +6787,7 @@ def save_config_gui():
     def load_config_gui(): #this is used to populate the GUI with a config file, whereas load_config_cli simply overwrites cli args
         file_type = [("KoboldCpp Settings", "*.kcpps *.kcppt")]
         global runmode_untouched, zenity_permitted
-        filename = zentk_askopenfilename(filetypes=file_type, defaultextension=".kcppt", initialdir=None)
+        filename = zentk_askopenfilename(filetypes=file_type, defaultextension=".kcppt", initialdir=None, title="Select kcpps or kcppt settings config file")
         if not filename or filename=="":
             return
         if not os.path.exists(filename) or os.path.getsize(filename)<4 or os.path.getsize(filename)>50000000: #for sanity, check invaid kcpps
@@ -7177,6 +7198,7 @@ def tunnel_reader():
 def reload_from_new_args(newargs):
     try:
         args.istemplate = False
+        newargs = convert_invalid_args(newargs)
         for key, value in newargs.items(): #do not overwrite certain values
             if key not in ["remotetunnel","showgui","port","host","port_param","admin","adminpassword","admindir","admintextmodelsdir","admindatadir","adminallowhf","ssl","nocertify","benchmark","prompt","config"]:
                 setattr(args, key, value)
@@ -7202,6 +7224,7 @@ def load_config_cli(filename):
     print("Loading .kcpps configuration file...")
     with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
         config = json.load(f)
+        config = convert_invalid_args(config)
         if "onready" in config:
             config["onready"] = "" #do not allow onready commands from config
         args.istemplate = False
@@ -7358,7 +7381,7 @@ def download_model_from_url(url, permitted_types=[".gguf",".safetensors", ".ggml
                 break
         if ((url.startswith("http://") or url.startswith("https://")) and end_ext_ok):
             dlfile = downloader_internal(url, "auto", False, min_file_size)
-            if handle_multipart and "-00001-of-0000" in url: #handle multipart files up to 9 parts
+            if handle_multipart and "-00001-of-00" in url: #handle multipart files up to 9 parts
                 match = re.search(r'-(\d{5})-of-(\d{5})\.', url)
                 if match:
                     total_parts = int(match.group(2))
 
@@ -0,0 +1,34 @@
+{%- if not add_generation_prompt is defined -%}
+    {%- set add_generation_prompt = true -%}
+{%- endif -%}
+{%- set ns = namespace(system_prompt='') -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'system' -%}
+        {%- set ns.system_prompt = message['content'] -%}
+    {%- endif -%}
+{%- endfor -%}
+{{bos_token}}
+{%- if ns.system_prompt != '' -%}
+{{- 'System: ' + ns.system_prompt + '\n\n' -}}
+{%- endif -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- 'User: ' + message['content']|trim + '\n\n' -}}
+    {%- endif -%}
+    {%- if message['role'] == 'assistant' and message['content'] is  not none -%}
+        {%- set content = message['content'] -%}
+        {%- if '</think>' in content -%}
+            {%- set content = content.split('</think>')[-1] -%}
+        {%- endif -%}
+        {{- 'Assistant: ' + content|trim + '\n\n' -}}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{- 'Assistant:' -}}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- ' <think>\n</think>' }}
+    {%- endif %}
+    {%- if enable_thinking is defined and enable_thinking is true %}
+        {{- ' <think>' }}
+    {%- endif %}
+{%- endif -%}
@@ -0,0 +1,5 @@
+datasets~=3.2.0
+matplotlib~=3.10.0
+numpy~=1.26.4
+requests~=2.32.3
+tqdm~=4.67.1