Merge branch 'concedo_experimental' into crokeso

Nexesenex · Nexesenex · commit 6d6ec079ba29 · 2025-07-05T14:46:52.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -117,6 +117,10 @@ add_compile_definitions(GGML_USE_CPU)
 add_compile_definitions(GGML_USE_CPU_AARCH64)
 add_compile_definitions(NOMINMAX)
 
+if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12)
+    add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
+endif()
+
 if (MSVC)
     add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
diff --git a/Makefile b/Makefile
@@ -332,6 +332,11 @@ endif
 	HCC         := $(ROCM_PATH)/llvm/bin/clang
 	HCXX        := $(ROCM_PATH)/llvm/bin/clang++
 endif
+ifdef GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12
+	HIPFLAGS   += -GGML_HIP_ROCWMMA_FATTN_GFX12
+	CFLAGS     += -GGML_HIP_ROCWMMA_FATTN_GFX12
+	CXXFLAGS   += -GGML_HIP_ROCWMMA_FATTN_GFX12
+endif
 ifdef LLAMA_NO_WMMA
 	HIPFLAGS   += -DGGML_HIP_NO_ROCWMMA_FATTN
 else
diff --git a/expose.h b/expose.h
@@ -168,6 +168,7 @@ struct sd_load_model_inputs
     const char * vulkan_info = nullptr;
     const int threads = 0;
     const int quant = 0;
+    const bool flash_attention = false;
     const bool taesd = false;
     const int tiled_vae_threshold = 0;
     const char * t5xxl_filename = nullptr;
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -109,6 +109,7 @@ void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & r
 }
 
 void quantize_q4_0(device const float * src, device block_q4_0 & dst) {
+#pragma METAL fp math_mode(safe)
     float amax = 0.0f; // absolute max
     float max  = 0.0f;
 
@@ -167,6 +168,7 @@ void quantize_q4_1(device const float * src, device block_q4_1 & dst) {
 }
 
 void quantize_q5_0(device const float * src, device block_q5_0 & dst) {
+#pragma METAL fp math_mode(safe)
     float amax = 0.0f; // absolute max
     float max  = 0.0f;
 
@@ -461,6 +463,7 @@ void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & re
 }
 
 void quantize_q8_0(device const float * src, device block_q8_0 & dst) {
+#pragma METAL fp math_mode(safe)
     float amax = 0.0f; // absolute max
 
     for (int j = 0; j < QK8_0; j++) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -46,6 +46,9 @@
 #include <alloca.h>
 #endif
 
+#define GGML_VERSION "0.0.1"
+#define GGML_COMMIT "KCPP"
+
 #include <assert.h>
 #include <errno.h>
 #include <time.h>
diff --git a/klite.embd b/klite.embd
@@ -3401,6 +3401,7 @@ Current version indicated by LITEVER below.
 	var schedule_multiplayer_major_change = false;
 	var last_request_str = "No Requests Available"; //full context of last submitted request
 	var last_response_obj = null;
+	var last_response_streamlog = "";
 	var lastcheckgenkey = ""; //for checking polled-streaming unique id when generating in kcpp
 	var kai_poll_recoverykey = ""; //for recovering a lost polled streaming in case of disconnect.
 	var globalabortcontroller = null;
@@ -6376,6 +6377,10 @@ Current version indicated by LITEVER below.
 				},
 				transform(chunk, ctrl) {
 					ctrl.buf += chunk;
+					if(chunk)
+					{
+						last_response_streamlog += chunk;
+					}
 					let evs = [];
 					let m;
 					while ((m = /^data: ?(.*)(\r?\n){2}/m.exec(ctrl.buf)) !== null) {
@@ -10133,6 +10138,10 @@ Current version indicated by LITEVER below.
 		{
 			lr += "\n\nResponse:\n" + JSON.stringify(last_response_obj);
 		}
+		if(last_response_streamlog)
+		{
+			lr += "\n\nResponse:\n" + last_response_streamlog;
+		}
 		msgbox(lr,"Last Request Info",false);
 	}
 	function show_last_logprobs()
@@ -11191,7 +11200,7 @@ Current version indicated by LITEVER below.
 		desired_oai_ep = transform_oai_ep(desired_oai_ep);
 
 		let oaiheaders = {};
-		if(desired_oai_key!=""){
+		if(desired_oai_key!="" && !desired_oai_ep.toLowerCase().includes("pollinations.ai")){
 			oaiheaders["Authorization"] = "Bearer " + desired_oai_key;
 		};
 		if (desired_oai_ep.toLowerCase().includes("api.mistral.ai")) {
@@ -14890,6 +14899,7 @@ Current version indicated by LITEVER below.
 		redo_arr = [];
 		last_request_str = "No Requests Available";
 		last_response_obj = null;
+		last_response_streamlog = "";
 		retry_prev_text = [];
 		retry_preserve_last = false;
 		retry_in_progress = false;
@@ -17355,6 +17365,7 @@ Current version indicated by LITEVER below.
 
 				last_request_str = JSON.stringify(submit_payload);
 				last_response_obj = null;
+				last_response_streamlog = "";
 				if (localsettings.tokenstreammode==2 && is_using_kcpp_with_sse()) {
 					let sub_endpt = apply_proxy_url(custom_kobold_endpoint + kobold_custom_gen_stream_endpoint);
 					kobold_api_stream_sse(sub_endpt, submit_payload);
@@ -17527,11 +17538,15 @@ Current version indicated by LITEVER below.
 
 				last_request_str = JSON.stringify(oai_payload);
 				last_response_obj = null;
+				last_response_streamlog = "";
 				let oaiheaders = {
-					'Content-Type': 'application/json',
-					'Authorization': 'Bearer ' + custom_oai_key
+					'Content-Type': 'application/json'
 				};
 
+				if (!targetep.toLowerCase().includes("pollinations.ai")) {
+                    oaiheaders['Authorization'] = 'Bearer ' + custom_oai_key;
+                }
+
 				if(targetep.toLowerCase().includes("openrouter.ai"))
 				{
 					oaiheaders["HTTP-Referer"] = "https://lite.koboldai.net";
@@ -17664,6 +17679,7 @@ Current version indicated by LITEVER below.
 
 				last_request_str = JSON.stringify(claude_payload);
 				last_response_obj = null;
+				last_response_streamlog = "";
 
 				let claudeheaders = {
 					'Content-Type': 'application/json',
@@ -17854,6 +17870,7 @@ Current version indicated by LITEVER below.
 
 				last_request_str = JSON.stringify(payload);
 				last_response_obj = null;
+				last_response_streamlog = "";
 
 				let geminiheaders = { 'Content-Type': 'application/json' };
 				if(is_browser_supports_sse() && localsettings.tokenstreammode!=0)
@@ -17902,6 +17919,7 @@ Current version indicated by LITEVER below.
 
 				last_request_str = JSON.stringify(cohere_payload);
 				last_response_obj = null;
+				last_response_streamlog = "";
 				let cohere_headers = {
 					'Content-Type': 'application/json',
 					'Authorization': 'Bearer ' + custom_cohere_key
@@ -17977,6 +17995,7 @@ Current version indicated by LITEVER below.
 
 			last_request_str = JSON.stringify(submit_payload);
 			last_response_obj = null;
+			last_response_streamlog = "";
 
 			fetch(horde_submit_endpoint, {
 				method: 'POST', // or 'PUT'
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -75,13 +75,12 @@
 # extra_images_max = 4
 
 # global vars
-KcppVersion = "1.95100"
-LcppVersion = "b5771"
-IKLcppVersion = "IKLpr550"
-EsoboldVersion = "RMv1.13.2m"
+KcppVersion = "1.96000"
+LcppVersion = "b5828"
+IKLcppVersion = "IKLpr567"
+EsoboldVersion = "RMv1.14.0m"
 CudaSpecifics = "Cu128_Ar86_SMC2_DmmvX32Y1"
-ReleaseDate = "2025/06/28"
-
+ReleaseDate = "2025/07/05"
 showdebug = True
 # guimode = False
 
@@ -301,6 +300,7 @@ class sd_load_model_inputs(ctypes.Structure):
                 ("vulkan_info", ctypes.c_char_p),
                 ("threads", ctypes.c_int),
                 ("quant", ctypes.c_int),
+                ("flash_attention", ctypes.c_bool),
                 ("taesd", ctypes.c_bool),
                 ("tiled_vae_threshold", ctypes.c_int),
                 ("t5xxl_filename", ctypes.c_char_p),
@@ -2443,6 +2443,7 @@ def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl
 
     inputs.threads = thds
     inputs.quant = quant
+    inputs.flash_attention = args.flashattention
     inputs.taesd = True if args.sdvaeauto else False
     inputs.tiled_vae_threshold = args.sdtiledvae
     inputs.vae_filename = vae_filename.encode("UTF-8")
@@ -9003,6 +9004,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
         args.threads = get_default_threads()
         print(f"Auto Set Threads: {args.threads}")
 
+    print(f"System: {platform.system()} {platform.version()} {platform.machine()} {platform.processor()}")
     if MaxMemory[0]>0:
         print(f"Detected Available GPU Memory: {int(MaxMemory[0]/1024/1024)} MB")
     else:
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -179,6 +179,10 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
         printf("With PhotoMaker Model: %s\n",photomaker_filename.c_str());
         photomaker_enabled = true;
     }
+    if(inputs.flash_attention)
+    {
+        printf("Flash Attention is enabled\n");
+    }
     if(inputs.quant)
     {
         printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
@@ -213,6 +217,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     sd_params->model_path = inputs.model_filename;
     sd_params->wtype = (inputs.quant==0?SD_TYPE_COUNT:SD_TYPE_Q4_0);
     sd_params->n_threads = inputs.threads; //if -1 use physical cores
+    sd_params->diffusion_flash_attn = inputs.flash_attention;
     sd_params->input_path = ""; //unused
     sd_params->batch_count = 1;
     sd_params->vae_path = vaefilename;

Original file line number	Diff line number	Diff line change
`@@ -109,6 +109,7 @@ void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & r`
`109`	`109`	`}`
`110`	`110`
`111`	`111`	`void quantize_q4_0(device const float * src, device block_q4_0 & dst) {`
	`112`	`+#pragma METAL fp math_mode(safe)`
`112`	`113`	`float amax = 0.0f; // absolute max`
`113`	`114`	`float max = 0.0f;`
`114`	`115`
`@@ -167,6 +168,7 @@ void quantize_q4_1(device const float * src, device block_q4_1 & dst) {`
`167`	`168`	`}`
`168`	`169`
`169`	`170`	`void quantize_q5_0(device const float * src, device block_q5_0 & dst) {`
	`171`	`+#pragma METAL fp math_mode(safe)`
`170`	`172`	`float amax = 0.0f; // absolute max`
`171`	`173`	`float max = 0.0f;`
`172`	`174`
`@@ -461,6 +463,7 @@ void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & re`
`461`	`463`	`}`
`462`	`464`
`463`	`465`	`void quantize_q8_0(device const float * src, device block_q8_0 & dst) {`
	`466`	`+#pragma METAL fp math_mode(safe)`
`464`	`467`	`float amax = 0.0f; // absolute max`
`465`	`468`
`466`	`469`	`for (int j = 0; j < QK8_0; j++) {`