Skip to content

Commit 6d6ec07

Browse files
committed
Merge branch 'concedo_experimental' into crokeso
2 parents 2a93c7a + 57ce374 commit 6d6ec07

File tree

8 files changed

+51
-9
lines changed

8 files changed

+51
-9
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,10 @@ add_compile_definitions(GGML_USE_CPU)
117117
add_compile_definitions(GGML_USE_CPU_AARCH64)
118118
add_compile_definitions(NOMINMAX)
119119

120+
if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12)
121+
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
122+
endif()
123+
120124
if (MSVC)
121125
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
122126
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,11 @@ endif
332332
HCC := $(ROCM_PATH)/llvm/bin/clang
333333
HCXX := $(ROCM_PATH)/llvm/bin/clang++
334334
endif
335+
ifdef GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12
336+
HIPFLAGS += -GGML_HIP_ROCWMMA_FATTN_GFX12
337+
CFLAGS += -GGML_HIP_ROCWMMA_FATTN_GFX12
338+
CXXFLAGS += -GGML_HIP_ROCWMMA_FATTN_GFX12
339+
endif
335340
ifdef LLAMA_NO_WMMA
336341
HIPFLAGS += -DGGML_HIP_NO_ROCWMMA_FATTN
337342
else

expose.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ struct sd_load_model_inputs
168168
const char * vulkan_info = nullptr;
169169
const int threads = 0;
170170
const int quant = 0;
171+
const bool flash_attention = false;
171172
const bool taesd = false;
172173
const int tiled_vae_threshold = 0;
173174
const char * t5xxl_filename = nullptr;

ggml/src/ggml-metal/ggml-metal.metal

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & r
109109
}
110110

111111
void quantize_q4_0(device const float * src, device block_q4_0 & dst) {
112+
#pragma METAL fp math_mode(safe)
112113
float amax = 0.0f; // absolute max
113114
float max = 0.0f;
114115

@@ -167,6 +168,7 @@ void quantize_q4_1(device const float * src, device block_q4_1 & dst) {
167168
}
168169

169170
void quantize_q5_0(device const float * src, device block_q5_0 & dst) {
171+
#pragma METAL fp math_mode(safe)
170172
float amax = 0.0f; // absolute max
171173
float max = 0.0f;
172174

@@ -461,6 +463,7 @@ void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & re
461463
}
462464

463465
void quantize_q8_0(device const float * src, device block_q8_0 & dst) {
466+
#pragma METAL fp math_mode(safe)
464467
float amax = 0.0f; // absolute max
465468

466469
for (int j = 0; j < QK8_0; j++) {

ggml/src/ggml.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@
4646
#include <alloca.h>
4747
#endif
4848

49+
#define GGML_VERSION "0.0.1"
50+
#define GGML_COMMIT "KCPP"
51+
4952
#include <assert.h>
5053
#include <errno.h>
5154
#include <time.h>

klite.embd

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3401,6 +3401,7 @@ Current version indicated by LITEVER below.
34013401
var schedule_multiplayer_major_change = false;
34023402
var last_request_str = "No Requests Available"; //full context of last submitted request
34033403
var last_response_obj = null;
3404+
var last_response_streamlog = "";
34043405
var lastcheckgenkey = ""; //for checking polled-streaming unique id when generating in kcpp
34053406
var kai_poll_recoverykey = ""; //for recovering a lost polled streaming in case of disconnect.
34063407
var globalabortcontroller = null;
@@ -6376,6 +6377,10 @@ Current version indicated by LITEVER below.
63766377
},
63776378
transform(chunk, ctrl) {
63786379
ctrl.buf += chunk;
6380+
if(chunk)
6381+
{
6382+
last_response_streamlog += chunk;
6383+
}
63796384
let evs = [];
63806385
let m;
63816386
while ((m = /^data: ?(.*)(\r?\n){2}/m.exec(ctrl.buf)) !== null) {
@@ -10133,6 +10138,10 @@ Current version indicated by LITEVER below.
1013310138
{
1013410139
lr += "\n\nResponse:\n" + JSON.stringify(last_response_obj);
1013510140
}
10141+
if(last_response_streamlog)
10142+
{
10143+
lr += "\n\nResponse:\n" + last_response_streamlog;
10144+
}
1013610145
msgbox(lr,"Last Request Info",false);
1013710146
}
1013810147
function show_last_logprobs()
@@ -11191,7 +11200,7 @@ Current version indicated by LITEVER below.
1119111200
desired_oai_ep = transform_oai_ep(desired_oai_ep);
1119211201

1119311202
let oaiheaders = {};
11194-
if(desired_oai_key!=""){
11203+
if(desired_oai_key!="" && !desired_oai_ep.toLowerCase().includes("pollinations.ai")){
1119511204
oaiheaders["Authorization"] = "Bearer " + desired_oai_key;
1119611205
};
1119711206
if (desired_oai_ep.toLowerCase().includes("api.mistral.ai")) {
@@ -14890,6 +14899,7 @@ Current version indicated by LITEVER below.
1489014899
redo_arr = [];
1489114900
last_request_str = "No Requests Available";
1489214901
last_response_obj = null;
14902+
last_response_streamlog = "";
1489314903
retry_prev_text = [];
1489414904
retry_preserve_last = false;
1489514905
retry_in_progress = false;
@@ -17355,6 +17365,7 @@ Current version indicated by LITEVER below.
1735517365

1735617366
last_request_str = JSON.stringify(submit_payload);
1735717367
last_response_obj = null;
17368+
last_response_streamlog = "";
1735817369
if (localsettings.tokenstreammode==2 && is_using_kcpp_with_sse()) {
1735917370
let sub_endpt = apply_proxy_url(custom_kobold_endpoint + kobold_custom_gen_stream_endpoint);
1736017371
kobold_api_stream_sse(sub_endpt, submit_payload);
@@ -17527,11 +17538,15 @@ Current version indicated by LITEVER below.
1752717538

1752817539
last_request_str = JSON.stringify(oai_payload);
1752917540
last_response_obj = null;
17541+
last_response_streamlog = "";
1753017542
let oaiheaders = {
17531-
'Content-Type': 'application/json',
17532-
'Authorization': 'Bearer ' + custom_oai_key
17543+
'Content-Type': 'application/json'
1753317544
};
1753417545

17546+
if (!targetep.toLowerCase().includes("pollinations.ai")) {
17547+
oaiheaders['Authorization'] = 'Bearer ' + custom_oai_key;
17548+
}
17549+
1753517550
if(targetep.toLowerCase().includes("openrouter.ai"))
1753617551
{
1753717552
oaiheaders["HTTP-Referer"] = "https://lite.koboldai.net";
@@ -17664,6 +17679,7 @@ Current version indicated by LITEVER below.
1766417679

1766517680
last_request_str = JSON.stringify(claude_payload);
1766617681
last_response_obj = null;
17682+
last_response_streamlog = "";
1766717683

1766817684
let claudeheaders = {
1766917685
'Content-Type': 'application/json',
@@ -17854,6 +17870,7 @@ Current version indicated by LITEVER below.
1785417870

1785517871
last_request_str = JSON.stringify(payload);
1785617872
last_response_obj = null;
17873+
last_response_streamlog = "";
1785717874

1785817875
let geminiheaders = { 'Content-Type': 'application/json' };
1785917876
if(is_browser_supports_sse() && localsettings.tokenstreammode!=0)
@@ -17902,6 +17919,7 @@ Current version indicated by LITEVER below.
1790217919

1790317920
last_request_str = JSON.stringify(cohere_payload);
1790417921
last_response_obj = null;
17922+
last_response_streamlog = "";
1790517923
let cohere_headers = {
1790617924
'Content-Type': 'application/json',
1790717925
'Authorization': 'Bearer ' + custom_cohere_key
@@ -17977,6 +17995,7 @@ Current version indicated by LITEVER below.
1797717995

1797817996
last_request_str = JSON.stringify(submit_payload);
1797917997
last_response_obj = null;
17998+
last_response_streamlog = "";
1798017999

1798118000
fetch(horde_submit_endpoint, {
1798218001
method: 'POST', // or 'PUT'

koboldcpp.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,12 @@
7575
# extra_images_max = 4
7676

7777
# global vars
78-
KcppVersion = "1.95100"
79-
LcppVersion = "b5771"
80-
IKLcppVersion = "IKLpr550"
81-
EsoboldVersion = "RMv1.13.2m"
78+
KcppVersion = "1.96000"
79+
LcppVersion = "b5828"
80+
IKLcppVersion = "IKLpr567"
81+
EsoboldVersion = "RMv1.14.0m"
8282
CudaSpecifics = "Cu128_Ar86_SMC2_DmmvX32Y1"
83-
ReleaseDate = "2025/06/28"
84-
83+
ReleaseDate = "2025/07/05"
8584
showdebug = True
8685
# guimode = False
8786

@@ -301,6 +300,7 @@ class sd_load_model_inputs(ctypes.Structure):
301300
("vulkan_info", ctypes.c_char_p),
302301
("threads", ctypes.c_int),
303302
("quant", ctypes.c_int),
303+
("flash_attention", ctypes.c_bool),
304304
("taesd", ctypes.c_bool),
305305
("tiled_vae_threshold", ctypes.c_int),
306306
("t5xxl_filename", ctypes.c_char_p),
@@ -2443,6 +2443,7 @@ def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl
24432443

24442444
inputs.threads = thds
24452445
inputs.quant = quant
2446+
inputs.flash_attention = args.flashattention
24462447
inputs.taesd = True if args.sdvaeauto else False
24472448
inputs.tiled_vae_threshold = args.sdtiledvae
24482449
inputs.vae_filename = vae_filename.encode("UTF-8")
@@ -9003,6 +9004,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
90039004
args.threads = get_default_threads()
90049005
print(f"Auto Set Threads: {args.threads}")
90059006

9007+
print(f"System: {platform.system()} {platform.version()} {platform.machine()} {platform.processor()}")
90069008
if MaxMemory[0]>0:
90079009
print(f"Detected Available GPU Memory: {int(MaxMemory[0]/1024/1024)} MB")
90089010
else:

otherarch/sdcpp/sdtype_adapter.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,10 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
179179
printf("With PhotoMaker Model: %s\n",photomaker_filename.c_str());
180180
photomaker_enabled = true;
181181
}
182+
if(inputs.flash_attention)
183+
{
184+
printf("Flash Attention is enabled\n");
185+
}
182186
if(inputs.quant)
183187
{
184188
printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
@@ -213,6 +217,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
213217
sd_params->model_path = inputs.model_filename;
214218
sd_params->wtype = (inputs.quant==0?SD_TYPE_COUNT:SD_TYPE_Q4_0);
215219
sd_params->n_threads = inputs.threads; //if -1 use physical cores
220+
sd_params->diffusion_flash_attn = inputs.flash_attention;
216221
sd_params->input_path = ""; //unused
217222
sd_params->batch_count = 1;
218223
sd_params->vae_path = vaefilename;

0 commit comments

Comments
 (0)