Skip to content

Commit e88f28b

Browse files
committed
Fix KCPP side and cleanup
Also remove some securities for model loading. And fix BUBS and autoloader
1 parent 61c1ce9 commit e88f28b

File tree

5 files changed

+238
-22
lines changed

5 files changed

+238
-22
lines changed

class.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,7 @@ def _load(self, save_model: bool, initial_load: bool) -> None:
267267
kcppargs = KcppArgsObject(model=self.kcpp_filename, model_param=self.kcpp_filename,
268268
port=5001, port_param=5001, host='', launch=False, lora=None, threads=self.kcpp_threads, blasthreads=self.kcpp_threads,
269269
psutil_set_threads=False, highpriority=False, contextsize=self.kcpp_ctxsize, blasbatchsize=self.kcpp_blasbatchsize,
270+
blasubatchsize=self.kcpp_blasubatchsize,
270271
ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], stream=False, smartcontext=self.kcpp_smartcontext, forceversion=0,
271272
nommap=self.kcpp_nommap, usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, usecpu=self.kcpp_usecpu,
272273
useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, usevulkan=self.kcpp_usevulkan, gpulayers=self.kcpp_gpulayers,

expose.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ struct load_model_inputs
5252
const int blasbatchsize = 128;
5353
const int blasubatchsize = 128;
5454
const int debugmode = 0;
55-
const int forceversion = 0;
55+
const int forceversion = 6;
5656
const int gpulayers = 0;
5757
const float rope_freq_scale = 1.0f;
5858
const float rope_freq_base = 10000.0f;

gpttype_adapter.cpp

Lines changed: 231 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
#include "mpt_v3.cpp"
4141
#include "examples/llava/clip.h"
4242
#include "examples/llava/llava.h"
43-
#include "experimental/emphasis.h"
43+
// #include "experimental/emphasis.h"
4444

4545
//const
4646
const int extra_context_handle_fragmentation = 120;
@@ -1684,7 +1684,7 @@ static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
16841684
file_format==FileFormat::RWKV_2);
16851685
if(!approved_format || desiredBlasBatchSize<=0)
16861686
{
1687-
desiredBlasBatchSize = 16;
1687+
desiredBlasBatchSize = 32;
16881688
}
16891689
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC)
16901690
{
@@ -1708,7 +1708,7 @@ static int GetUBatchSize(int desiredBlasUBatchSize,FileFormat in_file_format)
17081708
file_format==FileFormat::RWKV_2);
17091709
if(!approved_format || desiredBlasUBatchSize<=0)
17101710
{
1711-
desiredBlasUBatchSize = 16;
1711+
desiredBlasUBatchSize = 32;
17121712
}
17131713
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC)
17141714
{
@@ -1759,14 +1759,14 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
17591759
// }
17601760
return rope_freq_base_with_positive_offset;
17611761
}
1762-
else if(model_arch==GGUFArch::ARCH_MISTRAL_LLAMA_1_AND_2)
1762+
/* else if(model_arch==GGUFArch::ARCH_MISTRAL_LLAMA_1_AND_2)
17631763
{
17641764
float extended_rope_negative_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / (3.14159265358979323846 * 3.14159265358979323846));
17651765
float rope_freq_base_with_negative_offset = gradient_ai_rope_freq_base_value / extended_rope_negative_offset_value;
17661766
printf("Extended RoPE Negative Offset (divisor) for Llama 1 and 2 based models. (value:%.3f).\n", extended_rope_negative_offset_value);
17671767
printf("RoPE base calculated via Gradient AI formula for Llama 1 and 2 based models. (value:%.1f).\n", rope_freq_base_with_negative_offset);
17681768
return rope_freq_base_with_negative_offset;
1769-
}
1769+
} */
17701770
else
17711771
{
17721772
return gradient_ai_rope_freq_base_value;
@@ -2566,8 +2566,220 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
25662566
}
25672567
else
25682568
{
2569-
printf("\nUnknown Model, cannot load.\n");
2570-
return ModelLoadResult::FAIL;
2569+
// printf("\nUnknown Model, cannot load.\n");
2570+
// return ModelLoadResult::FAIL;
2571+
llama_backend_init();
2572+
2573+
llama_model_params model_params = llama_model_default_params();
2574+
llama_context_params llama_ctx_params = llama_context_default_params();
2575+
llama_ctx_params.n_ctx = clamped_max_context_length;
2576+
if(kcpp_data->use_contextshift)
2577+
{
2578+
llama_ctx_params.n_ctx += extra_context_handle_fragmentation;
2579+
}
2580+
2581+
llama_ctx_params.offload_kqv = !inputs.low_vram;
2582+
llama_ctx_params.logits_all = false;
2583+
model_params.use_mmap = inputs.use_mmap;
2584+
model_params.use_mlock = inputs.use_mlock;
2585+
model_params.n_gpu_layers = inputs.gpulayers;
2586+
2587+
#if defined(GGML_USE_CLBLAST)
2588+
if(file_format==FileFormat::GGUF_GENERIC && model_params.n_gpu_layers>0)
2589+
{
2590+
if(file_format_meta.model_architecture == GGUFArch::ARCH_FALCON)
2591+
{
2592+
printf("\nOpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n");
2593+
model_params.n_gpu_layers = 0;
2594+
}
2595+
else if(file_format_meta.n_expert_count>1)
2596+
{
2597+
printf("\nOpenCL cannot use regular GPU offloading for this model architecture. A fallback GPU offloader will be used with degraded performance.\n");
2598+
2599+
}
2600+
}
2601+
#endif
2602+
#if defined(GGML_USE_CUDA)
2603+
if(cu_parseinfo_maindevice>0)
2604+
{
2605+
printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice);
2606+
}
2607+
ggml_cuda_set_mul_mat_q(inputs.use_mmq);
2608+
if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2 && !kcpp_data->flash_attn)
2609+
{
2610+
printf("CUBLAS: Warning, you are running Qwen2 without Flash Attention and may observe incoherent output.\n");
2611+
}
2612+
#endif
2613+
model_params.main_gpu = cu_parseinfo_maindevice;
2614+
2615+
#if defined(GGML_USE_CUDA)
2616+
model_params.split_mode = (inputs.use_rowsplit?llama_split_mode::LLAMA_SPLIT_MODE_ROW:llama_split_mode::LLAMA_SPLIT_MODE_LAYER);
2617+
#else
2618+
model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
2619+
#endif
2620+
2621+
llama_ctx_params.n_batch = kcpp_data->n_batch;
2622+
llama_ctx_params.n_ubatch = kcpp_data->n_ubatch;
2623+
llama_ctx_params.n_threads = kcpp_data->n_threads;
2624+
llama_ctx_params.n_threads_batch = kcpp_data->n_blasthreads;
2625+
2626+
#if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN)
2627+
bool ts_all_zero = true;
2628+
for (int i = 0; i < tensor_split_max; ++i) {
2629+
if (inputs.tensor_split[i] != 0.0f) {
2630+
ts_all_zero = false;
2631+
break;
2632+
}
2633+
}
2634+
if(!ts_all_zero)
2635+
{
2636+
printf("\nApplying Tensor Split...");
2637+
model_params.tensor_split = inputs.tensor_split;
2638+
}
2639+
#endif
2640+
2641+
//compat for old falcon
2642+
if(file_format_meta.fileversion==1)
2643+
{
2644+
//apply compat fix
2645+
printf("\nUsing older tokenizer for GGUFv1...");
2646+
OldBPETokenizerMode = true;
2647+
}
2648+
2649+
llama_model * llamamodel = llama_load_model_from_file(kcpp_data->model_filename.c_str(), model_params);
2650+
if(overwriteRope)
2651+
{
2652+
llama_ctx_params.rope_freq_base = rope_freq_base;
2653+
llama_ctx_params.rope_freq_scale = rope_freq_scale;
2654+
}
2655+
else
2656+
{
2657+
//if the model modifes rope in any way, or uses yarn, use the model values. Otherwise, use our automatic ones
2658+
//special exception for llama, which uses auto scale
2659+
if((llamamodel->hparams.rope_freq_base_train!=10000.0f && llamamodel->hparams.rope_freq_base_train!=500000.0f) ||
2660+
llamamodel->hparams.rope_freq_scale_train!=1.0f ||
2661+
llamamodel->hparams.rope_scaling_type_train==2)
2662+
{
2663+
printf("Automatic RoPE Scaling: Using model internal value.\n");
2664+
}
2665+
else
2666+
{
2667+
//Calculate rope_freq_base using the gradientAI formula, solar requires ctx *8 for correct scaling
2668+
rope_freq_base = CalcGradientAIRopeFreqBase(llamamodel->hparams.rope_freq_base_train, file_format_meta.n_ctx_train, kcpp_data->n_ctx, file_format_meta.model_architecture);
2669+
llama_ctx_params.rope_freq_base = rope_freq_base;
2670+
llama_ctx_params.rope_freq_scale = rope_freq_scale;
2671+
printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base);
2672+
}
2673+
}
2674+
2675+
if(file_format_meta.model_architecture==GGUFArch::ARCH_RWKV)
2676+
{
2677+
printf("\nRWKV6 Overriding EOS and BOS IDs to 0\n");
2678+
llamamodel->vocab.special_bos_id = llamamodel->vocab.special_eos_id = 0;
2679+
}
2680+
2681+
llama_ctx_params.flash_attn = kcpp_data->flash_attn;
2682+
llama_ctx_params.type_k =
2683+
(inputs.quant_k==22?GGML_TYPE_IQ4_NL:
2684+
(inputs.quant_k==21?GGML_TYPE_Q4_0:
2685+
(inputs.quant_k==20?GGML_TYPE_Q4_1:
2686+
(inputs.quant_k==19?GGML_TYPE_Q5_0:
2687+
(inputs.quant_k==18?GGML_TYPE_Q5_1:
2688+
(inputs.quant_k==17?GGML_TYPE_Q6_0:
2689+
(inputs.quant_k==16?GGML_TYPE_Q8_0:
2690+
(inputs.quant_k==15?GGML_TYPE_F16:
2691+
(inputs.quant_k==14?GGML_TYPE_IQ4_NL:
2692+
(inputs.quant_k==13?GGML_TYPE_Q5_0:
2693+
(inputs.quant_k==12?GGML_TYPE_Q5_1:
2694+
(inputs.quant_k==11?GGML_TYPE_Q5_1:
2695+
(inputs.quant_k==10?GGML_TYPE_Q6_0:
2696+
(inputs.quant_k==9?GGML_TYPE_Q6_0:
2697+
(inputs.quant_k==8?GGML_TYPE_Q6_0:
2698+
(inputs.quant_k==7?GGML_TYPE_Q8_0:
2699+
(inputs.quant_k==6?GGML_TYPE_Q8_0:
2700+
(inputs.quant_k==5?GGML_TYPE_Q8_0:
2701+
(inputs.quant_k==4?GGML_TYPE_F16:
2702+
(inputs.quant_k==3?GGML_TYPE_F16:
2703+
(inputs.quant_k==2?GGML_TYPE_Q4_0:
2704+
(inputs.quant_k==1?GGML_TYPE_Q8_0:
2705+
GGML_TYPE_F16))))))))))))))))))))));
2706+
llama_ctx_params.type_v =
2707+
(inputs.quant_v==22?GGML_TYPE_F16:
2708+
(inputs.quant_v==21?GGML_TYPE_F16:
2709+
(inputs.quant_v==20?GGML_TYPE_F16:
2710+
(inputs.quant_v==19?GGML_TYPE_F16:
2711+
(inputs.quant_v==18?GGML_TYPE_F16:
2712+
(inputs.quant_v==17?GGML_TYPE_F16:
2713+
(inputs.quant_v==16?GGML_TYPE_F16:
2714+
(inputs.quant_v==15?GGML_TYPE_F16:
2715+
(inputs.quant_v==14?GGML_TYPE_IQ4_NL:
2716+
(inputs.quant_v==13?GGML_TYPE_IQ4_NL:
2717+
(inputs.quant_v==12?GGML_TYPE_IQ4_NL:
2718+
(inputs.quant_v==11?GGML_TYPE_Q5_0:
2719+
(inputs.quant_v==10?GGML_TYPE_IQ4_NL:
2720+
(inputs.quant_v==9?GGML_TYPE_Q5_0:
2721+
(inputs.quant_v==8?GGML_TYPE_Q6_0:
2722+
(inputs.quant_v==7?GGML_TYPE_IQ4_NL:
2723+
(inputs.quant_v==6?GGML_TYPE_Q5_0:
2724+
(inputs.quant_v==5?GGML_TYPE_Q6_0:
2725+
(inputs.quant_v==4?GGML_TYPE_Q6_0:
2726+
(inputs.quant_v==3?GGML_TYPE_Q8_0:
2727+
(inputs.quant_v==2?GGML_TYPE_Q4_0:
2728+
(inputs.quant_v==1?GGML_TYPE_Q8_0:
2729+
GGML_TYPE_F16))))))))))))))))))))));
2730+
llama_ctx_v4 = llama_new_context_with_model(llamamodel, llama_ctx_params);
2731+
2732+
if (llama_ctx_v4 == NULL)
2733+
{
2734+
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, kcpp_data->model_filename.c_str());
2735+
return ModelLoadResult::FAIL;
2736+
}
2737+
if (lora_filename != "")
2738+
{
2739+
printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
2740+
2741+
const char * lora_base_arg = NULL;
2742+
if (lora_base != "") {
2743+
printf("Using LORA base model: %s\n", lora_base.c_str());
2744+
lora_base_arg = lora_base.c_str();
2745+
}
2746+
2747+
auto adapter = llama_lora_adapter_init(llamamodel, lora_filename.c_str());
2748+
if (adapter == nullptr) {
2749+
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
2750+
return ModelLoadResult::FAIL;
2751+
}
2752+
llama_lora_adapter_set(llama_ctx_v4, adapter, 1.0f);
2753+
}
2754+
2755+
if(mmproj_filename != "" && file_format==FileFormat::GGUF_GENERIC)
2756+
{
2757+
printf("\nAttempting to apply Multimodal Projector: %s\n", mmproj_filename.c_str());
2758+
clp_ctx = clip_model_load(mmproj_filename.c_str(), /*verbosity=*/ 1);
2759+
if(clp_ctx == nullptr) {
2760+
fprintf(stderr, "%s: error: failed to load mmproj model!\n", __func__);
2761+
return ModelLoadResult::FAIL;
2762+
}
2763+
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
2764+
const int n_embd_llm = llama_n_embd(llamamodel);
2765+
if (n_embd_clip != n_embd_llm) {
2766+
fprintf(stderr, "%s: mmproj embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip, n_embd_llm);
2767+
return ModelLoadResult::FAIL;
2768+
}
2769+
clp_img_data = clip_image_u8_init();
2770+
}
2771+
2772+
n_vocab = llama_n_vocab(llamamodel);
2773+
2774+
//determine mem per token
2775+
std::vector<int> tmp = {1, 2, 3, 4};
2776+
llama_kv_cache_clear(llama_ctx_v4);
2777+
auto er = llama_decode(llama_ctx_v4, llama_batch_get_one(tmp.data(), tmp.size()));
2778+
if(er!=0)
2779+
{
2780+
printf("\nLLAMA EVAL returned nonzero: %d\n",er);
2781+
}
2782+
return ModelLoadResult::SUCCESS;
25712783
}
25722784

25732785
}
@@ -3150,7 +3362,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
31503362
}
31513363
}
31523364

3153-
bool blasmode = (embd_inp.size() >= 1 && kcpp_cpu_has_blas() && kcpp_data->n_batch>=1);
3365+
bool blasmode = (embd_inp.size() >= 32 && kcpp_cpu_has_blas() && kcpp_data->n_batch>=32);
31543366

31553367
current_context_tokens.resize(n_past);
31563368

@@ -3240,9 +3452,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
32403452
printf("%s\n\n", RemoveBell(outstr).c_str());
32413453
}
32423454

3243-
if (llama_ctx_v4) {
3244-
empcats_init(llama_ctx_v4, embd_inp, grammarstr);
3245-
}
3455+
// if (llama_ctx_v4) {
3456+
// empcats_init(llama_ctx_v4, embd_inp, grammarstr);
3457+
// }
32463458

32473459
while (remaining_tokens > 0)
32483460
{
@@ -3432,16 +3644,17 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
34323644
std::vector<int>& bans = antislop_banned_token_ids[n_past];
34333645

34343646
//unwanted print, but.. ^^
3435-
print_tok_vec_str(bans);
3647+
// print_tok_vec_str(bans);
3648+
34363649
for(int t=0;t<bans.size();++t)
34373650
{
34383651
logitsPtr[bans[t]]=lowestLogit;
34393652
}
34403653
}
34413654

3442-
if (llama_ctx_v4) {
3443-
empcats_step_pre(llama_ctx_v4, logitsPtr);
3444-
}
3655+
// if (llama_ctx_v4) {
3656+
// empcats_step_pre(llama_ctx_v4, logitsPtr);
3657+
// }
34453658

34463659
id = SampleLogits(logitsPtr, nctx, n_vocab, last_n_size, repeat_penalty, kcpp_data->rep_pen_slope, presence_penalty,
34473660
top_k, top_a, top_p, min_p, typical_p, tfs_z, temp, rng,
@@ -3450,9 +3663,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
34503663
kcpp_data->dry_allowed_length, kcpp_data->dry_penalty_last_n, kcpp_data->xtc_threshold, kcpp_data->xtc_probability,
34513664
sampler_order, grammar, dynatemp_range, dynatemp_exponent, smoothing_factor);
34523665

3453-
if (llama_ctx_v4) {
3454-
empcats_step_post(llama_ctx_v4, id );
3455-
}
3666+
// if (llama_ctx_v4) {
3667+
// empcats_step_post(llama_ctx_v4, id );
3668+
// }
34563669

34573670
if (grammar != nullptr) {
34583671
grammar_accept_token(file_format, n_vocab, grammar, id);

koboldcpp.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3309,6 +3309,7 @@ def changerunmode(a,b,c):
33093309

33103310
# load model
33113311
makefileentry(gpu_al_tab, "Model:", "Select GGML Model File", model_var, 40, 576, onchoosefile=on_picked_model_file, filetypes=[("GGML bin or GGUF", ("*.bin","*.gguf"))] ,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
3312+
model_var.trace("w", gui_changed_modelfile)
33123313

33133314
ctk.CTkButton(gpu_al_tab, text = "Run Benchmark", command = guibench ).grid(row=45,column=0, stick="se", padx= 0, pady=2)
33143315

@@ -3340,6 +3341,7 @@ def togglerope(a,b,c):
33403341

33413342
# load model
33423343
makefileentry(tokens_tab, "Model:", "Select GGML or GGML Model File", model_var, 50, 576, onchoosefile=on_picked_model_file, filetypes=[("GGML bin or GGUF", ("*.bin","*.gguf"))] ,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
3344+
model_var.trace("w", gui_changed_modelfile)
33433345

33443346
togglerope(1,1,1)
33453347
toggleflashattn(1,1,1)
@@ -3540,7 +3542,7 @@ def export_vars():
35403542
args.quiet = quietmode.get()==1
35413543
args.nocertify = nocertifymode.get()==1
35423544
args.nomodel = nomodel.get()==1
3543-
args.quantkv = int(quantkv_values[int(quantkv_var.get())])
3545+
# args.quantkv = int(quantkv_values[int(quantkv_var.get())])
35443546

35453547
args.poslayeroffset = int(poslayeroffset_values[int(poslayeroffset_var.get())])
35463548
args.neglayeroffset = int(neglayeroffset_values[int(neglayeroffset_var.get())])

model_adapter.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ enum GGUFArch
5959
ARCH_SOLAR = 4,
6060
ARCH_QWEN2 = 5,
6161
ARCH_RWKV = 6,
62-
ARCH_MISTRAL_LLAMA_1_AND_2 = 50,
62+
// ARCH_MISTRAL_LLAMA_1_AND_2 = 50,
6363
};
6464

6565
struct FileFormatExtraMeta
@@ -68,7 +68,7 @@ struct FileFormatExtraMeta
6868
int fileversion = 0;
6969
GGUFArch model_architecture = GGUFArch::ARCH_DEFAULT;
7070
int n_expert_count = 0;
71-
int32_t n_tensors;
71+
// int32_t n_tensors;
7272
};
7373

7474
struct TopPicksData

0 commit comments

Comments
 (0)