Skip to content

Commit 41e6742

Browse files
committed
Prep for kv-override
Another attempt..
1 parent 6e3af2e commit 41e6742

File tree

3 files changed

+58
-1
lines changed

3 files changed

+58
-1
lines changed

expose.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ struct load_model_inputs
4141
const char * lora_filename = nullptr;
4242
const char * lora_base = nullptr;
4343
const char * mmproj_filename = nullptr;
44+
const char * kv_override = nullptr;
45+
const char * data, std::vector<llama_model_kv_override> & overrides
4446
const bool use_mmap = false;
4547
const bool use_mlock = false;
4648
const bool use_smartcontext = false;
@@ -194,6 +196,7 @@ extern std::string executable_path;
194196
extern std::string lora_filename;
195197
extern std::string lora_base;
196198
extern std::string mmproj_filename;
199+
extern std::vector<llama_model_kv_override> kv_overrides;
197200
extern std::vector<std::string> generated_tokens;
198201
extern bool generation_finished;
199202
extern float last_eval_time;

gpttype_adapter.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ std::string executable_path = "";
5252
std::string lora_filename = "";
5353
std::string lora_base = "";
5454
std::string mmproj_filename = "";
55+
56+
std::vector<llama_model_kv_override> kv_overrides;
5557
bool generation_finished;
5658
float last_process_time = 0;
5759
float last_eval_time = 0;
@@ -1689,6 +1691,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
16891691
kcpp_data->n_ubatch = GetUBatchSize(inputs.blasubatchsize, in_file_format);
16901692
kcpp_data->flash_attn = inputs.flash_attention;
16911693
kcpp_data->model_filename = inputs.model_filename;
1694+
kcpp_data->kv_override = inputs.kv_override;
16921695
kcpp_data->use_smartcontext = inputs.use_smartcontext;
16931696
kcpp_data->use_contextshift = inputs.use_contextshift;
16941697
debugmode = inputs.debugmode;
@@ -1898,6 +1901,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
18981901
llama_model_params model_params = llama_model_default_params();
18991902
llama_context_params llama_ctx_params = llama_context_default_params();
19001903
llama_ctx_params.n_ctx = clamped_max_context_length;
1904+
llama_model_kv_override kv_override = imputs.kv_override;
1905+
19011906
if(kcpp_data->use_contextshift)
19021907
{
19031908
llama_ctx_params.n_ctx += extra_context_handle_fragmentation;

koboldcpp.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ class load_model_inputs(ctypes.Structure):
130130
("mmproj_filename", ctypes.c_char_p),
131131
("use_mmap", ctypes.c_bool),
132132
("use_mlock", ctypes.c_bool),
133+
("kv_override", ctypes.c_char_p),
133134
("use_smartcontext", ctypes.c_bool),
134135
("use_contextshift", ctypes.c_bool),
135136
("clblast_info", ctypes.c_int),
@@ -1119,6 +1120,7 @@ def load_model(model_filename):
11191120
inputs.blasthreads = args.blasthreads
11201121
inputs.use_mmap = (not args.nommap)
11211122
inputs.use_mlock = args.usemlock
1123+
inputs.kv_override = "".encode("UTF-8")
11221124
inputs.lora_filename = "".encode("UTF-8")
11231125
inputs.lora_base = "".encode("UTF-8")
11241126
if args.lora:
@@ -2689,7 +2691,7 @@ def hide_tooltip(event):
26892691

26902692
tabs = ctk.CTkFrame(root, corner_radius = 0, width=windowwidth, height=windowheight-50)
26912693
tabs.grid(row=0, stick="nsew")
2692-
tabnames= ["Quick Launch", "Hardware", "Tokens", "GPU AutoLayers", "Model Files", "Network", "Horde Worker","Image Gen","Audio","Extra"]
2694+
tabnames= ["Quick Launch", "Hardware", "Tokens", "GPU AutoLayers", "Model Files", "Network", "Horde Worker","Image Gen","Audio","Extra", "Croco"]
26932695
navbuttons = {}
26942696
navbuttonframe = ctk.CTkFrame(tabs, width=100, height=int(tabs.cget("height")))
26952697
navbuttonframe.grid(row=0, column=0, padx=2,pady=2)
@@ -2789,6 +2791,8 @@ def hide_tooltip(event):
27892791
mmproj_var = ctk.StringVar()
27902792
nomodel = ctk.IntVar(value=0)
27912793

2794+
kv_override_var = ctk.StringVar(value="")
2795+
27922796
port_var = ctk.StringVar(value=defaultport)
27932797
host_var = ctk.StringVar(value="")
27942798
multiuser_var = ctk.IntVar(value=1)
@@ -3238,6 +3242,8 @@ def changerunmode(a,b,c):
32383242

32393243
tensor_split_entry,tensor_split_label = makelabelentry(gpu_al_tab, "Tensor Split:", tensor_split_str_vars, 8, 160, tooltip='When using multiple GPUs this option controls how large tensors should be split across all GPUs.\nUses a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order.\nFor example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1.')
32403244

3245+
3246+
32413247
# load model
32423248
makefileentry(gpu_al_tab, "Model:", "Select GGML Model File", model_var, 40, 576, onchoosefile=on_picked_model_file, filetypes=[("GGML bin or GGUF", ("*.bin","*.gguf"))] ,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
32433249

@@ -3269,6 +3275,8 @@ def togglerope(a,b,c):
32693275
noqkvlabel.configure(text_color="#ff5555")
32703276
qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 22, 30, set=0,tooltip="Enable quantization of KV cache (KVQ). Mode 0 (F16) is default. Modes 1-12 requires FlashAttention and disables ContextShift.\nModes 15-20 work without FA, for incompatible models. 0,13,14 can work with or without.")
32713277

3278+
3279+
32723280
# load model
32733281
makefileentry(tokens_tab, "Model:", "Select GGML or GGML Model File", model_var, 50, 576, onchoosefile=on_picked_model_file, filetypes=[("GGML bin or GGUF", ("*.bin","*.gguf"))] ,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
32743282

@@ -3424,6 +3432,40 @@ def kcpp_export_template():
34243432
makelabel(extra_tab, "Export as launcher .kcppt template (Expert Only)", 4, 0,tooltiptxt="Creates a KoboldCpp launch template for others to use.\nEmbeds JSON files directly into exported file when saving.\nWhen loaded, forces the backend to be automatically determined.\nWarning! Not recommended for beginners!")
34253433
ctk.CTkButton(extra_tab , text = "Generate LaunchTemplate", command = kcpp_export_template ).grid(row=5,column=0, stick="w", padx= 8, pady=2)
34263434

3435+
# croco tab
3436+
croco_tab = tabcontent["Croco"]
3437+
3438+
# makelabelentry(croco_tab, "Context Size:" , context_var, 2, 160,tooltip="How many threads to use during BLAS processing.\nIf left blank, uses same value as regular thread count.")
3439+
3440+
makelabelentry(croco_tab, "Threads:" , threads_var, 4, 80,tooltip="How many threads to use.\nRecommended value is your CPU core count, defaults are usually OK.")
3441+
3442+
makelabelentry(croco_tab, "BLAS threads:" , blas_threads_var, 6, 80,tooltip="How many threads to use during BLAS processing.\nIf left blank, uses same value as regular thread count.")
3443+
3444+
# makelabelentry(croco_tab, "Logical Blas Batch Size:" , blas_size_var, 8, 160,tooltip="How many tokens to process at once per batch.\nLarger values use more memory unless Physical Batch supersedes it.")
3445+
3446+
# makelabelentry(croco_tab, "Physical Blas Batch Size:" , blasubatchsize_var, 10, 160,tooltip="How many tokens to process at once per batch.\nLarger values use more memory.")
3447+
3448+
makelabelentry(croco_tab, "GPU Layers:", gpulayers_var, 12, 80,tooltip="How many layers to offload onto the GPU.\nVRAM intensive, usage increases with model and context size.\nRequires some trial and error to find the best fit value.\n\nCommon values for total layers, accuracy not guaranteed.\n\nLlama/Mistral 7b/8b: 33\nSolar 10.7b/11b: 49\nLlama 13b: 41\nLlama 20b(stack): 63\nLlama/Yi 34b: 61\nMixtral 8x7b: 33\nLlama 70b: 81")
3449+
3450+
makelabelentry(croco_tab, "Positive Layer offset:", poslayeroffset_var, 14, 80, tooltip="Adds layers to the GPU layers autoloader calculation in case of under-exploitation of your GPU(s)..")
3451+
3452+
makelabelentry(croco_tab, "Negative Layer Offset:", neglayeroffset_var, 16, 80, tooltip="Removes layers to the GPU layers autoloader calculation in case of Out of Memory (OOM) error..")
3453+
3454+
makelabelentry(croco_tab, "Tensor Split:", tensor_split_str_vars, 18, 280, tooltip='When using multiple GPUs this option controls how large tensors should be split across all GPUs.\nUses a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order.\nFor example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1.')
3455+
3456+
makelabelentry(croco_tab, "RoPE Scale:", customrope_scale, 20, 80, tooltip="For Linear RoPE scaling. RoPE frequency scale.")
3457+
3458+
makelabelentry(croco_tab, "RoPE Base:", customrope_base, 22, 160, tooltip="For NTK Aware Scaling. RoPE frequency base.")
3459+
3460+
makelabelentry(croco_tab, "Quantize KV Cache:", quantkv_var, 24, 80, tooltip="Enable quantization of KV cache (KVQ). Mode 0 (F16) is default. Modes 1-12 requires FlashAttention and disables ContextShift.\nModes 15-20 work without FA, for incompatible models. 0,13,14 can work with or without.")
3461+
3462+
makelabelentry(croco_tab, "Opt. model metadata KV override:", kv_override_var, 26, 420, tooltip="Supersede metadata of a model, like Epislon _ e.g : llama.attention.layer_norm_rms_epsilon=float:1e5, 1.25e5, 3e6, etc.")
3463+
3464+
makefileentry(croco_tab, "Model:", "Select GGML or GGML Model File", model_var, 28, 576, onchoosefile=on_picked_model_file, filetypes=[("GGML bin or GGUF", ("*.bin","*.gguf"))] ,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
3465+
model_var.trace("w", gui_changed_modelfile)
3466+
3467+
ctk.CTkButton(croco_tab, text = "Run Benchmark", command = guibench ).grid(row=32, stick="se", padx= 0, pady=0)
3468+
34273469
# launch
34283470
def guilaunch():
34293471
if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and nomodel.get()!=1:
@@ -3550,6 +3592,8 @@ def export_vars():
35503592
pass
35513593
args.mmproj = None if mmproj_var.get() == "" else mmproj_var.get()
35523594

3595+
args.kv_override = None if kv_override_var.get() == "" else kv_override_var.get()
3596+
35533597
args.ssl = None if (ssl_cert_var.get() == "" or ssl_key_var.get() == "") else ([ssl_cert_var.get(), ssl_key_var.get()])
35543598
args.password = None if (password_var.get() == "") else (password_var.get())
35553599

@@ -3710,6 +3754,8 @@ def import_vars(dict):
37103754

37113755
mmproj_var.set(dict["mmproj"] if ("mmproj" in dict and dict["mmproj"]) else "")
37123756

3757+
kv_override_var.set(dict["kv_override"] if ("kv_override" in dict and dict["kv_override"]) else "")
3758+
37133759
ssl_cert_var.set("")
37143760
ssl_key_var.set("")
37153761
if "ssl" in dict and dict["ssl"]:
@@ -5014,6 +5060,9 @@ def range_checker(arg: str):
50145060
advparser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+')
50155061
advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true')
50165062
advparser.add_argument("--mmproj", help="Select a multimodal projector file for LLaVA.", default="")
5063+
5064+
advparser.add_argument("--kv_override", help="Supersede metadata of a model, like Epislon (e.g : llama.attention.layer_norm_rms_epsilon=float:1e5, 1.25e5, 3e6, etc)", metavar=('[kv_override]'), nargs='+')
5065+
50175066
advparser.add_argument("--password", help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None)
50185067
advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true')
50195068
advparser.add_argument("--chatcompletionsadapter", help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="")

0 commit comments

Comments
 (0)