Skip to content

Commit 5a921a4

Browse files
committed
add overridenativecontext flag, stop nagging me
1 parent 7ac0102 commit 5a921a4

File tree

3 files changed

+69
-17
lines changed

3 files changed

+69
-17
lines changed

expose.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ struct load_model_inputs
6060
const int gpulayers = 0;
6161
const float rope_freq_scale = 1.0f;
6262
const float rope_freq_base = 10000.0f;
63+
const int overridenativecontext = 0;
6364
const int moe_experts = -1;
6465
const int moecpu = 0;
6566
const bool no_bos_token = false;

gpttype_adapter.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2021,7 +2021,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
20212021
float rope_freq_scale = 1.0f;
20222022
float rope_freq_base = 10000.0f;
20232023
bool overwriteRope = false;
2024-
if(inputs.rope_freq_scale>0.0f)
2024+
if(inputs.rope_freq_scale>0.0f && inputs.overridenativecontext==0)
20252025
{
20262026
rope_freq_scale = inputs.rope_freq_scale;
20272027
rope_freq_base = inputs.rope_freq_base;
@@ -2030,8 +2030,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
20302030
}
20312031
else
20322032
{
2033+
const int maxctxtrain = (inputs.overridenativecontext>0?inputs.overridenativecontext:2048);
20332034
//Set freq base for all, including non GGUF. If we are using GGUF, this will be overwritten with more accurate values later.
2034-
rope_freq_base = CalcGradientAIRopeFreqBase(10000.0f,2048,kcpp_data->n_ctx, GGUFArch::ARCH_DEFAULT);
2035+
rope_freq_base = CalcGradientAIRopeFreqBase(10000.0f,maxctxtrain,kcpp_data->n_ctx, GGUFArch::ARCH_DEFAULT);
20352036
if(file_format==FileFormat::GGUF_GENERIC)
20362037
{
20372038
printf("Using automatic RoPE scaling for GGUF. If the model has custom RoPE settings, they'll be used directly instead!\n");
@@ -2369,7 +2370,15 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
23692370
{
23702371
//if the model modifes rope in any way, or uses yarn, use the model values. Otherwise, use our automatic ones
23712372
//special exception for llama, which uses auto scale
2372-
if((llamamodel->hparams.rope_freq_base_train!=10000.0f && llamamodel->hparams.rope_freq_base_train!=500000.0f) ||
2373+
if(inputs.overridenativecontext > 0)
2374+
{
2375+
printf("Automatic RoPE Scaling: Adjust based on override train context of %d.\n",inputs.overridenativecontext);
2376+
rope_freq_base = CalcGradientAIRopeFreqBase(llamamodel->hparams.rope_freq_base_train, inputs.overridenativecontext, kcpp_data->n_ctx, file_format_meta.model_architecture);
2377+
llama_ctx_params.rope_freq_base = rope_freq_base;
2378+
llama_ctx_params.rope_freq_scale = rope_freq_scale;
2379+
printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base);
2380+
}
2381+
else if((llamamodel->hparams.rope_freq_base_train!=10000.0f && llamamodel->hparams.rope_freq_base_train!=500000.0f) ||
23732382
llamamodel->hparams.rope_freq_scale_train!=1.0f ||
23742383
llamamodel->hparams.rope_scaling_type_train==2)
23752384
{

koboldcpp.py

Lines changed: 56 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
net_save_slots = 12
5555
savestate_limit = 3 #3 savestate slots
5656
default_vae_tile_threshold = 768
57+
default_native_ctx = 16384
5758

5859
# abuse prevention
5960
stop_token_max = 256
@@ -194,6 +195,7 @@ class load_model_inputs(ctypes.Structure):
194195
("gpulayers", ctypes.c_int),
195196
("rope_freq_scale", ctypes.c_float),
196197
("rope_freq_base", ctypes.c_float),
198+
("overridenativecontext", ctypes.c_int),
197199
("moe_experts", ctypes.c_int),
198200
("moecpu", ctypes.c_int),
199201
("no_bos_token", ctypes.c_bool),
@@ -1381,11 +1383,17 @@ def load_model(model_filename):
13811383
inputs.blasbatchsize = args.blasbatchsize
13821384
inputs.forceversion = args.forceversion
13831385
inputs.gpulayers = args.gpulayers
1384-
inputs.rope_freq_scale = args.ropeconfig[0]
1385-
if len(args.ropeconfig)>1:
1386-
inputs.rope_freq_base = args.ropeconfig[1]
1387-
else:
1386+
if args.overridenativecontext and args.overridenativecontext>0:
1387+
inputs.overridenativecontext = args.overridenativecontext
1388+
inputs.rope_freq_scale = 0
13881389
inputs.rope_freq_base = 10000
1390+
else:
1391+
inputs.overridenativecontext = 0
1392+
inputs.rope_freq_scale = args.ropeconfig[0]
1393+
if len(args.ropeconfig)>1:
1394+
inputs.rope_freq_base = args.ropeconfig[1]
1395+
else:
1396+
inputs.rope_freq_base = 10000
13891397

13901398
for n in range(tensor_split_max):
13911399
if args.tensor_split and n < len(args.tensor_split):
@@ -4507,8 +4515,10 @@ def hide_tooltip(event):
45074515
flashattention_var = ctk.IntVar(value=0)
45084516
context_var = ctk.IntVar()
45094517
customrope_var = ctk.IntVar()
4518+
manualrope_var = ctk.IntVar()
45104519
customrope_scale = ctk.StringVar(value="1.0")
45114520
customrope_base = ctk.StringVar(value="10000")
4521+
customrope_nativectx = ctk.StringVar(value=str(default_native_ctx))
45124522
chatcompletionsadapter_var = ctk.StringVar(value="AutoGuess")
45134523
moeexperts_var = ctk.StringVar(value=str(-1))
45144524
moecpu_var = ctk.StringVar(value=str(0))
@@ -5168,16 +5178,31 @@ def changerunmode(a,b,c):
51685178
context_var.trace_add("write", changed_gpulayers_estimate)
51695179
makelabelentry(tokens_tab, "Default Gen Amt:", defaultgenamt_var, row=20, padx=120, singleline=True, tooltip="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.")
51705180

5181+
nativectx_entry, nativectx_label = makelabelentry(tokens_tab, "Override Native Context:", customrope_nativectx, row=23, padx=146, singleline=True, tooltip="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.")
51715182
customrope_scale_entry, customrope_scale_label = makelabelentry(tokens_tab, "RoPE Scale:", customrope_scale, row=23, padx=100, singleline=True, tooltip="For Linear RoPE scaling. RoPE frequency scale.")
51725183
customrope_base_entry, customrope_base_label = makelabelentry(tokens_tab, "RoPE Base:", customrope_base, row=24, padx=100, singleline=True, tooltip="For NTK Aware Scaling. RoPE frequency base.")
51735184
def togglerope(a,b,c):
5174-
items = [customrope_scale_label, customrope_scale_entry,customrope_base_label, customrope_base_entry]
5175-
for idx, item in enumerate(items):
5176-
if customrope_var.get() == 1:
5177-
item.grid()
5178-
else:
5185+
if customrope_var.get() == 1:
5186+
manualropebox.grid()
5187+
enabled_items = [customrope_scale_label, customrope_scale_entry,customrope_base_label, customrope_base_entry]
5188+
disabled_items = [nativectx_entry,nativectx_label]
5189+
for idx, item in enumerate(enabled_items):
5190+
if manualrope_var.get() == 1:
5191+
item.grid()
5192+
else:
5193+
item.grid_remove()
5194+
for idx, item in enumerate(disabled_items):
5195+
if manualrope_var.get() == 0:
5196+
item.grid()
5197+
else:
5198+
item.grid_remove()
5199+
else:
5200+
disabled_items = [manualropebox, nativectx_entry,nativectx_label, customrope_scale_label, customrope_scale_entry, customrope_base_label, customrope_base_entry]
5201+
for idx, item in enumerate(disabled_items):
51795202
item.grid_remove()
5180-
makecheckbox(tokens_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
5203+
manualropebox = makecheckbox(tokens_tab, "Manual Rope Scale", variable=manualrope_var, row=22, command=togglerope, padx=166, tooltiptxt="Set RoPE base and scale manually.")
5204+
5205+
makecheckbox(tokens_tab, "Custom RoPE Config", variable=customrope_var, row=22, command=togglerope,tooltiptxt="Override the default RoPE configuration with custom RoPE scaling.")
51815206
makecheckbox(tokens_tab, "Use FlashAttention", flashattention_var, 28, command=toggleflashattn, tooltiptxt="Enable flash attention for GGUF models.")
51825207
noqkvlabel = makelabel(tokens_tab,"(Note: QuantKV works best with flash attention)",28,0,"Only K cache can be quantized, and performance can suffer.\nIn some cases, it might even use more VRAM when doing a full offload.",padx=160)
51835208
noqkvlabel.configure(text_color="#ff5555")
@@ -5474,9 +5499,15 @@ def export_vars():
54745499
args.forceversion = 0 if version_var.get()=="" else int(version_var.get())
54755500
args.contextsize = int(contextsize_text[context_var.get()])
54765501
if customrope_var.get()==1:
5477-
args.ropeconfig = [float(customrope_scale.get()),float(customrope_base.get())]
5502+
if manualrope_var.get()==1:
5503+
args.ropeconfig = [float(customrope_scale.get()),float(customrope_base.get())]
5504+
args.overridenativecontext = 0
5505+
else:
5506+
args.ropeconfig = [0.0, 10000.0]
5507+
args.overridenativecontext = int(customrope_nativectx.get())
54785508
else:
54795509
args.ropeconfig = [0.0, 10000.0]
5510+
args.overridenativecontext = 0
54805511
args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1
54815512
args.moecpu = int(moecpu_var.get()) if moecpu_var.get()!="" else 0
54825513
args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 512
@@ -5679,13 +5710,24 @@ def import_vars(dict):
56795710
blas_threads_var.set("")
56805711
if "contextsize" in dict and dict["contextsize"]:
56815712
context_var.set(contextsize_text.index(str(dict["contextsize"])))
5682-
if "ropeconfig" in dict and dict["ropeconfig"] and len(dict["ropeconfig"])>1:
5713+
if "overridenativecontext" in dict and dict["overridenativecontext"]>0:
5714+
customrope_var.set(1)
5715+
manualrope_var.set(0)
5716+
customrope_nativectx.set(str(dict["overridenativecontext"]))
5717+
elif "ropeconfig" in dict and dict["ropeconfig"] and len(dict["ropeconfig"])>1:
5718+
customrope_nativectx.set(default_native_ctx)
56835719
if dict["ropeconfig"][0]>0:
56845720
customrope_var.set(1)
5721+
manualrope_var.set(1)
56855722
customrope_scale.set(str(dict["ropeconfig"][0]))
56865723
customrope_base.set(str(dict["ropeconfig"][1]))
56875724
else:
56885725
customrope_var.set(0)
5726+
manualrope_var.set(0)
5727+
else:
5728+
customrope_nativectx.set(default_native_ctx)
5729+
customrope_var.set(0)
5730+
manualrope_var.set(0)
56895731
if "moeexperts" in dict and dict["moeexperts"]:
56905732
moeexperts_var.set(dict["moeexperts"])
56915733
if "moecpu" in dict and dict["moecpu"]:
@@ -7462,7 +7504,6 @@ def range_checker(arg: str):
74627504
parser.add_argument("--host", metavar=('[ipaddr]'), help="Host IP to listen on. If this flag is not set, all routable interfaces are accepted.", default="")
74637505
parser.add_argument("--launch", help="Launches a web browser when load is completed.", action='store_true')
74647506
parser.add_argument("--config", metavar=('[filename]'), help="Load settings from a .kcpps file. Other arguments will be ignored", type=str, nargs=1)
7465-
74667507
parser.add_argument("--threads", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=get_default_threads())
74677508
compatgroup = parser.add_mutually_exclusive_group()
74687509
compatgroup.add_argument("--usecuda", "--usecublas", "--usehipblas", help="Use CUDA for GPU Acceleration. Requires CUDA. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq|nommq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'all', 'mmq', 'nommq', 'rowsplit'])
@@ -7478,14 +7519,15 @@ def range_checker(arg: str):
74787519
advparser.add_argument("--version", help="Prints version and exits.", action='store_true')
74797520
advparser.add_argument("--analyze", metavar=('[filename]'), help="Reads the metadata, weight types and tensor names in any GGUF file.", default="")
74807521
advparser.add_argument("--maingpu", help="Only used in a multi-gpu setup. Sets the index of the main GPU that will be used.",metavar=('[Device ID]'), type=int, default=-1)
7481-
advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
74827522
advparser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,16,32,64,128,256,512,1024,2048], default=512)
74837523
advparser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)
74847524
advparser.add_argument("--lora", help="GGUF models only, applies a lora file on top of model.", metavar=('[lora_filename]'), nargs='+')
74857525
advparser.add_argument("--loramult", metavar=('[amount]'), help="Multiplier for the Text LORA model to be applied.", type=float, default=1.0)
74867526
advparser.add_argument("--noshift", help="If set, do not attempt to Trim and Shift the GGUF context.", action='store_true')
74877527
advparser.add_argument("--nofastforward", help="If set, do not attempt to fast forward GGUF context (always reprocess). Will also enable noshift", action='store_true')
74887528
advparser.add_argument("--useswa", help="If set, allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", action='store_true')
7529+
advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
7530+
advparser.add_argument("--overridenativecontext", help="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.",metavar=('[trained context]'), type=int, default=0)
74897531
compatgroup3 = advparser.add_mutually_exclusive_group()
74907532
compatgroup3.add_argument("--usemmap", help="If set, uses mmap to load model.", action='store_true')
74917533
advparser.add_argument("--usemlock", help="Enables mlock, preventing the RAM used to load the model from being paged out. Not usually recommended.", action='store_true')

0 commit comments

Comments
 (0)