Skip to content

Commit f4dc3a3

Browse files
committed
Customizable Norm RMS Epsilon
1 parent b3340df commit f4dc3a3

File tree

3 files changed

+21
-0
lines changed

3 files changed

+21
-0
lines changed

expose.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ struct load_model_inputs
6161
const float rope_freq_scale = 1.0f;
6262
const float rope_freq_base = 10000.0f;
6363
const int moe_experts = -1;
64+
const float norm_rms_eps = -1.0f;
6465
const bool flash_attention = false;
6566
const float tensor_split[tensor_split_max] = {};
6667
const int quant_k = 0;

gpttype_adapter.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2324,6 +2324,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
23242324
kvos.push_back(kvo);
23252325
model_params.kv_overrides = kvos.data();
23262326
}
2327+
if(inputs.norm_rms_eps>0)
2328+
{
2329+
printf("\nOverriding norm rms epsilon to %f\n",inputs.norm_rms_eps);
2330+
llama_model_kv_override kvo;
2331+
const char * rmskey = "llama.attention.layer_norm_rms_epsilon";
2332+
std::strncpy(kvo.key, rmskey, sizeof(kvo.key) - 1);
2333+
kvo.key[sizeof(kvo.key) - 1] = '\0'; // Ensure null termination
2334+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
2335+
kvo.val_f64 = inputs.norm_rms_eps;
2336+
kvos.push_back(kvo);
2337+
model_params.kv_overrides = kvos.data();
2338+
}
23272339
llama_model * llamamodel = llama_load_model_from_file(kcpp_data->model_filename.c_str(), model_params);
23282340

23292341
if(overwriteRope)

koboldcpp.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ class load_model_inputs(ctypes.Structure):
172172
("rope_freq_scale", ctypes.c_float),
173173
("rope_freq_base", ctypes.c_float),
174174
("moe_experts", ctypes.c_int),
175+
("norm_rms_eps", ctypes.c_float),
175176
("flash_attention", ctypes.c_bool),
176177
("tensor_split", ctypes.c_float * tensor_split_max),
177178
("quant_k", ctypes.c_int),
@@ -1419,6 +1420,7 @@ def load_model(model_filename):
14191420
inputs.tensor_split[n] = 0
14201421

14211422
inputs.moe_experts = args.moeexperts
1423+
inputs.norm_rms_eps = args.normrmseps
14221424
inputs = set_backend_props(inputs)
14231425

14241426
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
@@ -3322,6 +3324,7 @@ def hide_tooltip(event):
33223324
customrope_base = ctk.StringVar(value="10000")
33233325
chatcompletionsadapter_var = ctk.StringVar()
33243326
moeexperts_var = ctk.StringVar(value=str(-1))
3327+
normrmseps_var = ctk.StringVar(value=str(-1.0))
33253328

33263329
model_var = ctk.StringVar()
33273330
lora_var = ctk.StringVar()
@@ -3836,6 +3839,7 @@ def togglerope(a,b,c):
38363839
makefileentry(tokens_tab, "Model:", "Select GGML or GGML Model File", model_var, 50, 576, onchoosefile=on_picked_model_file, filetypes=[("GGML bin or GGUF", ("*.bin","*.gguf"))] ,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
38373840
model_var.trace("w", gui_changed_modelfile)
38383841
makelabelentry(tokens_tab, "MoE Experts:", moeexperts_var, row=35, padx=100, singleline=True, tooltip="Override number of MoE experts.")
3842+
makelabelentry(tokens_tab, "Norm RMS Epsilon:", normrmseps_var, row=38, padx=150, singleline=True, tooltip="Override Norm RMS Epsilon value to use for the model.\nUseful for <2bpw quants mainly.\nExample of format: 1.95e-05")
38393843

38403844
togglerope(1,1,1)
38413845
# toggleflashattn(1,1,1)
@@ -4119,6 +4123,7 @@ def export_vars():
41194123
if customrope_var.get()==1:
41204124
args.ropeconfig = [float(customrope_scale.get()),float(customrope_base.get())]
41214125
args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1
4126+
args.normrmseps = float(normrmseps_var.get()) if normrmseps_var.get()!="" else -1.0
41224127
args.chatcompletionsadapter = None if chatcompletionsadapter_var.get() == "" else chatcompletionsadapter_var.get()
41234128
try:
41244129
if kcpp_exporting_template and isinstance(args.chatcompletionsadapter, str) and args.chatcompletionsadapter!="" and os.path.exists(args.chatcompletionsadapter):
@@ -4292,6 +4297,8 @@ def import_vars(dict):
42924297
customrope_var.set(0)
42934298
if "moeexperts" in dict and dict["moeexperts"]:
42944299
moeexperts_var.set(dict["moeexperts"])
4300+
if "normrmseps" in dict and dict["normrmseps"]:
4301+
normrmseps_var.set(dict["normrmseps"])
42954302

42964303
if "blasbatchsize" in dict and dict["blasbatchsize"]:
42974304
blas_size_var.set(blasbatchsize_values.index(str(dict["blasbatchsize"])))
@@ -5739,6 +5746,7 @@ def range_checker(arg: str):
57395746
advparser.add_argument("--unpack", help="Extracts the file contents of the KoboldCpp/Croco.Cpp binary into a target directory.", metavar=('destination'), type=str, default="")
57405747
advparser.add_argument("--nomodel", help="Allows you to launch the GUI alone, without selecting any model.", action='store_true')
57415748
advparser.add_argument("--moeexperts", metavar=('[num of experts]'), help="How many experts to use for MoE models (default=follow gguf)", type=int, default=-1)
5749+
advparser.add_argument("--normrmseps", metavar=('[norm rms eps]'), help="Override Norm RMS Epsilon value to use for the model. Useful for <2bpw quants mainly. Example of format: 1.95e-05 (default=follow gguf)", type=float, default=-1.0)
57425750
advparser.add_argument("--poslayeroffset", help="Removes or adds a layer to the GPU layers autoloader calculation in case of OOM or under-exploitation.", type=check_range(int,0,10), default=0)
57435751
advparser.add_argument("--neglayeroffset", help="Removes or adds a layer to the GPU layers autoloader calculation in case of OOM or under-exploitation.", type=check_range(int,0,10), default=0)
57445752

0 commit comments

Comments
 (0)