Skip to content

Commit 13c6690

Browse files
committed
Revert "set number of experts only"
This reverts commit 6bcd3b9.
1 parent 6bcd3b9 commit 13c6690

File tree

4 files changed

+1
-64
lines changed

4 files changed

+1
-64
lines changed

expose.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ struct load_model_inputs
5050
const char * vulkan_info = nullptr;
5151
const int blasbatchsize = 128;
5252
const int blasubatchsize = 128;
53-
const int experts_used = 0;
5453
const int debugmode = 0;
5554
const int forceversion = 0;
5655
const int gpulayers = 0;

gpttype_adapter.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ std::string executable_path = "";
5252
std::string lora_filename = "";
5353
std::string lora_base = "";
5454
std::string mmproj_filename = "";
55-
5655
bool generation_finished;
5756
float last_process_time = 0;
5857
float last_eval_time = 0;
@@ -1690,7 +1689,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
16901689
kcpp_data->n_ubatch = GetUBatchSize(inputs.blasubatchsize, in_file_format);
16911690
kcpp_data->flash_attn = inputs.flash_attention;
16921691
kcpp_data->model_filename = inputs.model_filename;
1693-
kcpp_data->n_experts_used = inputs.experts_used;
16941692
kcpp_data->use_smartcontext = inputs.use_smartcontext;
16951693
kcpp_data->use_contextshift = inputs.use_contextshift;
16961694
debugmode = inputs.debugmode;
@@ -1900,7 +1898,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
19001898
llama_model_params model_params = llama_model_default_params();
19011899
llama_context_params llama_ctx_params = llama_context_default_params();
19021900
llama_ctx_params.n_ctx = clamped_max_context_length;
1903-
19041901
if(kcpp_data->use_contextshift)
19051902
{
19061903
llama_ctx_params.n_ctx += extra_context_handle_fragmentation;
@@ -1975,9 +1972,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
19751972
}
19761973

19771974
llama_model * llamamodel = llama_load_model_from_file(kcpp_data->model_filename.c_str(), model_params);
1978-
1979-
llamamodel->hparams.n_expert_used = kcpp_data->n_experts_used;
1980-
19811975
if(overwriteRope)
19821976
{
19831977
llama_ctx_params.rope_freq_base = rope_freq_base;

koboldcpp.py

Lines changed: 1 addition & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ class load_model_inputs(ctypes.Structure):
130130
("mmproj_filename", ctypes.c_char_p),
131131
("use_mmap", ctypes.c_bool),
132132
("use_mlock", ctypes.c_bool),
133-
("experts_used", ctypes.c_int),
134133
("use_smartcontext", ctypes.c_bool),
135134
("use_contextshift", ctypes.c_bool),
136135
("clblast_info", ctypes.c_int),
@@ -1120,7 +1119,6 @@ def load_model(model_filename):
11201119
inputs.blasthreads = args.blasthreads
11211120
inputs.use_mmap = (not args.nommap)
11221121
inputs.use_mlock = args.usemlock
1123-
inputs.experts_used = args.experts_used
11241122
inputs.lora_filename = "".encode("UTF-8")
11251123
inputs.lora_base = "".encode("UTF-8")
11261124
if args.lora:
@@ -2691,7 +2689,7 @@ def hide_tooltip(event):
26912689

26922690
tabs = ctk.CTkFrame(root, corner_radius = 0, width=windowwidth, height=windowheight-50)
26932691
tabs.grid(row=0, stick="nsew")
2694-
tabnames= ["Quick Launch", "Hardware", "Tokens", "GPU AutoLayers", "Model Files", "Network", "Horde Worker","Image Gen","Audio","Extra", "Croco"]
2692+
tabnames= ["Quick Launch", "Hardware", "Tokens", "GPU AutoLayers", "Model Files", "Network", "Horde Worker","Image Gen","Audio","Extra"]
26952693
navbuttons = {}
26962694
navbuttonframe = ctk.CTkFrame(tabs, width=100, height=int(tabs.cget("height")))
26972695
navbuttonframe.grid(row=0, column=0, padx=2,pady=2)
@@ -2767,8 +2765,6 @@ def hide_tooltip(event):
27672765

27682766
blasubatchsize_var = ctk.IntVar()
27692767

2770-
experts_used_var = ctk.StringVar(value="0")
2771-
27722768
version_var = ctk.StringVar(value="0")
27732769
tensor_split_str_vars = ctk.StringVar(value="")
27742770
rowsplit_var = ctk.IntVar()
@@ -3242,8 +3238,6 @@ def changerunmode(a,b,c):
32423238

32433239
tensor_split_entry,tensor_split_label = makelabelentry(gpu_al_tab, "Tensor Split:", tensor_split_str_vars, 8, 160, tooltip='When using multiple GPUs this option controls how large tensors should be split across all GPUs.\nUses a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order.\nFor example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1.')
32443240

3245-
makelabelentry(gpu_al_tab, "Opt. model metadata KV override:", experts_used_var, 26, 420, tooltip="Supersede metadata of a model, like Epislon _ e.g : llama.attention.layer_norm_rms_epsilon=float:1e5, 1.25e5, 3e6, etc.")
3246-
32473241
# load model
32483242
makefileentry(gpu_al_tab, "Model:", "Select GGML Model File", model_var, 40, 576, onchoosefile=on_picked_model_file, filetypes=[("GGML bin or GGUF", ("*.bin","*.gguf"))] ,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
32493243

@@ -3275,8 +3269,6 @@ def togglerope(a,b,c):
32753269
noqkvlabel.configure(text_color="#ff5555")
32763270
qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 22, 30, set=0,tooltip="Enable quantization of KV cache (KVQ). Mode 0 (F16) is default. Modes 1-12 requires FlashAttention and disables ContextShift.\nModes 15-20 work without FA, for incompatible models. 0,13,14 can work with or without.")
32773271

3278-
3279-
32803272
# load model
32813273
makefileentry(tokens_tab, "Model:", "Select GGML or GGML Model File", model_var, 50, 576, onchoosefile=on_picked_model_file, filetypes=[("GGML bin or GGUF", ("*.bin","*.gguf"))] ,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
32823274

@@ -3432,40 +3424,6 @@ def kcpp_export_template():
34323424
makelabel(extra_tab, "Export as launcher .kcppt template (Expert Only)", 4, 0,tooltiptxt="Creates a KoboldCpp launch template for others to use.\nEmbeds JSON files directly into exported file when saving.\nWhen loaded, forces the backend to be automatically determined.\nWarning! Not recommended for beginners!")
34333425
ctk.CTkButton(extra_tab , text = "Generate LaunchTemplate", command = kcpp_export_template ).grid(row=5,column=0, stick="w", padx= 8, pady=2)
34343426

3435-
# croco tab
3436-
croco_tab = tabcontent["Croco"]
3437-
3438-
# makelabelentry(croco_tab, "Context Size:" , context_var, 2, 160,tooltip="How many threads to use during BLAS processing.\nIf left blank, uses same value as regular thread count.")
3439-
3440-
makelabelentry(croco_tab, "Threads:" , threads_var, 4, 80,tooltip="How many threads to use.\nRecommended value is your CPU core count, defaults are usually OK.")
3441-
3442-
makelabelentry(croco_tab, "BLAS threads:" , blas_threads_var, 6, 80,tooltip="How many threads to use during BLAS processing.\nIf left blank, uses same value as regular thread count.")
3443-
3444-
# makelabelentry(croco_tab, "Logical Blas Batch Size:" , blas_size_var, 8, 160,tooltip="How many tokens to process at once per batch.\nLarger values use more memory unless Physical Batch supersedes it.")
3445-
3446-
# makelabelentry(croco_tab, "Physical Blas Batch Size:" , blasubatchsize_var, 10, 160,tooltip="How many tokens to process at once per batch.\nLarger values use more memory.")
3447-
3448-
makelabelentry(croco_tab, "GPU Layers:", gpulayers_var, 12, 80,tooltip="How many layers to offload onto the GPU.\nVRAM intensive, usage increases with model and context size.\nRequires some trial and error to find the best fit value.\n\nCommon values for total layers, accuracy not guaranteed.\n\nLlama/Mistral 7b/8b: 33\nSolar 10.7b/11b: 49\nLlama 13b: 41\nLlama 20b(stack): 63\nLlama/Yi 34b: 61\nMixtral 8x7b: 33\nLlama 70b: 81")
3449-
3450-
makelabelentry(croco_tab, "Positive Layer offset:", poslayeroffset_var, 14, 80, tooltip="Adds layers to the GPU layers autoloader calculation in case of under-exploitation of your GPU(s)..")
3451-
3452-
makelabelentry(croco_tab, "Negative Layer Offset:", neglayeroffset_var, 16, 80, tooltip="Removes layers to the GPU layers autoloader calculation in case of Out of Memory (OOM) error..")
3453-
3454-
makelabelentry(croco_tab, "Tensor Split:", tensor_split_str_vars, 18, 280, tooltip='When using multiple GPUs this option controls how large tensors should be split across all GPUs.\nUses a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order.\nFor example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1.')
3455-
3456-
makelabelentry(croco_tab, "RoPE Scale:", customrope_scale, 20, 80, tooltip="For Linear RoPE scaling. RoPE frequency scale.")
3457-
3458-
makelabelentry(croco_tab, "RoPE Base:", customrope_base, 22, 160, tooltip="For NTK Aware Scaling. RoPE frequency base.")
3459-
3460-
makelabelentry(croco_tab, "Quantize KV Cache:", quantkv_var, 24, 80, tooltip="Enable quantization of KV cache (KVQ). Mode 0 (F16) is default. Modes 1-12 requires FlashAttention and disables ContextShift.\nModes 15-20 work without FA, for incompatible models. 0,13,14 can work with or without.")
3461-
3462-
# makelabelentry(croco_tab, "Opt. model metadata KV override:", kv_override_var, 26, 420, tooltip="Supersede metadata of a model, like Epislon _ e.g : llama.attention.layer_norm_rms_epsilon=float:1e5, 1.25e5, 3e6, etc.")
3463-
3464-
makefileentry(croco_tab, "Model:", "Select GGML or GGML Model File", model_var, 28, 576, onchoosefile=on_picked_model_file, filetypes=[("GGML bin or GGUF", ("*.bin","*.gguf"))] ,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
3465-
model_var.trace("w", gui_changed_modelfile)
3466-
3467-
ctk.CTkButton(croco_tab, text = "Run Benchmark", command = guibench ).grid(row=32, stick="se", padx= 0, pady=0)
3468-
34693427
# launch
34703428
def guilaunch():
34713429
if model_var.get() == "" and sd_model_var.get() == "" and whisper_model_var.get() == "" and nomodel.get()!=1:
@@ -3563,8 +3521,6 @@ def export_vars():
35633521
args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())])
35643522

35653523
args.blasubatchsize = int(blasubatchsize_values[int(blasubatchsize_var.get())])
3566-
3567-
args.experts_used = None if experts_used_var.get() == "" else int(experts_used_var.get())
35683524

35693525
args.forceversion = 0 if version_var.get()=="" else int(version_var.get())
35703526

@@ -3719,10 +3675,6 @@ def import_vars(dict):
37193675
blas_threads_var.set(str(dict["blasthreads"]))
37203676
else:
37213677
blas_threads_var.set("")
3722-
if "experts_used" in dict and dict["experts_used"]:
3723-
experts_used_var.set(str(dict["experts_used"]))
3724-
else:
3725-
experts_used_var.set("0")
37263678
if "contextsize" in dict and dict["contextsize"]:
37273679
context_var.set(contextsize_text.index(str(dict["contextsize"])))
37283680
if "ropeconfig" in dict and dict["ropeconfig"] and len(dict["ropeconfig"])>1:
@@ -4680,9 +4632,6 @@ def main(launch_args,start_server=True):
46804632
if not args.blasthreads or args.blasthreads <= 0:
46814633
args.blasthreads = args.threads
46824634

4683-
if not args.experts_used or args.experts_used <= 0:
4684-
args.experts_used = 0
4685-
46864635
modelname = os.path.abspath(args.model_param)
46874636
print(args)
46884637
# Flush stdout for win32 issue with regards to piping in terminals,
@@ -5046,9 +4995,6 @@ def range_checker(arg: str):
50464995
advparser.add_argument("--blasubatchsize", help="Sets the Physical batch size used in BLAS processing (default 128 for VRAM savings, optimal speed is 512, 256 is a great compromise). Setting it to 0 alignes Physical BLAS batch on logical BLAS. Same steps as for logical BBS.", type=check_range(int,0,4096), default=0)
50474996

50484997
advparser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)
5049-
5050-
advparser.add_argument("--experts_used", help="Supersede metadata of a model, like Epislon (e.g : llama.attention.layer_norm_rms_epsilon=float:1e5, 1.25e5, 3e6, etc)", metavar=('[experts_used]'), type=int, default=0)
5051-
50524998
advparser.add_argument("--lora", help="LLAMA models only, applies a lora file on top of model. Experimental.", metavar=('[lora_filename]', '[lora_base]'), nargs='+')
50534999
advparser.add_argument("--contextshift", help="If set, do attempt to Trim and Shift the GGUF context without reprocessing everything once the max context is reached. If you disable it (or need to use Quantized KV cache (KVQ) with FlashAttention, aka. modes 1 to 14, which are incompatible with Context Shift), you can eventually use --smartcontext instead.", action='store_true')
50545000
advparser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
@@ -5068,7 +5014,6 @@ def range_checker(arg: str):
50685014
advparser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+')
50695015
advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true')
50705016
advparser.add_argument("--mmproj", help="Select a multimodal projector file for LLaVA.", default="")
5071-
50725017
advparser.add_argument("--password", help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None)
50735018
advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true')
50745019
advparser.add_argument("--chatcompletionsadapter", help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="")

otherarch/otherarch.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ struct kcpp_params {
2323
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
2424
int n_threads = -1;
2525
int n_blasthreads = -1;
26-
int n_experts_used = 0;
2726

2827
// sampling parameters
2928
int32_t top_k = 40; // <= 0 to use vocab size

0 commit comments

Comments
 (0)