Skip to content

Commit 1cbe716

Browse files
committed
allow setting maingpu
1 parent 7a688e0 commit 1cbe716

File tree

3 files changed

+49
-28
lines changed

3 files changed

+49
-28
lines changed

expose.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ struct load_model_inputs
5252
const bool use_contextshift = false;
5353
const bool use_fastforward = false;
5454
const int clblast_info = 0;
55-
const int cublas_info = 0;
55+
const int kcpp_main_gpu = 0;
5656
const char * vulkan_info = nullptr;
5757
const int blasbatchsize = 512;
5858
const int forceversion = 0;
@@ -157,7 +157,7 @@ struct sd_load_model_inputs
157157
const char * model_filename = nullptr;
158158
const char * executable_path = nullptr;
159159
const int clblast_info = 0;
160-
const int cublas_info = 0;
160+
const int kcpp_main_gpu = 0;
161161
const char * vulkan_info = nullptr;
162162
const int threads = 0;
163163
const int quant = 0;
@@ -199,7 +199,7 @@ struct whisper_load_model_inputs
199199
const char * model_filename = nullptr;
200200
const char * executable_path = nullptr;
201201
const int clblast_info = 0;
202-
const int cublas_info = 0;
202+
const int kcpp_main_gpu = 0;
203203
const char * vulkan_info = nullptr;
204204
const bool quiet = false;
205205
const int debugmode = 0;
@@ -224,7 +224,7 @@ struct tts_load_model_inputs
224224
const char * cts_model_filename = nullptr;
225225
const char * executable_path = nullptr;
226226
const int clblast_info = 0;
227-
const int cublas_info = 0;
227+
const int kcpp_main_gpu = 0;
228228
const char * vulkan_info = nullptr;
229229
const int gpulayers = 0;
230230
const bool flash_attention = false;
@@ -252,7 +252,7 @@ struct embeddings_load_model_inputs
252252
const char * model_filename = nullptr;
253253
const char * executable_path = nullptr;
254254
const int clblast_info = 0;
255-
const int cublas_info = 0;
255+
const int kcpp_main_gpu = 0;
256256
const char * vulkan_info = nullptr;
257257
const int gpulayers = 0;
258258
const bool flash_attention = false;

gpttype_adapter.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2020,16 +2020,16 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
20202020
//this is used for the mem_per_token eval, blas needs more RAM
20212021
bool v3_use_scratch = ggml_v3_cpu_has_gpublas();
20222022

2023-
int cu_parseinfo_maindevice = inputs.cublas_info<=0?0:inputs.cublas_info;
2023+
int kcpp_parseinfo_maindevice = inputs.kcpp_main_gpu<=0?0:inputs.kcpp_main_gpu;
20242024

20252025
printf("System Info: %s\n", kcpp_print_system_info());
20262026
#if defined(GGML_USE_CUDA)
20272027
if(file_format!=FileFormat::GGUF_GENERIC)
20282028
{
2029-
if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
2029+
if(ggml_v3_cpu_has_gpublas() && kcpp_parseinfo_maindevice>0)
20302030
{
2031-
printf("CUBLAS v3: Set main device to %d\n",cu_parseinfo_maindevice);
2032-
ggml_v3_cuda_set_main_device(cu_parseinfo_maindevice);
2031+
printf("CUBLAS v3: Set main device to %d\n",kcpp_parseinfo_maindevice);
2032+
ggml_v3_cuda_set_main_device(kcpp_parseinfo_maindevice);
20332033
}
20342034
}
20352035

@@ -2092,7 +2092,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
20922092
llama_ctx_params.use_mmap = inputs.use_mmap;
20932093
llama_ctx_params.use_mlock = inputs.use_mlock;
20942094
llama_ctx_params.n_gpu_layers = inputs.gpulayers;
2095-
llama_ctx_params.main_gpu = cu_parseinfo_maindevice;
2095+
llama_ctx_params.main_gpu = kcpp_parseinfo_maindevice;
20962096
llama_ctx_params.rope_freq_base = rope_freq_base;
20972097
llama_ctx_params.rope_freq_scale = rope_freq_scale;
20982098
llama_ctx_params.n_batch = kcpp_data->n_batch;
@@ -2178,9 +2178,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
21782178
}
21792179
#endif
21802180
#if defined(GGML_USE_CUDA)
2181-
if(cu_parseinfo_maindevice>0)
2181+
if(kcpp_parseinfo_maindevice>0)
21822182
{
2183-
printf("CUDA: Set main device to %d\n",cu_parseinfo_maindevice);
2183+
printf("CUDA: Set main device to %d\n",kcpp_parseinfo_maindevice);
21842184
}
21852185
printf("CUDA MMQ: %s\n",(inputs.use_mmq?"True":"False"));
21862186
ggml_cuda_set_mul_mat_q(inputs.use_mmq);
@@ -2194,7 +2194,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
21942194
printf("Qwen2VL detected! Mrope will be used, and context shift will be disabled!\n");
21952195
kcpp_data->use_contextshift = false;
21962196
}
2197-
model_params.main_gpu = cu_parseinfo_maindevice;
2197+
model_params.main_gpu = kcpp_parseinfo_maindevice;
21982198

21992199
#if defined(GGML_USE_CUDA)
22002200
model_params.split_mode = (inputs.use_rowsplit?llama_split_mode::LLAMA_SPLIT_MODE_ROW:llama_split_mode::LLAMA_SPLIT_MODE_LAYER);

koboldcpp.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ class load_model_inputs(ctypes.Structure):
178178
("use_contextshift", ctypes.c_bool),
179179
("use_fastforward", ctypes.c_bool),
180180
("clblast_info", ctypes.c_int),
181-
("cublas_info", ctypes.c_int),
181+
("kcpp_main_gpu", ctypes.c_int),
182182
("vulkan_info", ctypes.c_char_p),
183183
("blasbatchsize", ctypes.c_int),
184184
("forceversion", ctypes.c_int),
@@ -262,7 +262,7 @@ class sd_load_model_inputs(ctypes.Structure):
262262
_fields_ = [("model_filename", ctypes.c_char_p),
263263
("executable_path", ctypes.c_char_p),
264264
("clblast_info", ctypes.c_int),
265-
("cublas_info", ctypes.c_int),
265+
("kcpp_main_gpu", ctypes.c_int),
266266
("vulkan_info", ctypes.c_char_p),
267267
("threads", ctypes.c_int),
268268
("quant", ctypes.c_int),
@@ -300,7 +300,7 @@ class whisper_load_model_inputs(ctypes.Structure):
300300
_fields_ = [("model_filename", ctypes.c_char_p),
301301
("executable_path", ctypes.c_char_p),
302302
("clblast_info", ctypes.c_int),
303-
("cublas_info", ctypes.c_int),
303+
("kcpp_main_gpu", ctypes.c_int),
304304
("vulkan_info", ctypes.c_char_p),
305305
("quiet", ctypes.c_bool),
306306
("debugmode", ctypes.c_int)]
@@ -321,7 +321,7 @@ class tts_load_model_inputs(ctypes.Structure):
321321
("cts_model_filename", ctypes.c_char_p),
322322
("executable_path", ctypes.c_char_p),
323323
("clblast_info", ctypes.c_int),
324-
("cublas_info", ctypes.c_int),
324+
("kcpp_main_gpu", ctypes.c_int),
325325
("vulkan_info", ctypes.c_char_p),
326326
("gpulayers", ctypes.c_int),
327327
("flash_attention", ctypes.c_bool),
@@ -345,7 +345,7 @@ class embeddings_load_model_inputs(ctypes.Structure):
345345
("model_filename", ctypes.c_char_p),
346346
("executable_path", ctypes.c_char_p),
347347
("clblast_info", ctypes.c_int),
348-
("cublas_info", ctypes.c_int),
348+
("kcpp_main_gpu", ctypes.c_int),
349349
("vulkan_info", ctypes.c_char_p),
350350
("gpulayers", ctypes.c_int),
351351
("flash_attention", ctypes.c_bool),
@@ -566,32 +566,39 @@ def set_backend_props(inputs):
566566

567567
# we must force an explicit tensor split
568568
# otherwise the default will divide equally and multigpu crap will slow it down badly
569-
inputs.cublas_info = 0
569+
inputs.kcpp_main_gpu = 0
570+
if(args.maingpu is not None and args.maingpu>=0):
571+
inputs.kcpp_main_gpu = args.maingpu
570572

571573
if args.usecublas:
572574
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
573575
if not args.tensor_split:
574576
if (args.usecublas and "0" in args.usecublas):
575577
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
576578
os.environ["HIP_VISIBLE_DEVICES"] = "0"
579+
inputs.kcpp_main_gpu = 0
577580
elif (args.usecublas and "1" in args.usecublas):
578581
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
579582
os.environ["HIP_VISIBLE_DEVICES"] = "1"
583+
inputs.kcpp_main_gpu = 0
580584
elif (args.usecublas and "2" in args.usecublas):
581585
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
582586
os.environ["HIP_VISIBLE_DEVICES"] = "2"
587+
inputs.kcpp_main_gpu = 0
583588
elif (args.usecublas and "3" in args.usecublas):
584589
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
585590
os.environ["HIP_VISIBLE_DEVICES"] = "3"
591+
inputs.kcpp_main_gpu = 0
586592
else:
587-
if (args.usecublas and "0" in args.usecublas):
588-
inputs.cublas_info = 0
589-
elif (args.usecublas and "1" in args.usecublas):
590-
inputs.cublas_info = 1
591-
elif (args.usecublas and "2" in args.usecublas):
592-
inputs.cublas_info = 2
593-
elif (args.usecublas and "3" in args.usecublas):
594-
inputs.cublas_info = 3
593+
if(args.maingpu is None or args.maingpu<0):
594+
if (args.usecublas and "0" in args.usecublas):
595+
inputs.kcpp_main_gpu = 0
596+
elif (args.usecublas and "1" in args.usecublas):
597+
inputs.kcpp_main_gpu = 1
598+
elif (args.usecublas and "2" in args.usecublas):
599+
inputs.kcpp_main_gpu = 2
600+
elif (args.usecublas and "3" in args.usecublas):
601+
inputs.kcpp_main_gpu = 3
595602

596603
if args.usevulkan: #is an empty array if using vulkan without defined gpu
597604
s = ""
@@ -4201,6 +4208,7 @@ def hide_tooltip(event):
42014208
version_var = ctk.StringVar(value="0")
42024209
tensor_split_str_vars = ctk.StringVar(value="")
42034210
rowsplit_var = ctk.IntVar()
4211+
maingpu_var = ctk.StringVar(value="")
42044212

42054213
contextshift_var = ctk.IntVar(value=1)
42064214
fastforward_var = ctk.IntVar(value=1)
@@ -4676,13 +4684,17 @@ def changerunmode(a,b,c):
46764684
quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
46774685
CUDA_gpu_selector_box.grid_remove()
46784686
CUDA_quick_gpu_selector_box.grid_remove()
4687+
maingpu_label.grid_remove()
4688+
maingpu_entry.grid_remove()
46794689
if gpu_choice_var.get()=="All":
46804690
gpu_choice_var.set("1")
46814691
elif index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
46824692
gpu_selector_box.grid_remove()
46834693
quick_gpu_selector_box.grid_remove()
46844694
CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
46854695
CUDA_quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
4696+
maingpu_label.grid(row=10, column=0, padx = 8, pady=1, stick="nw")
4697+
maingpu_entry.grid(row=10, column=1, padx = 8, pady=1, stick="nw")
46864698
else:
46874699
quick_gpuname_label.grid_remove()
46884700
gpuname_label.grid_remove()
@@ -4692,6 +4704,8 @@ def changerunmode(a,b,c):
46924704
quick_gpu_selector_label.grid_remove()
46934705
quick_gpu_selector_box.grid_remove()
46944706
CUDA_quick_gpu_selector_box.grid_remove()
4707+
maingpu_label.grid_remove()
4708+
maingpu_entry.grid_remove()
46954709

46964710
if index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
46974711
lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
@@ -4803,6 +4817,8 @@ def changerunmode(a,b,c):
48034817
mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1, tooltiptxt="Enable MMQ mode to use finetuned kernels instead of default CuBLAS/HipBLAS for prompt processing.\nRead the wiki. Speed may vary.")
48044818
splitmode_box = makecheckbox(hardware_tab, "Row-Split", rowsplit_var, 5,0, tooltiptxt="Split rows across GPUs instead of splitting layers and KV across GPUs.\nUses the main GPU for small tensors and intermediate results. Speed may vary.")
48054819

4820+
maingpu_entry,maingpu_label = makelabelentry(hardware_tab, "Main GPU:" , maingpu_var, 10, 50,tooltip="Only for multi-gpu, which GPU to set as main?\nIf left blank, uses default value.")
4821+
48064822
# threads
48074823
makelabelentry(hardware_tab, "Threads:" , threads_var, 11, 50,tooltip="How many threads to use.\nRecommended value is your CPU core count, defaults are usually OK.")
48084824

@@ -5140,7 +5156,7 @@ def export_vars():
51405156
else:
51415157
args.draftgpusplit = [float(x) for x in tssv.split(" ")]
51425158

5143-
5159+
args.maingpu = -1 if maingpu_var.get()=="" else int(maingpu_var.get())
51445160
args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
51455161
args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())])
51465162
args.forceversion = 0 if version_var.get()=="" else int(version_var.get())
@@ -5330,6 +5346,10 @@ def import_vars(dict):
53305346
gpulayers_var.set(dict["gpulayers"])
53315347
else:
53325348
gpulayers_var.set("0")
5349+
if "maingpu" in dict:
5350+
maingpu_var.set(dict["maingpu"])
5351+
else:
5352+
maingpu_var.set("")
53335353
if "tensor_split" in dict and dict["tensor_split"]:
53345354
tssep = ','.join(map(str, dict["tensor_split"]))
53355355
tensor_split_str_vars.set(tssep)
@@ -7076,6 +7096,7 @@ def range_checker(arg: str):
70767096
advparser = parser.add_argument_group('Advanced Commands')
70777097
advparser.add_argument("--version", help="Prints version and exits.", action='store_true')
70787098
advparser.add_argument("--analyze", metavar=('[filename]'), help="Reads the metadata, weight types and tensor names in any GGUF file.", default="")
7099+
advparser.add_argument("--maingpu", help="Only used in a multi-gpu setup. Sets the index of the main GPU that will be used.",metavar=('[Device ID]'), type=int, default=-1)
70797100
advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
70807101
advparser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,16,32,64,128,256,512,1024,2048], default=512)
70817102
advparser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)

0 commit comments

Comments
 (0)