Skip to content

Commit 9d0b834

Browse files
ikawrakowIwan Kawrakow
andauthored
CUDA: set compute parameters via command line arguments (#910)
* cuda: set compute parameters via command line arguments * Also llama-bench --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 665434e commit 9d0b834

File tree

11 files changed

+144
-29
lines changed

11 files changed

+144
-29
lines changed

common/common.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,6 +1249,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
12491249
}
12501250
return true;
12511251
}
1252+
if (arg == "-cuda" || arg == "--cuda-params") {
1253+
CHECK_ARG
1254+
params.cuda_params = argv[i];
1255+
return true;
1256+
}
12521257
if (arg == "--cpu-moe" || arg == "-cmoe") {
12531258
params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps\\.weight"), ggml_backend_cpu_buffer_type()});
12541259
return true;
@@ -2076,6 +2081,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
20762081
options.push_back({ "*", " --no-context-shift", "disable context-shift." });
20772082
options.push_back({ "backend" });
20782083
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
2084+
options.push_back({ "*", "-cuda, --cuda-params", "comma separate list of cuda parameters" });
20792085

20802086
if (llama_supports_mlock()) {
20812087
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
@@ -2676,15 +2682,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
26762682
auto mparams = llama_model_params_from_gpt_params(params);
26772683

26782684
llama_model * model = nullptr;
2679-
2685+
26802686
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
26812687
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
26822688
} else if (!params.model_url.empty()) {
26832689
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
26842690
} else {
26852691
model = llama_load_model_from_file(params.model.c_str(), mparams);
26862692
}
2687-
2693+
26882694
if (model == NULL) {
26892695
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
26902696
return iparams;
@@ -2914,6 +2920,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
29142920
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
29152921

29162922
if (!params.offload_policy.empty()) cparams.offload_policy = (void *)&params.offload_policy;
2923+
if (!params.cuda_params.empty()) cparams.cuda_params = (void *)params.cuda_params.data();
29172924

29182925
return cparams;
29192926
}

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ struct gpt_params {
198198
std::string logits_file = ""; // file for saving *all* logits
199199
std::string rpc_servers = ""; // comma separated list of RPC servers
200200

201+
std::string cuda_params = ""; // comma separated list of cuda parameters key=value1,key2=value2
201202

202203
std::vector<std::string> in_files; // all input files
203204
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)

examples/cvector-generator/pca.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ struct pca_model {
6666
pca_model(struct ggml_tensor * t_input) {
6767
#ifdef GGML_USE_CUDA
6868
fprintf(stderr, "%s: using CUDA backend\n", __func__);
69-
backend = ggml_backend_cuda_init(0); // init device 0
69+
backend = ggml_backend_cuda_init(0, nullptr); // init device 0
7070
if (!backend) {
7171
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
7272
}

examples/llama-bench/llama-bench.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ struct cmd_params {
256256
std::vector<bool> embeddings;
257257
std::vector<llama_model_tensor_buft_override> buft_overrides;
258258
ggml_numa_strategy numa;
259+
std::string cuda_params;
259260
int reps;
260261
bool verbose;
261262
bool warmup;
@@ -295,6 +296,7 @@ static const cmd_params cmd_params_defaults = {
295296
/* embeddings */ {false},
296297
/* buft_overrides */ {},
297298
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
299+
/* cuda_params */ {},
298300
/* reps */ 5,
299301
/* verbose */ false,
300302
/* warmup */ true,
@@ -344,6 +346,7 @@ static void print_usage(int /* argc */, char ** argv) {
344346
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
345347
printf(" -w, --warmup <0|1> (default: %s)\n", cmd_params_defaults.warmup ? "1" : "0");
346348
printf(" -rtr, --run-time-repack <0|1> (default: %s)\n", cmd_params_defaults.repack ? "1" : "0");
349+
printf(" -cuda, --cuda-params <string> (default: %s)\n", cmd_params_defaults.repack ? "1" : "0");
347350
printf(" -mqkv, --merge-qkv (default: %s)\n", cmd_params_defaults.mqkv ? "1" : "0");
348351
printf(" -thp, --transparent-huge-pages <0|1> (default: %s)\n", cmd_params_defaults.use_thp? "1" : "0");
349352
printf(" -ot, --override-tensor pattern (default: none)\n");
@@ -736,6 +739,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
736739
break;
737740
}
738741
params.repack = std::stoi(argv[i]);
742+
} else if (arg == "-cuda" || arg == "--cuda-params") {
743+
if (++i >= argc) {
744+
invalid_param = true;
745+
break;
746+
}
747+
params.cuda_params = argv[i];
739748
} else if (arg == "-mqkv" || arg == "--merge-qkv") {
740749
if (++i >= argc) {
741750
invalid_param = true;
@@ -852,6 +861,7 @@ struct cmd_params_instance {
852861
int attn_max_batch;
853862
Ser ser;
854863
std::vector<float> tensor_split;
864+
std::string cuda_params;
855865
bool use_mmap;
856866
bool embeddings;
857867
bool repack = false;
@@ -914,6 +924,7 @@ struct cmd_params_instance {
914924
cparams.min_experts = ser.first;
915925
cparams.thresh_experts = ser.second;
916926
cparams.embeddings = embeddings;
927+
cparams.cuda_params = (void *)cuda_params.data();
917928

918929
return cparams;
919930
}
@@ -965,6 +976,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
965976
/* .attn_max_b = */ amb,
966977
/* .ser = */ ser,
967978
/* .tensor_split = */ ts,
979+
/* .cuda_params = */ params.cuda_params,
968980
/* .use_mmap = */ mmp,
969981
/* .embeddings = */ embd,
970982
/* .repack = */ params.repack,
@@ -1003,6 +1015,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10031015
/* .attn_max_b = */ amb,
10041016
/* .ser = */ ser,
10051017
/* .tensor_split = */ ts,
1018+
/* .cuda_params = */ params.cuda_params,
10061019
/* .use_mmap = */ mmp,
10071020
/* .embeddings = */ embd,
10081021
/* .repack = */ params.repack,
@@ -1041,6 +1054,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10411054
/* .attn_max_b = */ amb,
10421055
/* .ser = */ ser,
10431056
/* .tensor_split = */ ts,
1057+
/* .cuda_params = */ params.cuda_params,
10441058
/* .use_mmap = */ mmp,
10451059
/* .embeddings = */ embd,
10461060
/* .repack = */ params.repack,
@@ -1079,6 +1093,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10791093
/* .attn_max_b = */ amb,
10801094
/* .ser = */ ser,
10811095
/* .tensor_split = */ ts,
1096+
/* .cuda_params = */ params.cuda_params,
10821097
/* .use_mmap = */ mmp,
10831098
/* .embeddings = */ embd,
10841099
/* .repack = */ params.repack,
@@ -1128,6 +1143,7 @@ struct test {
11281143
int attn_max_batch;
11291144
Ser ser;
11301145
std::vector<float> tensor_split;
1146+
std::string cuda_params;
11311147
bool use_mmap;
11321148
bool embeddings;
11331149
bool repack = false;
@@ -1166,6 +1182,7 @@ struct test {
11661182
attn_max_batch = inst.attn_max_batch;
11671183
ser = inst.ser;
11681184
tensor_split = inst.tensor_split;
1185+
cuda_params = inst.cuda_params;
11691186
use_mmap = inst.use_mmap;
11701187
embeddings = inst.embeddings;
11711188
repack = inst.repack;

ggml/include/ggml-cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ extern "C" {
2121
#define GGML_CUDA_MAX_DEVICES 16
2222

2323
// backend API
24-
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
24+
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, const void * params);
2525

2626
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
2727

0 commit comments

Comments
 (0)