Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1249,6 +1249,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
if (arg == "-cuda" || arg == "--cuda-params") {
CHECK_ARG
params.cuda_params = argv[i];
return true;
}
if (arg == "--cpu-moe" || arg == "-cmoe") {
params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps\\.weight"), ggml_backend_cpu_buffer_type()});
return true;
Expand Down Expand Up @@ -2076,6 +2081,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --no-context-shift", "disable context-shift." });
options.push_back({ "backend" });
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
options.push_back({ "*", "-cuda, --cuda-params", "comma separate list of cuda parameters" });

if (llama_supports_mlock()) {
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
Expand Down Expand Up @@ -2676,15 +2682,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
auto mparams = llama_model_params_from_gpt_params(params);

llama_model * model = nullptr;

if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else if (!params.model_url.empty()) {
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
}

if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return iparams;
Expand Down Expand Up @@ -2914,6 +2920,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);

if (!params.offload_policy.empty()) cparams.offload_policy = (void *)&params.offload_policy;
if (!params.cuda_params.empty()) cparams.cuda_params = (void *)params.cuda_params.data();

return cparams;
}
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ struct gpt_params {
std::string logits_file = ""; // file for saving *all* logits
std::string rpc_servers = ""; // comma separated list of RPC servers

std::string cuda_params = ""; // comma separated list of cuda parameters key=value1,key2=value2

std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
Expand Down
2 changes: 1 addition & 1 deletion examples/cvector-generator/pca.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ struct pca_model {
pca_model(struct ggml_tensor * t_input) {
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
backend = ggml_backend_cuda_init(0, nullptr); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
Expand Down
17 changes: 17 additions & 0 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ struct cmd_params {
std::vector<bool> embeddings;
std::vector<llama_model_tensor_buft_override> buft_overrides;
ggml_numa_strategy numa;
std::string cuda_params;
int reps;
bool verbose;
bool warmup;
Expand Down Expand Up @@ -295,6 +296,7 @@ static const cmd_params cmd_params_defaults = {
/* embeddings */ {false},
/* buft_overrides */ {},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* cuda_params */ {},
/* reps */ 5,
/* verbose */ false,
/* warmup */ true,
Expand Down Expand Up @@ -344,6 +346,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf(" -w, --warmup <0|1> (default: %s)\n", cmd_params_defaults.warmup ? "1" : "0");
printf(" -rtr, --run-time-repack <0|1> (default: %s)\n", cmd_params_defaults.repack ? "1" : "0");
printf(" -cuda, --cuda-params <string> (default: %s)\n", cmd_params_defaults.repack ? "1" : "0");
printf(" -mqkv, --merge-qkv (default: %s)\n", cmd_params_defaults.mqkv ? "1" : "0");
printf(" -thp, --transparent-huge-pages <0|1> (default: %s)\n", cmd_params_defaults.use_thp? "1" : "0");
printf(" -ot, --override-tensor pattern (default: none)\n");
Expand Down Expand Up @@ -736,6 +739,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
}
params.repack = std::stoi(argv[i]);
} else if (arg == "-cuda" || arg == "--cuda-params") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cuda_params = argv[i];
} else if (arg == "-mqkv" || arg == "--merge-qkv") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -852,6 +861,7 @@ struct cmd_params_instance {
int attn_max_batch;
Ser ser;
std::vector<float> tensor_split;
std::string cuda_params;
bool use_mmap;
bool embeddings;
bool repack = false;
Expand Down Expand Up @@ -914,6 +924,7 @@ struct cmd_params_instance {
cparams.min_experts = ser.first;
cparams.thresh_experts = ser.second;
cparams.embeddings = embeddings;
cparams.cuda_params = (void *)cuda_params.data();

return cparams;
}
Expand Down Expand Up @@ -965,6 +976,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .attn_max_b = */ amb,
/* .ser = */ ser,
/* .tensor_split = */ ts,
/* .cuda_params = */ params.cuda_params,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .repack = */ params.repack,
Expand Down Expand Up @@ -1003,6 +1015,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .attn_max_b = */ amb,
/* .ser = */ ser,
/* .tensor_split = */ ts,
/* .cuda_params = */ params.cuda_params,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .repack = */ params.repack,
Expand Down Expand Up @@ -1041,6 +1054,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .attn_max_b = */ amb,
/* .ser = */ ser,
/* .tensor_split = */ ts,
/* .cuda_params = */ params.cuda_params,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .repack = */ params.repack,
Expand Down Expand Up @@ -1079,6 +1093,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .attn_max_b = */ amb,
/* .ser = */ ser,
/* .tensor_split = */ ts,
/* .cuda_params = */ params.cuda_params,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .repack = */ params.repack,
Expand Down Expand Up @@ -1128,6 +1143,7 @@ struct test {
int attn_max_batch;
Ser ser;
std::vector<float> tensor_split;
std::string cuda_params;
bool use_mmap;
bool embeddings;
bool repack = false;
Expand Down Expand Up @@ -1166,6 +1182,7 @@ struct test {
attn_max_batch = inst.attn_max_batch;
ser = inst.ser;
tensor_split = inst.tensor_split;
cuda_params = inst.cuda_params;
use_mmap = inst.use_mmap;
embeddings = inst.embeddings;
repack = inst.repack;
Expand Down
2 changes: 1 addition & 1 deletion ggml/include/ggml-cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ extern "C" {
#define GGML_CUDA_MAX_DEVICES 16

// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, const void * params);

GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);

Expand Down
Loading