Skip to content

Commit 3d13f64

Browse files
committed
* llama-bench: add --devices support
- Support --devices same as llama-server - Provide for benchmarking different device combinations - Include --list-devices like llama-server for convenience
1 parent 8ff2060 commit 3d13f64

File tree

1 file changed

+137
-10
lines changed

1 file changed

+137
-10
lines changed

tools/llama-bench/llama-bench.cpp

Lines changed: 137 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,86 @@ static std::string get_gpu_info() {
135135
return join(gpu_list, ", ");
136136
}
137137

138+
static std::vector<ggml_backend_dev_t> parse_devices_arg(const std::string & value) {
139+
std::vector<ggml_backend_dev_t> devices;
140+
std::string trimmed = string_strip(value);
141+
if (trimmed.empty()) {
142+
throw std::invalid_argument("no devices specified");
143+
}
144+
if (trimmed == "auto") {
145+
return devices;
146+
}
147+
148+
auto dev_names = string_split<std::string>(trimmed, '/');
149+
if (dev_names.size() == 1 && string_strip(dev_names[0]) == "none") {
150+
devices.push_back(nullptr);
151+
return devices;
152+
}
153+
154+
for (auto & name : dev_names) {
155+
std::string dev_name = string_strip(name);
156+
if (dev_name.empty()) {
157+
throw std::invalid_argument("invalid device specification");
158+
}
159+
auto * dev = ggml_backend_dev_by_name(dev_name.c_str());
160+
if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
161+
throw std::invalid_argument(string_format("invalid device: %s", dev_name.c_str()));
162+
}
163+
devices.push_back(dev);
164+
}
165+
166+
devices.push_back(nullptr);
167+
return devices;
168+
}
169+
170+
[[noreturn]] static void print_available_devices_and_exit() {
171+
std::vector<ggml_backend_dev_t> devices;
172+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
173+
auto * dev = ggml_backend_dev_get(i);
174+
auto ty = ggml_backend_dev_type(dev);
175+
if (ty == GGML_BACKEND_DEVICE_TYPE_CPU) {
176+
continue;
177+
}
178+
devices.push_back(dev);
179+
}
180+
181+
printf("Available devices:\n");
182+
if (devices.empty()) {
183+
printf(" (none)\n");
184+
}
185+
for (auto * dev : devices) {
186+
size_t free = 0;
187+
size_t total = 0;
188+
ggml_backend_dev_memory(dev, &free, &total);
189+
printf(" %s: %s (%zu MiB, %zu MiB free)\n",
190+
ggml_backend_dev_name(dev),
191+
ggml_backend_dev_description(dev),
192+
total / 1024 / 1024,
193+
free / 1024 / 1024);
194+
}
195+
exit(0);
196+
}
197+
198+
static std::string devices_to_string(const std::vector<ggml_backend_dev_t> & devices) {
199+
if (devices.empty()) {
200+
return "auto";
201+
}
202+
203+
if (devices.size() == 1 && devices[0] == nullptr) {
204+
return "none";
205+
}
206+
207+
std::vector<std::string> names;
208+
for (auto * dev : devices) {
209+
if (dev == nullptr) {
210+
break;
211+
}
212+
names.push_back(ggml_backend_dev_name(dev));
213+
}
214+
215+
return join(names, "/");
216+
}
217+
138218
// command line params
139219
enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
140220

@@ -256,6 +336,7 @@ struct cmd_params {
256336
std::vector<int> main_gpu;
257337
std::vector<bool> no_kv_offload;
258338
std::vector<bool> flash_attn;
339+
std::vector<std::vector<ggml_backend_dev_t>> devices;
259340
std::vector<std::vector<float>> tensor_split;
260341
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
261342
std::vector<bool> use_mmap;
@@ -293,6 +374,7 @@ static const cmd_params cmd_params_defaults = {
293374
/* main_gpu */ { 0 },
294375
/* no_kv_offload */ { false },
295376
/* flash_attn */ { false },
377+
/* devices */ { std::vector<ggml_backend_dev_t>() },
296378
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
297379
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
298380
/* use_mmap */ { true },
@@ -325,6 +407,7 @@ static void print_usage(int /* argc */, char ** argv) {
325407
output_format_str(cmd_params_defaults.output_format));
326408
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
327409
output_format_str(cmd_params_defaults.output_format_stderr));
410+
printf(" --list-devices list available devices and exit\n");
328411
printf(" -v, --verbose verbose output\n");
329412
printf(" --progress print test progress indicators\n");
330413
printf(" --no-warmup skip warmup runs before benchmarking\n");
@@ -369,6 +452,7 @@ static void print_usage(int /* argc */, char ** argv) {
369452
join(cmd_params_defaults.no_kv_offload, ",").c_str());
370453
printf(" -fa, --flash-attn <0|1> (default: %s)\n",
371454
join(cmd_params_defaults.flash_attn, ",").c_str());
455+
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
372456
printf(" -mmp, --mmap <0|1> (default: %s)\n",
373457
join(cmd_params_defaults.use_mmap, ",").c_str());
374458
printf(" -embd, --embeddings <0|1> (default: %s)\n",
@@ -533,6 +617,26 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
533617
break;
534618
}
535619
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
620+
} else if (arg == "-dev" || arg == "--device") {
621+
if (++i >= argc) {
622+
invalid_param = true;
623+
break;
624+
}
625+
auto combos = string_split<std::string>(argv[i], split_delim);
626+
for (const auto & combo : combos) {
627+
try {
628+
params.devices.push_back(parse_devices_arg(combo));
629+
} catch (const std::exception & e) {
630+
fprintf(stderr, "error: %s\\n", e.what());
631+
invalid_param = true;
632+
break;
633+
}
634+
}
635+
if (invalid_param) {
636+
break;
637+
}
638+
} else if (arg == "--list-devices") {
639+
print_available_devices_and_exit();
536640
} else if (arg == "-t" || arg == "--threads") {
537641
if (++i >= argc) {
538642
invalid_param = true;
@@ -870,6 +974,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
870974
if (params.flash_attn.empty()) {
871975
params.flash_attn = cmd_params_defaults.flash_attn;
872976
}
977+
if (params.devices.empty()) {
978+
params.devices = cmd_params_defaults.devices;
979+
}
873980
if (params.tensor_split.empty()) {
874981
params.tensor_split = cmd_params_defaults.tensor_split;
875982
}
@@ -921,6 +1028,7 @@ struct cmd_params_instance {
9211028
int main_gpu;
9221029
bool no_kv_offload;
9231030
bool flash_attn;
1031+
std::vector<ggml_backend_dev_t> devices;
9241032
std::vector<float> tensor_split;
9251033
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
9261034
bool use_mmap;
@@ -931,7 +1039,9 @@ struct cmd_params_instance {
9311039
llama_model_params mparams = llama_model_default_params();
9321040

9331041
mparams.n_gpu_layers = n_gpu_layers;
934-
if (!rpc_servers_str.empty()) {
1042+
if (!devices.empty()) {
1043+
mparams.devices = const_cast<ggml_backend_dev_t *>(devices.data());
1044+
} else if (!rpc_servers_str.empty()) {
9351045
auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
9361046

9371047
// add RPC devices
@@ -948,13 +1058,13 @@ struct cmd_params_instance {
9481058
fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
9491059
exit(1);
9501060
}
951-
static std::vector<ggml_backend_dev_t> devices;
952-
devices.clear();
1061+
static std::vector<ggml_backend_dev_t> rpc_devices;
1062+
rpc_devices.clear();
9531063
// RPC devices should always come first for performance reasons
9541064
for (const std::string & server : rpc_servers) {
9551065
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
9561066
if (dev) {
957-
devices.push_back(dev);
1067+
rpc_devices.push_back(dev);
9581068
} else {
9591069
fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
9601070
exit(1);
@@ -971,16 +1081,16 @@ struct cmd_params_instance {
9711081
break;
9721082

9731083
case GGML_BACKEND_DEVICE_TYPE_GPU:
974-
devices.push_back(dev);
1084+
rpc_devices.push_back(dev);
9751085
break;
9761086

9771087
case GGML_BACKEND_DEVICE_TYPE_IGPU:
9781088
// iGPUs are not used when there are RPC servers
9791089
break;
9801090
}
9811091
}
982-
devices.push_back(nullptr);
983-
mparams.devices = devices.data();
1092+
rpc_devices.push_back(nullptr);
1093+
mparams.devices = rpc_devices.data();
9841094
}
9851095
}
9861096
mparams.split_mode = split_mode;
@@ -1031,6 +1141,7 @@ struct cmd_params_instance {
10311141
return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
10321142
rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode &&
10331143
main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split &&
1144+
devices == other.devices &&
10341145
vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
10351146
}
10361147

@@ -1063,6 +1174,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10631174
for (const auto & rpc : params.rpc_servers)
10641175
for (const auto & sm : params.split_mode)
10651176
for (const auto & mg : params.main_gpu)
1177+
for (const auto & devs : params.devices)
10661178
for (const auto & ts : params.tensor_split)
10671179
for (const auto & ot : params.tensor_buft_overrides)
10681180
for (const auto & mmp : params.use_mmap)
@@ -1103,6 +1215,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
11031215
/* .main_gpu = */ mg,
11041216
/* .no_kv_offload= */ nkvo,
11051217
/* .flash_attn = */ fa,
1218+
/* .devices = */ devs,
11061219
/* .tensor_split = */ ts,
11071220
/* .tensor_buft_overrides = */ ot,
11081221
/* .use_mmap = */ mmp,
@@ -1136,6 +1249,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
11361249
/* .main_gpu = */ mg,
11371250
/* .no_kv_offload= */ nkvo,
11381251
/* .flash_attn = */ fa,
1252+
/* .devices = */ devs,
11391253
/* .tensor_split = */ ts,
11401254
/* .tensor_buft_overrides = */ ot,
11411255
/* .use_mmap = */ mmp,
@@ -1169,6 +1283,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
11691283
/* .main_gpu = */ mg,
11701284
/* .no_kv_offload= */ nkvo,
11711285
/* .flash_attn = */ fa,
1286+
/* .devices = */ devs,
11721287
/* .tensor_split = */ ts,
11731288
/* .tensor_buft_overrides = */ ot,
11741289
/* .use_mmap = */ mmp,
@@ -1206,6 +1321,7 @@ struct test {
12061321
int main_gpu;
12071322
bool no_kv_offload;
12081323
bool flash_attn;
1324+
std::vector<ggml_backend_dev_t> devices;
12091325
std::vector<float> tensor_split;
12101326
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
12111327
bool use_mmap;
@@ -1241,6 +1357,7 @@ struct test {
12411357
main_gpu = inst.main_gpu;
12421358
no_kv_offload = inst.no_kv_offload;
12431359
flash_attn = inst.flash_attn;
1360+
devices = inst.devices;
12441361
tensor_split = inst.tensor_split;
12451362
tensor_buft_overrides = inst.tensor_buft_overrides;
12461363
use_mmap = inst.use_mmap;
@@ -1292,9 +1409,9 @@ struct test {
12921409
"n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
12931410
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
12941411
"main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
1295-
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen",
1296-
"n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts",
1297-
"stddev_ts"
1412+
"devices", "use_mmap", "embeddings", "no_op_offload", "n_prompt",
1413+
"n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns",
1414+
"avg_ts", "stddev_ts"
12981415
};
12991416
return fields;
13001417
}
@@ -1378,6 +1495,7 @@ struct test {
13781495
std::to_string(main_gpu),
13791496
std::to_string(no_kv_offload),
13801497
std::to_string(flash_attn),
1498+
devices_to_string(devices),
13811499
tensor_split_str,
13821500
tensor_buft_overrides_str,
13831501
std::to_string(use_mmap),
@@ -1559,6 +1677,9 @@ struct markdown_printer : public printer {
15591677
if (field == "flash_attn") {
15601678
return 2;
15611679
}
1680+
if (field == "devices") {
1681+
return -12;
1682+
}
15621683
if (field == "use_mmap") {
15631684
return 4;
15641685
}
@@ -1602,6 +1723,9 @@ struct markdown_printer : public printer {
16021723
if (field == "no_op_offload") {
16031724
return "nopo";
16041725
}
1726+
if (field == "devices") {
1727+
return "dev";
1728+
}
16051729
if (field == "tensor_split") {
16061730
return "ts";
16071731
}
@@ -1661,6 +1785,9 @@ struct markdown_printer : public printer {
16611785
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
16621786
fields.emplace_back("flash_attn");
16631787
}
1788+
if (params.devices.size() > 1 || params.devices != cmd_params_defaults.devices) {
1789+
fields.emplace_back("devices");
1790+
}
16641791
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
16651792
fields.emplace_back("tensor_split");
16661793
}

0 commit comments

Comments
 (0)