Skip to content

Commit ac7e8b9

Browse files
committed
llama-bench: rpc device simplification
* rpc servers unify with other devices earlier, simplifying code * --list-devices made stateless and simpler * various cleanup
1 parent 2565dfa commit ac7e8b9

File tree

2 files changed

+25
-96
lines changed

2 files changed

+25
-96
lines changed

tools/llama-bench/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ options:
3333
--list-devices list available devices and exit
3434
-v, --verbose verbose output
3535
--progress print test progress indicators
36+
-rpc, --rpc <rpc_servers> register RPC devices (comma separated)
3637
3738
test parameters:
3839
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
@@ -50,7 +51,6 @@ test parameters:
5051
--poll <0...100> (default: 50)
5152
-ngl, --n-gpu-layers <n> (default: 99)
5253
-ncmoe, --n-cpu-moe <n> (default: 0)
53-
-rpc, --rpc <rpc_servers> (default: none)
5454
-sm, --split-mode <none|layer|row> (default: layer)
5555
-mg, --main-gpu <i> (default: 0)
5656
-nkvo, --no-kv-offload <0|1> (default: 0)

tools/llama-bench/llama-bench.cpp

Lines changed: 24 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -211,34 +211,6 @@ static std::vector<ggml_backend_dev_t> register_rpc_device_list(const std::strin
211211
return devices;
212212
}
213213

214-
[[noreturn]] static void print_available_devices_and_exit() {
215-
std::vector<ggml_backend_dev_t> devices;
216-
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
217-
auto * dev = ggml_backend_dev_get(i);
218-
auto ty = ggml_backend_dev_type(dev);
219-
if (ty == GGML_BACKEND_DEVICE_TYPE_CPU) {
220-
continue;
221-
}
222-
devices.push_back(dev);
223-
}
224-
225-
printf("Available devices:\n");
226-
if (devices.empty()) {
227-
printf(" (none)\n");
228-
}
229-
for (auto * dev : devices) {
230-
size_t free = 0;
231-
size_t total = 0;
232-
ggml_backend_dev_memory(dev, &free, &total);
233-
printf(" %s: %s (%zu MiB, %zu MiB free)\n",
234-
ggml_backend_dev_name(dev),
235-
ggml_backend_dev_description(dev),
236-
total / 1024 / 1024,
237-
free / 1024 / 1024);
238-
}
239-
exit(0);
240-
}
241-
242214
static std::string devices_to_string(const std::vector<ggml_backend_dev_t> & devices) {
243215
if (devices.empty()) {
244216
return "auto";
@@ -375,8 +347,6 @@ struct cmd_params {
375347
std::vector<int> poll;
376348
std::vector<int> n_gpu_layers;
377349
std::vector<int> n_cpu_moe;
378-
std::vector<std::string> rpc_servers;
379-
std::vector<std::vector<ggml_backend_dev_t>> rpc_device_sets;
380350
std::vector<llama_split_mode> split_mode;
381351
std::vector<int> main_gpu;
382352
std::vector<bool> no_kv_offload;
@@ -396,7 +366,6 @@ struct cmd_params {
396366
bool no_warmup;
397367
output_formats output_format;
398368
output_formats output_format_stderr;
399-
bool list_devices;
400369
};
401370

402371
static const cmd_params cmd_params_defaults = {
@@ -415,13 +384,11 @@ static const cmd_params cmd_params_defaults = {
415384
/* poll */ { 50 },
416385
/* n_gpu_layers */ { 99 },
417386
/* n_cpu_moe */ { 0 },
418-
/* rpc_servers */ { "" },
419-
/* rpc_device_sets */ { std::vector<ggml_backend_dev_t>() },
420387
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
421388
/* main_gpu */ { 0 },
422389
/* no_kv_offload */ { false },
423390
/* flash_attn */ { false },
424-
/* devices */ { std::vector<ggml_backend_dev_t>() },
391+
/* devices */ { {} },
425392
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
426393
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
427394
/* use_mmap */ { true },
@@ -436,7 +403,6 @@ static const cmd_params cmd_params_defaults = {
436403
/* no_warmup */ false,
437404
/* output_format */ MARKDOWN,
438405
/* output_format_stderr */ NONE,
439-
/* list_devices */ false,
440406
};
441407

442408
static void print_usage(int /* argc */, char ** argv) {
@@ -459,6 +425,9 @@ static void print_usage(int /* argc */, char ** argv) {
459425
printf(" -v, --verbose verbose output\n");
460426
printf(" --progress print test progress indicators\n");
461427
printf(" --no-warmup skip warmup runs before benchmarking\n");
428+
if (llama_supports_rpc()) {
429+
printf(" -rpc, --rpc <rpc_servers> register RPC devices (comma separated)\n");
430+
}
462431
printf("\n");
463432
printf("test parameters:\n");
464433
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
@@ -488,10 +457,6 @@ static void print_usage(int /* argc */, char ** argv) {
488457
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
489458
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n",
490459
join(cmd_params_defaults.n_cpu_moe, ",").c_str());
491-
if (llama_supports_rpc()) {
492-
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
493-
join(cmd_params_defaults.rpc_servers, ",").c_str());
494-
}
495460
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
496461
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
497462
printf(" -mg, --main-gpu <i> (default: %s)\n",
@@ -561,7 +526,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
561526
params.delay = cmd_params_defaults.delay;
562527
params.progress = cmd_params_defaults.progress;
563528
params.no_warmup = cmd_params_defaults.no_warmup;
564-
params.list_devices = cmd_params_defaults.list_devices;
565529

566530
for (int i = 1; i < argc; i++) {
567531
arg = argv[i];
@@ -676,7 +640,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
676640
try {
677641
params.devices.push_back(parse_devices_arg(combo));
678642
} catch (const std::exception & e) {
679-
fprintf(stderr, "error: %s\\n", e.what());
643+
fprintf(stderr, "error: %s\n", e.what());
680644
invalid_param = true;
681645
break;
682646
}
@@ -685,7 +649,23 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
685649
break;
686650
}
687651
} else if (arg == "--list-devices") {
688-
params.list_devices = true;
652+
std::vector<ggml_backend_dev_t> devices;
653+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
654+
auto * dev = ggml_backend_dev_get(i);
655+
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
656+
devices.push_back(dev);
657+
}
658+
}
659+
printf("Available devices:\n");
660+
if (devices.empty()) {
661+
printf(" (none)\n");
662+
}
663+
for (auto * dev : devices) {
664+
size_t free, total;
665+
ggml_backend_dev_memory(dev, &free, &total);
666+
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
667+
}
668+
exit(0);
689669
} else if (arg == "-t" || arg == "--threads") {
690670
if (++i >= argc) {
691671
invalid_param = true;
@@ -734,9 +714,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
734714
break;
735715
}
736716
try {
737-
auto devices = register_rpc_device_list(argv[i]);
738-
params.rpc_servers.push_back(argv[i]);
739-
params.rpc_device_sets.push_back(devices);
717+
register_rpc_device_list(argv[i]);
740718
} catch (const std::exception & e) {
741719
fprintf(stderr, "error: %s\n", e.what());
742720
invalid_param = true;
@@ -1016,12 +994,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
1016994
if (params.n_cpu_moe.empty()) {
1017995
params.n_cpu_moe = cmd_params_defaults.n_cpu_moe;
1018996
}
1019-
if (params.rpc_servers.empty()) {
1020-
params.rpc_servers = cmd_params_defaults.rpc_servers;
1021-
}
1022-
if (params.rpc_device_sets.empty()) {
1023-
params.rpc_device_sets = cmd_params_defaults.rpc_device_sets;
1024-
}
1025997
if (params.split_mode.empty()) {
1026998
params.split_mode = cmd_params_defaults.split_mode;
1027999
}
@@ -1083,8 +1055,6 @@ struct cmd_params_instance {
10831055
int poll;
10841056
int n_gpu_layers;
10851057
int n_cpu_moe;
1086-
std::string rpc_servers_str;
1087-
std::vector<ggml_backend_dev_t> rpc_devices;
10881058
llama_split_mode split_mode;
10891059
int main_gpu;
10901060
bool no_kv_offload;
@@ -1102,24 +1072,6 @@ struct cmd_params_instance {
11021072
mparams.n_gpu_layers = n_gpu_layers;
11031073
if (!devices.empty()) {
11041074
mparams.devices = const_cast<ggml_backend_dev_t *>(devices.data());
1105-
} else if (!rpc_devices.empty()) {
1106-
static std::vector<ggml_backend_dev_t> merged_devices;
1107-
merged_devices.clear();
1108-
merged_devices.insert(merged_devices.end(), rpc_devices.begin(), rpc_devices.end());
1109-
1110-
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1111-
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
1112-
auto dev_type = ggml_backend_dev_type(dev);
1113-
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
1114-
continue;
1115-
}
1116-
if (std::find(merged_devices.begin(), merged_devices.end(), dev) == merged_devices.end()) {
1117-
merged_devices.push_back(dev);
1118-
}
1119-
}
1120-
1121-
merged_devices.push_back(nullptr);
1122-
mparams.devices = merged_devices.data();
11231075
}
11241076
mparams.split_mode = split_mode;
11251077
mparams.main_gpu = main_gpu;
@@ -1167,7 +1119,7 @@ struct cmd_params_instance {
11671119

11681120
bool equal_mparams(const cmd_params_instance & other) const {
11691121
return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
1170-
rpc_servers_str == other.rpc_servers_str && rpc_devices == other.rpc_devices && split_mode == other.split_mode &&
1122+
split_mode == other.split_mode &&
11711123
main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split &&
11721124
devices == other.devices &&
11731125
vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
@@ -1199,7 +1151,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
11991151
for (const auto & m : params.model)
12001152
for (const auto & nl : params.n_gpu_layers)
12011153
for (const auto & ncmoe : params.n_cpu_moe)
1202-
for (size_t rpc_idx = 0; rpc_idx < params.rpc_servers.size(); ++rpc_idx)
12031154
for (const auto & sm : params.split_mode)
12041155
for (const auto & mg : params.main_gpu)
12051156
for (const auto & devs : params.devices)
@@ -1219,9 +1170,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
12191170
for (const auto & cs : params.cpu_strict)
12201171
for (const auto & nd : params.n_depth)
12211172
for (const auto & pl : params.poll) {
1222-
const auto & rpc = params.rpc_servers[rpc_idx];
1223-
const auto & rpc_set = params.rpc_device_sets[rpc_idx];
1224-
12251173
for (const auto & n_prompt : params.n_prompt) {
12261174
if (n_prompt == 0) {
12271175
continue;
@@ -1241,8 +1189,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
12411189
/* .poll = */ pl,
12421190
/* .n_gpu_layers = */ nl,
12431191
/* .n_cpu_moe = */ ncmoe,
1244-
/* .rpc_servers = */ rpc,
1245-
/* .rpc_devices = */ rpc_set,
12461192
/* .split_mode = */ sm,
12471193
/* .main_gpu = */ mg,
12481194
/* .no_kv_offload= */ nkvo,
@@ -1276,8 +1222,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
12761222
/* .poll = */ pl,
12771223
/* .n_gpu_layers = */ nl,
12781224
/* .n_cpu_moe = */ ncmoe,
1279-
/* .rpc_servers = */ rpc,
1280-
/* .rpc_devices = */ rpc_set,
12811225
/* .split_mode = */ sm,
12821226
/* .main_gpu = */ mg,
12831227
/* .no_kv_offload= */ nkvo,
@@ -1311,8 +1255,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
13111255
/* .poll = */ pl,
13121256
/* .n_gpu_layers = */ nl,
13131257
/* .n_cpu_moe = */ ncmoe,
1314-
/* .rpc_servers = */ rpc,
1315-
/* .rpc_devices = */ rpc_set,
13161258
/* .split_mode = */ sm,
13171259
/* .main_gpu = */ mg,
13181260
/* .no_kv_offload= */ nkvo,
@@ -2050,19 +1992,6 @@ int main(int argc, char ** argv) {
20501992

20511993
cmd_params params = parse_cmd_params(argc, argv);
20521994

2053-
if (params.list_devices) {
2054-
for (const auto & rpc : params.rpc_servers) {
2055-
if (!rpc.empty()) {
2056-
try {
2057-
register_rpc_device_list(rpc);
2058-
} catch (const std::exception & e) {
2059-
fprintf(stderr, "warning: %s\n", e.what());
2060-
}
2061-
}
2062-
}
2063-
print_available_devices_and_exit();
2064-
}
2065-
20661995
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
20671996
if (!cpu_dev) {
20681997
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);

0 commit comments

Comments
 (0)