From 3d13f64c87b410413e841ce948bf0ca09008d386 Mon Sep 17 00:00:00 2001 From: Scott Sweeney <1149151+ssweens@users.noreply.github.com> Date: Tue, 16 Sep 2025 13:17:23 -0700 Subject: [PATCH 1/7] * llama-bench: add --devices support - Support --devices same as llama-server - Provide for benchmarking different device combinations - Include --list-devices like llama-server for convenience --- tools/llama-bench/llama-bench.cpp | 147 ++++++++++++++++++++++++++++-- 1 file changed, 137 insertions(+), 10 deletions(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index ad47bf144f002..2352d98cf2ebb 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -135,6 +135,86 @@ static std::string get_gpu_info() { return join(gpu_list, ", "); } +static std::vector parse_devices_arg(const std::string & value) { + std::vector devices; + std::string trimmed = string_strip(value); + if (trimmed.empty()) { + throw std::invalid_argument("no devices specified"); + } + if (trimmed == "auto") { + return devices; + } + + auto dev_names = string_split(trimmed, '/'); + if (dev_names.size() == 1 && string_strip(dev_names[0]) == "none") { + devices.push_back(nullptr); + return devices; + } + + for (auto & name : dev_names) { + std::string dev_name = string_strip(name); + if (dev_name.empty()) { + throw std::invalid_argument("invalid device specification"); + } + auto * dev = ggml_backend_dev_by_name(dev_name.c_str()); + if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { + throw std::invalid_argument(string_format("invalid device: %s", dev_name.c_str())); + } + devices.push_back(dev); + } + + devices.push_back(nullptr); + return devices; +} + +[[noreturn]] static void print_available_devices_and_exit() { + std::vector devices; + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto ty = ggml_backend_dev_type(dev); + if (ty == GGML_BACKEND_DEVICE_TYPE_CPU) { + continue; + } + devices.push_back(dev); + } + + printf("Available devices:\n"); + if (devices.empty()) { + printf(" (none)\n"); + } + for (auto * dev : devices) { + size_t free = 0; + size_t total = 0; + ggml_backend_dev_memory(dev, &free, &total); + printf(" %s: %s (%zu MiB, %zu MiB free)\n", + ggml_backend_dev_name(dev), + ggml_backend_dev_description(dev), + total / 1024 / 1024, + free / 1024 / 1024); + } + exit(0); +} + +static std::string devices_to_string(const std::vector & devices) { + if (devices.empty()) { + return "auto"; + } + + if (devices.size() == 1 && devices[0] == nullptr) { + return "none"; + } + + std::vector names; + for (auto * dev : devices) { + if (dev == nullptr) { + break; + } + names.push_back(ggml_backend_dev_name(dev)); + } + + return join(names, "/"); +} + // command line params enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL }; @@ -256,6 +336,7 @@ struct cmd_params { std::vector main_gpu; std::vector no_kv_offload; std::vector flash_attn; + std::vector> devices; std::vector> tensor_split; std::vector> tensor_buft_overrides; std::vector use_mmap; @@ -293,6 +374,7 @@ static const cmd_params cmd_params_defaults = { /* main_gpu */ { 0 }, /* no_kv_offload */ { false }, /* flash_attn */ { false }, + /* devices */ { std::vector() }, /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, /* use_mmap */ { true }, @@ -325,6 +407,7 @@ static void print_usage(int /* argc */, char ** argv) { output_format_str(cmd_params_defaults.output_format)); printf(" -oe, --output-err output format printed to stderr (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); + printf(" --list-devices list available devices and exit\n"); printf(" -v, --verbose verbose output\n"); printf(" --progress print test progress indicators\n"); printf(" --no-warmup skip warmup runs before benchmarking\n"); @@ -369,6 +452,7 @@ static void print_usage(int /* argc */, char ** argv) { join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); + printf(" -dev, --device (default: auto)\n"); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", @@ -533,6 +617,26 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.type_v.insert(params.type_v.end(), types.begin(), types.end()); + } else if (arg == "-dev" || arg == "--device") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto combos = string_split(argv[i], split_delim); + for (const auto & combo : combos) { + try { + params.devices.push_back(parse_devices_arg(combo)); + } catch (const std::exception & e) { + fprintf(stderr, "error: %s\\n", e.what()); + invalid_param = true; + break; + } + } + if (invalid_param) { + break; + } + } else if (arg == "--list-devices") { + print_available_devices_and_exit(); } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; @@ -870,6 +974,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; } + if (params.devices.empty()) { + params.devices = cmd_params_defaults.devices; + } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } @@ -921,6 +1028,7 @@ struct cmd_params_instance { int main_gpu; bool no_kv_offload; bool flash_attn; + std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; bool use_mmap; @@ -931,7 +1039,9 @@ struct cmd_params_instance { llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = n_gpu_layers; - if (!rpc_servers_str.empty()) { + if (!devices.empty()) { + mparams.devices = const_cast(devices.data()); + } else if (!rpc_servers_str.empty()) { auto rpc_servers = string_split(rpc_servers_str, ','); // add RPC devices @@ -948,13 +1058,13 @@ struct cmd_params_instance { fprintf(stderr, "%s: failed to find RPC device add function\n", __func__); exit(1); } - static std::vector devices; - devices.clear(); + static std::vector rpc_devices; + rpc_devices.clear(); // RPC devices should always come first for performance reasons for (const std::string & server : rpc_servers) { ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); if (dev) { - devices.push_back(dev); + rpc_devices.push_back(dev); } else { fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str()); exit(1); @@ -971,7 +1081,7 @@ struct cmd_params_instance { break; case GGML_BACKEND_DEVICE_TYPE_GPU: - devices.push_back(dev); + rpc_devices.push_back(dev); break; case GGML_BACKEND_DEVICE_TYPE_IGPU: @@ -979,8 +1089,8 @@ struct cmd_params_instance { break; } } - devices.push_back(nullptr); - mparams.devices = devices.data(); + rpc_devices.push_back(nullptr); + mparams.devices = rpc_devices.data(); } } mparams.split_mode = split_mode; @@ -1031,6 +1141,7 @@ struct cmd_params_instance { return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split && + devices == other.devices && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); } @@ -1063,6 +1174,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & rpc : params.rpc_servers) for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) + for (const auto & devs : params.devices) for (const auto & ts : params.tensor_split) for (const auto & ot : params.tensor_buft_overrides) for (const auto & mmp : params.use_mmap) @@ -1103,6 +1215,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .flash_attn = */ fa, + /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, @@ -1136,6 +1249,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .flash_attn = */ fa, + /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, @@ -1169,6 +1283,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .flash_attn = */ fa, + /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, @@ -1206,6 +1321,7 @@ struct test { int main_gpu; bool no_kv_offload; bool flash_attn; + std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; bool use_mmap; @@ -1241,6 +1357,7 @@ struct test { main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; flash_attn = inst.flash_attn; + devices = inst.devices; tensor_split = inst.tensor_split; tensor_buft_overrides = inst.tensor_buft_overrides; use_mmap = inst.use_mmap; @@ -1292,9 +1409,9 @@ struct test { "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", - "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", - "stddev_ts" + "devices", "use_mmap", "embeddings", "no_op_offload", "n_prompt", + "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", + "avg_ts", "stddev_ts" }; return fields; } @@ -1378,6 +1495,7 @@ struct test { std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), + devices_to_string(devices), tensor_split_str, tensor_buft_overrides_str, std::to_string(use_mmap), @@ -1559,6 +1677,9 @@ struct markdown_printer : public printer { if (field == "flash_attn") { return 2; } + if (field == "devices") { + return -12; + } if (field == "use_mmap") { return 4; } @@ -1602,6 +1723,9 @@ struct markdown_printer : public printer { if (field == "no_op_offload") { return "nopo"; } + if (field == "devices") { + return "dev"; + } if (field == "tensor_split") { return "ts"; } @@ -1661,6 +1785,9 @@ struct markdown_printer : public printer { if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) { fields.emplace_back("flash_attn"); } + if (params.devices.size() > 1 || params.devices != cmd_params_defaults.devices) { + fields.emplace_back("devices"); + } if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { fields.emplace_back("tensor_split"); } From 6d3f8ca50d117ff30302aca51cf4be8fad6946a6 Mon Sep 17 00:00:00 2001 From: Scott Sweeney <1149151+ssweens@users.noreply.github.com> Date: Tue, 16 Sep 2025 14:24:33 -0700 Subject: [PATCH 2/7] fix: field display ordering restored --- tools/llama-bench/llama-bench.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 2352d98cf2ebb..b58b13c158558 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -1404,14 +1404,14 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", "cpu_info", "gpu_info", "backends", - "model_filename", "model_type", "model_size", "model_n_params", "n_batch", - "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", - "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", - "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "devices", "use_mmap", "embeddings", "no_op_offload", "n_prompt", - "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts" + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", + "model_filename", "model_type", "model_size", "model_n_params", "n_batch", + "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", + "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", + "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split", + "tensor_buft_overrides", "use_mmap", "embeddings", "no_op_offload", + "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", + "stddev_ns", "avg_ts", "stddev_ts" }; return fields; } From 8baa81bb03dffa5ea5543e08e981f388dd1ead9f Mon Sep 17 00:00:00 2001 From: Scott Sweeney <1149151+ssweens@users.noreply.github.com> Date: Tue, 16 Sep 2025 15:10:11 -0700 Subject: [PATCH 3/7] fix: integrated the rpc devices - aimed to mimic the server as much as possible --- tools/llama-bench/llama-bench.cpp | 121 ++++++++++++++++++------------ 1 file changed, 71 insertions(+), 50 deletions(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index b58b13c158558..c40c342e6fb76 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -167,6 +167,36 @@ static std::vector parse_devices_arg(const std::string & val return devices; } +static std::vector register_rpc_device_list(const std::string & servers) { + auto rpc_servers = string_split(servers, ','); + if (rpc_servers.empty()) { + throw std::invalid_argument("no RPC servers specified"); + } + + auto * rpc_reg = ggml_backend_reg_by_name("RPC"); + if (!rpc_reg) { + throw std::invalid_argument("failed to find RPC backend"); + } + + using add_rpc_device_fn = ggml_backend_dev_t (*)(const char * endpoint); + auto * ggml_backend_rpc_add_device_fn = (add_rpc_device_fn) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device"); + if (!ggml_backend_rpc_add_device_fn) { + throw std::invalid_argument("failed to find RPC device add function"); + } + + std::vector devices; + for (const auto & server : rpc_servers) { + ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); + if (!dev) { + throw std::invalid_argument(string_format("failed to add RPC device for server '%s'", server.c_str())); + } + ggml_backend_device_register(dev); + devices.push_back(dev); + } + + return devices; +} + [[noreturn]] static void print_available_devices_and_exit() { std::vector devices; for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { @@ -332,6 +362,7 @@ struct cmd_params { std::vector n_gpu_layers; std::vector n_cpu_moe; std::vector rpc_servers; + std::vector> rpc_device_sets; std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; @@ -370,6 +401,7 @@ static const cmd_params cmd_params_defaults = { /* n_gpu_layers */ { 99 }, /* n_cpu_moe */ { 0 }, /* rpc_servers */ { "" }, + /* rpc_device_sets */ { std::vector() }, /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, /* main_gpu */ { 0 }, /* no_kv_offload */ { false }, @@ -684,7 +716,16 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - params.rpc_servers.push_back(argv[i]); + ggml_backend_load_all(); + try { + auto devices = register_rpc_device_list(argv[i]); + params.rpc_servers.push_back(argv[i]); + params.rpc_device_sets.push_back(devices); + } catch (const std::exception & e) { + fprintf(stderr, "error: %s\n", e.what()); + invalid_param = true; + break; + } } else if (arg == "-sm" || arg == "--split-mode") { if (++i >= argc) { invalid_param = true; @@ -962,6 +1003,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } + if (params.rpc_device_sets.empty()) { + params.rpc_device_sets = cmd_params_defaults.rpc_device_sets; + } + if (params.rpc_device_sets.size() < params.rpc_servers.size()) { + params.rpc_device_sets.resize(params.rpc_servers.size()); + } if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } @@ -1024,6 +1071,7 @@ struct cmd_params_instance { int n_gpu_layers; int n_cpu_moe; std::string rpc_servers_str; + std::vector rpc_devices; llama_split_mode split_mode; int main_gpu; bool no_kv_offload; @@ -1041,57 +1089,24 @@ struct cmd_params_instance { mparams.n_gpu_layers = n_gpu_layers; if (!devices.empty()) { mparams.devices = const_cast(devices.data()); - } else if (!rpc_servers_str.empty()) { - auto rpc_servers = string_split(rpc_servers_str, ','); - - // add RPC devices - if (!rpc_servers.empty()) { - ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); - if (!rpc_reg) { - fprintf(stderr, "%s: failed to find RPC backend\n", __func__); - exit(1); - } + } else if (!rpc_devices.empty()) { + static std::vector merged_devices; + merged_devices.clear(); + merged_devices.insert(merged_devices.end(), rpc_devices.begin(), rpc_devices.end()); - typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint); - ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device"); - if (!ggml_backend_rpc_add_device_fn) { - fprintf(stderr, "%s: failed to find RPC device add function\n", __func__); - exit(1); - } - static std::vector rpc_devices; - rpc_devices.clear(); - // RPC devices should always come first for performance reasons - for (const std::string & server : rpc_servers) { - ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); - if (dev) { - rpc_devices.push_back(dev); - } else { - fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str()); - exit(1); - } + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + auto dev_type = ggml_backend_dev_type(dev); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) { + continue; } - // FIXME: use llama.cpp device selection logic - // add local GPU devices if any - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - ggml_backend_dev_t dev = ggml_backend_dev_get(i); - switch (ggml_backend_dev_type(dev)) { - case GGML_BACKEND_DEVICE_TYPE_CPU: - case GGML_BACKEND_DEVICE_TYPE_ACCEL: - // skip CPU backends since they are handled separately - break; - - case GGML_BACKEND_DEVICE_TYPE_GPU: - rpc_devices.push_back(dev); - break; - - case GGML_BACKEND_DEVICE_TYPE_IGPU: - // iGPUs are not used when there are RPC servers - break; - } + if (std::find(merged_devices.begin(), merged_devices.end(), dev) == merged_devices.end()) { + merged_devices.push_back(dev); } - rpc_devices.push_back(nullptr); - mparams.devices = rpc_devices.data(); } + + merged_devices.push_back(nullptr); + mparams.devices = merged_devices.data(); } mparams.split_mode = split_mode; mparams.main_gpu = main_gpu; @@ -1139,7 +1154,7 @@ struct cmd_params_instance { bool equal_mparams(const cmd_params_instance & other) const { return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && - rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode && + rpc_servers_str == other.rpc_servers_str && rpc_devices == other.rpc_devices && split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split && devices == other.devices && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); @@ -1171,7 +1186,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) for (const auto & ncmoe : params.n_cpu_moe) - for (const auto & rpc : params.rpc_servers) + for (size_t rpc_idx = 0; rpc_idx < params.rpc_servers.size(); ++rpc_idx) for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) for (const auto & devs : params.devices) @@ -1191,6 +1206,9 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & cs : params.cpu_strict) for (const auto & nd : params.n_depth) for (const auto & pl : params.poll) { + const auto & rpc = params.rpc_servers[rpc_idx]; + const auto & rpc_set = params.rpc_device_sets[rpc_idx]; + for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { continue; @@ -1211,6 +1229,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, + /* .rpc_devices = */ rpc_set, /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, @@ -1245,6 +1264,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, + /* .rpc_devices = */ rpc_set, /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, @@ -1279,6 +1299,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, + /* .rpc_devices = */ rpc_set, /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, From 96f259bb6da4dc56d565f394ffe7aa0d353519b3 Mon Sep 17 00:00:00 2001 From: Scott Sweeney <1149151+ssweens@users.noreply.github.com> Date: Tue, 16 Sep 2025 18:39:07 -0700 Subject: [PATCH 4/7] cleanup: defaults for list-devices - handle dup device listing with RPC --- tools/llama-bench/llama-bench.cpp | 42 +++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index c40c342e6fb76..065f22a257789 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include "common.h" #include "ggml.h" @@ -184,13 +185,26 @@ static std::vector register_rpc_device_list(const std::strin throw std::invalid_argument("failed to find RPC device add function"); } + static std::unordered_set registered; std::vector devices; for (const auto & server : rpc_servers) { - ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); + ggml_backend_dev_t dev = nullptr; + + std::string name = string_format("RPC[%s]", server.c_str()); + + if (registered.find(server) != registered.end()) { + dev = ggml_backend_dev_by_name(name.c_str()); + } + if (!dev) { - throw std::invalid_argument(string_format("failed to add RPC device for server '%s'", server.c_str())); + dev = ggml_backend_rpc_add_device_fn(server.c_str()); + if (!dev) { + throw std::invalid_argument(string_format("failed to add RPC device for server '%s'", server.c_str())); + } + ggml_backend_device_register(dev); + registered.insert(server); } - ggml_backend_device_register(dev); + devices.push_back(dev); } @@ -382,6 +396,7 @@ struct cmd_params { bool no_warmup; output_formats output_format; output_formats output_format_stderr; + bool list_devices; }; static const cmd_params cmd_params_defaults = { @@ -421,6 +436,7 @@ static const cmd_params cmd_params_defaults = { /* no_warmup */ false, /* output_format */ MARKDOWN, /* output_format_stderr */ NONE, + /* list_devices */ false, }; static void print_usage(int /* argc */, char ** argv) { @@ -545,6 +561,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.delay = cmd_params_defaults.delay; params.progress = cmd_params_defaults.progress; params.no_warmup = cmd_params_defaults.no_warmup; + params.list_devices = cmd_params_defaults.list_devices; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -668,7 +685,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } } else if (arg == "--list-devices") { - print_available_devices_and_exit(); + params.list_devices = true; } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; @@ -1006,9 +1023,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.rpc_device_sets.empty()) { params.rpc_device_sets = cmd_params_defaults.rpc_device_sets; } - if (params.rpc_device_sets.size() < params.rpc_servers.size()) { - params.rpc_device_sets.resize(params.rpc_servers.size()); - } if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } @@ -2037,6 +2051,20 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); + if (params.list_devices) { + ggml_backend_load_all(); + for (const auto & rpc : params.rpc_servers) { + if (!rpc.empty()) { + try { + register_rpc_device_list(rpc); + } catch (const std::exception & e) { + fprintf(stderr, "warning: %s\n", e.what()); + } + } + } + print_available_devices_and_exit(); + } + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (!cpu_dev) { fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__); From 667663a72d22f7573998c57f51b44bbd34f44882 Mon Sep 17 00:00:00 2001 From: Scott Sweeney <1149151+ssweens@users.noreply.github.com> Date: Tue, 16 Sep 2025 18:55:00 -0700 Subject: [PATCH 5/7] cleanup: remove dup device load calls --- tools/llama-bench/llama-bench.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 065f22a257789..826f967837132 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -733,7 +733,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - ggml_backend_load_all(); try { auto devices = register_rpc_device_list(argv[i]); params.rpc_servers.push_back(argv[i]); @@ -2052,7 +2051,6 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); if (params.list_devices) { - ggml_backend_load_all(); for (const auto & rpc : params.rpc_servers) { if (!rpc.empty()) { try { From 2565dfa258667b2f8f016cffcd29833a628e0fa8 Mon Sep 17 00:00:00 2001 From: Scott Sweeney <1149151+ssweens@users.noreply.github.com> Date: Tue, 16 Sep 2025 19:19:49 -0700 Subject: [PATCH 6/7] docs: update llama-bench - added the recently added n-cpu-moe option to the docs while in there --- tools/llama-bench/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index bf7fd29c8c55f..4d48cdc687f8b 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -30,6 +30,7 @@ options: --delay <0...N> (seconds) delay between each test (default: 0) -o, --output output format printed to stdout (default: md) -oe, --output-err output format printed to stderr (default: none) + --list-devices list available devices and exit -v, --verbose verbose output --progress print test progress indicators @@ -48,11 +49,13 @@ test parameters: --cpu-strict <0|1> (default: 0) --poll <0...100> (default: 50) -ngl, --n-gpu-layers (default: 99) + -ncmoe, --n-cpu-moe (default: 0) -rpc, --rpc (default: none) -sm, --split-mode (default: layer) -mg, --main-gpu (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0) -fa, --flash-attn <0|1> (default: 0) + -dev, --device (default: auto) -mmp, --mmap <0|1> (default: 1) -embd, --embeddings <0|1> (default: 0) -ts, --tensor-split (default: 0) From ac7e8b9ebc0e0c93c7d54d4c12cce44350866c20 Mon Sep 17 00:00:00 2001 From: Scott Sweeney <1149151+ssweens@users.noreply.github.com> Date: Thu, 18 Sep 2025 20:31:38 -0700 Subject: [PATCH 7/7] llama-bench: rpc device simplification * rpc servers unify with other devices earlier, simplifying code * --list-devices made stateless and simpler * various cleanup --- tools/llama-bench/README.md | 2 +- tools/llama-bench/llama-bench.cpp | 119 ++++++------------------------ 2 files changed, 25 insertions(+), 96 deletions(-) diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index 4d48cdc687f8b..ead4da45e2957 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -33,6 +33,7 @@ options: --list-devices list available devices and exit -v, --verbose verbose output --progress print test progress indicators + -rpc, --rpc register RPC devices (comma separated) test parameters: -m, --model (default: models/7B/ggml-model-q4_0.gguf) @@ -50,7 +51,6 @@ test parameters: --poll <0...100> (default: 50) -ngl, --n-gpu-layers (default: 99) -ncmoe, --n-cpu-moe (default: 0) - -rpc, --rpc (default: none) -sm, --split-mode (default: layer) -mg, --main-gpu (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 826f967837132..275ba367c02f1 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -211,34 +211,6 @@ static std::vector register_rpc_device_list(const std::strin return devices; } -[[noreturn]] static void print_available_devices_and_exit() { - std::vector devices; - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - auto * dev = ggml_backend_dev_get(i); - auto ty = ggml_backend_dev_type(dev); - if (ty == GGML_BACKEND_DEVICE_TYPE_CPU) { - continue; - } - devices.push_back(dev); - } - - printf("Available devices:\n"); - if (devices.empty()) { - printf(" (none)\n"); - } - for (auto * dev : devices) { - size_t free = 0; - size_t total = 0; - ggml_backend_dev_memory(dev, &free, &total); - printf(" %s: %s (%zu MiB, %zu MiB free)\n", - ggml_backend_dev_name(dev), - ggml_backend_dev_description(dev), - total / 1024 / 1024, - free / 1024 / 1024); - } - exit(0); -} - static std::string devices_to_string(const std::vector & devices) { if (devices.empty()) { return "auto"; @@ -375,8 +347,6 @@ struct cmd_params { std::vector poll; std::vector n_gpu_layers; std::vector n_cpu_moe; - std::vector rpc_servers; - std::vector> rpc_device_sets; std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; @@ -396,7 +366,6 @@ struct cmd_params { bool no_warmup; output_formats output_format; output_formats output_format_stderr; - bool list_devices; }; static const cmd_params cmd_params_defaults = { @@ -415,13 +384,11 @@ static const cmd_params cmd_params_defaults = { /* poll */ { 50 }, /* n_gpu_layers */ { 99 }, /* n_cpu_moe */ { 0 }, - /* rpc_servers */ { "" }, - /* rpc_device_sets */ { std::vector() }, /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, /* main_gpu */ { 0 }, /* no_kv_offload */ { false }, /* flash_attn */ { false }, - /* devices */ { std::vector() }, + /* devices */ { {} }, /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, /* use_mmap */ { true }, @@ -436,7 +403,6 @@ static const cmd_params cmd_params_defaults = { /* no_warmup */ false, /* output_format */ MARKDOWN, /* output_format_stderr */ NONE, - /* list_devices */ false, }; static void print_usage(int /* argc */, char ** argv) { @@ -459,6 +425,9 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -v, --verbose verbose output\n"); printf(" --progress print test progress indicators\n"); printf(" --no-warmup skip warmup runs before benchmarking\n"); + if (llama_supports_rpc()) { + printf(" -rpc, --rpc register RPC devices (comma separated)\n"); + } printf("\n"); printf("test parameters:\n"); printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); @@ -488,10 +457,6 @@ static void print_usage(int /* argc */, char ** argv) { join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ncmoe, --n-cpu-moe (default: %s)\n", join(cmd_params_defaults.n_cpu_moe, ",").c_str()); - if (llama_supports_rpc()) { - printf(" -rpc, --rpc (default: %s)\n", - join(cmd_params_defaults.rpc_servers, ",").c_str()); - } printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", @@ -561,7 +526,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.delay = cmd_params_defaults.delay; params.progress = cmd_params_defaults.progress; params.no_warmup = cmd_params_defaults.no_warmup; - params.list_devices = cmd_params_defaults.list_devices; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -676,7 +640,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { try { params.devices.push_back(parse_devices_arg(combo)); } catch (const std::exception & e) { - fprintf(stderr, "error: %s\\n", e.what()); + fprintf(stderr, "error: %s\n", e.what()); invalid_param = true; break; } @@ -685,7 +649,23 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } } else if (arg == "--list-devices") { - params.list_devices = true; + std::vector devices; + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { + devices.push_back(dev); + } + } + printf("Available devices:\n"); + if (devices.empty()) { + printf(" (none)\n"); + } + for (auto * dev : devices) { + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); + } + exit(0); } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; @@ -734,9 +714,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } try { - auto devices = register_rpc_device_list(argv[i]); - params.rpc_servers.push_back(argv[i]); - params.rpc_device_sets.push_back(devices); + register_rpc_device_list(argv[i]); } catch (const std::exception & e) { fprintf(stderr, "error: %s\n", e.what()); invalid_param = true; @@ -1016,12 +994,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_cpu_moe.empty()) { params.n_cpu_moe = cmd_params_defaults.n_cpu_moe; } - if (params.rpc_servers.empty()) { - params.rpc_servers = cmd_params_defaults.rpc_servers; - } - if (params.rpc_device_sets.empty()) { - params.rpc_device_sets = cmd_params_defaults.rpc_device_sets; - } if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } @@ -1083,8 +1055,6 @@ struct cmd_params_instance { int poll; int n_gpu_layers; int n_cpu_moe; - std::string rpc_servers_str; - std::vector rpc_devices; llama_split_mode split_mode; int main_gpu; bool no_kv_offload; @@ -1102,24 +1072,6 @@ struct cmd_params_instance { mparams.n_gpu_layers = n_gpu_layers; if (!devices.empty()) { mparams.devices = const_cast(devices.data()); - } else if (!rpc_devices.empty()) { - static std::vector merged_devices; - merged_devices.clear(); - merged_devices.insert(merged_devices.end(), rpc_devices.begin(), rpc_devices.end()); - - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - ggml_backend_dev_t dev = ggml_backend_dev_get(i); - auto dev_type = ggml_backend_dev_type(dev); - if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) { - continue; - } - if (std::find(merged_devices.begin(), merged_devices.end(), dev) == merged_devices.end()) { - merged_devices.push_back(dev); - } - } - - merged_devices.push_back(nullptr); - mparams.devices = merged_devices.data(); } mparams.split_mode = split_mode; mparams.main_gpu = main_gpu; @@ -1167,7 +1119,7 @@ struct cmd_params_instance { bool equal_mparams(const cmd_params_instance & other) const { return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && - rpc_servers_str == other.rpc_servers_str && rpc_devices == other.rpc_devices && split_mode == other.split_mode && + split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split && devices == other.devices && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); @@ -1199,7 +1151,6 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) for (const auto & ncmoe : params.n_cpu_moe) - for (size_t rpc_idx = 0; rpc_idx < params.rpc_servers.size(); ++rpc_idx) for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) for (const auto & devs : params.devices) @@ -1219,9 +1170,6 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & cs : params.cpu_strict) for (const auto & nd : params.n_depth) for (const auto & pl : params.poll) { - const auto & rpc = params.rpc_servers[rpc_idx]; - const auto & rpc_set = params.rpc_device_sets[rpc_idx]; - for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { continue; @@ -1241,8 +1189,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .poll = */ pl, /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, - /* .rpc_servers = */ rpc, - /* .rpc_devices = */ rpc_set, /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, @@ -1276,8 +1222,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .poll = */ pl, /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, - /* .rpc_servers = */ rpc, - /* .rpc_devices = */ rpc_set, /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, @@ -1311,8 +1255,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .poll = */ pl, /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, - /* .rpc_servers = */ rpc, - /* .rpc_devices = */ rpc_set, /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, @@ -2050,19 +1992,6 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); - if (params.list_devices) { - for (const auto & rpc : params.rpc_servers) { - if (!rpc.empty()) { - try { - register_rpc_device_list(rpc); - } catch (const std::exception & e) { - fprintf(stderr, "warning: %s\n", e.what()); - } - } - } - print_available_devices_and_exit(); - } - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (!cpu_dev) { fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);