Skip to content

Commit 7e3e0d9

Browse files
committed
add --rpc-layers flag to explicitly set RPC layers
The current setup does not allow for very precise control of how many layers to put on the local GPUs vs the remote RPC connected server(s). This adds an additional --rpc-layers flag (-nrl) which allows the user to explicitly set the number of layers to offload to RPC end.
1 parent 325afb3 commit 7e3e0d9

File tree

5 files changed

+30
-1
lines changed

5 files changed

+30
-1
lines changed

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1489,6 +1489,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14891489
}
14901490
}
14911491
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
1492+
add_opt(common_arg(
1493+
{"-nrl", "--rpc-layers", "--n-rpc-layers"}, "N",
1494+
"number of layers to store on remote RPC devices",
1495+
[](common_params & params, int value) {
1496+
params.n_rpc_layers = value;
1497+
}
1498+
).set_env("LLAMA_ARG_N_RPC_LAYERS"));
14921499
add_opt(common_arg(
14931500
{"-sm", "--split-mode"}, "{none,layer,row}",
14941501
"how to split the model across multiple GPUs, one of:\n"

common/common.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,6 +1086,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
10861086
if (params.n_gpu_layers != -1) {
10871087
mparams.n_gpu_layers = params.n_gpu_layers;
10881088
}
1089+
if (params.n_rpc_layers != -1) {
1090+
mparams.n_rpc_layers = params.n_rpc_layers;
1091+
}
10891092
mparams.main_gpu = params.main_gpu;
10901093
mparams.split_mode = params.split_mode;
10911094
mparams.tensor_split = params.tensor_split;

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ struct common_params {
217217
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
218218

219219
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
220+
int32_t n_rpc_layers = -1; // number of layers to store on RPC devices (-1 - use default)
220221
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
221222
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
222223

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ extern "C" {
280280
ggml_backend_dev_t * devices;
281281

282282
int32_t n_gpu_layers; // number of layers to store in VRAM
283+
int32_t n_rpc_layers; // number of layers to delegate to RPC connected devices
283284
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
284285

285286
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE

src/llama-model.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1256,16 +1256,22 @@ void llama_model::load_vocab(llama_model_loader & ml) {
12561256
bool llama_model::load_tensors(llama_model_loader & ml) {
12571257
const auto & split_mode = params.split_mode;
12581258
const auto & n_gpu_layers = params.n_gpu_layers;
1259+
const auto & n_rpc_layers = params.n_rpc_layers;
12591260
const auto & use_mlock = params.use_mlock;
12601261
const auto & tensor_split = params.tensor_split;
12611262

12621263
const int n_layer = hparams.n_layer;
12631264

12641265
const bool use_mmap_buffer = true;
12651266

1267+
ggml_backend_dev_t rpc_dev = nullptr;
1268+
12661269
// build a list of buffer types for the CPU and GPU devices
12671270
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
12681271
for (auto * dev : devices) {
1272+
if (n_rpc_layers > 0 && rpc_dev == nullptr && std::string::npos != std::string(ggml_backend_dev_name(dev)).find("RPC[")) {
1273+
rpc_dev = dev;
1274+
}
12691275
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
12701276
// add CPU buffer types as a fallback
12711277
buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
@@ -1279,6 +1285,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
12791285
// default split, by free memory
12801286
for (size_t i = 0; i < n_devices(); ++i) {
12811287
ggml_backend_dev_t dev = devices[i];
1288+
if (dev == rpc_dev) {
1289+
// handled separately
1290+
splits[i] = 0;
1291+
continue;
1292+
}
12821293
size_t total;
12831294
size_t free;
12841295
ggml_backend_dev_memory(dev, &free, &total);
@@ -1300,12 +1311,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
13001311

13011312
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
13021313
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
1314+
const int i_rpc_start = std::max(i_gpu_start - n_rpc_layers, (int) 0);
13031315
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
13041316
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
1305-
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
1317+
if (il < i_rpc_start || (il - i_gpu_start) >= act_gpu_layers) {
13061318
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev));
13071319
return {cpu_dev, &pimpl->cpu_buft_list};
13081320
}
1321+
if (il < i_gpu_start) {
1322+
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(rpc_dev));
1323+
return {rpc_dev, &pimpl->gpu_buft_list.at(rpc_dev)};
1324+
}
13091325
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
13101326
auto * dev = devices.at(layer_gpu);
13111327
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev));
@@ -3760,6 +3776,7 @@ struct llama_model_params llama_model_default_params() {
37603776
struct llama_model_params result = {
37613777
/*.devices =*/ nullptr,
37623778
/*.n_gpu_layers =*/ 0,
3779+
/*.n_rpc_layers =*/ 0,
37633780
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
37643781
/*.main_gpu =*/ 0,
37653782
/*.tensor_split =*/ nullptr,

0 commit comments

Comments
 (0)