@@ -3418,22 +3418,25 @@ struct llama_lora_adapter {
34183418 }
34193419};
34203420
3421- static size_t llama_get_device_count(const llama_model & model) {
3422- size_t count = 1 ;
3421+ static int llama_get_device_count(const llama_model & model) {
3422+ int count = (int) model.devices.size() ;
34233423
3424- count = model.devices.size();
3424+ #if defined(GGML_USE_RPC)
3425+ count += (int) model.rpc_servers.size();
3426+ #endif
34253427
3426- #if defined(GGML_USE_SYCL)
3427- count = ggml_backend_sycl_get_device_count();
3428+ #if defined(GGML_USE_METAL)
3429+ count += 1;
3430+ #elif defined(GGML_USE_SYCL)
3431+ count += ggml_backend_sycl_get_device_count();
34283432#elif defined(GGML_USE_VULKAN)
3429- count = ggml_backend_vk_get_device_count();
3433+ count + = ggml_backend_vk_get_device_count();
34303434#elif defined(GGML_USE_CANN)
3431- return ggml_backend_cann_get_device_count();
3432- #endif
3433- #if defined(GGML_USE_RPC)
3434- count += model.rpc_servers.size();
3435+ count += ggml_backend_cann_get_device_count();
34353436#endif
3437+
34363438 return count;
3439+
34373440 GGML_UNUSED(model);
34383441}
34393442
@@ -3482,12 +3485,13 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
34823485 const char * endpoint = model.rpc_servers[device].c_str();
34833486 return ggml_backend_rpc_buffer_type(endpoint);
34843487 }
3485- device = device - rpc_count;
3488+ device -= rpc_count;
34863489#endif
34873490
34883491 if (device < (int)model.devices.size()) {
3489- buft = ggml_backend_dev_buffer_type(model.devices[device]);
3492+ return ggml_backend_dev_buffer_type(model.devices[device]);
34903493 }
3494+ device -= (int)model.devices.size();
34913495
34923496#if defined(GGML_USE_METAL)
34933497 buft = ggml_backend_metal_buffer_type();
@@ -6965,6 +6969,13 @@ static bool llm_load_tensors(
69656969 void * progress_callback_user_data) {
69666970 auto & hparams = model.hparams;
69676971
6972+ // check if the value of main_gpu is valid
6973+ if (llama_get_device_count(model) > 0 &&
6974+ split_mode != LLAMA_SPLIT_MODE_LAYER &&
6975+ (main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
6976+ throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
6977+ }
6978+
69686979 model.split_mode = split_mode;
69696980 model.main_gpu = main_gpu;
69706981 model.n_gpu_layers = n_gpu_layers;
@@ -19291,30 +19302,20 @@ struct llama_context * llama_new_context_with_model(
1929119302
1929219303 if (!hparams.vocab_only) {
1929319304 // initialize backends
19294- #if defined(GGML_USE_RPC)
19295- if (model->n_gpu_layers > 0) {
19296- for (const auto & endpoint : model->rpc_servers) {
19297- ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
19305+ int main_gpu = model->main_gpu;
19306+
19307+ // with registry
19308+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19309+ if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
19310+ ggml_backend_dev_t main_dev = model->devices[main_gpu];
19311+ ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
1929819312 if (backend == nullptr) {
19299- LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s' \n", __func__, endpoint.c_str( ));
19313+ LLAMA_LOG_ERROR("%s: failed to initialize %s backend \n", __func__, ggml_backend_dev_name(main_dev ));
1930019314 llama_free(ctx);
1930119315 return nullptr;
1930219316 }
1930319317 ctx->backends.push_back(backend);
1930419318 }
19305- }
19306- #endif
19307-
19308- if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
19309- // with split_mode LLAMA_SPLIT_MODE_NONE, only the main GPU backend is used
19310- ggml_backend_dev_t main_dev = model->devices[model->main_gpu];
19311- ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
19312- if (backend == nullptr) {
19313- LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
19314- llama_free(ctx);
19315- return nullptr;
19316- }
19317- ctx->backends.push_back(backend);
1931819319 } else {
1931919320 // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
1932019321 for (auto * dev : model->devices) {
@@ -19327,6 +19328,26 @@ struct llama_context * llama_new_context_with_model(
1932719328 ctx->backends.push_back(backend);
1932819329 }
1932919330 }
19331+ if (main_gpu >= (int)model->devices.size()) {
19332+ main_gpu -= (int)model->devices.size();
19333+ }
19334+
19335+ #if defined(GGML_USE_RPC)
19336+ if (model->n_gpu_layers > 0) {
19337+ for (const auto & endpoint : model->rpc_servers) {
19338+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
19339+ if (backend == nullptr) {
19340+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
19341+ llama_free(ctx);
19342+ return nullptr;
19343+ }
19344+ ctx->backends.push_back(backend);
19345+ }
19346+ }
19347+ if (main_gpu >= (int)model->rpc_servers.size()) {
19348+ main_gpu -= (int)model->rpc_servers.size();
19349+ }
19350+ #endif
1933019351
1933119352#if defined(GGML_USE_METAL)
1933219353 if (model->n_gpu_layers > 0) {
@@ -19345,7 +19366,7 @@ struct llama_context * llama_new_context_with_model(
1934519366 return nullptr;
1934619367 }
1934719368 if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
19348- ggml_backend_t backend = ggml_backend_vk_init(model-> main_gpu);
19369+ ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
1934919370 if (backend == nullptr) {
1935019371 LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
1935119372 llama_free(ctx);
@@ -19366,9 +19387,9 @@ struct llama_context * llama_new_context_with_model(
1936619387#elif defined(GGML_USE_SYCL)
1936719388 // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
1936819389 if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19369- ggml_backend_t backend = ggml_backend_sycl_init(model-> main_gpu);
19390+ ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
1937019391 if (backend == nullptr) {
19371- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model-> main_gpu);
19392+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
1937219393 llama_free(ctx);
1937319394 return nullptr;
1937419395 }
@@ -19387,7 +19408,7 @@ struct llama_context * llama_new_context_with_model(
1938719408 }
1938819409#elif defined(GGML_USE_KOMPUTE)
1938919410 if (model->n_gpu_layers > 0) {
19390- auto * backend = ggml_backend_kompute_init(model-> main_gpu);
19411+ auto * backend = ggml_backend_kompute_init(main_gpu);
1939119412 if (backend == nullptr) {
1939219413 LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
1939319414 llama_free(ctx);
@@ -19396,29 +19417,29 @@ struct llama_context * llama_new_context_with_model(
1939619417 ctx->backends.push_back(backend);
1939719418 }
1939819419#elif defined(GGML_USE_CANN)
19399- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19400- // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
19401- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19402- ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu);
19403- if (backend == nullptr) {
19404- LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
19405- llama_free(ctx);
19406- return nullptr;
19407- }
19408- ctx->backends.push_back(backend);
19409- } else {
19410- // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19411- // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
19412- for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
19413- ggml_backend_t backend = ggml_backend_cann_init(device);
19420+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19421+ // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
19422+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19423+ ggml_backend_t backend = ggml_backend_cann_init(main_gpu);
1941419424 if (backend == nullptr) {
19415- LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device );
19425+ LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu );
1941619426 llama_free(ctx);
1941719427 return nullptr;
1941819428 }
1941919429 ctx->backends.push_back(backend);
19430+ } else {
19431+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19432+ // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
19433+ for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
19434+ ggml_backend_t backend = ggml_backend_cann_init(device);
19435+ if (backend == nullptr) {
19436+ LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
19437+ llama_free(ctx);
19438+ return nullptr;
19439+ }
19440+ ctx->backends.push_back(backend);
19441+ }
1942019442 }
19421- }
1942219443#endif
1942319444
1942419445#ifdef GGML_USE_BLAS
0 commit comments