33//  - Creates n_parallel (--parallel) contexts per model
44//  - Runs inference in parallel on each context
55
6+ #include  < array> 
67#include  < thread> 
78#include  < vector> 
89#include  < atomic> 
@@ -38,13 +39,14 @@ int main(int argc, char ** argv) {
3839    cparams.n_seq_max  = 1 ;
3940
4041    int  dev_count = ggml_backend_dev_count ();
41-     int  gpu_dev_count =  0 ;
42+     std::vector<std::array< ggml_backend_dev_t ,  2 >> gpus ;
4243    for  (int  i = 0 ; i < dev_count; ++i) {
4344        auto  * dev = ggml_backend_dev_get (i);
4445        if  (dev && ggml_backend_dev_type (dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
45-             gpu_dev_count++ ;
46+             gpus. push_back ({dev,  nullptr }) ;
4647        }
4748    }
49+     const  int  gpu_dev_count = (int )gpus.size ();
4850    const  int  num_models = gpu_dev_count + 1  + 1 ; //  GPUs + 1 CPU model + 1 layer split
4951    // const int num_models = std::max(1, gpu_dev_count);
5052    const  int  num_contexts = std::max (1 , params.n_parallel );
@@ -58,12 +60,12 @@ int main(int argc, char ** argv) {
5860
5961        if  (m < gpu_dev_count) {
6062            mparams.split_mode  = LLAMA_SPLIT_MODE_NONE;
61-             mparams.main_gpu  = m ;
63+             mparams.devices  = gpus[m]. data () ;
6264        } else  if  (m == gpu_dev_count) {
6365            mparams.split_mode  = LLAMA_SPLIT_MODE_NONE;
6466            mparams.main_gpu  = -1 ; //  CPU model
6567        } else  {
66-             mparams.split_mode  = LLAMA_SPLIT_MODE_LAYER;; 
68+             mparams.split_mode  = LLAMA_SPLIT_MODE_LAYER;
6769        }
6870
6971        llama_model * model = llama_model_load_from_file (params.model .path .c_str (), mparams);
0 commit comments