@@ -52,6 +52,7 @@ thread_local int is_main_thread = 0;
5252// Command line parameter defaults
5353static int nThreads = 1 ;
5454static int nGpus = 1 ;
55+ static int nGpusVisible;
5556static size_t minBytes = 32 *1024 *1024 ;
5657static size_t maxBytes = 32 *1024 *1024 ;
5758static size_t stepBytes = 1 *1024 *1024 ;
@@ -432,7 +433,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
432433 int device;
433434 int rank = ((args->proc *args->nThreads + args->thread )*args->nGpus + i);
434435 NCCLCHECK (ncclCommCuDevice (args->comms [i], &device));
435- CUDACHECK (cudaSetDevice (device));
436+ CUDACHECK (cudaSetDevice (device % nGpusVisible ));
436437 void *data = in_place ? ((void *)((uintptr_t )args->recvbuffs [i] + args->recvInplaceOffset *rank)) : args->recvbuffs [i];
437438 TESTCHECK (CheckDelta (data , args->expected [i], count, type, args->deltaHost ));
438439 maxDelta = std::max (*(args->deltaHost ), maxDelta);
@@ -788,7 +789,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
788789 // will be done on the current GPU (by default : 0) and if the GPUs are in
789790 // exclusive mode those operations will fail.
790791 int gpuid = args->localRank *args->nThreads *args->nGpus + args->thread *args->nGpus ;
791- CUDACHECK (cudaSetDevice (gpuid));
792+ CUDACHECK (cudaSetDevice (gpuid % nGpusVisible ));
792793 TESTCHECK (ncclTestEngine.runTest (args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
793794 return testSuccess;
794795}
@@ -805,7 +806,7 @@ testResult_t threadInit(struct threadArgs* args) {
805806 for (int i=0 ; i<args->nGpus ; i++) {
806807 int rank = args->proc *args->nThreads *args->nGpus + args->thread *args->nGpus + i;
807808 int gpuid = args->localRank *args->nThreads *args->nGpus + args->thread *args->nGpus + i;
808- CUDACHECK (cudaSetDevice (gpuid));
809+ CUDACHECK (cudaSetDevice (gpuid % nGpusVisible ));
809810 NCCLCHECK (ncclCommInitRank (args->comms +i, nranks, args->ncclId , rank));
810811 }
811812 NCCLCHECK (ncclGroupEnd ());
@@ -886,6 +887,8 @@ int main(int argc, char* argv[]) {
886887 {}
887888 };
888889
890+ CUDACHECK (cudaGetDeviceCount (&nGpusVisible));
891+
889892 while (1 ) {
890893 int c;
891894 c = getopt_long (argc, argv, " t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:" , longopts, &longindex);
@@ -899,6 +902,10 @@ int main(int argc, char* argv[]) {
899902 break ;
900903 case ' g' :
901904 nGpus = strtol (optarg, NULL , 0 );
905+ if (nGpus > nGpusVisible) {
906+ fprintf (stderr, " invalid number of GPUs specified (%d), only for %d GPUs exist\n " , nGpus, nGpusVisible);
907+ return -1 ;
908+ }
902909 break ;
903910 case ' b' :
904911 parsed = parsesize (optarg);
@@ -1042,7 +1049,7 @@ testResult_t run() {
10421049 int cudaDev = localRank*nThreads*nGpus+i;
10431050 int rank = proc*nThreads*nGpus+i;
10441051 cudaDeviceProp prop;
1045- CUDACHECK (cudaGetDeviceProperties (&prop, cudaDev));
1052+ CUDACHECK (cudaGetDeviceProperties (&prop, cudaDev % nGpus ));
10461053 len += snprintf (line+len, MAX_LINE-len, " # Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n " ,
10471054 rank, getpid (), hostname, cudaDev, prop.pciBusID , prop.name );
10481055 maxMem = std::min (maxMem, prop.totalGlobalMem );
@@ -1086,7 +1093,7 @@ testResult_t run() {
10861093 ncclTestEngine.getBuffSize (&sendBytes, &recvBytes, (size_t )maxBytes, (size_t )nProcs*nGpus*nThreads);
10871094
10881095 for (int i=0 ; i<nGpus*nThreads; i++) {
1089- CUDACHECK (cudaSetDevice (localRank*nThreads*nGpus+i));
1096+ CUDACHECK (cudaSetDevice (( localRank*nThreads*nGpus+i) % nGpusVisible ));
10901097 TESTCHECK (AllocateBuffs (sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t )maxBytes, nProcs*nThreads*nGpus));
10911098 CUDACHECK (cudaStreamCreateWithFlags (streams+i, cudaStreamNonBlocking));
10921099 }
@@ -1101,7 +1108,7 @@ testResult_t run() {
11011108 } else {
11021109 NCCLCHECK (ncclGroupStart ());
11031110 for (int i=0 ; i<nGpus*nThreads; i++) {
1104- CUDACHECK (cudaSetDevice (localRank*nThreads*nGpus+i));
1111+ CUDACHECK (cudaSetDevice (( localRank*nThreads*nGpus+i) % nGpusVisible ));
11051112 NCCLCHECK (ncclCommInitRank (comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
11061113 }
11071114 NCCLCHECK (ncclGroupEnd ());
0 commit comments