Skip to content

Commit 1c00538

Browse files
Greg Inozemtsevryanamazon
authored andcommitted
Support setting CUDA_VISIBLE_DEVICES env variable
1 parent 8274cb4 commit 1c00538

File tree

11 files changed

+43
-16
lines changed

11 files changed

+43
-16
lines changed

src/all_gather.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
3131
size_t sendcount = args->sendBytes / wordSize(type);
3232
size_t recvcount = args->expectedBytes / wordSize(type);
3333
int nranks = args->nProcs*args->nThreads*args->nGpus;
34+
int nGpusVisible;
35+
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));
3436

3537
for (int i=0; i<args->nGpus; i++) {
3638
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
37-
CUDACHECK(cudaSetDevice(gpuid));
39+
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
3840
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
3941
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
4042
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];

src/all_reduce.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
3131
size_t sendcount = args->sendBytes / wordSize(type);
3232
size_t recvcount = args->expectedBytes / wordSize(type);
3333
int nranks = args->nProcs*args->nThreads*args->nGpus;
34+
int nGpusVisible;
35+
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));
3436

3537
for (int i=0; i<args->nGpus; i++) {
3638
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
37-
CUDACHECK(cudaSetDevice(gpuid));
39+
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
3840
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
3941
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
4042
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];

src/alltoall.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,13 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
3131
size_t sendcount = args->sendBytes / wordSize(type);
3232
size_t recvcount = args->expectedBytes / wordSize(type);
3333
int nranks = args->nProcs*args->nThreads*args->nGpus;
34+
int nGpusVisible;
35+
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));
3436

3537
for (int i=0; i<args->nGpus; i++) {
3638
char* str = getenv("NCCL_TESTS_DEVICE");
3739
int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
38-
CUDACHECK(cudaSetDevice(gpuid));
40+
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
3941
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
4042
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
4143
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];

src/broadcast.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,12 @@ void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par
3030
testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
3131
size_t sendcount = args->sendBytes / wordSize(type);
3232
size_t recvcount = args->expectedBytes / wordSize(type);
33+
int nGpusVisible;
34+
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));
3335

3436
for (int i=0; i<args->nGpus; i++) {
3537
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
36-
CUDACHECK(cudaSetDevice(gpuid));
38+
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
3739
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
3840
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
3941
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];

src/common.cu

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ thread_local int is_main_thread = 0;
5252
// Command line parameter defaults
5353
static int nThreads = 1;
5454
static int nGpus = 1;
55+
static int nGpusVisible;
5556
static size_t minBytes = 32*1024*1024;
5657
static size_t maxBytes = 32*1024*1024;
5758
static size_t stepBytes = 1*1024*1024;
@@ -432,7 +433,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
432433
int device;
433434
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
434435
NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
435-
CUDACHECK(cudaSetDevice(device));
436+
CUDACHECK(cudaSetDevice(device % nGpusVisible));
436437
void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
437438
TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
438439
maxDelta = std::max(*(args->deltaHost), maxDelta);
@@ -788,7 +789,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
788789
// will be done on the current GPU (by default : 0) and if the GPUs are in
789790
// exclusive mode those operations will fail.
790791
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
791-
CUDACHECK(cudaSetDevice(gpuid));
792+
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
792793
TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
793794
return testSuccess;
794795
}
@@ -805,7 +806,7 @@ testResult_t threadInit(struct threadArgs* args) {
805806
for (int i=0; i<args->nGpus; i++) {
806807
int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
807808
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
808-
CUDACHECK(cudaSetDevice(gpuid));
809+
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
809810
NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
810811
}
811812
NCCLCHECK(ncclGroupEnd());
@@ -886,6 +887,8 @@ int main(int argc, char* argv[]) {
886887
{}
887888
};
888889

890+
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));
891+
889892
while(1) {
890893
int c;
891894
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
@@ -899,6 +902,10 @@ int main(int argc, char* argv[]) {
899902
break;
900903
case 'g':
901904
nGpus = strtol(optarg, NULL, 0);
905+
if (nGpus > nGpusVisible) {
906+
fprintf(stderr, "invalid number of GPUs specified (%d), only for %d GPUs exist\n", nGpus, nGpusVisible);
907+
return -1;
908+
}
902909
break;
903910
case 'b':
904911
parsed = parsesize(optarg);
@@ -1042,7 +1049,7 @@ testResult_t run() {
10421049
int cudaDev = localRank*nThreads*nGpus+i;
10431050
int rank = proc*nThreads*nGpus+i;
10441051
cudaDeviceProp prop;
1045-
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
1052+
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev % nGpus));
10461053
len += snprintf(line+len, MAX_LINE-len, "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
10471054
rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
10481055
maxMem = std::min(maxMem, prop.totalGlobalMem);
@@ -1086,7 +1093,7 @@ testResult_t run() {
10861093
ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
10871094

10881095
for (int i=0; i<nGpus*nThreads; i++) {
1089-
CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
1096+
CUDACHECK(cudaSetDevice((localRank*nThreads*nGpus+i) % nGpusVisible));
10901097
TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
10911098
CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
10921099
}
@@ -1101,7 +1108,7 @@ testResult_t run() {
11011108
} else {
11021109
NCCLCHECK(ncclGroupStart());
11031110
for (int i=0; i<nGpus*nThreads; i++) {
1104-
CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
1111+
CUDACHECK(cudaSetDevice((localRank*nThreads*nGpus+i) % nGpusVisible));
11051112
NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
11061113
}
11071114
NCCLCHECK(ncclGroupEnd());

src/gather.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
3131
size_t sendcount = args->sendBytes / wordSize(type);
3232
size_t recvcount = args->expectedBytes / wordSize(type);
3333
int nranks = args->nProcs*args->nThreads*args->nGpus;
34+
int nGpusVisible;
35+
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));
3436

3537
for (int i=0; i<args->nGpus; i++) {
3638
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
37-
CUDACHECK(cudaSetDevice(gpuid));
39+
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
3840
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
3941
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
4042
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];

src/hypercube.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,12 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
3434
size_t sendcount = args->sendBytes / wordSize(type);
3535
size_t recvcount = args->expectedBytes / wordSize(type);
3636
int nranks = args->nProcs*args->nThreads*args->nGpus;
37+
int nGpusVisible;
38+
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));
3739

3840
for (int i=0; i<args->nGpus; i++) {
3941
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
40-
CUDACHECK(cudaSetDevice(gpuid));
42+
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
4143
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
4244
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
4345
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];

src/reduce.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
3131
size_t sendcount = args->sendBytes / wordSize(type);
3232
size_t recvcount = args->expectedBytes / wordSize(type);
3333
int nranks = args->nProcs*args->nThreads*args->nGpus;
34+
int nGpusVisible;
35+
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));
3436

3537
for (int i=0; i<args->nGpus; i++) {
3638
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
37-
CUDACHECK(cudaSetDevice(gpuid));
39+
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
3840
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
3941
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
4042
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];

src/reduce_scatter.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
3131
size_t sendcount = args->sendBytes / wordSize(type);
3232
size_t recvcount = args->expectedBytes / wordSize(type);
3333
int nranks = args->nProcs*args->nThreads*args->nGpus;
34+
int nGpusVisible;
35+
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));
3436

3537
for (int i=0; i<args->nGpus; i++) {
3638
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
37-
CUDACHECK(cudaSetDevice(gpuid));
39+
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
3840
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
3941
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
4042
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];

src/scatter.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,12 @@ void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *param
3030
testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
3131
size_t sendcount = args->sendBytes / wordSize(type);
3232
size_t recvcount = args->expectedBytes / wordSize(type);
33+
int nGpusVisible;
34+
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));
3335

3436
for (int i=0; i<args->nGpus; i++) {
3537
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
36-
CUDACHECK(cudaSetDevice(gpuid));
38+
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
3739
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
3840
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
3941
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];

0 commit comments

Comments
 (0)