-
Notifications
You must be signed in to change notification settings - Fork 358
Expand file tree
/
Copy pathhypercube.cu
More file actions
121 lines (103 loc) · 4.23 KB
/
hypercube.cu
File metadata and controls
121 lines (103 loc) · 4.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/*************************************************************************
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "cuda_runtime.h"
#include "common.h"
#define ALIGN 4
void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base;
*recvcount = base*nranks;
*sendInplaceOffset = base;
*recvInplaceOffset = 0;
*paramcount = base;
}
testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus;
int nGpusVisible;
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
}
CUDACHECK(cudaDeviceSynchronize());
}
return testSuccess;
}
void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
*algBw = baseBw;
double factor = 1;
*busBw = baseBw * factor;
}
testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
char* sbuff = (char*)sendbuff;
char* rbuff = (char*)recvbuff;
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
size_t rankSize = count * wordSize(type);
if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream));
// Hypercube AllGather
for (int mask=1; mask<nRanks; mask<<=1) {
NCCLCHECK(ncclGroupStart());
int s = rank & ~(mask-1);
int r = s ^ mask;
NCCLCHECK(ncclSend(rbuff+s*rankSize, count*mask, type, rank^mask, comm, stream));
NCCLCHECK(ncclRecv(rbuff+r*rankSize, count*mask, type, rank^mask, comm, stream));
NCCLCHECK(ncclGroupEnd());
}
return testSuccess;
}
struct testColl hyperCubeTest = {
"HyperCube",
HyperCubeGetCollByteCount,
HyperCubeInitData,
HyperCubeGetBw,
HyperCubeRunColl
};
void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
HyperCubeGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &hyperCubeTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
// Check if this is a power of 2
int nRanks = args->nProcs*args->nThreads*args->nGpus;
if (nRanks && !(nRanks & (nRanks - 1))) {
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
}
} else {
printf("nRanks %d is not a power of 2, skipping\n", nRanks);
}
return testSuccess;
}
struct testEngine hyperCubeEngine = {
HyperCubeGetBuffSize,
HyperCubeRunTest
};
#pragma weak ncclTestEngine=hyperCubeEngine