Skip to content

Commit 7287630

Browse files
authored
Repair nccl op test (#8575)
* fix nccl op unit test * fix build error * format code * refine nccl related unit test * fix build error * add setGPUData * clean up * follow comments * rm test_nccl.cu * follow comment * rm wait
1 parent ada82a3 commit 7287630

File tree

6 files changed

+69
-261
lines changed

6 files changed

+69
-261
lines changed

cmake/generic.cmake

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -244,11 +244,11 @@ function(cc_test TARGET_NAME)
244244
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
245245
add_executable(${TARGET_NAME} ${cc_test_SRCS})
246246
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
247-
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
247+
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
248248
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
249249
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
250250
endif()
251-
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
251+
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
252252
add_test(NAME ${TARGET_NAME}
253253
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
254254
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
@@ -311,8 +311,8 @@ function(nv_test TARGET_NAME)
311311
set(multiValueArgs SRCS DEPS)
312312
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
313313
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
314-
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
315-
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
314+
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
315+
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
316316
add_test(${TARGET_NAME} ${TARGET_NAME})
317317
endif()
318318
endfunction(nv_test)

paddle/fluid/operators/CMakeLists.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
222222
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
223223
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
224224
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
225-
if(WITH_GPU)
226-
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
227-
endif()
228225
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
229226
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
227+
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)

paddle/fluid/operators/nccl_op.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ limitations under the License. */
1414

1515
#include "paddle/fluid/framework/op_registry.h"
1616
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
17-
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
1817

1918
namespace paddle {
2019
namespace operators {

paddle/fluid/operators/nccl_op_test.cu.cc

Lines changed: 64 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,15 @@ limitations under the License. */
1414

1515
#include <glog/logging.h>
1616
#include <gtest/gtest.h>
17-
#include <algorithm>
1817
#include <memory>
1918
#include <mutex>
2019
#include <thread>
21-
#include <utility>
2220
#include <vector>
2321

24-
#include "paddle/fluid/framework/block_desc.h"
2522
#include "paddle/fluid/framework/init.h"
2623
#include "paddle/fluid/framework/op_desc.h"
2724
#include "paddle/fluid/framework/op_registry.h"
2825
#include "paddle/fluid/framework/program_desc.h"
29-
#include "paddle/fluid/framework/var_desc.h"
3026
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
3127
#include "paddle/fluid/platform/device_context.h"
3228
#include "paddle/fluid/platform/enforce.h"
@@ -41,26 +37,35 @@ USE_CUDA_ONLY_OP(ncclBcast);
4137
namespace f = paddle::framework;
4238
namespace p = paddle::platform;
4339

44-
static std::vector<int> gpu_list;
45-
4640
// test data amount
47-
const f::DDim kDims = {100, 100};
41+
const f::DDim kDims = {20, 20};
4842

4943
// nccl op common tester, init communicator.
5044
class NCCLTester : public ::testing::Test {
5145
public:
5246
virtual void SetUp() override {
47+
int count = p::GetCUDADeviceCount();
48+
if (count <= 1) {
49+
LOG(WARNING)
50+
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
51+
<< count;
52+
exit(0);
53+
}
54+
for (int i = 0; i < count; ++i) {
55+
gpu_list_.emplace_back(i);
56+
}
57+
5358
paddle::platform::CPUPlace cpu_place;
54-
for (size_t i = 0; i < gpu_list.size(); ++i) {
59+
for (size_t i = 0; i < gpu_list_.size(); ++i) {
5560
p::CUDAPlace place(i);
56-
dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
61+
dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
5762
}
5863

5964
NCCLInitOp();
6065
}
6166

6267
virtual void TearDown() override {
63-
for (auto &device_context : dev_ctxs) {
68+
for (auto &device_context : dev_ctxs_) {
6469
delete device_context;
6570
}
6671
}
@@ -70,36 +75,40 @@ class NCCLTester : public ::testing::Test {
7075
std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
7176

7277
op1->SetType("ncclInit");
78+
op1->SetInput("parallel_scopes", {"p_scopes"});
7379
op1->SetOutput("Communicator", {"comm"});
74-
op1->SetAttr("gpus", {gpu_list});
7580

76-
auto *var = g_scope.Var("comm");
81+
auto *var = g_scope_.Var("comm");
7782
var->GetMutable<p::Communicator>();
7883

84+
auto *scope_var = g_scope_.Var("p_scopes");
85+
auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
86+
(*p_scopes).resize(gpu_list_.size());
87+
7988
auto op = f::OpRegistry::CreateOp(*op1);
8089
VLOG(1) << "invoke NCCLInitOp.";
81-
op->Run(g_scope, cpu_place);
90+
op->Run(g_scope_, cpu_place);
8291
VLOG(1) << "NCCLInitOp finished.";
8392
}
8493

94+
int GetGPUData(int gpu_id) { return gpu_id + 42; }
95+
8596
template <class T>
8697
void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
87-
std::unique_lock<std::mutex> lk(mu);
98+
std::unique_lock<std::mutex> lk(mu_);
8899
const f::OpDesc *op1 = &op_desc;
89100

90101
p::CUDAPlace place(gpu_id);
91-
auto &ctx = dev_ctxs.at(gpu_id);
102+
auto &ctx = dev_ctxs_.at(gpu_id);
92103

93104
auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
94105
auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
95106

96107
if (!send_tensor->numel()) {
97-
send_tensor->Resize(kDims);
98108
send_tensor->mutable_data<T>(kDims, place);
99109

100-
std::vector<T> send_vector(f::product(kDims), gpu_id);
110+
std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
101111
paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
102-
ctx->Wait();
103112
VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
104113
}
105114

@@ -118,30 +127,14 @@ class NCCLTester : public ::testing::Test {
118127
}
119128

120129
public:
121-
std::vector<p::DeviceContext *> dev_ctxs;
122-
f::Scope g_scope;
123-
std::mutex mu;
130+
std::vector<p::DeviceContext *> dev_ctxs_;
131+
f::Scope g_scope_;
132+
std::mutex mu_;
133+
std::vector<int> gpu_list_;
124134
};
125135

126136
// ncclInitOp with desc
127-
TEST(NCCL, ncclInitOp) {
128-
std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);
129-
130-
op_desc->SetType("ncclInit");
131-
op_desc->SetOutput("Communicator", {"x1"});
132-
op_desc->SetAttr("gpus", {gpu_list});
133-
134-
f::Scope g_scope;
135-
paddle::platform::CPUPlace cpu_place;
136-
137-
auto *var = g_scope.Var("x1");
138-
var->GetMutable<p::Communicator>();
139-
140-
auto op = f::OpRegistry::CreateOp(*op_desc);
141-
VLOG(1) << "invoke NCCLInitOp.";
142-
op->Run(g_scope, cpu_place);
143-
VLOG(1) << "NCCLInitOp finished.";
144-
}
137+
TEST_F(NCCLTester, ncclInitOp) {}
145138

146139
// ncclAllReduceOp with desc
147140
TEST_F(NCCLTester, ncclAllReduceOp) {
@@ -155,23 +148,25 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
155148

156149
std::vector<std::thread> ths;
157150

158-
for (size_t i = 0; i < gpu_list.size(); ++i) {
159-
dev_scopes.emplace_back(&g_scope.NewScope());
160-
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
151+
for (size_t i = 0; i < gpu_list_.size(); ++i) {
152+
dev_scopes.emplace_back(&g_scope_.NewScope());
153+
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
161154
*op2.get(), dev_scopes[i]);
162155
ths.emplace_back(std::move(th));
163156
}
164157

165-
for (size_t i = 0; i < gpu_list.size(); ++i) {
158+
for (size_t i = 0; i < gpu_list_.size(); ++i) {
166159
ths[i].join();
167160
}
168161

169-
// check results
170-
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
162+
float expected_result = 0.0;
163+
for (int gpu_id : gpu_list_) {
164+
expected_result = expected_result + GetGPUData(gpu_id);
165+
}
171166

172167
for (size_t i = 0; i < dev_scopes.size(); ++i) {
173168
p::CPUPlace cpu_place;
174-
p::CUDAPlace gpu_place(gpu_list[i]);
169+
p::CUDAPlace gpu_place(gpu_list_[i]);
175170

176171
auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
177172
auto *rt = recv_tensor.data<float>();
@@ -180,12 +175,12 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
180175
auto *ct = result_tensor->mutable_data<float>(cpu_place);
181176

182177
paddle::memory::Copy(
183-
cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
178+
cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt,
184179
recv_tensor.numel() * sizeof(float),
185-
static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
180+
static_cast<p::CUDADeviceContext *>(dev_ctxs_[i])->stream());
186181

187182
for (int64_t j = 0; j < f::product(kDims); ++j) {
188-
ASSERT_NEAR(ct[j], result, 1e-5);
183+
ASSERT_NEAR(ct[j], expected_result, 1e-5);
189184
}
190185
}
191186
}
@@ -204,22 +199,24 @@ TEST_F(NCCLTester, ncclReduceOp) {
204199

205200
std::vector<std::thread> ths;
206201

207-
for (size_t i = 0; i < gpu_list.size(); ++i) {
208-
dev_scopes.emplace_back(&g_scope.NewScope());
209-
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
202+
for (size_t i = 0; i < gpu_list_.size(); ++i) {
203+
dev_scopes.emplace_back(&g_scope_.NewScope());
204+
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
210205
*op2.get(), dev_scopes[i]);
211206
ths.emplace_back(std::move(th));
212207
}
213208

214-
for (size_t i = 0; i < gpu_list.size(); ++i) {
209+
for (size_t i = 0; i < gpu_list_.size(); ++i) {
215210
ths[i].join();
216211
}
217212

218-
// check results on
219-
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
213+
float expected_result = 0.0;
214+
for (int gpu_id : gpu_list_) {
215+
expected_result = expected_result + GetGPUData(gpu_id);
216+
}
220217

221218
p::CPUPlace cpu_place;
222-
p::CUDAPlace gpu_place(gpu_list[kRoot]);
219+
p::CUDAPlace gpu_place(gpu_list_[kRoot]);
223220

224221
auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
225222
auto *rt = recv_tensor.data<float>();
@@ -229,12 +226,12 @@ TEST_F(NCCLTester, ncclReduceOp) {
229226
auto *ct = result_tensor->mutable_data<float>(cpu_place);
230227

231228
paddle::memory::Copy(
232-
cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
229+
cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
233230
recv_tensor.numel() * sizeof(float),
234-
static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
231+
static_cast<p::CUDADeviceContext *>(dev_ctxs_[kRoot])->stream());
235232

236233
for (int64_t j = 0; j < f::product(kDims); ++j) {
237-
ASSERT_NEAR(ct[j], result, 1e-5);
234+
ASSERT_NEAR(ct[j], expected_result, 1e-5);
238235
}
239236
}
240237

@@ -252,23 +249,22 @@ TEST_F(NCCLTester, ncclBcastOp) {
252249

253250
std::vector<std::thread> ths;
254251

255-
for (size_t i = 0; i < gpu_list.size(); ++i) {
256-
dev_scopes.emplace_back(&g_scope.NewScope());
257-
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
252+
for (size_t i = 0; i < gpu_list_.size(); ++i) {
253+
dev_scopes.emplace_back(&g_scope_.NewScope());
254+
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
258255
*op2.get(), dev_scopes[i]);
259256
ths.emplace_back(std::move(th));
260257
}
261258

262-
for (size_t i = 0; i < gpu_list.size(); ++i) {
259+
for (size_t i = 0; i < gpu_list_.size(); ++i) {
263260
ths[i].join();
264261
}
265262

266263
const int idx = 1;
267-
// check results on
268-
float result = kRoot;
264+
float result = GetGPUData(kRoot);
269265

270266
p::CPUPlace cpu_place;
271-
p::CUDAPlace gpu_place(gpu_list[idx]);
267+
p::CUDAPlace gpu_place(gpu_list_[idx]);
272268

273269
auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
274270
auto *rt = recv_tensor.data<float>();
@@ -277,42 +273,11 @@ TEST_F(NCCLTester, ncclBcastOp) {
277273
auto *ct = result_tensor->mutable_data<float>(cpu_place);
278274

279275
paddle::memory::Copy(
280-
cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
276+
cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
281277
recv_tensor.numel() * sizeof(float),
282-
static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
278+
static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
283279

284280
for (int64_t j = 0; j < f::product(kDims); ++j) {
285281
ASSERT_NEAR(ct[j], result, 1e-5);
286282
}
287283
}
288-
289-
int main(int argc, char **argv) {
290-
// FIXME(tonyyang-svail):
291-
// Due to the driver issue on our CI, disable for now
292-
return 0;
293-
const int dev_count = p::GetCUDADeviceCount();
294-
if (dev_count <= 1) {
295-
LOG(WARNING)
296-
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
297-
<< dev_count;
298-
return 0;
299-
}
300-
301-
std::vector<paddle::platform::Place> places;
302-
303-
places.emplace_back(paddle::platform::CPUPlace());
304-
int count = paddle::platform::GetCUDADeviceCount();
305-
for (int i = 0; i < count; ++i) {
306-
places.emplace_back(paddle::platform::CUDAPlace(i));
307-
gpu_list.emplace_back(i);
308-
}
309-
310-
VLOG(0) << " DeviceCount " << count;
311-
paddle::platform::DeviceContextPool::Init(places);
312-
313-
testing::InitGoogleTest(&argc, argv);
314-
315-
// device context should be release before scope.
316-
// otherwise driver will down.
317-
return RUN_ALL_TESTS();
318-
}

paddle/fluid/platform/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
4848

4949
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
5050
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
51-
nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
5251

5352
cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
5453
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)

0 commit comments

Comments
 (0)