Skip to content

Commit eb83abe

Browse files
authored
Add DGC(Deep Gradient Compression) interface. (#15841)
1 parent b1d2605 commit eb83abe

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1363
-100
lines changed

CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,12 @@ if(WITH_GPU)
193193
include(tensorrt)
194194
include(anakin_subgraph)
195195
endif()
196+
197+
if(WITH_GPU AND NOT WIN32)
198+
message(STATUS "add dgc lib.")
199+
include(external/dgc)
200+
endif()
201+
196202
if(WITH_MKL OR WITH_MKLML)
197203
include(external/anakin)
198204
elseif()

cmake/external/dgc.cmake

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
INCLUDE(ExternalProject)
16+
17+
SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc")
18+
SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
19+
SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
20+
SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
21+
INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
22+
23+
ExternalProject_Add(
24+
extern_dgc
25+
${EXTERNAL_PROJECT_LOG_ARGS}
26+
GIT_REPOSITORY "https://github.com/PaddlePaddle/Fleet"
27+
GIT_TAG "2d04dc3800cdd0601f1b65d547dabcc60b0cf9dc"
28+
SOURCE_DIR "${DGC_SOURCES_DIR}"
29+
CONFIGURE_COMMAND ""
30+
BUILD_COMMAND cd collective && make -j
31+
INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/ ${DGC_INCLUDE_DIR}/dgc
32+
&& cp ${DGC_SOURCES_DIR}/collective/build/lib/libdgc.a ${DGC_LIBRARIES}
33+
&& cp ${DGC_SOURCES_DIR}/collective/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/
34+
BUILD_IN_SOURCE 1
35+
)
36+
37+
ADD_LIBRARY(dgc SHARED IMPORTED GLOBAL)
38+
SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES})
39+
ADD_DEPENDENCIES(dgc extern_dgc)
40+
41+
LIST(APPEND external_project_dependencies dgc)
42+

cmake/inference_lib.cmake

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,15 @@ elseif (NOT CBLAS_FOUND OR WIN32)
131131
)
132132
endif ()
133133

134+
if (WITH_GPU AND NOT WIN32)
135+
set(dgc_dir "${FLUID_INSTALL_DIR}/third_party/install/dgc")
136+
copy(dgc_lib
137+
SRCS ${DGC_INSTALL_DIR}/lib ${DGC_INSTALL_DIR}/include
138+
DSTS ${dgc_dir} ${dgc_dir}
139+
DEPS dgc)
140+
endif()
141+
142+
134143
if (WITH_MKLDNN)
135144
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
136145
copy(mkldnn_lib

cmake/operators.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ function(op_library TARGET)
110110
# Define operators that don't need pybind here.
111111
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
112112
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
113-
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op")
113+
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op")
114114
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
115115
set(pybind_flag 1)
116116
endif()

paddle/fluid/API.spec

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,11 @@ paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['sel
483483
paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
484484
paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
485485
paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
486+
paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
487+
paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
488+
paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
489+
paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
490+
paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
486491
paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24'))
487492
paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
488493
paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))

paddle/fluid/framework/details/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ endif()
2323

2424
if(WITH_GPU)
2525
nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
26-
dynload_cuda variable_visitor)
26+
dynload_cuda variable_visitor dgc)
2727
nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
2828
dynload_cuda variable_visitor)
2929
if(WITH_DISTRIBUTE)

paddle/fluid/framework/details/all_reduce_deps_pass.cc

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
8686
}
8787
}
8888

89-
VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl;
89+
VLOG(10) << "dist_ops size:" << dist_ops.size()
90+
<< ", outputs size:" << vars.size() << ", ops size:" << ops.size();
9091

9192
std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
9293
OpHandleBase* op2) {
@@ -99,6 +100,10 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
99100
auto l_it = vars.find(i0->name());
100101
auto r_it = vars.find(i1->name());
101102

103+
PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(),
104+
"can't find var's name %s and %s in opdesc", i0->name(),
105+
i1->name());
106+
102107
if (l_it->second < r_it->second) return true;
103108

104109
if (l_it->second == r_it->second) {

paddle/fluid/framework/details/all_reduce_op_handle.cc

Lines changed: 197 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@
1616
#include "paddle/fluid/framework/details/container_cast.h"
1717
#include "paddle/fluid/framework/details/reduce_and_gather.h"
1818
#include "paddle/fluid/framework/details/variable_visitor.h"
19+
#include "paddle/fluid/framework/operator.h"
20+
21+
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
22+
#include "dgc/dgc.h"
23+
#endif
24+
25+
#include "paddle/fluid/platform/gpu_info.h"
1926
#include "paddle/fluid/platform/profiler.h"
2027

2128
// asynchronous nccl allreduce or synchronous issue:
@@ -33,11 +40,14 @@ namespace details {
3340
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
3441
const std::vector<Scope *> &local_scopes,
3542
const std::vector<platform::Place> &places,
36-
const platform::NCCLContextMap *ctxs)
43+
const platform::NCCLContextMap *ctxs,
44+
bool is_encoded, int nranks)
3745
: OpHandleBase(node),
3846
local_scopes_(local_scopes),
3947
places_(places),
40-
nccl_ctxs_(ctxs) {
48+
nccl_ctxs_(ctxs),
49+
is_encoded_(is_encoded),
50+
nranks_(nranks) {
4151
if (nccl_ctxs_) {
4252
for (auto &p : places_) {
4353
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
@@ -51,7 +61,185 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
5161
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
5262
#endif
5363

64+
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
65+
void AllReduceOpHandle::RunImplEncoded() {
66+
platform::RecordEvent record_event(Name());
67+
68+
WaitInputVarGenerated();
69+
70+
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
71+
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
72+
PADDLE_ENFORCE_EQ(
73+
in_var_handles.size(), places_.size(),
74+
"The NoDummyInputSize should be equal to the number of places.");
75+
PADDLE_ENFORCE_EQ(
76+
in_var_handles.size(), out_var_handles.size(),
77+
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
78+
79+
std::vector<const LoDTensor *> ins;
80+
std::vector<LoDTensor *> outs;
81+
int k = -1;
82+
for (size_t i = 0; i < local_scopes_.size(); ++i) {
83+
auto &local_scope =
84+
local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
85+
auto original_name =
86+
paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
87+
auto encode_var_name = original_name + g_dgc_encoded;
88+
auto *in_var = local_scope->FindVar(encode_var_name);
89+
PADDLE_ENFORCE_NOT_NULL(in_var);
90+
auto &in = in_var->Get<LoDTensor>();
91+
ins.emplace_back(&in);
92+
93+
auto *out = local_scope->FindVar(out_var_handles[i]->name())
94+
->GetMutable<LoDTensor>();
95+
outs.emplace_back(out);
96+
97+
if (k < 0) {
98+
k = GetKValue(in_var_handles[i]->name());
99+
}
100+
}
101+
102+
PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
103+
PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
104+
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
105+
106+
int dtype = -1;
107+
size_t in_numel = 0;
108+
size_t out_numel = 0;
109+
PADDLE_ENFORCE(nranks_ > 1);
110+
std::vector<std::function<void()>> all_reduce_calls;
111+
112+
for (size_t i = 0; i < local_scopes_.size(); ++i) {
113+
auto &place = places_[i];
114+
auto &in = *ins[i];
115+
void *in_tensor_buf = const_cast<void *>(in.data<void>());
116+
117+
auto &out = *outs[i];
118+
float *out_tensor_buf = out.data<float>();
119+
120+
dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
121+
in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
122+
PADDLE_ENFORCE(in_numel % 2 == 0);
123+
PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
124+
out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
125+
126+
int dev_id = boost::get<platform::CUDAPlace>(place).device;
127+
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
128+
auto stream = nccl_ctx.stream();
129+
auto comm = nccl_ctx.comm_;
130+
131+
auto &allocator =
132+
platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
133+
int encode_size = 2 * k * sizeof(int);
134+
// dgc use ncclAllGather to get all the encoded data
135+
// so the buffer need nranks.
136+
int buf_size = nranks_ * encode_size;
137+
auto tmp_ious_data = allocator.Allocate(buf_size);
138+
void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
139+
140+
VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
141+
<< ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
142+
<< ", k:" << k << ", place:" << place << ", dtype:" << dtype;
143+
144+
all_reduce_calls.emplace_back([=] {
145+
PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce(
146+
in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm,
147+
stream));
148+
});
149+
}
150+
151+
this->RunAndRecordEvent([&] {
152+
if (all_reduce_calls.size() == 1UL) {
153+
// Do not use NCCLGroup when manage NCCL by per thread per device
154+
all_reduce_calls[0]();
155+
} else {
156+
platform::NCCLGroupGuard guard;
157+
for (auto &call : all_reduce_calls) {
158+
call();
159+
}
160+
}
161+
});
162+
163+
if (FLAGS_sync_nccl_allreduce) {
164+
for (auto &p : places_) {
165+
int dev_id = boost::get<platform::CUDAPlace>(p).device;
166+
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
167+
auto stream = nccl_ctx.stream();
168+
cudaError_t e_sync = cudaStreamSynchronize(stream);
169+
if (e_sync != 0) {
170+
LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync);
171+
}
172+
173+
cudaError_t e_get = cudaGetLastError();
174+
if (e_get != 0) {
175+
LOG(FATAL) << "cudaGetLastError " << cudaGetErrorString(e_get)
176+
<< " errno:" << e_get;
177+
}
178+
}
179+
}
180+
}
181+
182+
int AllReduceOpHandle::GetKValue(const std::string &grad_name) {
183+
auto original_name = paddle::framework::GradOriginalVarName(grad_name);
184+
auto var_name = original_name + g_dgc_k;
185+
PADDLE_ENFORCE(local_scopes_.size() > 0);
186+
187+
auto *scope = local_scopes_[0];
188+
auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
189+
auto var = local_scope->FindVar(var_name);
190+
PADDLE_ENFORCE_NOT_NULL(var);
191+
auto tensor = var->Get<LoDTensor>().data<float>();
192+
return *tensor;
193+
}
194+
#endif
195+
196+
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
197+
bool AllReduceOpHandle::IsEncoded() {
198+
if (!is_encoded_) {
199+
return false;
200+
}
201+
auto counter_name = g_dgc_counter_name;
202+
auto step_name = g_dgc_rampup_begin_step;
203+
PADDLE_ENFORCE(local_scopes_.size() > 0);
204+
205+
auto *scope = local_scopes_[0];
206+
auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
207+
auto count_var = local_scope->FindVar(counter_name);
208+
auto step_var = local_scope->FindVar(step_name);
209+
if (count_var == nullptr || step_var == nullptr) {
210+
PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
211+
step_var);
212+
}
213+
214+
float count = *count_var->Get<LoDTensor>().data<float>();
215+
float step = *step_var->Get<LoDTensor>().data<float>();
216+
if (static_cast<int>(count) < static_cast<int>(step)) {
217+
VLOG(10) << "in all_reduce currentstep:" << count
218+
<< " < rampup_begin_step:" << step
219+
<< " so not use sparse all reduce";
220+
return false;
221+
}
222+
223+
return true;
224+
}
225+
#else
226+
bool AllReduceOpHandle::IsEncoded() { return false; }
227+
#endif
228+
54229
void AllReduceOpHandle::RunImpl() {
230+
if (!IsEncoded()) {
231+
RunImplNormal();
232+
return;
233+
}
234+
235+
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
236+
RunImplEncoded();
237+
#else
238+
PADDLE_THROW("Not compiled with CUDA");
239+
#endif
240+
}
241+
242+
void AllReduceOpHandle::RunImplNormal() {
55243
platform::RecordEvent record_event(Name());
56244

57245
WaitInputVarGenerated();
@@ -72,6 +260,8 @@ void AllReduceOpHandle::RunImpl() {
72260
auto &lod_tensor =
73261
local_scope.FindVar(in_var_handles[i]->name())->Get<LoDTensor>();
74262
lod_tensors.emplace_back(&lod_tensor);
263+
VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
264+
<< ", out_name:" << out_var_handles[i]->name();
75265
PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
76266
"The name of input and output should be equal.");
77267
}
@@ -99,13 +289,17 @@ void AllReduceOpHandle::RunImpl() {
99289
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
100290
auto stream = nccl_ctx.stream();
101291
auto comm = nccl_ctx.comm_;
292+
293+
VLOG(10) << "before all reduce buffer:" << buffer << ", numel:" << numel
294+
<< ", dev_id:" << dev_id << ", dtype:" << dtype
295+
<< ", place:" << p;
296+
102297
all_reduce_calls.emplace_back([=] {
103298
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
104299
buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
105300
comm, stream));
106301
});
107302
}
108-
109303
this->RunAndRecordEvent([&] {
110304
if (all_reduce_calls.size() == 1UL) {
111305
// Do not use NCCLGroup when manage NCCL by per thread per device

0 commit comments

Comments
 (0)