Skip to content

Commit e39adc8

Browse files
committed
add reduce op handle
1 parent 494c262 commit e39adc8

File tree

7 files changed

+630
-27
lines changed

7 files changed

+630
-27
lines changed

paddle/fluid/framework/details/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,18 @@ else()
1717
endif()
1818
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
1919
scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
20+
2021
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
2122
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
2223
simple_threadpool device_context)
2324

2425
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory)
2526
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory)
27+
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim)
2628

2729
cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
2830
device_context broadcast_op_handle)
2931
cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
3032
device_context gather_op_handle)
33+
cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
34+
device_context reduce_op_handle)

paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
// limitations under the License.
1414

1515
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
16-
1716
#include <algorithm>
17+
#include "paddle/fluid/framework/details/reduce_util.h"
1818

1919
namespace paddle {
2020
namespace framework {
@@ -29,32 +29,6 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
2929
}
3030
}
3131

32-
struct ReduceLoDTensor {
33-
const std::vector<LoDTensor> &src_tensors_;
34-
LoDTensor &dst_tensor_;
35-
36-
ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
37-
: src_tensors_(src), dst_tensor_(*dst) {}
38-
39-
template <typename T>
40-
void operator()() const {
41-
PADDLE_ENFORCE(!src_tensors_.empty());
42-
auto &t0 = src_tensors_[0];
43-
PADDLE_ENFORCE_NE(t0.numel(), 0);
44-
dst_tensor_.Resize(t0.dims());
45-
T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
46-
std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
47-
48-
for (size_t i = 1; i < src_tensors_.size(); ++i) {
49-
auto &t = src_tensors_[i];
50-
PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
51-
PADDLE_ENFORCE_EQ(t.type(), t0.type());
52-
std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
53-
[](T a, T b) -> T { return a + b; });
54-
}
55-
}
56-
};
57-
5832
void NCCLAllReduceOpHandle::RunImpl() {
5933
if (inputs_.size() == 1) {
6034
return; // No need to all reduce when GPU count = 1;
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#pragma once
16+
#include <algorithm>
17+
#include <map>
18+
#include <vector>
19+
#include "paddle/fluid/framework/details/reduce_and_gather.h"
20+
#include "paddle/fluid/framework/lod_tensor.h"
21+
#include "paddle/fluid/framework/selected_rows.h"
22+
namespace paddle {
23+
namespace framework {
24+
namespace details {
25+
26+
struct ReduceLoDTensor {
27+
const std::vector<LoDTensor> &src_tensors_;
28+
LoDTensor &dst_tensor_;
29+
30+
ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
31+
: src_tensors_(src), dst_tensor_(*dst) {}
32+
33+
template <typename T>
34+
void operator()() const {
35+
PADDLE_ENFORCE(!src_tensors_.empty());
36+
auto &t0 = src_tensors_[0];
37+
PADDLE_ENFORCE_NE(t0.numel(), 0);
38+
dst_tensor_.Resize(t0.dims());
39+
T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
40+
std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
41+
42+
for (size_t i = 1; i < src_tensors_.size(); ++i) {
43+
auto &t = src_tensors_[i];
44+
PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
45+
PADDLE_ENFORCE_EQ(t.type(), t0.type());
46+
std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
47+
[](T a, T b) -> T { return a + b; });
48+
}
49+
}
50+
};
51+
52+
inline void GatherSelectedRows(
53+
const std::vector<const SelectedRows *> &src_selecte_rows_,
54+
const std::vector<platform::Place> &in_places,
55+
const std::unordered_map<platform::Place, platform::DeviceContext *,
56+
platform::PlaceHash> &dev_ctxes,
57+
const platform::Place &out_place, SelectedRows *dst_selecte_rows) {
58+
PADDLE_ENFORCE(!src_selecte_rows_.empty());
59+
60+
std::vector<Tensor> in_tensors;
61+
std::vector<int64_t> out_rows;
62+
63+
for (auto in_sr_ptr : src_selecte_rows_) {
64+
auto &in_sr = *in_sr_ptr;
65+
in_tensors.emplace_back(in_sr.value());
66+
out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
67+
}
68+
69+
auto &pre_in = src_selecte_rows_[0];
70+
71+
auto &dst_tensor = *dst_selecte_rows;
72+
dst_tensor.set_height(pre_in->height());
73+
dst_tensor.set_rows(out_rows);
74+
size_t rows = out_rows.size();
75+
DDim out_dim = pre_in->GetCompleteDims();
76+
out_dim[0] = static_cast<int64_t>(rows);
77+
dst_tensor.mutable_value()->Resize(out_dim);
78+
dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type());
79+
Tensor *out_tensor = dst_tensor.mutable_value();
80+
81+
// copy
82+
int s = 0, e = 0;
83+
for (size_t j = 0; j < in_tensors.size(); ++j) {
84+
e += in_tensors[j].dims()[0];
85+
auto sub_out = out_tensor->Slice(s, e);
86+
paddle::framework::TensorCopy(in_tensors[j], out_place,
87+
*(dev_ctxes.at(in_places[j])), &sub_out);
88+
s = e;
89+
}
90+
}
91+
92+
} // namespace details
93+
} // namespace framework
94+
} // namespace paddle
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/fluid/framework/details/reduce_op_handle.h"
16+
#include "paddle/fluid/framework/details/gather_op_handle.h"
17+
#include "paddle/fluid/framework/details/reduce_and_gather.h"
18+
#include "paddle/fluid/platform/nccl_helper.h"
19+
20+
namespace paddle {
21+
namespace framework {
22+
namespace details {
23+
24+
std::vector<VarHandle *> GetValidVarHandle(
25+
const std::vector<VarHandleBase *> &inputs) {
26+
std::vector<VarHandle *> in_var_handles;
27+
for (auto *in : inputs) {
28+
auto *in_handle = dynamic_cast<VarHandle *>(in);
29+
if (in_handle) {
30+
in_var_handles.push_back(in_handle);
31+
}
32+
}
33+
return in_var_handles;
34+
}
35+
36+
void ReduceOpHandle::RunImpl() {
37+
// the input and output may have dummy var.
38+
std::vector<VarHandle *> in_var_handles = GetValidVarHandle(inputs_);
39+
std::vector<VarHandle *> out_var_handles = GetValidVarHandle(outputs_);
40+
41+
PADDLE_ENFORCE_EQ(
42+
in_var_handles.size(), places_.size(),
43+
"The number of output should equal to the number of places.");
44+
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
45+
"The number of output should be one.");
46+
47+
// Wait input done, this Wait is asynchronous operation
48+
if (in_var_handles[0]->generated_op_) {
49+
for (auto *in : in_var_handles) {
50+
auto &in_p = in->place_;
51+
in_var_handles[0]->generated_op_->Wait(dev_ctxes_[in_p]);
52+
}
53+
}
54+
55+
// check in the same place
56+
auto in_0_handle = static_cast<VarHandle *>(in_var_handles[0]);
57+
auto pre_place = in_0_handle->place_;
58+
59+
std::vector<platform::Place> in_places;
60+
for (auto *in_handle : in_var_handles) {
61+
auto in_p = in_handle->place_;
62+
PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
63+
"Places must be all on CPU or all on CUDA.");
64+
in_places.emplace_back(in_p);
65+
}
66+
67+
auto out_var = local_scopes_[out_var_handles[0]->scope_idx_]->FindVar(
68+
out_var_handles[0]->name_);
69+
70+
auto pre_in_var =
71+
local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
72+
73+
if (pre_in_var->IsType<framework::SelectedRows>()) {
74+
auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
75+
std::vector<const SelectedRows *> in_selected_rows;
76+
77+
for (auto *in_handle : in_var_handles) {
78+
auto in_var =
79+
local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
80+
auto &in_sr = in_var->Get<framework::SelectedRows>();
81+
82+
PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
83+
"The type of input is not consistent.");
84+
85+
in_selected_rows.emplace_back(&in_sr);
86+
}
87+
auto trg = out_var->GetMutable<framework::SelectedRows>();
88+
GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_,
89+
out_var_handles[0]->place_, trg);
90+
} else {
91+
auto pre_in = pre_in_var->Get<framework::LoDTensor>();
92+
std::vector<LoDTensor> lod_tensors;
93+
94+
// can be refined
95+
for (auto *in_handle : in_var_handles) {
96+
auto in_var =
97+
local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
98+
auto &in_sr = in_var->Get<framework::LoDTensor>();
99+
100+
PADDLE_ENFORCE_EQ(in_sr.type(), pre_in.type(),
101+
"The type of input is not consistent.");
102+
103+
lod_tensors.emplace_back(in_sr);
104+
}
105+
106+
auto trg = out_var->GetMutable<framework::LoDTensor>();
107+
trg->Resize(pre_in.dims());
108+
trg->mutable_data(out_var_handles[0]->place_, pre_in.type());
109+
110+
if (paddle::platform::is_cpu_place(pre_place)) {
111+
ReduceLoDTensor func(lod_tensors, trg);
112+
VisitDataType(ToDataType(lod_tensors[0].type()), func);
113+
114+
} else if (paddle::platform::is_gpu_place(pre_place)) {
115+
#ifdef PADDLE_WITH_CUDA
116+
auto out_p = out_var_handles[0]->place_;
117+
int root = boost::get<platform::CUDAPlace>(out_p).device;
118+
119+
std::vector<std::function<void()>> all_reduce_calls;
120+
for (size_t i = 0; i < local_scopes_.size(); ++i) {
121+
auto &p = in_places[i];
122+
auto &lod_tensor = lod_tensors[i];
123+
int dev_id = boost::get<platform::CUDAPlace>(p).device;
124+
auto &nccl_ctx = nccl_ctxs_.at(dev_id);
125+
auto stream = nccl_ctx.stream();
126+
auto comm = nccl_ctx.comm_;
127+
128+
void *buffer = const_cast<void *>(lod_tensor.data<void>());
129+
void *recvbuffer = nullptr;
130+
if (root == dev_id) {
131+
recvbuffer = trg->mutable_data(out_var_handles[0]->place_);
132+
}
133+
134+
all_reduce_calls.emplace_back([=] {
135+
PADDLE_ENFORCE(platform::dynload::ncclReduce(
136+
buffer, recvbuffer, static_cast<size_t>(lod_tensor.numel()),
137+
platform::ToNCCLDataType(lod_tensor.type()), ncclSum, root, comm,
138+
stream));
139+
});
140+
}
141+
142+
platform::NCCLGroupGuard guard;
143+
for (auto &call : all_reduce_calls) {
144+
call();
145+
}
146+
#else
147+
PADDLE_THROW("CUDA is not support.");
148+
#endif
149+
} else {
150+
PADDLE_THROW("Error");
151+
}
152+
}
153+
}
154+
std::string ReduceOpHandle::Name() const { return "reduce"; }
155+
} // namespace details
156+
} // namespace framework
157+
} // namespace paddle
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#pragma once
16+
17+
#include <map>
18+
#include <string>
19+
#include <vector>
20+
21+
#include "paddle/fluid/framework/details/op_handle_base.h"
22+
#include "paddle/fluid/framework/lod_tensor.h"
23+
#include "paddle/fluid/framework/scope.h"
24+
#include "paddle/fluid/framework/selected_rows.h"
25+
#include "paddle/fluid/platform/device_context.h"
26+
#include "paddle/fluid/platform/nccl_helper.h"
27+
28+
namespace paddle {
29+
namespace framework {
30+
namespace details {
31+
32+
struct ReduceOpHandle : public OpHandleBase {
33+
const std::vector<Scope *> &local_scopes_;
34+
const std::vector<platform::Place> &places_;
35+
36+
#ifdef PADDLE_WITH_CUDA
37+
const platform::NCCLContextMap &nccl_ctxs_;
38+
ReduceOpHandle(const std::vector<Scope *> &local_scopes,
39+
const std::vector<platform::Place> &places,
40+
const platform::NCCLContextMap &nccl_ctxs)
41+
: local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
42+
for (auto &p_ctx : nccl_ctxs_.contexts_) {
43+
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
44+
}
45+
}
46+
#else
47+
ReduceOpHandle(const std::vector<Scope *> &local_scopes,
48+
const std::vector<platform::Place> &places)
49+
: local_scopes_(local_scopes), places_(places) {}
50+
#endif
51+
52+
std::string Name() const override;
53+
54+
bool IsMultiDeviceTransfer() override { return false; };
55+
56+
protected:
57+
void RunImpl() override;
58+
};
59+
60+
} // namespace details
61+
} // namespace framework
62+
} // namespace paddle

0 commit comments

Comments
 (0)