Skip to content

Commit 9f32b61

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into seq_expand_op
2 parents fab6f30 + 0049ce0 commit 9f32b61

21 files changed

+1316
-23
lines changed

paddle/framework/op_registry.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,10 @@ class OpKernelRegistrar : public Registrar {
228228
USE_OP_ITSELF(op_type); \
229229
USE_OP_DEVICE_KERNEL(op_type, CPU);
230230

231+
#define USE_GPU_ONLY_OP(op_type) \
232+
USE_OP_ITSELF(op_type); \
233+
USE_OP_DEVICE_KERNEL(op_type, GPU)
234+
231235
#define USE_OP(op_type) \
232236
USE_OP_ITSELF(op_type); \
233237
USE_OP_KERNEL(op_type)

paddle/framework/operator.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ class OperatorBase {
122122
protected:
123123
std::string type_;
124124
// NOTE: in case of OpGrad, inputs_ contains:
125-
// I (Inputs)opear
125+
// I (Inputs)
126126
// O (Outputs)
127127
// OG (Output Gradients)
128128
VariableNameMap inputs_;
@@ -287,6 +287,16 @@ class ExecutionContext {
287287
return device_context_;
288288
}
289289

290+
//! Get actual name vector for this input.
291+
const std::vector<std::string>& Inputs(const std::string& name) const {
292+
return op_.Inputs(name);
293+
}
294+
295+
//! Get actual name vector for this output.
296+
const std::vector<std::string>& Outputs(const std::string& name) const {
297+
return op_.Outputs(name);
298+
}
299+
290300
#ifdef PADDLE_WITH_CUDA
291301
const platform::CUDADeviceContext& cuda_device_context() const {
292302
PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
@@ -398,6 +408,7 @@ class OperatorWithKernel : public OperatorBase {
398408
// indicate kernel DataType by input data. Defaultly all input data must be
399409
// same.
400410
virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
411+
VLOG(3) << "Default IndicateDataType " << this->Type();
401412
auto& scope = ctx.scope();
402413
int data_type = -1;
403414
for (auto& input : this->inputs_) {

paddle/framework/tensor.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,11 +126,16 @@ class Tensor {
126126
inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
127127

128128
platform::Place place() const {
129-
PADDLE_ENFORCE_NOT_NULL(holder_, "Tensor get place() must contains holder");
129+
PADDLE_ENFORCE_NOT_NULL(
130+
holder_, "Tensor not initialized yet when Tensor::place() is called.");
130131
return holder_->place();
131132
}
132133

133-
std::type_index type() const { return holder_->type(); }
134+
std::type_index type() const {
135+
PADDLE_ENFORCE_NOT_NULL(
136+
holder_, "Tensor not initialized yet when Tensor::type() is called.");
137+
return holder_->type();
138+
}
134139

135140
size_t memory_size() const;
136141

paddle/operators/CMakeLists.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,13 @@ function(op_library TARGET)
9090
file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
9191
endif()
9292

93+
# nccl_op contains several operators
94+
if ("${TARGET}" STREQUAL "nccl_op")
95+
set(pybind_flag 1)
96+
# It's enough to just adding one operator to pybind
97+
file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
98+
endif()
99+
93100
# reduce_op contains several operators
94101
if ("${TARGET}" STREQUAL "reduce_op")
95102
set(pybind_flag 1)
@@ -121,6 +128,7 @@ function(op_library TARGET)
121128
endfunction()
122129

123130
add_subdirectory(math)
131+
add_subdirectory(nccl)
124132

125133
set(DEPS_OPS
126134
recurrent_op
@@ -130,6 +138,7 @@ set(DEPS_OPS
130138
sum_op
131139
pool_op
132140
pool_with_index_op
141+
nccl_op
133142
sequence_conv_op
134143
lstm_op)
135144

@@ -142,6 +151,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
142151
op_library(sum_op DEPS net_op selected_rows_functor)
143152
op_library(pool_op DEPS pooling)
144153
op_library(pool_with_index_op DEPS pooling)
154+
if(WITH_GPU)
155+
op_library(nccl_op DEPS nccl_common)
156+
endif()
145157
op_library(sequence_conv_op DEPS context_project)
146158
op_library(lstm_op DEPS sequence2batch lstm_compute)
147159

@@ -157,4 +169,8 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
157169
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
158170
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
159171
cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc DEPS dynamic_recurrent_op recurrent_op tensor_array)
172+
173+
if(WITH_GPU)
174+
nv_test(nccl_op_test SRCS nccl_op_test.cu DEPS nccl_op gpu_info device_context)
175+
endif()
160176
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)

paddle/operators/batch_norm_op.cc

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ namespace paddle {
1818
namespace operators {
1919

2020
using Tensor = framework::Tensor;
21+
using LoDTensor = framework::LoDTensor;
2122
template <typename T, int MajorType = Eigen::RowMajor,
2223
typename IndexType = Eigen::DenseIndex>
2324
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
@@ -64,6 +65,9 @@ class BatchNormOp : public framework::OperatorWithKernel {
6465
(tensor_format == TensorFormat::NCHW ? x_dims[1]
6566
: x_dims[x_dims.size() - 1]);
6667

68+
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
69+
"Input x must have 3 to 5 dimensions.");
70+
6771
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
6872
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
6973
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
@@ -108,10 +112,12 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
108112
"Store the global Variance when training");
109113
AddOutput("SavedMean",
110114
"Mean of the current mini batch, "
111-
"will apply to output when training");
115+
"will apply to output when training")
116+
.AsIntermediate();
112117
AddOutput("SavedVariance",
113118
"Variance of the current mini batch, "
114-
"will apply to output when training");
119+
"will apply to output when training")
120+
.AsIntermediate();
115121
AddComment(R"DOC(
116122
https://arxiv.org/pdf/1502.03167.pdf
117123
@@ -135,7 +141,6 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
135141

136142
const auto *x = ctx.Input<Tensor>("X");
137143
const auto &x_dims = x->dims();
138-
139144
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
140145
"The Input dim size should be between 3 and 5");
141146
const int N = x_dims[0];
@@ -289,6 +294,25 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
289294
ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
290295
ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
291296
}
297+
298+
framework::DataType IndicateDataType(
299+
const framework::ExecutionContext &ctx) const override {
300+
VLOG(3) << "IndicateDataType " << this->Type();
301+
const auto *var = ctx.InputVar(framework::GradVarName("Y"));
302+
if (var == nullptr) {
303+
PADDLE_THROW("can't find Y@GRAD");
304+
}
305+
const Tensor *t = nullptr;
306+
if (var->IsType<Tensor>()) {
307+
t = &var->Get<Tensor>();
308+
} else if (var->IsType<LoDTensor>()) {
309+
t = &var->Get<LoDTensor>();
310+
}
311+
if (t == nullptr) {
312+
PADDLE_THROW("can't find Y@GRAD");
313+
}
314+
return framework::ToDataType(t->type());
315+
}
292316
};
293317

294318
template <typename T>

paddle/operators/nccl/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
if(WITH_GPU)
2+
nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
3+
endif()
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
2+
Licensed under the Apache License, Version 2.0 (the "License");
3+
you may not use this file except in compliance with the License.
4+
You may obtain a copy of the License at
5+
http://www.apache.org/licenses/LICENSE-2.0
6+
Unless required by applicable law or agreed to in writing, software
7+
distributed under the License is distributed on an "AS IS" BASIS,
8+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
See the License for the specific language governing permissions and
10+
limitations under the License. */
11+
12+
#include "paddle/operators/nccl/nccl_gpu_common.h"
13+
#include "paddle/platform/gpu_info.h"
14+
15+
namespace paddle {
16+
namespace platform {} // namespace platform
17+
} // namespace paddle
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
17+
#include <algorithm>
18+
#include <condition_variable>
19+
#include <memory>
20+
#include <mutex>
21+
#include <string>
22+
#include <unordered_map>
23+
#include <vector>
24+
25+
#include "paddle/platform/device_context.h"
26+
#include "paddle/platform/dynload/nccl.h"
27+
#include "paddle/platform/enforce.h"
28+
#include "paddle/platform/macros.h"
29+
30+
namespace paddle {
31+
namespace platform {
32+
33+
constexpr int kInvalidGPUId = -1;
34+
35+
struct Communicator {
36+
std::vector<ncclComm_t> comms_;
37+
std::unordered_map<int, int> comm_id_map_;
38+
39+
Communicator() {}
40+
41+
int GetCommId(int device_id) const { return comm_id_map_.at(device_id); }
42+
43+
void InitAll(const std::vector<int>& gpus) {
44+
comms_.resize(gpus.size());
45+
for (size_t i = 0; i < gpus.size(); ++i) {
46+
comm_id_map_[gpus[i]] = i;
47+
}
48+
PADDLE_ENFORCE(
49+
dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
50+
}
51+
52+
~Communicator() {
53+
for (size_t i = 0; i < comms_.size(); ++i) {
54+
// FIXME(dzh) : PADDLE_ENFORCE return void
55+
dynload::ncclCommDestroy(comms_[i]);
56+
}
57+
}
58+
59+
DISABLE_COPY_AND_ASSIGN(Communicator);
60+
};
61+
62+
} // namespace platform
63+
} // namespace paddle

0 commit comments

Comments
 (0)