Skip to content

Commit d307b5e

Browse files
committed
Merge remote-tracking branch 'upstream/develop' into elementwise_add_fp16
2 parents 3da094f + 5271c32 commit d307b5e

File tree

12 files changed

+217
-64
lines changed

12 files changed

+217
-64
lines changed

doc/design/distributed_lookup_table_design.md renamed to doc/fluid/design/dist_train/distributed_lookup_table_design.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ lookup of rows.
2626
The following figure illustrates the multiplication of x with two
2727
non-zero elements, or say, two symbols, and a lookup table W:
2828

29-
![lookup table](./lookup_table.png)
29+
![lookup table](./src/lookup_table.png)
3030

3131
### The Backward Algorithm
3232

@@ -42,7 +42,7 @@ or some more sophisticated algorithms that rely on both W' and W:
4242
$$W = f(W, W')$$
4343

4444
The following figure illustrates the backward pass of the lookup
45-
operator: ![lookup table training](./lookup_table_training.png)
45+
operator: ![lookup table training](./src/lookup_table_training.png)
4646

4747
## Distributed Storage Service
4848

doc/fluid/design/motivation/fluid.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ In computability theory, a system of data-manipulation rules, such as a programm
103103

104104
There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
105105

106-
There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
106+
There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
107107

108108
Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
109109

paddle/fluid/operators/dropout_op.cc

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ class DropoutOp : public framework::OperatorWithKernel {
3535
}
3636
};
3737

38-
template <typename AttrType>
3938
class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
4039
public:
4140
DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
@@ -73,7 +72,6 @@ are set equal to their corresponding inputs.
7372
}
7473
};
7574

76-
template <typename AttrType>
7775
class DropoutOpGrad : public framework::OperatorWithKernel {
7876
public:
7977
using framework::OperatorWithKernel::OperatorWithKernel;
@@ -103,11 +101,10 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
103101
} // namespace paddle
104102

105103
namespace ops = paddle::operators;
106-
REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
107-
ops::DropoutOpGrad<float>);
104+
REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad,
105+
ops::DropoutOpGrad);
108106
REGISTER_OP_CPU_KERNEL(
109-
dropout,
110-
ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
107+
dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
111108
REGISTER_OP_CPU_KERNEL(
112109
dropout_grad,
113110
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);

paddle/fluid/operators/dropout_op.cu

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,18 @@ limitations under the License. */
1818
#include <thrust/random.h>
1919
#include <thrust/transform.h>
2020
#include "paddle/fluid/operators/dropout_op.h"
21+
#include "paddle/fluid/platform/float16.h"
2122

2223
namespace paddle {
2324
namespace operators {
2425

25-
template <typename T, typename AttrType>
26+
template <typename T>
2627
__global__ void RandomGenerator(const size_t n, const int seed,
27-
const AttrType dropout_prob, const T* src,
28+
const float dropout_prob, const T* src,
2829
T* mask_data, T* dst) {
2930
thrust::minstd_rand rng;
3031
rng.seed(seed);
31-
thrust::uniform_real_distribution<AttrType> dist(0, 1);
32+
thrust::uniform_real_distribution<float> dist(0, 1);
3233

3334
int idx = blockDim.x * blockIdx.x + threadIdx.x;
3435
for (; idx < n; idx += blockDim.x * gridDim.x) {
@@ -44,14 +45,14 @@ __global__ void RandomGenerator(const size_t n, const int seed,
4445
// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
4546
// Use std::random and thrust::random(thrust is a std library in CUDA) to
4647
// implement uniform random.
47-
template <typename Place, typename T, typename AttrType>
48+
template <typename Place, typename T>
4849
class GPUDropoutKernel : public framework::OpKernel<T> {
4950
public:
5051
void Compute(const framework::ExecutionContext& context) const override {
5152
auto* x = context.Input<Tensor>("X");
5253
auto* y = context.Output<Tensor>("Out");
5354
y->mutable_data<T>(context.GetPlace());
54-
AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
55+
float dropout_prob = context.Attr<float>("dropout_prob");
5556

5657
auto X = EigenMatrix<T>::Reshape(*x, 1);
5758
auto Y = EigenMatrix<T>::Reshape(*y, 1);
@@ -70,11 +71,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
7071

7172
int threads = 512;
7273
int grid = (x->numel() + threads - 1) / threads;
73-
RandomGenerator<T, AttrType><<<grid, threads, 0,
74-
context.cuda_device_context().stream()>>>(
74+
RandomGenerator<
75+
T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
7576
size, seed, dropout_prob, x_data, mask_data, y_data);
7677
} else {
77-
Y.device(place) = X * (1.0f - dropout_prob);
78+
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
7879
}
7980
}
8081
};
@@ -83,9 +84,9 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
8384
} // namespace paddle
8485

8586
namespace ops = paddle::operators;
87+
namespace plat = paddle::platform;
8688
REGISTER_OP_CUDA_KERNEL(
87-
dropout,
88-
ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>);
89-
REGISTER_OP_CUDA_KERNEL(
90-
dropout_grad,
91-
ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
89+
dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
90+
ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>);
91+
REGISTER_OP_CUDA_KERNEL(dropout_grad,
92+
ops::DropoutGradKernel<plat::CUDADeviceContext, float>);

paddle/fluid/operators/dropout_op.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
2525
typename IndexType = Eigen::DenseIndex>
2626
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
2727

28-
template <typename DeviceContext, typename T, typename AttrType>
28+
template <typename DeviceContext, typename T>
2929
class CPUDropoutKernel : public framework::OpKernel<T> {
3030
public:
3131
void Compute(const framework::ExecutionContext& context) const override {

paddle/fluid/operators/math/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ math_library(sequence2batch)
4343
math_library(sequence_padding)
4444
math_library(sequence_pooling DEPS math_function)
4545
math_library(sequence_scale)
46-
math_library(softmax)
46+
math_library(softmax DEPS math_function)
4747
math_library(unpooling)
4848
math_library(vol2col)
4949

paddle/fluid/operators/reader/create_double_buffer_reader_op.cc

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,20 +48,24 @@ class DoubleBufferReader : public framework::DecoratedReader {
4848

4949
void start_thread() {
5050
buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize);
51-
std::thread prefetch([this] { PrefetchThreadFunc(); });
52-
prefetch.detach();
51+
prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
5352
}
5453

5554
void ReadNext(std::vector<framework::LoDTensor>* out) override;
5655
void ReInit() override;
5756

58-
~DoubleBufferReader() { buffer_->Close(); }
57+
~DoubleBufferReader() {
58+
buffer_->Close();
59+
prefetcher_.join();
60+
delete buffer_;
61+
}
5962

6063
bool HasNext() const override;
6164

6265
private:
6366
void PrefetchThreadFunc();
6467

68+
std::thread prefetcher_;
6569
framework::Channel<Item>* buffer_;
6670
platform::Place place_;
6771
std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
@@ -134,6 +138,8 @@ void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
134138
void DoubleBufferReader::ReInit() {
135139
reader_->ReInit();
136140
buffer_->Close();
141+
prefetcher_.join();
142+
delete buffer_;
137143
start_thread();
138144
}
139145

@@ -159,11 +165,12 @@ void DoubleBufferReader::PrefetchThreadFunc() {
159165

160166
if (!buffer_->Send(&batch)) {
161167
VLOG(5) << "WARNING: The double buffer channel has been closed. The "
162-
"prefetch thread terminates.";
168+
"prefetch thread will terminate.";
163169
break;
164170
}
165171
}
166172
buffer_->Close();
173+
VLOG(5) << "Prefetch thread terminates.";
167174
}
168175

169176
bool DoubleBufferReader::HasNext() const {

paddle/fluid/operators/reader/create_shuffle_reader_op.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ class ShuffleReader : public framework::DecoratedReader {
3434
}
3535

3636
void ReadNext(std::vector<framework::LoDTensor>* out) override {
37+
if (!HasNext()) {
38+
PADDLE_THROW("There is no next data!");
39+
}
3740
if (iteration_pos_ >= buffer_.size()) {
3841
VLOG(10) << "Resetting shuffle buffer";
3942
ReadIntoBuffers();
@@ -50,7 +53,6 @@ class ShuffleReader : public framework::DecoratedReader {
5053
buffer_.clear();
5154
buffer_.reserve(buffer_size_);
5255
iteration_pos_ = 0;
53-
PADDLE_ENFORCE(reader_->HasNext());
5456
for (size_t i = 0; i < buffer_size_; ++i) {
5557
if (!reader_->HasNext()) {
5658
break;

0 commit comments

Comments
 (0)