Skip to content

Commit 29ad979

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_tensorrt_softmax
2 parents 641f32d + 772ceee commit 29ad979

File tree

20 files changed

+365
-64
lines changed

20 files changed

+365
-64
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ include(external/any) # download libn::any
175175
include(external/eigen) # download eigen3
176176
include(external/pybind11) # download pybind11
177177
include(external/cares)
178+
include(external/cub)
178179

179180
if(WITH_DISTRIBUTE)
180181
if(WITH_GRPC)

cmake/external/cub.cmake

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
if(NOT WITH_GPU)
2+
return()
3+
endif()
4+
5+
include(ExternalProject)
6+
7+
set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
8+
set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
9+
10+
include_directories(${CUB_INCLUDE_DIR})
11+
12+
ExternalProject_Add(
13+
extern_cub
14+
${EXTERNAL_PROJECT_LOG_ARGS}
15+
GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
16+
GIT_TAG "v1.8.0"
17+
PREFIX ${CUB_SOURCE_DIR}
18+
UPDATE_COMMAND ""
19+
CONFIGURE_COMMAND ""
20+
BUILD_COMMAND ""
21+
INSTALL_COMMAND ""
22+
TEST_COMMAND ""
23+
)
24+
25+
if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
26+
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
27+
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
28+
add_library(cub STATIC ${dummyfile})
29+
else()
30+
add_library(cub INTERFACE)
31+
endif()
32+
33+
add_dependencies(cub extern_cub)
34+
35+
LIST(APPEND externl_project_dependencies cub)

paddle/fluid/API.spec

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,7 @@ paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=Non
336336
paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
337337
paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
338338
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
339+
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
339340
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
340341
paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
341342
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)

paddle/fluid/framework/details/multi_devices_graph_builder.cc

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
275275
if (strategy_.gradient_scale_ !=
276276
BuildStrategy::GradientScaleStrategy::kCustomized) {
277277
// TODO(paddle-dev): Why is there no input for this op_handle?
278-
CreateScaleLossGradOp(&result);
278+
auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
279+
CreateScaleLossGradOp(&result, loss_grad_name);
279280
}
280281
// This assumes the backward generating code will ensure IsScaleLossOp
281282
// is true only for the op that scale the final scalar loss.
@@ -535,7 +536,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
535536
return got == sharded_var_device.end() ? -1 : got->second;
536537
}
537538

538-
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
539+
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
540+
ir::Graph *result, const std::string &loss_grad_name) const {
539541
for (size_t i = 0; i < places_.size(); ++i) {
540542
// Insert ScaleCost OpHandle
541543
#ifdef PADDLE_WITH_CUDA
@@ -558,10 +560,10 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
558560
// loss->pending_ops_.emplace_back(op_handle);
559561
// op_handle->inputs_.emplace_back(loss);
560562

561-
CreateOpOutput(result, op_handle,
562-
result->CreateEmptyNode(GradVarName(loss_var_name_),
563-
ir::Node::Type::kVariable),
564-
places_[i], i);
563+
CreateOpOutput(
564+
result, op_handle,
565+
result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
566+
places_[i], i);
565567
}
566568
}
567569

paddle/fluid/framework/details/multi_devices_graph_builder.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
7575
void CreateComputationalOps(ir::Graph *result, ir::Node *node,
7676
size_t num_places) const;
7777

78-
void CreateScaleLossGradOp(ir::Graph *result) const;
78+
void CreateScaleLossGradOp(ir::Graph *result,
79+
const std::string &loss_grad_name) const;
80+
7981
VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
8082
int dst_dev_id) const;
8183
void CreateComputationalOp(ir::Graph *result, ir::Node *node,

paddle/fluid/framework/executor.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -330,12 +330,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
330330
}
331331

332332
for (auto& op : ctx->ops_) {
333-
VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
334333
op->Run(*local_scope, place_);
335-
// NOTE! Please do not delete this line, it's usefull because the debug
336-
// string before and after op.run are different, after run the output
337-
// will have right shape which is usefull for debug.
338-
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
339334

340335
if (FLAGS_benchmark) {
341336
VLOG(2) << "Memory used after operator " + op->Type() + " running: "

paddle/fluid/framework/operator.cc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
127127
}
128128

129129
void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
130-
VLOG(10) << "- " << DebugStringEx(&scope);
130+
VLOG(4) << place << " " << DebugStringEx(&scope);
131131
if (platform::is_gpu_place(place)) {
132132
#ifndef PADDLE_WITH_CUDA
133133
PADDLE_THROW("Cannot run operator on place %s", place);
@@ -139,7 +139,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
139139
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
140140
platform::RecordEvent record_event(Type(), pool.Get(place));
141141
RunImpl(scope, place);
142-
VLOG(10) << "+ " << DebugStringEx(&scope);
142+
VLOG(3) << place << " " << DebugStringEx(&scope);
143143
}
144144

145145
bool OperatorBase::HasInputs(const std::string& name) const {
@@ -778,6 +778,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
778778
const ExecutionContext& ctx) const {
779779
auto& scope = ctx.scope();
780780
int data_type = -1;
781+
std::string last_input_name;
781782
for (auto& input : this->inputs_) {
782783
for (auto& ipt_name : input.second) {
783784
auto* var = scope.FindVar(ipt_name);
@@ -794,9 +795,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
794795
int tmp = static_cast<int>(ToDataType(t->type()));
795796
PADDLE_ENFORCE(
796797
tmp == data_type || data_type == -1,
797-
"DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
798-
data_type, tmp);
798+
"DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
799+
Type(), last_input_name, data_type, ipt_name, tmp);
799800
data_type = tmp;
801+
last_input_name = ipt_name;
800802
}
801803
}
802804
}

paddle/fluid/inference/api/api.cc

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

15+
#include <glog/logging.h>
1516
#include "paddle/fluid/inference/api/paddle_inference_api.h"
1617

1718
namespace paddle {
@@ -40,19 +41,36 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
4041
PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
4142

4243
PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
44+
if (!other.memory_owned_) {
45+
data_ = other.data_;
46+
length_ = other.length_;
47+
memory_owned_ = other.memory_owned_;
48+
} else {
49+
Resize(other.length());
50+
memcpy(data_, other.data(), other.length());
51+
length_ = other.length();
52+
memory_owned_ = true;
53+
}
54+
return *this;
55+
}
56+
57+
PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
4358
// only the buffer with external memory can be copied
44-
assert(!other.memory_owned_);
4559
data_ = other.data_;
4660
length_ = other.length_;
4761
memory_owned_ = other.memory_owned_;
62+
other.data_ = nullptr;
63+
other.length_ = 0;
64+
other.memory_owned_ = false;
4865
return *this;
4966
}
5067

5168
void PaddleBuf::Resize(size_t length) {
5269
// Only the owned memory can be reset, the external memory can't be changed.
5370
if (length_ == length) return;
54-
assert(memory_owned_);
55-
Free();
71+
if (memory_owned_) {
72+
Free();
73+
}
5674
data_ = new char[length];
5775
length_ = length;
5876
memory_owned_ = true;
@@ -68,7 +86,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
6886
void PaddleBuf::Free() {
6987
if (memory_owned_ && data_) {
7088
assert(length_ > 0);
71-
delete static_cast<char*>(data_);
89+
delete[] static_cast<char*>(data_);
7290
data_ = nullptr;
7391
length_ = 0;
7492
}

paddle/fluid/inference/api/paddle_inference_api.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,12 @@ class PaddleBuf {
4040
// Copy only available when memory is managed externally.
4141
explicit PaddleBuf(const PaddleBuf&);
4242
PaddleBuf& operator=(const PaddleBuf&);
43+
PaddleBuf& operator=(PaddleBuf&&);
4344
// Do not own the memory.
4445
PaddleBuf(void* data, size_t length)
4546
: data_(data), length_(length), memory_owned_{false} {}
4647
// Own memory.
47-
explicit PaddleBuf(size_t length)
48+
PaddleBuf(size_t length)
4849
: data_(new char[length]), length_(length), memory_owned_(true) {}
4950
// Resize to `length` bytes.
5051
void Resize(size_t length);

paddle/fluid/operators/elementwise_op_function.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -534,8 +534,8 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
534534
const framework::Tensor& dout, int axis,
535535
framework::Tensor* dx, framework::Tensor* dy,
536536
DX_OP dx_op, DY_OP dy_op) {
537-
const framework::DDim x_dim = x.dims();
538-
const framework::DDim y_dim = y.dims();
537+
const framework::DDim& x_dim = x.dims();
538+
const framework::DDim& y_dim = y.dims();
539539
if (x.dims() == y.dims()) {
540540
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
541541
ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
@@ -558,19 +558,19 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx,
558558
framework::Tensor* dx, framework::Tensor* dy,
559559
DX_OP dx_op, DY_OP dy_op) {
560560
if (dy == nullptr) {
561-
const framework::DDim dx_dims = dout.dims();
561+
const framework::DDim& dx_dims = dout.dims();
562562
auto dy_dims = dx_dims;
563563
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
564564
ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
565565
} else {
566566
if (dout.dims() == dy->dims()) {
567-
const framework::DDim dx_dims = dout.dims();
568-
const framework::DDim dy_dims = dy->dims();
567+
const framework::DDim& dx_dims = dout.dims();
568+
const framework::DDim& dy_dims = dy->dims();
569569
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
570570
ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
571571
} else { // Y is a scalar
572572
auto dx_dims = dout.dims();
573-
const framework::DDim dy_dims = dy->dims();
573+
const framework::DDim& dy_dims = dy->dims();
574574
ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
575575
ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
576576
}

0 commit comments

Comments
 (0)