Skip to content

Commit 847cbdc

Browse files
authored
Merge pull request #14934 from NHZlX/fix_trt_thread_pool_bug_to_1.2
cherry-pick """fix pool2d and thread bug of paddle-trt """ to release1.2
2 parents 15b6e2a + 2942cf2 commit 847cbdc

File tree

9 files changed

+93
-131
lines changed

9 files changed

+93
-131
lines changed

cmake/inference_lib.cmake

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,13 @@ if (WITH_ANAKIN AND WITH_MKL)
182182
list(APPEND inference_deps anakin_inference_lib)
183183
endif ()
184184

185+
if (TENSORRT_FOUND)
186+
copy(tensorrt_lib DEPS ${inference_deps}
187+
SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer*
188+
DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib)
189+
endif ()
190+
191+
185192
set(module "inference")
186193
copy(inference_lib DEPS ${inference_deps}
187194
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*

paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
6363
void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
6464
Graph *graph) const {
6565
auto *op_desc = node->Op();
66-
static int counter{0};
6766
auto &subgraph = *Agent(node).subgraph();
6867
PADDLE_ENFORCE(!subgraph.empty());
6968

@@ -191,8 +190,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
191190
block_desc.Proto()->SerializeAsString());
192191
SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
193192
SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
194-
SetAttr(op_desc->Proto(), "engine_uniq_key",
195-
"trt-" + std::to_string(counter++));
196193
SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
197194
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
198195
}

paddle/fluid/inference/tensorrt/convert/op_converter.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ class OpConverter {
103103
void ConvertBlock(const framework::proto::BlockDesc& block,
104104
const std::unordered_set<std::string>& parameters,
105105
const framework::Scope& scope, TensorRTEngine* engine) {
106+
std::unique_lock<std::mutex> lk(mut_);
106107
for (int i = 0; i < block.ops_size(); i++) {
107108
const auto& op = block.ops(i);
108109
ConvertOp(op, parameters, scope, engine);
@@ -125,6 +126,7 @@ class OpConverter {
125126
std::unordered_map<std::string, OpConverter*> converters_;
126127
// fluid inference scope
127128
framework::Scope* scope_{nullptr};
129+
std::mutex mut_;
128130
};
129131

130132
} // namespace tensorrt

paddle/fluid/inference/tensorrt/convert/pool2d_op.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,12 @@ class Pool2dOpConverter : public OpConverter {
109109
}
110110

111111
if (pool_type == "max") {
112-
nvinfer1::DimsHW pre_pad(paddings[0], paddings[1]);
113-
nvinfer1::DimsHW post_pad(paddings[0], paddings[1]);
112+
// Under ceil mode, the pre_pad and post_pad are used to
113+
// record the the padding size. In some ceil mode cases,
114+
// we do not need padding, so we initialize the two vars to 0.
115+
116+
nvinfer1::DimsHW pre_pad(0, 0);
117+
nvinfer1::DimsHW post_pad(0, 0);
114118
if (ceil_mode) {
115119
// If ceil mode is true, we will pad the appropriate size to the input.
116120
DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
2-
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n")
2+
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n")
33
nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
44
DEPS tensorrt_engine_op
55
analysis)

paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121

2222
namespace paddle {
2323

24-
DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
25-
2624
namespace operators {
2725

2826
class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -31,7 +29,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
3129
AddInput("Xs", "A list of inputs.").AsDuplicable();
3230
AddOutput("Ys", "A list of outputs").AsDuplicable();
3331
AddAttr<std::string>("subgraph", "the subgraph.");
34-
AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
3532
AddAttr<int>("max_batch_size", "the maximum batch size.");
3633
AddAttr<int>("workspace_size", "the workspace size.");
3734
AddComment("TensorRT engine operator.");
@@ -50,6 +47,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
5047
namespace ops = paddle::operators;
5148

5249
REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
53-
ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
50+
ops::TensorRTEngineOpMaker);
5451

5552
#endif // PADDLE_WITH_CUDA

paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc

Lines changed: 0 additions & 24 deletions
This file was deleted.

paddle/fluid/operators/tensorrt/tensorrt_engine_op.h

Lines changed: 75 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@
2727

2828
namespace paddle {
2929

30-
DECLARE_int32(tensorrt_engine_batch_size);
31-
3230
namespace operators {
3331

3432
using FluidDT = framework::proto::VarType_Type;
@@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
4947
return TRT_DT::kINT32;
5048
}
5149

52-
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
50+
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
5351
PADDLE_ENFORCE_GT(shape.size(), 1UL,
5452
"TensorRT' tensor input requires at least 2 dimensions");
5553
PADDLE_ENFORCE_LE(shape.size(), 4UL,
@@ -63,171 +61,153 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
6361
} // namespace // NOLINT
6462

6563
using inference::Singleton;
66-
using inference::tensorrt::TRT_EngineManager;
64+
using inference::tensorrt::TensorRTEngine;
65+
66+
class TensorRTEngineOp : public framework::OperatorBase {
67+
private:
68+
std::vector<std::string> input_names_;
69+
std::unordered_set<std::string> param_names_;
70+
mutable std::unique_ptr<TensorRTEngine> trt_engine_;
71+
int max_batch_size_;
72+
int workspace_size_;
6773

68-
class TensorRTEngineOp : public framework::OperatorWithKernel {
6974
public:
70-
using framework::OperatorWithKernel::OperatorWithKernel;
75+
TensorRTEngineOp(const std::string &type,
76+
const framework::VariableNameMap &inputs,
77+
const framework::VariableNameMap &outputs,
78+
const framework::AttributeMap &attrs)
79+
: framework::OperatorBase(type, inputs, outputs, attrs) {
80+
input_names_ = Inputs("Xs");
81+
max_batch_size_ = Attr<int>("max_batch_size");
82+
workspace_size_ = Attr<int>("workspace_size");
83+
84+
auto params = Attr<std::vector<std::string>>("parameters");
85+
for (const auto &param : params) {
86+
param_names_.insert(param);
87+
}
88+
}
7189

7290
protected:
73-
void InferShape(framework::InferShapeContext* ctx) const override {}
74-
75-
framework::OpKernelType GetExpectedKernelType(
76-
const framework::ExecutionContext& ctx) const override {
77-
auto input0 = ctx.Inputs("Xs").front();
78-
framework::OpKernelType kt = framework::OpKernelType(
79-
framework::ToDataType(ctx.scope()
80-
.FindVar(input0)
81-
->GetMutable<framework::LoDTensor>()
82-
->type()),
83-
ctx.GetPlace());
84-
return kt;
91+
void RunImpl(const framework::Scope &scope,
92+
const platform::Place &dev_place) const override {
93+
RunTrt(scope, dev_place);
8594
}
86-
};
8795

88-
template <typename DeviceContext, typename T>
89-
class TensorRTEngineKernel : public framework::OpKernel<T> {
90-
public:
91-
void Compute(const framework::ExecutionContext& context) const override {
92-
auto engine_name = context.Attr<std::string>("engine_uniq_key");
93-
int max_batch_size = context.Attr<int>("max_batch_size");
94-
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
95-
Prepare(context);
96+
void RunTrt(const framework::Scope &scope,
97+
const platform::Place &dev_place) const {
98+
int runtime_batch = 1;
99+
if (trt_engine_.get() == nullptr) {
100+
trt_engine_.reset(new TensorRTEngine(
101+
max_batch_size_, workspace_size_, nullptr,
102+
boost::get<platform::CUDAPlace>(dev_place).device));
103+
Prepare(scope, dev_place, trt_engine_.get());
96104
}
97-
auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
98-
auto input_names = context.op().Inputs("Xs");
99-
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
100-
PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);
105+
106+
auto *engine = trt_engine_.get();
107+
PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
101108

102109
std::vector<std::string> output_maps =
103-
context.Attr<std::vector<std::string>>("output_name_mapping");
110+
Attr<std::vector<std::string>>("output_name_mapping");
104111

105-
auto params = context.Attr<std::vector<std::string>>("parameters");
106-
std::unordered_set<std::string> parameters;
107-
for (const auto& param : params) {
108-
parameters.insert(param);
109-
}
110112
// Convert input tensor from fluid to engine.
111-
for (const auto& x : context.Inputs("Xs")) {
112-
if (parameters.count(x)) continue;
113+
for (const auto &x : Inputs("Xs")) {
114+
if (param_names_.count(x)) continue;
113115
// convert input and copy to TRT engine's buffer
114-
auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
115-
context.scope(), x);
116+
auto &t =
117+
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
118+
auto t_shape = framework::vectorize(t.dims());
119+
runtime_batch = t_shape[0];
116120
if (platform::is_cpu_place(t.place())) {
117-
engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
121+
engine->SetInputFromCPU(x, static_cast<const void *>(t.data<void>()),
118122
t.memory_size());
119123
} else {
120-
engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
124+
engine->SetInputFromGPU(x, static_cast<const void *>(t.data<void>()),
121125
t.memory_size());
122126
}
123127
}
128+
129+
PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
124130
// Execute the engine.
125-
PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
126-
engine->Execute(FLAGS_tensorrt_engine_batch_size);
131+
engine->Execute(runtime_batch);
127132

128133
// Convert output tensor from engine to fluid
129134
int output_index = 0;
130135
VLOG(4) << "TensorRT Engine Op Outputs:";
131-
for (const auto& y : context.Outputs("Ys")) {
136+
for (const auto &y : Outputs("Ys")) {
132137
VLOG(4) << y;
133138
// convert output and copy to fluid.
134-
nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
139+
nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]);
135140
auto dims = trt_t->getDimensions();
136141
// Use the output ITensor's dims to reshape the Fluid Tensor.
137142
// The ITensor doesn't contain the batch size dim.
138143
std::vector<int> ddim;
139-
ddim.push_back(FLAGS_tensorrt_engine_batch_size);
144+
ddim.push_back(runtime_batch);
140145
for (int i = 0; i < dims.nbDims; i++) {
141146
ddim.push_back(dims.d[i]);
142147
}
143148

144-
auto* fluid_v = context.scope().FindVar(y);
149+
auto *fluid_v = scope.FindVar(y);
145150
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
146-
auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
151+
auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
147152

148153
fluid_t->Resize(framework::make_ddim(ddim));
149154

150-
// TODO(Superjomn) find some way to determine which device to output the
151-
// tensor.
152-
// if (platform::is_cpu_place(fluid_t->place())) {
153155
// TODO(Superjomn) change this float to dtype size.
154-
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
155-
FLAGS_tensorrt_engine_batch_size;
156+
auto size =
157+
inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch;
156158
engine->GetOutputInGPU(
157159
output_maps[output_index],
158160
fluid_t->mutable_data<float>(platform::CUDAPlace(
159-
boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
161+
boost::get<platform::CUDAPlace>(dev_place).device)),
160162
size * sizeof(float));
161-
162163
output_index += 1;
163164
}
164165

165166
cudaStreamSynchronize(*engine->stream());
166167
}
167168

168-
protected:
169-
void Prepare(const framework::ExecutionContext& context) const {
169+
void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
170+
TensorRTEngine *engine) const {
170171
VLOG(4) << "Prepare engine";
171-
// Get the ProgramDesc and pass to convert.
172172
framework::proto::BlockDesc block_desc;
173-
block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
174-
int max_batch_size = context.Attr<int>("max_batch_size");
175-
int workspace_size = context.Attr<int>("workspace_size");
176-
177-
auto params = context.Attr<std::vector<std::string>>("parameters");
178-
std::unordered_set<std::string> parameters;
179-
for (const auto& param : params) {
180-
parameters.insert(param);
181-
}
173+
block_desc.ParseFromString(Attr<std::string>("subgraph"));
182174

183175
std::vector<std::string> output_maps =
184-
context.Attr<std::vector<std::string>>("output_name_mapping");
185-
186-
// TODO(Superjomn) replace this with a different stream
187-
auto* engine = Singleton<TRT_EngineManager>::Global().Create(
188-
max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
189-
context.Attr<std::string>("engine_uniq_key"),
190-
boost::get<platform::CUDAPlace>(context.GetPlace()).device);
176+
Attr<std::vector<std::string>>("output_name_mapping");
191177

192178
engine->InitNetwork();
193179

194180
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
195181
VLOG(4) << "parsed var size " << block.AllVars().size();
196182
// Add inputs
197183
VLOG(4) << "declare inputs";
198-
for (auto& input : context.Inputs("Xs")) {
199-
if (parameters.count(input)) continue;
184+
for (auto &input : Inputs("Xs")) {
185+
if (param_names_.count(input)) continue;
200186
VLOG(4) << "declare input " << input;
201-
auto* var = block.FindVar(input);
187+
188+
auto &t =
189+
inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
190+
auto t_shape = framework::vectorize(t.dims());
191+
192+
auto *var = block.FindVar(input);
202193
// TensorRT engine need to create parameters. The parameter's description
203194
// should be set in
204195
PADDLE_ENFORCE(var, "no variable called %s", input);
205196
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
206197
"TensorRT engine only takes LoDTensor as input");
207-
auto shape = var->GetShape();
208-
// For the special batch_size placeholder -1, drop it and pass the real
209-
// shape of data.
210-
// TODO(Superjomn) fix this with batch broadcast, or it can't handle
211-
// variational batch size.
212-
if (shape[0] == -1) {
213-
shape[0] = FLAGS_tensorrt_engine_batch_size;
214-
}
198+
215199
engine->DeclareInput(
216200
input, FluidDataType2TRT(
217201
var->Proto()->type().lod_tensor().tensor().data_type()),
218-
Vec2TRT_Dims(shape));
202+
Vec2TRT_Dims(t_shape));
219203
}
220-
221204
inference::Singleton<inference::tensorrt::OpConverter>::Global()
222-
.ConvertBlock(block_desc, parameters, context.scope(), engine);
205+
.ConvertBlock(block_desc, param_names_, scope, engine);
223206

224207
// Add outputs
225-
for (auto& output : output_maps) {
226-
if (!engine->HasDeclared(output)) {
227-
engine->DeclareOutput(output);
228-
}
208+
for (auto &output : output_maps) {
209+
engine->DeclareOutput(output);
229210
}
230-
231211
engine->FreezeNetwork();
232212
}
233213
};

paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,7 @@ limitations under the License. */
2424
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
2525
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
2626

27-
USE_CUDA_ONLY_OP(tensorrt_engine);
28-
27+
USE_NO_KERNEL_OP(tensorrt_engine);
2928
namespace paddle {
3029
namespace operators {
3130

0 commit comments

Comments
 (0)