Skip to content

Commit 5fd142c

Browse files
authored
bugfix/trt engine op (#11487)
1 parent 34ac0eb commit 5fd142c

File tree

5 files changed

+158
-37
lines changed

5 files changed

+158
-37
lines changed

paddle/fluid/inference/tensorrt/convert/op_converter.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ class OpConverter {
6464
(*it)(op, scope, test_mode);
6565
}
6666

67-
// convert fluid block to tensorrt network
67+
// Convert a fluid block to tensorrt network, NOTE it just convert operators,
68+
// the INetwork's inputs and outputs should specified in some other modules.
6869
void ConvertBlock(const framework::proto::BlockDesc& block,
6970
const std::unordered_set<std::string>& parameters,
7071
const framework::Scope& scope, TensorRTEngine* engine) {

paddle/fluid/inference/tensorrt/engine.h

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,12 @@ class TensorRTEngine : public EngineBase {
5151
nvinfer1::Weights w_;
5252
};
5353

54-
TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream,
54+
TensorRTEngine(int max_batch, int max_workspace,
55+
cudaStream_t* stream = nullptr,
5556
nvinfer1::ILogger& logger = NaiveLogger::Global())
5657
: max_batch_(max_batch),
5758
max_workspace_(max_workspace),
58-
stream_(stream),
59+
stream_(stream ? stream : &default_stream_),
5960
logger_(logger) {}
6061

6162
virtual ~TensorRTEngine();
@@ -121,6 +122,8 @@ class TensorRTEngine : public EngineBase {
121122
// the max memory size the engine uses
122123
int max_workspace_;
123124
cudaStream_t* stream_;
125+
// If stream_ is not set from outside, hold its own stream.
126+
cudaStream_t default_stream_;
124127
nvinfer1::ILogger& logger_;
125128

126129
std::vector<Buffer> buffers_;
@@ -165,20 +168,31 @@ class TensorRTEngine : public EngineBase {
165168
*/
166169
class TRT_EngineManager {
167170
public:
168-
TensorRTEngine* Create(int max_batch, int max_workspace,
169-
cudaStream_t* stream) {
170-
engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream));
171-
return engines_.back().get();
171+
bool HasEngine(const std::string& name) const {
172+
return engines_.count(name) != 0;
173+
}
174+
175+
// Get an engine called `name`.
176+
TensorRTEngine* Get(const std::string& name) const {
177+
return engines_.at(name).get();
178+
}
179+
180+
// Create or get an engine called `name`
181+
TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
182+
const std::string& name) {
183+
auto* p = new TensorRTEngine(max_batch, max_workspace, stream);
184+
engines_[name].reset(p);
185+
return p;
172186
}
173187

174188
void DeleteALl() {
175-
for (auto& ptr : engines_) {
176-
ptr.reset(nullptr);
189+
for (auto& item : engines_) {
190+
item.second.reset(nullptr);
177191
}
178192
}
179193

180194
private:
181-
std::vector<std::unique_ptr<TensorRTEngine>> engines_;
195+
std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
182196
};
183197

184198
} // namespace tensorrt

paddle/fluid/operators/tensorrt_engine_op.cc

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,25 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
6666
} // namespace
6767

6868
template <typename DeviceContext, typename T>
69-
void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
69+
void TensorRTEngineKernel<DeviceContext, T>::Prepare(
7070
const framework::ExecutionContext &context) const {
7171
VLOG(4) << "Prepare engine";
7272
// Get the ProgramDesc and pass to convert.
7373
framework::proto::BlockDesc block_desc;
7474
block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
75-
max_batch_ = context.Attr<int>("max_batch");
75+
int max_batch = context.Attr<int>("max_batch");
7676
auto max_workspace = context.Attr<int>("max_workspace");
77-
engine_ = Singleton<TRT_EngineManager>::Global().Create(
78-
max_batch_, max_workspace, &stream_);
79-
engine_->InitNetwork();
77+
auto params = context.Attr<std::vector<std::string>>("parameters");
78+
std::unordered_set<std::string> parameters;
79+
for (const auto &param : params) {
80+
parameters.insert(param);
81+
}
82+
83+
// TODO(Superjomn) replace this with a different stream
84+
auto *engine = Singleton<TRT_EngineManager>::Global().Create(
85+
max_batch, max_workspace, nullptr /*engine hold its own stream*/,
86+
context.Attr<std::string>("engine_uniq_key"));
87+
engine->InitNetwork();
8088

8189
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
8290
// Add inputs
@@ -87,24 +95,23 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
8795
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
8896
"TensorRT engine only takes LoDTensor as input");
8997
auto shape = var->GetShape();
90-
engine_->DeclareInput(
98+
engine->DeclareInput(
9199
input, FluidDataType2TRT(
92100
var->Proto()->type().lod_tensor().tensor().data_type()),
93101
Vec2TRT_Dims(var->GetShape()));
94102
}
95103

96-
// TODO(Superjomn) parameters should be passed after analysised from outside.
97104
inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
98-
block_desc, {}, context.scope(), engine_);
105+
block_desc, parameters, context.scope(), engine);
99106

100107
// Add outputs
101108
VLOG(4) << "declare outputs";
102109
for (auto &output : context.Outputs("Ys")) {
103110
VLOG(4) << "declare output " << output;
104-
engine_->DeclareOutput(output);
111+
engine->DeclareOutput(output);
105112
}
106113

107-
engine_->FreezeNetwork();
114+
engine->FreezeNetwork();
108115
}
109116

110117
class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -113,6 +120,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
113120
AddInput("Xs", "A list of inputs.").AsDuplicable();
114121
AddOutput("Ys", "A list of outputs").AsDuplicable();
115122
AddAttr<std::string>("subgraph", "the subgraph.");
123+
AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
116124
AddAttr<int>("max_batch", "the maximum batch size.");
117125
AddAttr<int>("max_workspace", "the maximum batch size.");
118126
AddComment("TensorRT engine operator.");

paddle/fluid/operators/tensorrt_engine_op.h

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,14 @@
1919
#include "paddle/fluid/framework/operator.h"
2020
#include "paddle/fluid/inference/analysis/helper.h"
2121
#include "paddle/fluid/inference/tensorrt/engine.h"
22+
#include "paddle/fluid/inference/tensorrt/engine.h"
2223

2324
namespace paddle {
2425
namespace operators {
2526

27+
using inference::Singleton;
28+
using inference::tensorrt::TRT_EngineManager;
29+
2630
class TensorRTEngineOp : public framework::OperatorWithKernel {
2731
public:
2832
using framework::OperatorWithKernel::OperatorWithKernel;
@@ -47,37 +51,39 @@ template <typename DeviceContext, typename T>
4751
class TensorRTEngineKernel : public framework::OpKernel<T> {
4852
public:
4953
void Compute(const framework::ExecutionContext& context) const override {
50-
if (!engine_) {
54+
auto engine_name = context.Attr<std::string>("engine_uniq_key");
55+
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
5156
Prepare(context);
5257
}
58+
auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
5359
auto input_names = context.op().Inputs("Xs");
5460
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
5561
// Try to determine a batch_size
5662
auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
5763
context.scope(), input_names.front());
5864
int batch_size = tensor0.dims()[0];
59-
PADDLE_ENFORCE_LE(batch_size, max_batch_);
65+
PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
6066

6167
// Convert input tensor from fluid to engine.
6268
for (const auto& x : context.Inputs("Xs")) {
6369
// convert input and copy to TRT engine's buffer
6470
auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
6571
context.scope(), x);
6672
if (platform::is_cpu_place(t.place())) {
67-
engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
68-
t.memory_size());
73+
engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
74+
t.memory_size());
6975
} else {
70-
engine_->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
71-
t.memory_size());
76+
engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
77+
t.memory_size());
7278
}
7379
}
7480
// Execute the engine.
7581
PADDLE_ENFORCE_GT(batch_size, 0);
76-
engine_->Execute(batch_size);
82+
engine->Execute(batch_size);
7783
// Convert output tensor from engine to fluid
7884
for (const auto& y : context.Outputs("Ys")) {
7985
// convert output and copy to fluid.
80-
nvinfer1::ITensor* trt_t = engine_->GetITensor(y);
86+
nvinfer1::ITensor* trt_t = engine->GetITensor(y);
8187
auto dims = trt_t->getDimensions();
8288
// Use the output ITensor's dims to reshape the Fluid Tensor.
8389
std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
@@ -89,27 +95,22 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
8995
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
9096
if (platform::is_cpu_place(fluid_t->place())) {
9197
// TODO(Superjomn) change this float to dtype size.
92-
engine_->GetOutputInCPU(
98+
engine->GetOutputInCPU(
9399
y, fluid_t->mutable_data<float>(platform::CPUPlace()),
94100
size * sizeof(float));
95101
} else {
96-
engine_->GetOutputInGPU(
102+
engine->GetOutputInGPU(
97103
y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
98104
size * sizeof(float));
99105
}
100106
}
101107

102-
cudaStreamSynchronize(stream_);
108+
cudaStreamSynchronize(*engine->stream());
103109
}
104110

105111
protected:
106112
// Build the engine.
107113
void Prepare(const framework::ExecutionContext& context) const;
108-
109-
private:
110-
mutable cudaStream_t stream_;
111-
mutable inference::tensorrt::TensorRTEngine* engine_{nullptr};
112-
mutable int max_batch_{0};
113114
};
114115

115116
} // namespace operators

paddle/fluid/operators/tensorrt_engine_op_test.cc

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,17 @@ void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
7979
attr->set_type(paddle::framework::proto::AttrType::LONG);
8080
attr->set_l(data);
8181
}
82+
template <>
83+
void SetAttr<std::vector<std::string>>(framework::proto::OpDesc* op,
84+
const std::string& name,
85+
const std::vector<std::string>& data) {
86+
auto* attr = op->add_attrs();
87+
attr->set_name(name);
88+
attr->set_type(paddle::framework::proto::AttrType::STRINGS);
89+
for (const auto& s : data) {
90+
attr->add_strings(s.c_str());
91+
}
92+
}
8293

8394
} // namespace
8495

@@ -123,11 +134,15 @@ TEST(TensorRTEngineOp, manual) {
123134
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
124135
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
125136
block_->SerializeAsString());
126-
SetAttr<int>(engine_op_desc.Proto(), "max_batch", 30);
137+
SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
127138
SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
139+
SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
140+
SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
141+
std::vector<std::string>({}));
128142

129143
LOG(INFO) << "create engine op";
130144
auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
145+
LOG(INFO) << "engine_op " << engine_op.get();
131146

132147
framework::Scope scope;
133148
platform::CPUPlace place;
@@ -145,6 +160,88 @@ TEST(TensorRTEngineOp, manual) {
145160
engine_op->Run(scope, place);
146161
}
147162

163+
void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
164+
framework::ProgramDesc program;
165+
framework::Scope scope;
166+
platform::CPUPlace place;
167+
platform::CPUDeviceContext ctx(place);
168+
169+
auto* block_ = program.Proto()->add_blocks();
170+
block_->set_idx(0);
171+
block_->set_parent_idx(-1);
172+
173+
using shape_t = std::vector<int64_t>;
174+
175+
LOG(INFO) << "create block desc";
176+
framework::BlockDesc block_desc(&program, block_);
177+
178+
auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
179+
const std::string& z_name, bool x_created,
180+
const shape_t& x_shape, const shape_t& y_shape,
181+
const shape_t& z_shape) {
182+
183+
LOG(INFO) << "create fc op";
184+
auto* fc = block_desc.AppendOp();
185+
fc->SetType("mul");
186+
fc->SetInput("X", std::vector<std::string>({x_name}));
187+
fc->SetInput("Y", std::vector<std::string>({y_name}));
188+
fc->SetOutput("Out", std::vector<std::string>({z_name}));
189+
190+
// Set inputs' variable shape in BlockDesc
191+
if (!x_created) {
192+
AddTensorToBlockDesc(block_, x_name,
193+
std::vector<int64_t>({batch_size, input_dim, 1, 1}));
194+
}
195+
AddTensorToBlockDesc(block_, y_name,
196+
std::vector<int64_t>({input_dim, output_dim}));
197+
AddTensorToBlockDesc(block_, z_name,
198+
std::vector<int64_t>({batch_size, output_dim}));
199+
200+
// Prepare variables.
201+
if (!x_created) {
202+
CreateCPUTensor(&scope, x_name, std::vector<int64_t>(x_shape));
203+
}
204+
CreateCPUTensor(&scope, y_name, std::vector<int64_t>(y_shape));
205+
CreateCPUTensor(&scope, z_name, std::vector<int64_t>(z_shape));
206+
207+
// It is wired, need to copy manually.
208+
*block_->add_ops() = *fc->Proto();
209+
};
210+
211+
// Test with 4 layer FC
212+
AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
213+
{input_dim, output_dim}, {batch_size, output_dim});
214+
AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
215+
{batch_size, output_dim});
216+
AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
217+
{batch_size, output_dim});
218+
AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
219+
{batch_size, output_dim});
220+
221+
LOG(INFO) << "create tensorrt desc";
222+
framework::OpDesc engine_op_desc(nullptr);
223+
engine_op_desc.SetType("tensorrt_engine");
224+
engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
225+
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
226+
227+
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
228+
block_->SerializeAsString());
229+
SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
230+
SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
231+
SetAttr<std::vector<std::string>>(
232+
engine_op_desc.Proto(), "parameters",
233+
std::vector<std::string>({"y0", "y1", "y2", "y3"}));
234+
SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
235+
236+
auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
237+
238+
// Execute them.
239+
engine_op->Run(scope, place);
240+
}
241+
242+
// Test with a larger FC layer.
243+
TEST(TensorRTEngineOp, fc) { Execute(40, 256, 256); }
244+
148245
} // namespace operators
149246
} // namespace paddle
150247

0 commit comments

Comments
 (0)