Skip to content

Commit bf5ce62

Browse files
author
Yibing Liu
committed
Merge branch 'develop' of upstream into fix_docs
2 parents 316eb3e + 566a940 commit bf5ce62

20 files changed

+494
-179
lines changed

paddle/fluid/inference/tensorrt/convert/op_converter.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ class OpConverter {
6464
(*it)(op, scope, test_mode);
6565
}
6666

67-
// convert fluid block to tensorrt network
67+
// Convert a fluid block to tensorrt network, NOTE it just convert operators,
68+
// the INetwork's inputs and outputs should specified in some other modules.
6869
void ConvertBlock(const framework::proto::BlockDesc& block,
6970
const std::unordered_set<std::string>& parameters,
7071
const framework::Scope& scope, TensorRTEngine* engine) {

paddle/fluid/inference/tensorrt/engine.h

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,12 @@ class TensorRTEngine : public EngineBase {
5151
nvinfer1::Weights w_;
5252
};
5353

54-
TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream,
54+
TensorRTEngine(int max_batch, int max_workspace,
55+
cudaStream_t* stream = nullptr,
5556
nvinfer1::ILogger& logger = NaiveLogger::Global())
5657
: max_batch_(max_batch),
5758
max_workspace_(max_workspace),
58-
stream_(stream),
59+
stream_(stream ? stream : &default_stream_),
5960
logger_(logger) {}
6061

6162
virtual ~TensorRTEngine();
@@ -121,6 +122,8 @@ class TensorRTEngine : public EngineBase {
121122
// the max memory size the engine uses
122123
int max_workspace_;
123124
cudaStream_t* stream_;
125+
// If stream_ is not set from outside, hold its own stream.
126+
cudaStream_t default_stream_;
124127
nvinfer1::ILogger& logger_;
125128

126129
std::vector<Buffer> buffers_;
@@ -165,20 +168,31 @@ class TensorRTEngine : public EngineBase {
165168
*/
166169
class TRT_EngineManager {
167170
public:
168-
TensorRTEngine* Create(int max_batch, int max_workspace,
169-
cudaStream_t* stream) {
170-
engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream));
171-
return engines_.back().get();
171+
bool HasEngine(const std::string& name) const {
172+
return engines_.count(name) != 0;
173+
}
174+
175+
// Get an engine called `name`.
176+
TensorRTEngine* Get(const std::string& name) const {
177+
return engines_.at(name).get();
178+
}
179+
180+
// Create or get an engine called `name`
181+
TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
182+
const std::string& name) {
183+
auto* p = new TensorRTEngine(max_batch, max_workspace, stream);
184+
engines_[name].reset(p);
185+
return p;
172186
}
173187

174188
void DeleteALl() {
175-
for (auto& ptr : engines_) {
176-
ptr.reset(nullptr);
189+
for (auto& item : engines_) {
190+
item.second.reset(nullptr);
177191
}
178192
}
179193

180194
private:
181-
std::vector<std::unique_ptr<TensorRTEngine>> engines_;
195+
std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
182196
};
183197

184198
} // namespace tensorrt

paddle/fluid/operators/activation_op.cc

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ Sigmoid Activation Operator
112112
__attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
113113
Logsigmoid Activation Operator
114114
115-
$$out = \log \frac{1}{1 + e^{-x}}$$
115+
$$out = \\log \\frac{1}{1 + e^{-x}}$$
116116
117117
)DOC";
118118

@@ -252,15 +252,14 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
252252
AddOutput("Out", "Output of Softshrink operator");
253253
AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
254254
AddComment(R"DOC(
255-
Softshrink Activation Operator.
256-
257-
$$
258-
out = \begin{cases}
259-
x - \lambda, \text{if } x > \lambda \\
260-
x + \lambda, \text{if } x < -\lambda \\
261-
0, \text{otherwise}
262-
\end{cases}
263-
$$
255+
:strong:`Softshrink Activation Operator`
256+
257+
.. math::
258+
out = \begin{cases}
259+
x - \lambda, \text{if } x > \lambda \\
260+
x + \lambda, \text{if } x < -\lambda \\
261+
0, \text{otherwise}
262+
\end{cases}
264263
265264
)DOC");
266265
}

paddle/fluid/operators/detection/box_coder_op.cc

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -106,23 +106,36 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
106106
"and M represents the number of deocded boxes.");
107107

108108
AddComment(R"DOC(
109-
Bounding Box Coder Operator.
109+
110+
Bounding Box Coder.
111+
110112
Encode/Decode the target bounding box with the priorbox information.
113+
111114
The Encoding schema described below:
112-
ox = (tx - px) / pw / pxv
113-
oy = (ty - py) / ph / pyv
114-
ow = log(abs(tw / pw)) / pwv
115-
oh = log(abs(th / ph)) / phv
115+
116+
ox = (tx - px) / pw / pxv
117+
118+
oy = (ty - py) / ph / pyv
119+
120+
ow = log(abs(tw / pw)) / pwv
121+
122+
oh = log(abs(th / ph)) / phv
123+
116124
The Decoding schema described below:
117-
ox = (pw * pxv * tx * + px) - tw / 2
118-
oy = (ph * pyv * ty * + py) - th / 2
119-
ow = exp(pwv * tw) * pw + tw / 2
120-
oh = exp(phv * th) * ph + th / 2
121-
where tx, ty, tw, th denote the target box's center coordinates, width and
122-
height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
123-
center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
124-
of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
125-
width and height.
125+
126+
ox = (pw * pxv * tx * + px) - tw / 2
127+
128+
oy = (ph * pyv * ty * + py) - th / 2
129+
130+
ow = exp(pwv * tw) * pw + tw / 2
131+
132+
oh = exp(phv * th) * ph + th / 2
133+
134+
where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
135+
and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
136+
priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
137+
`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
138+
encoded/decoded coordinates, width and height.
126139
)DOC");
127140
}
128141
};

paddle/fluid/operators/gaussian_random_batch_size_like_op.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,12 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
3636
void Apply() override {
3737
AddAttr<float>("mean",
3838
"(float, default 0.0) "
39-
"mean of random tensor.")
39+
"The mean (or center) of the gaussian distribution.")
4040
.SetDefault(.0f);
4141
AddAttr<float>("std",
4242
"(float, default 1.0) "
43-
"std of random tensor.")
43+
"The standard deviation (std, or spread) of the "
44+
"gaussian distribution.")
4445
.SetDefault(1.0f);
4546
AddAttr<int>("seed",
4647
"(int, default 0) "
@@ -55,9 +56,11 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
5556
.SetDefault(framework::proto::VarType::FP32);
5657

5758
AddComment(R"DOC(
58-
GaussianRandom Operator.
5959
6060
Used to initialize tensors with gaussian random generator.
61+
The defalut mean of the distribution is 0. and defalut standard
62+
deviation (std) of the distribution is 1.. Uers can set mean and std
63+
by input arguments.
6164
)DOC");
6265
}
6366
};

paddle/fluid/operators/listen_and_serv_op.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
348348
};
349349

350350
void SignalHandler::StopAndExit(int signal_num) {
351-
VLOG(3) << "Catch interrupt signal: " << signal_num << ", program will exit";
351+
// Do not use VLOG here for the device for printing maybe already released.
352+
// exit will release interal allocated resoureces.
352353
exit(0);
353354
}
354355

paddle/fluid/operators/mean_op.cc

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,10 @@ class MeanOp : public framework::OperatorWithKernel {
3333
class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
3434
public:
3535
void Make() override {
36-
AddInput("X", "The input of mean op");
37-
AddOutput("Out", "The output of mean op").Reuse("X");
36+
AddInput("X", "(Tensor) The input of mean op");
37+
AddOutput("Out", "(Tensor) The output of mean op").Reuse("X");
3838
AddComment(R"DOC(
39-
Mean Operator.
40-
41-
Out is a scalar which is the mean of all elements in X.
39+
Mean Operator calculates the mean of all elements in X.
4240
4341
)DOC");
4442
}

paddle/fluid/operators/tensorrt_engine_op.cc

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,25 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
6666
} // namespace
6767

6868
template <typename DeviceContext, typename T>
69-
void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
69+
void TensorRTEngineKernel<DeviceContext, T>::Prepare(
7070
const framework::ExecutionContext &context) const {
7171
VLOG(4) << "Prepare engine";
7272
// Get the ProgramDesc and pass to convert.
7373
framework::proto::BlockDesc block_desc;
7474
block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
75-
max_batch_ = context.Attr<int>("max_batch");
75+
int max_batch = context.Attr<int>("max_batch");
7676
auto max_workspace = context.Attr<int>("max_workspace");
77-
engine_ = Singleton<TRT_EngineManager>::Global().Create(
78-
max_batch_, max_workspace, &stream_);
79-
engine_->InitNetwork();
77+
auto params = context.Attr<std::vector<std::string>>("parameters");
78+
std::unordered_set<std::string> parameters;
79+
for (const auto &param : params) {
80+
parameters.insert(param);
81+
}
82+
83+
// TODO(Superjomn) replace this with a different stream
84+
auto *engine = Singleton<TRT_EngineManager>::Global().Create(
85+
max_batch, max_workspace, nullptr /*engine hold its own stream*/,
86+
context.Attr<std::string>("engine_uniq_key"));
87+
engine->InitNetwork();
8088

8189
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
8290
// Add inputs
@@ -87,24 +95,23 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
8795
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
8896
"TensorRT engine only takes LoDTensor as input");
8997
auto shape = var->GetShape();
90-
engine_->DeclareInput(
98+
engine->DeclareInput(
9199
input, FluidDataType2TRT(
92100
var->Proto()->type().lod_tensor().tensor().data_type()),
93101
Vec2TRT_Dims(var->GetShape()));
94102
}
95103

96-
// TODO(Superjomn) parameters should be passed after analysised from outside.
97104
inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
98-
block_desc, {}, context.scope(), engine_);
105+
block_desc, parameters, context.scope(), engine);
99106

100107
// Add outputs
101108
VLOG(4) << "declare outputs";
102109
for (auto &output : context.Outputs("Ys")) {
103110
VLOG(4) << "declare output " << output;
104-
engine_->DeclareOutput(output);
111+
engine->DeclareOutput(output);
105112
}
106113

107-
engine_->FreezeNetwork();
114+
engine->FreezeNetwork();
108115
}
109116

110117
class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -113,6 +120,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
113120
AddInput("Xs", "A list of inputs.").AsDuplicable();
114121
AddOutput("Ys", "A list of outputs").AsDuplicable();
115122
AddAttr<std::string>("subgraph", "the subgraph.");
123+
AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
116124
AddAttr<int>("max_batch", "the maximum batch size.");
117125
AddAttr<int>("max_workspace", "the maximum batch size.");
118126
AddComment("TensorRT engine operator.");

paddle/fluid/operators/tensorrt_engine_op.h

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,14 @@
1919
#include "paddle/fluid/framework/operator.h"
2020
#include "paddle/fluid/inference/analysis/helper.h"
2121
#include "paddle/fluid/inference/tensorrt/engine.h"
22+
#include "paddle/fluid/inference/tensorrt/engine.h"
2223

2324
namespace paddle {
2425
namespace operators {
2526

27+
using inference::Singleton;
28+
using inference::tensorrt::TRT_EngineManager;
29+
2630
class TensorRTEngineOp : public framework::OperatorWithKernel {
2731
public:
2832
using framework::OperatorWithKernel::OperatorWithKernel;
@@ -47,37 +51,39 @@ template <typename DeviceContext, typename T>
4751
class TensorRTEngineKernel : public framework::OpKernel<T> {
4852
public:
4953
void Compute(const framework::ExecutionContext& context) const override {
50-
if (!engine_) {
54+
auto engine_name = context.Attr<std::string>("engine_uniq_key");
55+
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
5156
Prepare(context);
5257
}
58+
auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
5359
auto input_names = context.op().Inputs("Xs");
5460
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
5561
// Try to determine a batch_size
5662
auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
5763
context.scope(), input_names.front());
5864
int batch_size = tensor0.dims()[0];
59-
PADDLE_ENFORCE_LE(batch_size, max_batch_);
65+
PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
6066

6167
// Convert input tensor from fluid to engine.
6268
for (const auto& x : context.Inputs("Xs")) {
6369
// convert input and copy to TRT engine's buffer
6470
auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
6571
context.scope(), x);
6672
if (platform::is_cpu_place(t.place())) {
67-
engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
68-
t.memory_size());
73+
engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
74+
t.memory_size());
6975
} else {
70-
engine_->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
71-
t.memory_size());
76+
engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
77+
t.memory_size());
7278
}
7379
}
7480
// Execute the engine.
7581
PADDLE_ENFORCE_GT(batch_size, 0);
76-
engine_->Execute(batch_size);
82+
engine->Execute(batch_size);
7783
// Convert output tensor from engine to fluid
7884
for (const auto& y : context.Outputs("Ys")) {
7985
// convert output and copy to fluid.
80-
nvinfer1::ITensor* trt_t = engine_->GetITensor(y);
86+
nvinfer1::ITensor* trt_t = engine->GetITensor(y);
8187
auto dims = trt_t->getDimensions();
8288
// Use the output ITensor's dims to reshape the Fluid Tensor.
8389
std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
@@ -89,27 +95,22 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
8995
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
9096
if (platform::is_cpu_place(fluid_t->place())) {
9197
// TODO(Superjomn) change this float to dtype size.
92-
engine_->GetOutputInCPU(
98+
engine->GetOutputInCPU(
9399
y, fluid_t->mutable_data<float>(platform::CPUPlace()),
94100
size * sizeof(float));
95101
} else {
96-
engine_->GetOutputInGPU(
102+
engine->GetOutputInGPU(
97103
y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
98104
size * sizeof(float));
99105
}
100106
}
101107

102-
cudaStreamSynchronize(stream_);
108+
cudaStreamSynchronize(*engine->stream());
103109
}
104110

105111
protected:
106112
// Build the engine.
107113
void Prepare(const framework::ExecutionContext& context) const;
108-
109-
private:
110-
mutable cudaStream_t stream_;
111-
mutable inference::tensorrt::TensorRTEngine* engine_{nullptr};
112-
mutable int max_batch_{0};
113114
};
114115

115116
} // namespace operators

0 commit comments

Comments
 (0)