27
27
28
28
namespace paddle {
29
29
30
- DECLARE_int32 (tensorrt_engine_batch_size);
31
-
32
30
namespace operators {
33
31
34
32
using FluidDT = framework::proto::VarType_Type;
@@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
49
47
return TRT_DT::kINT32 ;
50
48
}
51
49
52
- nvinfer1::Dims Vec2TRT_Dims (const std::vector<int64_t >& shape) {
50
+ nvinfer1::Dims Vec2TRT_Dims (const std::vector<int64_t > & shape) {
53
51
PADDLE_ENFORCE_GT (shape.size (), 1UL ,
54
52
" TensorRT' tensor input requires at least 2 dimensions" );
55
53
PADDLE_ENFORCE_LE (shape.size (), 4UL ,
@@ -63,171 +61,153 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
63
61
} // namespace // NOLINT
64
62
65
63
using inference::Singleton;
66
- using inference::tensorrt::TRT_EngineManager;
64
+ using inference::tensorrt::TensorRTEngine;
65
+
66
+ class TensorRTEngineOp : public framework ::OperatorBase {
67
+ private:
68
+ std::vector<std::string> input_names_;
69
+ std::unordered_set<std::string> param_names_;
70
+ mutable std::unique_ptr<TensorRTEngine> trt_engine_;
71
+ int max_batch_size_;
72
+ int workspace_size_;
67
73
68
- class TensorRTEngineOp : public framework ::OperatorWithKernel {
69
74
public:
70
- using framework::OperatorWithKernel::OperatorWithKernel;
75
+ TensorRTEngineOp (const std::string &type,
76
+ const framework::VariableNameMap &inputs,
77
+ const framework::VariableNameMap &outputs,
78
+ const framework::AttributeMap &attrs)
79
+ : framework::OperatorBase(type, inputs, outputs, attrs) {
80
+ input_names_ = Inputs (" Xs" );
81
+ max_batch_size_ = Attr<int >(" max_batch_size" );
82
+ workspace_size_ = Attr<int >(" workspace_size" );
83
+
84
+ auto params = Attr<std::vector<std::string>>(" parameters" );
85
+ for (const auto ¶m : params) {
86
+ param_names_.insert (param);
87
+ }
88
+ }
71
89
72
90
protected:
73
- void InferShape (framework::InferShapeContext* ctx) const override {}
74
-
75
- framework::OpKernelType GetExpectedKernelType (
76
- const framework::ExecutionContext& ctx) const override {
77
- auto input0 = ctx.Inputs (" Xs" ).front ();
78
- framework::OpKernelType kt = framework::OpKernelType (
79
- framework::ToDataType (ctx.scope ()
80
- .FindVar (input0)
81
- ->GetMutable <framework::LoDTensor>()
82
- ->type ()),
83
- ctx.GetPlace ());
84
- return kt;
91
+ void RunImpl (const framework::Scope &scope,
92
+ const platform::Place &dev_place) const override {
93
+ RunTrt (scope, dev_place);
85
94
}
86
- };
87
95
88
- template < typename DeviceContext, typename T>
89
- class TensorRTEngineKernel : public framework ::OpKernel<T> {
90
- public:
91
- void Compute ( const framework::ExecutionContext& context) const override {
92
- auto engine_name = context. Attr <std::string>( " engine_uniq_key " );
93
- int max_batch_size = context. Attr < int >( " max_batch_size " );
94
- if (!Singleton<TRT_EngineManager>:: Global (). HasEngine (engine_name)) {
95
- Prepare (context );
96
+ void RunTrt ( const framework::Scope &scope,
97
+ const platform::Place &dev_place) const {
98
+ int runtime_batch = 1 ;
99
+ if (trt_engine_. get () == nullptr ) {
100
+ trt_engine_. reset ( new TensorRTEngine (
101
+ max_batch_size_, workspace_size_, nullptr ,
102
+ boost::get<platform::CUDAPlace>(dev_place). device ));
103
+ Prepare (scope, dev_place, trt_engine_. get () );
96
104
}
97
- auto * engine = Singleton<TRT_EngineManager>::Global ().Get (engine_name);
98
- auto input_names = context.op ().Inputs (" Xs" );
99
- PADDLE_ENFORCE (!input_names.empty (), " should pass more than one inputs" );
100
- PADDLE_ENFORCE_LE (FLAGS_tensorrt_engine_batch_size, max_batch_size);
105
+
106
+ auto *engine = trt_engine_.get ();
107
+ PADDLE_ENFORCE (!input_names_.empty (), " should pass more than one inputs" );
101
108
102
109
std::vector<std::string> output_maps =
103
- context. Attr <std::vector<std::string>>(" output_name_mapping" );
110
+ Attr<std::vector<std::string>>(" output_name_mapping" );
104
111
105
- auto params = context.Attr <std::vector<std::string>>(" parameters" );
106
- std::unordered_set<std::string> parameters;
107
- for (const auto & param : params) {
108
- parameters.insert (param);
109
- }
110
112
// Convert input tensor from fluid to engine.
111
- for (const auto & x : context. Inputs (" Xs" )) {
112
- if (parameters .count (x)) continue ;
113
+ for (const auto & x : Inputs (" Xs" )) {
114
+ if (param_names_ .count (x)) continue ;
113
115
// convert input and copy to TRT engine's buffer
114
- auto & t = inference::analysis::GetFromScope<framework::LoDTensor>(
115
- context.scope (), x);
116
+ auto &t =
117
+ inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
118
+ auto t_shape = framework::vectorize (t.dims ());
119
+ runtime_batch = t_shape[0 ];
116
120
if (platform::is_cpu_place (t.place ())) {
117
- engine->SetInputFromCPU (x, static_cast <const void *>(t.data <void >()),
121
+ engine->SetInputFromCPU (x, static_cast <const void *>(t.data <void >()),
118
122
t.memory_size ());
119
123
} else {
120
- engine->SetInputFromGPU (x, static_cast <const void *>(t.data <void >()),
124
+ engine->SetInputFromGPU (x, static_cast <const void *>(t.data <void >()),
121
125
t.memory_size ());
122
126
}
123
127
}
128
+
129
+ PADDLE_ENFORCE_LE (runtime_batch, max_batch_size_);
124
130
// Execute the engine.
125
- PADDLE_ENFORCE_GT (FLAGS_tensorrt_engine_batch_size, 0 );
126
- engine->Execute (FLAGS_tensorrt_engine_batch_size);
131
+ engine->Execute (runtime_batch);
127
132
128
133
// Convert output tensor from engine to fluid
129
134
int output_index = 0 ;
130
135
VLOG (4 ) << " TensorRT Engine Op Outputs:" ;
131
- for (const auto & y : context. Outputs (" Ys" )) {
136
+ for (const auto & y : Outputs (" Ys" )) {
132
137
VLOG (4 ) << y;
133
138
// convert output and copy to fluid.
134
- nvinfer1::ITensor* trt_t = engine->GetITensor (output_maps[output_index]);
139
+ nvinfer1::ITensor * trt_t = engine->GetITensor (output_maps[output_index]);
135
140
auto dims = trt_t ->getDimensions ();
136
141
// Use the output ITensor's dims to reshape the Fluid Tensor.
137
142
// The ITensor doesn't contain the batch size dim.
138
143
std::vector<int > ddim;
139
- ddim.push_back (FLAGS_tensorrt_engine_batch_size );
144
+ ddim.push_back (runtime_batch );
140
145
for (int i = 0 ; i < dims.nbDims ; i++) {
141
146
ddim.push_back (dims.d [i]);
142
147
}
143
148
144
- auto * fluid_v = context. scope () .FindVar (y);
149
+ auto * fluid_v = scope.FindVar (y);
145
150
PADDLE_ENFORCE_NOT_NULL (fluid_v, " no output variable called %s" , y);
146
- auto * fluid_t = fluid_v->GetMutable <framework::LoDTensor>();
151
+ auto * fluid_t = fluid_v->GetMutable <framework::LoDTensor>();
147
152
148
153
fluid_t ->Resize (framework::make_ddim (ddim));
149
154
150
- // TODO(Superjomn) find some way to determine which device to output the
151
- // tensor.
152
- // if (platform::is_cpu_place(fluid_t->place())) {
153
155
// TODO(Superjomn) change this float to dtype size.
154
- auto size = inference::analysis::AccuDims (dims. d , dims. nbDims ) *
155
- FLAGS_tensorrt_engine_batch_size ;
156
+ auto size =
157
+ inference::analysis::AccuDims (dims. d , dims. nbDims ) * runtime_batch ;
156
158
engine->GetOutputInGPU (
157
159
output_maps[output_index],
158
160
fluid_t ->mutable_data <float >(platform::CUDAPlace (
159
- boost::get<platform::CUDAPlace>(context. GetPlace () ).device )),
161
+ boost::get<platform::CUDAPlace>(dev_place ).device )),
160
162
size * sizeof (float ));
161
-
162
163
output_index += 1 ;
163
164
}
164
165
165
166
cudaStreamSynchronize (*engine->stream ());
166
167
}
167
168
168
- protected:
169
- void Prepare ( const framework::ExecutionContext& context ) const {
169
+ void Prepare ( const framework::Scope &scope, const platform::Place &dev_place,
170
+ TensorRTEngine *engine ) const {
170
171
VLOG (4 ) << " Prepare engine" ;
171
- // Get the ProgramDesc and pass to convert.
172
172
framework::proto::BlockDesc block_desc;
173
- block_desc.ParseFromString (context.Attr <std::string>(" subgraph" ));
174
- int max_batch_size = context.Attr <int >(" max_batch_size" );
175
- int workspace_size = context.Attr <int >(" workspace_size" );
176
-
177
- auto params = context.Attr <std::vector<std::string>>(" parameters" );
178
- std::unordered_set<std::string> parameters;
179
- for (const auto & param : params) {
180
- parameters.insert (param);
181
- }
173
+ block_desc.ParseFromString (Attr<std::string>(" subgraph" ));
182
174
183
175
std::vector<std::string> output_maps =
184
- context.Attr <std::vector<std::string>>(" output_name_mapping" );
185
-
186
- // TODO(Superjomn) replace this with a different stream
187
- auto * engine = Singleton<TRT_EngineManager>::Global ().Create (
188
- max_batch_size, workspace_size, nullptr /* engine hold its own stream*/ ,
189
- context.Attr <std::string>(" engine_uniq_key" ),
190
- boost::get<platform::CUDAPlace>(context.GetPlace ()).device );
176
+ Attr<std::vector<std::string>>(" output_name_mapping" );
191
177
192
178
engine->InitNetwork ();
193
179
194
180
framework::BlockDesc block (nullptr /* programdesc*/ , &block_desc);
195
181
VLOG (4 ) << " parsed var size " << block.AllVars ().size ();
196
182
// Add inputs
197
183
VLOG (4 ) << " declare inputs" ;
198
- for (auto & input : context. Inputs (" Xs" )) {
199
- if (parameters .count (input)) continue ;
184
+ for (auto & input : Inputs (" Xs" )) {
185
+ if (param_names_ .count (input)) continue ;
200
186
VLOG (4 ) << " declare input " << input;
201
- auto * var = block.FindVar (input);
187
+
188
+ auto &t =
189
+ inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
190
+ auto t_shape = framework::vectorize (t.dims ());
191
+
192
+ auto *var = block.FindVar (input);
202
193
// TensorRT engine need to create parameters. The parameter's description
203
194
// should be set in
204
195
PADDLE_ENFORCE (var, " no variable called %s" , input);
205
196
PADDLE_ENFORCE_EQ (var->GetType (), FluidDT::VarType_Type_LOD_TENSOR,
206
197
" TensorRT engine only takes LoDTensor as input" );
207
- auto shape = var->GetShape ();
208
- // For the special batch_size placeholder -1, drop it and pass the real
209
- // shape of data.
210
- // TODO(Superjomn) fix this with batch broadcast, or it can't handle
211
- // variational batch size.
212
- if (shape[0 ] == -1 ) {
213
- shape[0 ] = FLAGS_tensorrt_engine_batch_size;
214
- }
198
+
215
199
engine->DeclareInput (
216
200
input, FluidDataType2TRT (
217
201
var->Proto ()->type ().lod_tensor ().tensor ().data_type ()),
218
- Vec2TRT_Dims (shape ));
202
+ Vec2TRT_Dims (t_shape ));
219
203
}
220
-
221
204
inference::Singleton<inference::tensorrt::OpConverter>::Global ()
222
- .ConvertBlock (block_desc, parameters, context. scope () , engine);
205
+ .ConvertBlock (block_desc, param_names_, scope, engine);
223
206
224
207
// Add outputs
225
- for (auto & output : output_maps) {
226
- if (!engine->HasDeclared (output)) {
227
- engine->DeclareOutput (output);
228
- }
208
+ for (auto &output : output_maps) {
209
+ engine->DeclareOutput (output);
229
210
}
230
-
231
211
engine->FreezeNetwork ();
232
212
}
233
213
};
0 commit comments