@@ -26,6 +26,8 @@ namespace paddle {
26
26
namespace inference {
27
27
namespace tensorrt {
28
28
29
+ int TensorRTEngine::runtime_batch_ = 1 ;
30
+
29
31
void TensorRTEngine::Build (const DescType& paddle_model) {
30
32
PADDLE_ENFORCE (false , " not implemented" );
31
33
}
@@ -40,6 +42,7 @@ void TensorRTEngine::Execute(int batch_size) {
40
42
}
41
43
infer_context_->enqueue (batch_size, buffers.data (), *stream_, nullptr );
42
44
cudaStreamSynchronize (*stream_);
45
+ SetRuntimeBatch (batch_size);
43
46
}
44
47
45
48
TensorRTEngine::~TensorRTEngine () {
@@ -76,14 +79,15 @@ void TensorRTEngine::FreezeNetwork() {
76
79
auto dims = infer_engine_->getBindingDimensions (slot_offset);
77
80
item.second = kDataTypeSize [static_cast <int >(
78
81
infer_engine_->getBindingDataType (slot_offset))] *
79
- analysis::AccuDims (dims.d , dims.nbDims );
82
+ analysis::AccuDims (dims.d , dims.nbDims ) * max_batch_ ;
80
83
}
81
84
auto & buf = buffer (item.first );
82
85
CHECK (buf.buffer == nullptr ); // buffer should be allocated only once.
83
- PADDLE_ENFORCE_EQ (0 , cudaMalloc (&buf.buffer , item.second ));
86
+ PADDLE_ENFORCE_EQ (0 , cudaMalloc (&buf.buffer , item.second * max_batch_ ));
84
87
VLOG (4 ) << " buffer malloc " << item.first << " " << item.second << " "
85
88
<< buf.buffer ;
86
- buf.size = buf.max_size = item.second ;
89
+ buf.size = item.second ;
90
+ buf.max_size = item.second * max_batch_;
87
91
buf.device = DeviceType::GPU;
88
92
}
89
93
}
@@ -98,7 +102,7 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
98
102
auto * input = infer_network_->addInput (name.c_str (), dtype, dims);
99
103
PADDLE_ENFORCE (input, " infer network add input %s failed" , name);
100
104
buffer_sizes_[name] = kDataTypeSize [static_cast <int >(dtype)] *
101
- analysis::AccuDims (dims.d , dims.nbDims );
105
+ analysis::AccuDims (dims.d , dims.nbDims ) * max_batch_ ;
102
106
PADDLE_ENFORCE (input->isNetworkInput ());
103
107
TensorRTEngine::SetITensor (name, input);
104
108
return input;
@@ -139,30 +143,40 @@ void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
139
143
return buffer (name).buffer ;
140
144
}
141
145
142
- void TensorRTEngine::GetOutputInGPU (const std::string& name, void * dst,
143
- size_t max_size) {
146
+ void TensorRTEngine::GetOutputInGPU (const std::string& name, void * dst) {
144
147
// determine data size
148
+ auto * output = TensorRTEngine::GetITensor (name);
149
+ nvinfer1::Dims dims = output->getDimensions ();
150
+ auto dim_size = analysis::AccuDims (dims.d , dims.nbDims );
151
+ size_t dst_size = dim_size * runtime_batch_ *
152
+ kDataTypeSize [static_cast <int >(output->getType ())];
153
+
145
154
auto it = buffer_sizes_.find (name);
146
155
PADDLE_ENFORCE (it != buffer_sizes_.end ());
147
156
PADDLE_ENFORCE_GT (it->second , 0 );
148
- PADDLE_ENFORCE_GE (max_size , it->second );
157
+ PADDLE_ENFORCE_LE (dst_size , it->second );
149
158
auto & buf = buffer (name);
150
159
PADDLE_ENFORCE_NOT_NULL (buf.buffer , " buffer should be allocated before" );
151
- PADDLE_ENFORCE_EQ (cudaMemcpyAsync (dst, buf.buffer , it-> second ,
160
+ PADDLE_ENFORCE_EQ (cudaMemcpyAsync (dst, buf.buffer , dst_size ,
152
161
cudaMemcpyDeviceToDevice, *stream_),
153
162
0 );
154
163
}
155
164
156
- void TensorRTEngine::GetOutputInCPU (const std::string& name, void * dst,
157
- size_t max_size) {
165
+ void TensorRTEngine::GetOutputInCPU (const std::string& name, void * dst) {
158
166
// determine data size
167
+
168
+ auto * output = TensorRTEngine::GetITensor (name);
169
+ nvinfer1::Dims dims = output->getDimensions ();
170
+ auto dim_size = analysis::AccuDims (dims.d , dims.nbDims );
171
+ size_t dst_size = dim_size * runtime_batch_ *
172
+ kDataTypeSize [static_cast <int >(output->getType ())];
159
173
auto it = buffer_sizes_.find (name);
160
174
PADDLE_ENFORCE (it != buffer_sizes_.end ());
161
175
PADDLE_ENFORCE_GT (it->second , 0 );
162
- PADDLE_ENFORCE_GE (max_size , it->second );
176
+ PADDLE_ENFORCE_LE (dst_size , it->second );
163
177
auto & buf = buffer (name);
164
178
PADDLE_ENFORCE_NOT_NULL (buf.buffer , " buffer should be allocated before" );
165
- PADDLE_ENFORCE_EQ (0 , cudaMemcpyAsync (dst, buf.buffer , it-> second ,
179
+ PADDLE_ENFORCE_EQ (0 , cudaMemcpyAsync (dst, buf.buffer , dst_size ,
166
180
cudaMemcpyDeviceToHost, *stream_));
167
181
}
168
182
@@ -207,6 +221,12 @@ nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
207
221
return itensor_map_[name];
208
222
}
209
223
224
+ void TensorRTEngine::SetRuntimeBatch (size_t batch_size) {
225
+ runtime_batch_ = batch_size;
226
+ }
227
+
228
+ int TensorRTEngine::GetRuntimeBatch () { return runtime_batch_; }
229
+
210
230
} // namespace tensorrt
211
231
} // namespace inference
212
232
} // namespace paddle
0 commit comments