@@ -30,16 +30,24 @@ void TensorRTEngine::Build(const DescType& paddle_model) {
30
30
}
31
31
32
32
void TensorRTEngine::Execute (int batch_size) {
33
- infer_context_->enqueue (batch_size, buffers_.data (), *stream_, nullptr );
33
+ std::vector<void *> buffers;
34
+ for (auto & buf : buffers_) {
35
+ PADDLE_ENFORCE_NOT_NULL (buf.buffer , " buffer should be allocated" );
36
+ PADDLE_ENFORCE_GT (buf.max_size , 0 );
37
+ PADDLE_ENFORCE (buf.device == DeviceType::GPU);
38
+ buffers.push_back (buf.buffer );
39
+ }
40
+ infer_context_->enqueue (batch_size, buffers.data (), *stream_, nullptr );
34
41
cudaStreamSynchronize (*stream_);
35
42
}
36
43
37
44
TensorRTEngine::~TensorRTEngine () {
38
45
// clean buffer
39
- for (auto & buffer : buffers_) {
40
- if (buffer != nullptr ) {
41
- PADDLE_ENFORCE_EQ (0 , cudaFree (buffer));
42
- buffer = nullptr ;
46
+ for (auto & buf : buffers_) {
47
+ if (buf.buffer != nullptr ) {
48
+ PADDLE_ENFORCE_EQ (0 , cudaFree (buf.buffer ));
49
+ buf.buffer = nullptr ;
50
+ buf.max_size = 0 ;
43
51
}
44
52
}
45
53
}
@@ -59,15 +67,19 @@ void TensorRTEngine::FreezeNetwork() {
59
67
infer_context_.reset (infer_engine_->createExecutionContext ());
60
68
61
69
// allocate GPU buffers.
62
- buffers_.resize (buffer_sizes_.size (), nullptr );
70
+ buffers_.resize (buffer_sizes_.size ());
63
71
for (auto & item : buffer_sizes_) {
64
72
if (item.second == 0 ) {
65
73
auto slot_offset = infer_engine_->getBindingIndex (item.first .c_str ());
66
74
item.second = kDataTypeSize [static_cast <int >(
67
75
infer_engine_->getBindingDataType (slot_offset))] *
68
76
AccumDims (infer_engine_->getBindingDimensions (slot_offset));
69
77
}
70
- PADDLE_ENFORCE_EQ (0 , cudaMalloc (&buffer (item.first ), item.second ));
78
+ auto & buf = buffer (item.first );
79
+ CHECK (buf.buffer == nullptr ); // buffer should be allocated only once.
80
+ PADDLE_ENFORCE_EQ (0 , cudaMalloc (&buf.buffer , item.second ));
81
+ buf.size = buf.max_size = item.second ;
82
+ buf.device = DeviceType::GPU;
71
83
}
72
84
}
73
85
@@ -113,7 +125,7 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
113
125
}
114
126
115
127
void * TensorRTEngine::GetOutputInGPU (const std::string& name) {
116
- return buffer (name);
128
+ return buffer (name). buffer ;
117
129
}
118
130
119
131
void TensorRTEngine::GetOutputInCPU (const std::string& name, void * dst,
@@ -123,11 +135,13 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
123
135
PADDLE_ENFORCE (it != buffer_sizes_.end ());
124
136
PADDLE_ENFORCE_GT (it->second , 0 );
125
137
PADDLE_ENFORCE_GE (max_size, it->second );
126
- PADDLE_ENFORCE_EQ (0 , cudaMemcpyAsync (dst, buffer (name), it->second ,
138
+ auto & buf = buffer (name);
139
+ PADDLE_ENFORCE_NOT_NULL (buf.buffer , " buffer should be allocated before" );
140
+ PADDLE_ENFORCE_EQ (0 , cudaMemcpyAsync (dst, buf.buffer , it->second ,
127
141
cudaMemcpyDeviceToHost, *stream_));
128
142
}
129
143
130
- void * & TensorRTEngine::buffer (const std::string& name) {
144
+ Buffer & TensorRTEngine::buffer (const std::string& name) {
131
145
PADDLE_ENFORCE (infer_engine_ != nullptr , " call FreezeNetwork first." );
132
146
auto it = buffer_sizes_.find (name);
133
147
PADDLE_ENFORCE (it != buffer_sizes_.end ());
@@ -137,10 +151,12 @@ void*& TensorRTEngine::buffer(const std::string& name) {
137
151
138
152
void TensorRTEngine::SetInputFromCPU (const std::string& name, void * data,
139
153
size_t size) {
140
- void * buf = buffer (name);
141
- cudaMemcpyAsync (buf, data, size, cudaMemcpyHostToDevice, *stream_);
142
- PADDLE_ENFORCE_EQ (
143
- 0 , cudaMemcpyAsync (buf, data, size, cudaMemcpyHostToDevice, *stream_));
154
+ auto & buf = buffer (name);
155
+ PADDLE_ENFORCE_NOT_NULL (buf.buffer );
156
+ PADDLE_ENFORCE_LE (size, buf.max_size , " buffer is too small" );
157
+ PADDLE_ENFORCE (buf.device == DeviceType::GPU);
158
+ PADDLE_ENFORCE_EQ (0 , cudaMemcpyAsync (buf.buffer , data, size,
159
+ cudaMemcpyHostToDevice, *stream_));
144
160
}
145
161
146
162
void TensorRTEngine::SetITensor (const std::string& name,
0 commit comments