Skip to content

Commit 8190381

Browse files
Superjomnluotao1
authored andcommitted
Feature/engine refactor (#10497)
* init refactor * init * update some comment * fix build * fix errorrr * fix bug * fix comment * update
1 parent c7c62e0 commit 8190381

File tree

5 files changed

+54
-20
lines changed

5 files changed

+54
-20
lines changed

paddle/fluid/inference/engine.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ limitations under the License. */
1919
namespace paddle {
2020
namespace inference {
2121

22+
struct Buffer;
23+
enum class DeviceType { UNK = -1, CPU, GPU };
24+
2225
/*
2326
* EngineBase is the base class of all inference engines. An inference engine
2427
* takes a paddle program as input, and outputs the result in fluid Tensor
@@ -45,8 +48,20 @@ class EngineBase {
4548
// Execute the engine, that will run the inference network.
4649
virtual void Execute(int batch_size) = 0;
4750

51+
// Return the IO buffer that allocated in engine. One can read/write directly
52+
// on the buffer. If the buffer's buffer is nullptr, one can also allocate
53+
// memory and maintain it outside the engine.
54+
virtual Buffer& buffer(const std::string& name) = 0;
55+
4856
virtual ~EngineBase() {}
4957
}; // class EngineBase
5058

59+
struct Buffer {
60+
void* buffer{nullptr}; // buffer should be allocated only once.
61+
int max_size; // buffer allocated space.
62+
int size; // data size.
63+
DeviceType device{DeviceType::UNK}; // tells which device this buffer is on.
64+
};
65+
5166
} // namespace inference
5267
} // namespace paddle
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1+
nv_library(tensorrt_engine SRCS engine.cc)
12
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
2-
nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda)
3-
set(ENGINE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/engine.cc)
3+
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
4+
45
add_subdirectory(convert)
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
2-
nv_test(test_trt_activation_op SRCS test_activation_op.cc ${ENGINE_FILE} activation_op.cc
3-
DEPS ${FLUID_CORE_MODULES} activation_op)
2+
nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
3+
DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine)
44
nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)

paddle/fluid/inference/tensorrt/engine.cc

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,24 @@ void TensorRTEngine::Build(const DescType& paddle_model) {
3030
}
3131

3232
void TensorRTEngine::Execute(int batch_size) {
33-
infer_context_->enqueue(batch_size, buffers_.data(), *stream_, nullptr);
33+
std::vector<void*> buffers;
34+
for (auto& buf : buffers_) {
35+
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
36+
PADDLE_ENFORCE_GT(buf.max_size, 0);
37+
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
38+
buffers.push_back(buf.buffer);
39+
}
40+
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
3441
cudaStreamSynchronize(*stream_);
3542
}
3643

3744
TensorRTEngine::~TensorRTEngine() {
3845
// clean buffer
39-
for (auto& buffer : buffers_) {
40-
if (buffer != nullptr) {
41-
PADDLE_ENFORCE_EQ(0, cudaFree(buffer));
42-
buffer = nullptr;
46+
for (auto& buf : buffers_) {
47+
if (buf.buffer != nullptr) {
48+
PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
49+
buf.buffer = nullptr;
50+
buf.max_size = 0;
4351
}
4452
}
4553
}
@@ -59,15 +67,19 @@ void TensorRTEngine::FreezeNetwork() {
5967
infer_context_.reset(infer_engine_->createExecutionContext());
6068

6169
// allocate GPU buffers.
62-
buffers_.resize(buffer_sizes_.size(), nullptr);
70+
buffers_.resize(buffer_sizes_.size());
6371
for (auto& item : buffer_sizes_) {
6472
if (item.second == 0) {
6573
auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
6674
item.second = kDataTypeSize[static_cast<int>(
6775
infer_engine_->getBindingDataType(slot_offset))] *
6876
AccumDims(infer_engine_->getBindingDimensions(slot_offset));
6977
}
70-
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buffer(item.first), item.second));
78+
auto& buf = buffer(item.first);
79+
CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
80+
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
81+
buf.size = buf.max_size = item.second;
82+
buf.device = DeviceType::GPU;
7183
}
7284
}
7385

@@ -113,7 +125,7 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
113125
}
114126

115127
void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
116-
return buffer(name);
128+
return buffer(name).buffer;
117129
}
118130

119131
void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
@@ -123,11 +135,13 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
123135
PADDLE_ENFORCE(it != buffer_sizes_.end());
124136
PADDLE_ENFORCE_GT(it->second, 0);
125137
PADDLE_ENFORCE_GE(max_size, it->second);
126-
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buffer(name), it->second,
138+
auto& buf = buffer(name);
139+
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
140+
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
127141
cudaMemcpyDeviceToHost, *stream_));
128142
}
129143

130-
void*& TensorRTEngine::buffer(const std::string& name) {
144+
Buffer& TensorRTEngine::buffer(const std::string& name) {
131145
PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
132146
auto it = buffer_sizes_.find(name);
133147
PADDLE_ENFORCE(it != buffer_sizes_.end());
@@ -137,10 +151,12 @@ void*& TensorRTEngine::buffer(const std::string& name) {
137151

138152
void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
139153
size_t size) {
140-
void* buf = buffer(name);
141-
cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_);
142-
PADDLE_ENFORCE_EQ(
143-
0, cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_));
154+
auto& buf = buffer(name);
155+
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
156+
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
157+
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
158+
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
159+
cudaMemcpyHostToDevice, *stream_));
144160
}
145161

146162
void TensorRTEngine::SetITensor(const std::string& name,

paddle/fluid/inference/tensorrt/engine.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,9 @@ class TensorRTEngine : public EngineBase {
8787
// these memory directly for acceleration, for example, output the converted
8888
// data directly to the buffer to save data copy overhead.
8989
// NOTE this should be used after calling `FreezeNetwork`.
90-
void*& buffer(const std::string& name);
90+
Buffer& buffer(const std::string& name) override;
91+
92+
cudaStream_t* stream() { return stream_; }
9193

9294
// Fill an input from CPU memory with name and size.
9395
void SetInputFromCPU(const std::string& name, void* data, size_t size);
@@ -116,7 +118,7 @@ class TensorRTEngine : public EngineBase {
116118
cudaStream_t* stream_;
117119
nvinfer1::ILogger& logger_;
118120

119-
std::vector<void*> buffers_;
121+
std::vector<Buffer> buffers_;
120122
// max data size for the buffers.
121123
std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
122124
std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>

0 commit comments

Comments
 (0)