Skip to content

Commit 2d57158

Browse files
authored
fea/init tensorrt engine (#10003)
1 parent 64babc9 commit 2d57158

File tree

7 files changed

+515
-10
lines changed

7 files changed

+515
-10
lines changed

paddle/fluid/inference/engine.h

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
17+
#include "paddle/fluid/framework/framework.pb.h"
18+
19+
namespace paddle {
20+
namespace inference {
21+
22+
/*
23+
* EngineBase is the base class of all inference engines. An inference engine
24+
* takes a paddle program as input, and outputs the result in fluid Tensor
25+
* format. It can be used to optimize performance of computation sub-blocks, for
26+
* example, break down the original block into sub-blocks and execute each
27+
* sub-blocks in different engines.
28+
*
29+
* For example:
30+
* When inference, the resnet50 model can put most of the model into subgraph
31+
* and run it on a TensorRT engine.
32+
*
33+
* There are several engines such as TensorRT and other frameworks, so an
34+
* EngineBase is put forward to give an unified interface for all the
35+
* different engine implemention.
36+
*/
37+
class EngineBase {
38+
public:
39+
using DescType = ::paddle::framework::proto::BlockDesc;
40+
41+
// Build the model and do some preparation, for example, in TensorRT, run
42+
// createInferBuilder, buildCudaEngine.
43+
virtual void Build(const DescType& paddle_model) = 0;
44+
45+
// Execute the engine, that will run the inference network.
46+
virtual void Execute(int batch_size) = 0;
47+
48+
virtual ~EngineBase() {}
49+
50+
}; // class EngineBase
51+
52+
} // namespace inference
53+
} // namespace paddle
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
1-
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
1+
if(WITH_TESTING)
2+
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
3+
nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda)
4+
endif()
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include "paddle/fluid/inference/tensorrt/engine.h"
16+
17+
#include <NvInfer.h>
18+
#include <cuda.h>
19+
#include <glog/logging.h>
20+
#include "paddle/fluid/inference/tensorrt/helper.h"
21+
#include "paddle/fluid/platform/enforce.h"
22+
23+
namespace paddle {
24+
namespace inference {
25+
namespace tensorrt {
26+
27+
void TensorRTEngine::Build(const DescType& paddle_model) {
28+
PADDLE_ENFORCE(false, "not implemented");
29+
}
30+
31+
void TensorRTEngine::Execute(int batch_size) {
32+
infer_context_->enqueue(batch_size, buffers_.data(), *stream_, nullptr);
33+
cudaStreamSynchronize(*stream_);
34+
}
35+
36+
TensorRTEngine::~TensorRTEngine() {
37+
// clean buffer
38+
for (auto& buffer : buffers_) {
39+
if (buffer != nullptr) {
40+
PADDLE_ENFORCE_EQ(0, cudaFree(buffer));
41+
buffer = nullptr;
42+
}
43+
}
44+
}
45+
46+
void TensorRTEngine::FreezeNetwork() {
47+
PADDLE_ENFORCE(infer_builder_ != nullptr,
48+
"Call InitNetwork first to initialize network.");
49+
PADDLE_ENFORCE(infer_network_ != nullptr,
50+
"Call InitNetwork first to initialize network.");
51+
// build engine.
52+
infer_builder_->setMaxBatchSize(max_batch_);
53+
infer_builder_->setMaxWorkspaceSize(max_workspace_);
54+
55+
infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
56+
PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
57+
58+
infer_context_.reset(infer_engine_->createExecutionContext());
59+
60+
// allocate GPU buffers.
61+
buffers_.resize(buffer_sizes_.size(), nullptr);
62+
for (auto& item : buffer_sizes_) {
63+
if (item.second == 0) {
64+
auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
65+
item.second = kDataTypeSize[static_cast<int>(
66+
infer_engine_->getBindingDataType(slot_offset))] *
67+
AccumDims(infer_engine_->getBindingDimensions(slot_offset));
68+
}
69+
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buffer(item.first), item.second));
70+
}
71+
}
72+
73+
nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
74+
nvinfer1::DataType dtype,
75+
const nvinfer1::Dims& dim) {
76+
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
77+
name);
78+
79+
PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
80+
auto* input = infer_network_->addInput(name.c_str(), dtype, dim);
81+
PADDLE_ENFORCE(input, "infer network add input %s failed", name);
82+
83+
buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * AccumDims(dim);
84+
return input;
85+
}
86+
87+
void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
88+
const std::string& name) {
89+
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
90+
name);
91+
92+
auto* output = layer->getOutput(offset);
93+
PADDLE_ENFORCE(output != nullptr);
94+
output->setName(name.c_str());
95+
infer_network_->markOutput(*output);
96+
// output buffers' size can only be decided latter, set zero here to mark this
97+
// and will reset latter.
98+
buffer_sizes_[name] = 0;
99+
}
100+
101+
void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
102+
return buffer(name);
103+
}
104+
105+
void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
106+
size_t max_size) {
107+
// determine data size
108+
auto it = buffer_sizes_.find(name);
109+
PADDLE_ENFORCE(it != buffer_sizes_.end());
110+
PADDLE_ENFORCE_GT(it->second, 0);
111+
PADDLE_ENFORCE_GE(max_size, it->second);
112+
113+
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buffer(name), it->second,
114+
cudaMemcpyDeviceToHost, *stream_));
115+
}
116+
117+
void*& TensorRTEngine::buffer(const std::string& name) {
118+
PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
119+
auto it = buffer_sizes_.find(name);
120+
PADDLE_ENFORCE(it != buffer_sizes_.end());
121+
auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
122+
return buffers_[slot_offset];
123+
}
124+
125+
void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
126+
size_t size) {
127+
void* buf = buffer(name);
128+
PADDLE_ENFORCE_EQ(
129+
0, cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_));
130+
}
131+
132+
} // namespace tensorrt
133+
} // namespace inference
134+
} // namespace paddle
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
17+
#include <NvInfer.h>
18+
#include <memory>
19+
#include <unordered_map>
20+
#include "paddle/fluid/inference/engine.h"
21+
#include "paddle/fluid/inference/tensorrt/helper.h"
22+
23+
namespace paddle {
24+
namespace inference {
25+
namespace tensorrt {
26+
27+
/*
28+
* TensorRT Engine.
29+
*
30+
* There are two alternative ways to use it, one is to build from a paddle
31+
* protobuf model, another way is to manully construct the network.
32+
*/
33+
class TensorRTEngine : public EngineBase {
34+
public:
35+
// Weight is model parameter.
36+
class Weight {
37+
public:
38+
Weight(nvinfer1::DataType dtype, void* value, int num_elem) {
39+
w_.type = dtype;
40+
w_.values = value;
41+
w_.count = num_elem;
42+
}
43+
const nvinfer1::Weights& get() { return w_; }
44+
45+
private:
46+
nvinfer1::Weights w_;
47+
};
48+
49+
TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream,
50+
nvinfer1::ILogger& logger = NaiveLogger::Global())
51+
: max_batch_(max_batch),
52+
max_workspace_(max_workspace),
53+
stream_(stream),
54+
logger_(logger) {}
55+
56+
virtual ~TensorRTEngine();
57+
58+
// TODO(Superjomn) implement it later when graph segmentation is supported.
59+
virtual void Build(const DescType& paddle_model) override;
60+
61+
virtual void Execute(int batch_size) override;
62+
63+
// Initialize the inference network, so that TensorRT layers can add to this
64+
// network.
65+
void InitNetwork() {
66+
infer_builder_.reset(createInferBuilder(logger_));
67+
infer_network_.reset(infer_builder_->createNetwork());
68+
}
69+
// After finishing adding ops, freeze this network and creates the executation
70+
// environment.
71+
void FreezeNetwork();
72+
73+
// Add an input and set its name, data type and dimention.
74+
nvinfer1::ITensor* DeclareInput(const std::string& name,
75+
nvinfer1::DataType dtype,
76+
const nvinfer1::Dims& dim);
77+
// Set the offset-th output from a layer as the network's output, and set its
78+
// name.
79+
void DeclareOutput(const nvinfer1::ILayer* layer, int offset,
80+
const std::string& name);
81+
82+
// GPU memory address for an ITensor with specific name. One can operate on
83+
// these memory directly for acceleration, for example, output the converted
84+
// data directly to the buffer to save data copy overhead.
85+
// NOTE this should be used after calling `FreezeNetwork`.
86+
void*& buffer(const std::string& name);
87+
88+
// Fill an input from CPU memory with name and size.
89+
void SetInputFromCPU(const std::string& name, void* data, size_t size);
90+
// TODO(Superjomn) is this method necessary given that buffer(xxx) can be
91+
// accessed directly. Fill an input from GPU memory with name and size.
92+
void SetInputFromGPU(const std::string& name, void* data, size_t size);
93+
// Get an output called name, the output of tensorrt is in GPU, so this method
94+
// will just return the output's GPU memory address.
95+
void* GetOutputInGPU(const std::string& name);
96+
// LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
97+
// to CPU.
98+
void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
99+
100+
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
101+
nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
102+
103+
private:
104+
// the max batch size
105+
int max_batch_;
106+
// the max memory size the engine uses
107+
int max_workspace_;
108+
cudaStream_t* stream_;
109+
nvinfer1::ILogger& logger_;
110+
111+
std::vector<void*> buffers_;
112+
// max data size for the buffers.
113+
std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
114+
115+
// TensorRT related internal members
116+
template <typename T>
117+
struct Destroyer {
118+
void operator()(T* x) { x->destroy(); }
119+
};
120+
template <typename T>
121+
using infer_ptr = std::unique_ptr<T, Destroyer<T>>;
122+
infer_ptr<nvinfer1::IBuilder> infer_builder_;
123+
infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
124+
infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
125+
infer_ptr<nvinfer1::IExecutionContext> infer_context_;
126+
}; // class TensorRTEngine
127+
128+
// Add an layer__ into engine__ with args ARGS.
129+
// For example:
130+
// TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias)
131+
//
132+
// Reference
133+
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network
134+
//
135+
// will add a fully connected layer into the engine.
136+
// TensorRT has too many layers, so that is not wise to add member functions for
137+
// them, and an macro like this is more extensible when underlying TensorRT
138+
// library add new layer supports.
139+
#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
140+
engine__->network()->add##layer__(ARGS);
141+
142+
} // namespace tensorrt
143+
} // namespace inference
144+
} // namespace paddle

0 commit comments

Comments
 (0)