Skip to content

Commit 2372daf

Browse files
committed
there is no batchsize concept in tensorrt's tensor
1 parent 4a07617 commit 2372daf

File tree

3 files changed

+74
-17
lines changed

3 files changed

+74
-17
lines changed

paddle/fluid/inference/tensorrt/engine.cc

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ namespace paddle {
2626
namespace inference {
2727
namespace tensorrt {
2828

29+
int TensorRTEngine::runtime_batch_ = 1;
30+
2931
void TensorRTEngine::Build(const DescType& paddle_model) {
3032
PADDLE_ENFORCE(false, "not implemented");
3133
}
@@ -40,6 +42,7 @@ void TensorRTEngine::Execute(int batch_size) {
4042
}
4143
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
4244
cudaStreamSynchronize(*stream_);
45+
SetRuntimeBatch(batch_size);
4346
}
4447

4548
TensorRTEngine::~TensorRTEngine() {
@@ -76,14 +79,15 @@ void TensorRTEngine::FreezeNetwork() {
7679
auto dims = infer_engine_->getBindingDimensions(slot_offset);
7780
item.second = kDataTypeSize[static_cast<int>(
7881
infer_engine_->getBindingDataType(slot_offset))] *
79-
analysis::AccuDims(dims.d, dims.nbDims);
82+
analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
8083
}
8184
auto& buf = buffer(item.first);
8285
CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
83-
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
86+
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
8487
VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
8588
<< buf.buffer;
86-
buf.size = buf.max_size = item.second;
89+
buf.size = item.second;
90+
buf.max_size = item.second * max_batch_;
8791
buf.device = DeviceType::GPU;
8892
}
8993
}
@@ -98,7 +102,7 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
98102
auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
99103
PADDLE_ENFORCE(input, "infer network add input %s failed", name);
100104
buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
101-
analysis::AccuDims(dims.d, dims.nbDims);
105+
analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
102106
PADDLE_ENFORCE(input->isNetworkInput());
103107
TensorRTEngine::SetITensor(name, input);
104108
return input;
@@ -139,30 +143,40 @@ void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
139143
return buffer(name).buffer;
140144
}
141145

142-
void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
143-
size_t max_size) {
146+
void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst) {
144147
// determine data size
148+
auto* output = TensorRTEngine::GetITensor(name);
149+
nvinfer1::Dims dims = output->getDimensions();
150+
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
151+
size_t dst_size = dim_size * runtime_batch_ *
152+
kDataTypeSize[static_cast<int>(output->getType())];
153+
145154
auto it = buffer_sizes_.find(name);
146155
PADDLE_ENFORCE(it != buffer_sizes_.end());
147156
PADDLE_ENFORCE_GT(it->second, 0);
148-
PADDLE_ENFORCE_GE(max_size, it->second);
157+
PADDLE_ENFORCE_LE(dst_size, it->second);
149158
auto& buf = buffer(name);
150159
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
151-
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
160+
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
152161
cudaMemcpyDeviceToDevice, *stream_),
153162
0);
154163
}
155164

156-
void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
157-
size_t max_size) {
165+
void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst) {
158166
// determine data size
167+
168+
auto* output = TensorRTEngine::GetITensor(name);
169+
nvinfer1::Dims dims = output->getDimensions();
170+
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
171+
size_t dst_size = dim_size * runtime_batch_ *
172+
kDataTypeSize[static_cast<int>(output->getType())];
159173
auto it = buffer_sizes_.find(name);
160174
PADDLE_ENFORCE(it != buffer_sizes_.end());
161175
PADDLE_ENFORCE_GT(it->second, 0);
162-
PADDLE_ENFORCE_GE(max_size, it->second);
176+
PADDLE_ENFORCE_LE(dst_size, it->second);
163177
auto& buf = buffer(name);
164178
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
165-
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
179+
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
166180
cudaMemcpyDeviceToHost, *stream_));
167181
}
168182

@@ -207,6 +221,12 @@ nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
207221
return itensor_map_[name];
208222
}
209223

224+
void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
225+
runtime_batch_ = batch_size;
226+
}
227+
228+
int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
229+
210230
} // namespace tensorrt
211231
} // namespace inference
212232
} // namespace paddle

paddle/fluid/inference/tensorrt/engine.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,21 +104,25 @@ class TensorRTEngine : public EngineBase {
104104
// Return the output's GPU memory address without copy.
105105
void* GetOutputInGPU(const std::string& name);
106106
// Copy data into dst inside the GPU device.
107-
void GetOutputInGPU(const std::string& name, void* dst, size_t max_size);
107+
void GetOutputInGPU(const std::string& name, void* dst);
108108
// LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
109109
// to CPU.
110-
void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
110+
void GetOutputInCPU(const std::string& name, void* dst);
111111
// Fill an ITensor into map itensor_map_.
112112
void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
113113
// Get an ITensor called name.
114114
nvinfer1::ITensor* GetITensor(const std::string& name);
115115

116116
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
117117
nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
118+
void SetRuntimeBatch(size_t batch_size);
119+
int GetRuntimeBatch();
118120

119121
private:
120122
// the max batch size
121123
int max_batch_;
124+
// the runtime batch size
125+
static int runtime_batch_;
122126
// the max memory size the engine uses
123127
int max_workspace_;
124128
cudaStream_t* stream_;

paddle/fluid/inference/tensorrt/test_engine.cc

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test {
2828
protected:
2929
void SetUp() override {
3030
ASSERT_EQ(0, cudaStreamCreate(&stream_));
31-
engine_ = new TensorRTEngine(1, 1 << 10, &stream_);
31+
engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
3232
engine_->InitNetwork();
3333
}
3434

@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) {
7171

7272
LOG(INFO) << "to get output";
7373
float y_cpu;
74-
engine_->GetOutputInCPU("y", &y_cpu, sizeof(float));
74+
engine_->GetOutputInCPU("y", &y_cpu);
7575

7676
LOG(INFO) << "to checkout output";
7777
ASSERT_EQ(y_cpu, x_v * 2 + 3);
@@ -103,11 +103,44 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
103103

104104
LOG(INFO) << "to get output";
105105
float y_cpu[2] = {-1., -1.};
106-
engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2);
106+
engine_->GetOutputInCPU("y", &y_cpu[0]);
107107
ASSERT_EQ(y_cpu[0], 4.5);
108108
ASSERT_EQ(y_cpu[1], 14.5);
109109
}
110110

111+
TEST_F(TensorRTEngineTest, test_conv2d_temp) {
112+
// Weight in CPU memory.
113+
float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
114+
float raw_bias[1] = {0};
115+
116+
TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
117+
TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
118+
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
119+
nvinfer1::Dims3{1, 3, 3});
120+
auto* conv_layer =
121+
TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
122+
weight.get(), bias.get());
123+
PADDLE_ENFORCE(conv_layer != nullptr);
124+
conv_layer->setStride(nvinfer1::DimsHW{1, 1});
125+
conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
126+
127+
engine_->DeclareOutput(conv_layer, 0, "y");
128+
engine_->FreezeNetwork();
129+
ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
130+
131+
float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
132+
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
133+
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
134+
18 * sizeof(float));
135+
engine_->Execute(2);
136+
137+
LOG(INFO) << "to get output";
138+
float* y_cpu = new float[18];
139+
engine_->GetOutputInCPU("y", &y_cpu[0]);
140+
ASSERT_EQ(y_cpu[0], 4.0);
141+
ASSERT_EQ(y_cpu[1], 6.0);
142+
}
143+
111144
} // namespace tensorrt
112145
} // namespace inference
113146
} // namespace paddle

0 commit comments

Comments
 (0)