Skip to content

Commit 6cbe597

Browse files
authored
Merge pull request #10495 from luotao1/refine_relu_test
refine EngineIOConverter, and use io_convert in test_trt_activation_op
2 parents dfdcb7e + 1992f70 commit 6cbe597

File tree

8 files changed

+137
-71
lines changed

8 files changed

+137
-71
lines changed

paddle/fluid/inference/analysis/dot.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
#include <glog/logging.h>
2323
#include <sstream>
24+
#include <string>
2425
#include <unordered_map>
2526
#include <vector>
2627

paddle/fluid/inference/engine.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ limitations under the License. */
1414

1515
#pragma once
1616

17+
#include <string>
1718
#include "paddle/fluid/framework/framework.pb.h"
1819

1920
namespace paddle {
@@ -58,8 +59,8 @@ class EngineBase {
5859

5960
struct Buffer {
6061
void* buffer{nullptr}; // buffer should be allocated only once.
61-
int max_size; // buffer allocated space.
62-
int size; // data size.
62+
size_t max_size; // buffer allocated space.
63+
size_t size; // data size.
6364
DeviceType device{DeviceType::UNK}; // tells which device this buffer is on.
6465
};
6566

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto)
22
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
33
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
4-
54
add_subdirectory(convert)
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc op_converter.h DEPS ${FLUID_CORE_MODULES})
2-
nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
1+
nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
2+
nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
33
DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine)
44
nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)

paddle/fluid/inference/tensorrt/convert/io_converter.cc

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,34 +23,51 @@ namespace tensorrt {
2323
using platform::is_gpu_place;
2424
using platform::is_cpu_place;
2525

26-
class DefaultInputConverter : public EngineInputConverter {
26+
class DefaultIOConverter : public EngineIOConverter {
2727
public:
28-
DefaultInputConverter() {}
28+
DefaultIOConverter() {}
2929
// NOTE out is GPU memory.
3030
virtual void operator()(const LoDTensor& in, void* out,
3131
size_t max_size) override {
3232
PADDLE_ENFORCE(out != nullptr);
33-
PADDLE_ENFORCE_LE(in.memory_size(), max_size);
33+
PADDLE_ENFORCE(stream_ != nullptr);
3434
const auto& place = in.place();
35+
size_t size = in.memory_size();
36+
PADDLE_ENFORCE_LE(size, max_size);
3537
if (is_cpu_place(place)) {
36-
PADDLE_ENFORCE(stream_ != nullptr);
37-
PADDLE_ENFORCE_EQ(0,
38-
cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
39-
cudaMemcpyHostToDevice, *stream_));
40-
38+
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
39+
cudaMemcpyHostToDevice, *stream_));
4140
} else if (is_gpu_place(place)) {
42-
PADDLE_ENFORCE_EQ(0,
43-
cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
44-
cudaMemcpyHostToHost, *stream_));
45-
41+
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
42+
cudaMemcpyDeviceToDevice, *stream_));
43+
} else {
44+
PADDLE_THROW("Unknown device for converter");
45+
}
46+
cudaStreamSynchronize(*stream_);
47+
}
48+
// NOTE in is GPU memory.
49+
virtual void operator()(const void* in, LoDTensor* out,
50+
size_t max_size) override {
51+
PADDLE_ENFORCE(in != nullptr);
52+
PADDLE_ENFORCE(stream_ != nullptr);
53+
const auto& place = out->place();
54+
size_t size = out->memory_size();
55+
PADDLE_ENFORCE_LE(size, max_size);
56+
if (is_cpu_place(place)) {
57+
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
58+
cudaMemcpyDeviceToHost, *stream_));
59+
} else if (is_gpu_place(place)) {
60+
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
61+
cudaMemcpyDeviceToDevice, *stream_));
4662
} else {
4763
PADDLE_THROW("Unknown device for converter");
4864
}
4965
cudaStreamSynchronize(*stream_);
5066
}
5167
};
5268

53-
REGISTER_TENSORRT_INPUT_CONVERTER(default, DefaultInputConverter);
69+
// fluid LodTensor <-> tensorrt ITensor
70+
REGISTER_TENSORRT_IO_CONVERTER(default, DefaultIOConverter);
5471

5572
} // namespace tensorrt
5673
} // namespace inference

paddle/fluid/inference/tensorrt/convert/io_converter.h

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ limitations under the License. */
1414

1515
#pragma once
1616

17+
#include <string>
1718
#include <unordered_map>
1819
#include "paddle/fluid/framework/lod_tensor.h"
1920
#include "paddle/fluid/inference/utils/singleton.h"
@@ -25,43 +26,57 @@ namespace tensorrt {
2526
using framework::LoDTensor;
2627

2728
/*
28-
* Convert Input from Fluid to an Engine.
29-
* TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in
30-
* most cases just need to copy the data.
29+
* Convert Input from Fluid to TensorRT Engine.
30+
* Convert Output from TensorRT Engine to Fluid.
31+
*
32+
* Note that TensorRT's ITensor follows row major, NCHW. Fluid is also row
33+
* major,
34+
* so in the default case just need to copy the data.
3135
*/
32-
class EngineInputConverter {
36+
class EngineIOConverter {
3337
public:
34-
EngineInputConverter() {}
38+
EngineIOConverter() {}
3539

3640
virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {}
41+
virtual void operator()(const void* in, LoDTensor* out, size_t max_size) {}
3742

3843
void SetStream(cudaStream_t* stream) { stream_ = stream; }
3944

40-
static void Run(const std::string& in_op_type, const LoDTensor& in, void* out,
41-
size_t max_size, cudaStream_t* stream) {
45+
static void ConvertInput(const std::string& op_type, const LoDTensor& in,
46+
void* out, size_t max_size, cudaStream_t* stream) {
4247
PADDLE_ENFORCE(stream != nullptr);
43-
auto* converter = Registry<EngineInputConverter>::Lookup(
44-
in_op_type, "default" /* default_type */);
48+
auto* converter = Registry<EngineIOConverter>::Lookup(
49+
op_type, "default" /* default_type */);
4550
PADDLE_ENFORCE_NOT_NULL(converter);
4651
converter->SetStream(stream);
4752
(*converter)(in, out, max_size);
4853
}
4954

50-
virtual ~EngineInputConverter() {}
55+
static void ConvertOutput(const std::string& op_type, const void* in,
56+
LoDTensor* out, size_t max_size,
57+
cudaStream_t* stream) {
58+
PADDLE_ENFORCE(stream != nullptr);
59+
auto* converter = Registry<EngineIOConverter>::Lookup(
60+
op_type, "default" /* default_type */);
61+
PADDLE_ENFORCE_NOT_NULL(converter);
62+
converter->SetStream(stream);
63+
(*converter)(in, out, max_size);
64+
}
65+
66+
virtual ~EngineIOConverter() {}
5167

5268
protected:
5369
cudaStream_t* stream_{nullptr};
5470
};
5571

72+
#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__) \
73+
struct trt_io_##op_type__##_converter { \
74+
trt_io_##op_type__##_converter() { \
75+
Registry<EngineIOConverter>::Register<Converter__>(#op_type__); \
76+
} \
77+
}; \
78+
trt_io_##op_type__##_converter trt_io_##op_type__##_converter__;
79+
5680
} // namespace tensorrt
5781
} // namespace inference
5882
} // namespace paddle
59-
60-
#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \
61-
struct trt_input_##in_op_type__##_converter { \
62-
trt_input_##in_op_type__##_converter() { \
63-
::paddle::inference::Registry<EngineInputConverter>::Register< \
64-
Converter__>(#in_op_type__); \
65-
} \
66-
}; \
67-
trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__;

paddle/fluid/inference/tensorrt/convert/test_activation_op.cc

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ limitations under the License. */
1616
#include "paddle/fluid/framework/lod_tensor.h"
1717
#include "paddle/fluid/framework/op_registry.h"
1818
#include "paddle/fluid/framework/program_desc.h"
19+
#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
1920
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
2021
#include "paddle/fluid/platform/device_context.h"
2122
#include "paddle/fluid/platform/place.h"
@@ -26,7 +27,7 @@ namespace paddle {
2627
namespace inference {
2728
namespace tensorrt {
2829

29-
void Compare(float input, float expect) {
30+
void Compare(const std::string op_type, float input, float expect) {
3031
framework::Scope scope;
3132
platform::CUDAPlace place;
3233
platform::CUDADeviceContext ctx(place);
@@ -35,6 +36,7 @@ void Compare(float input, float expect) {
3536
auto x_var = scope.Var("X");
3637
auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
3738
x_tensor->Resize({1, 1});
39+
x_tensor->mutable_data<float>(place);
3840
std::vector<float> init;
3941
init.push_back(input);
4042
framework::TensorFromVector(init, ctx, x_tensor);
@@ -45,14 +47,15 @@ void Compare(float input, float expect) {
4547
out_tensor->mutable_data<float>(place);
4648

4749
framework::OpDesc op_desc;
48-
op_desc.SetType("relu");
50+
op_desc.SetType(op_type);
4951
op_desc.SetInput("X", {"X"});
5052
op_desc.SetOutput("Out", {"Out"});
5153

52-
auto relu_op = framework::OpRegistry::CreateOp(*op_desc.Proto());
54+
auto op = framework::OpRegistry::CreateOp(*op_desc.Proto());
5355

5456
// run fluid op
55-
relu_op->Run(scope, place);
57+
op->Run(scope, place);
58+
// get fluid output
5659
std::vector<float> out1;
5760
framework::TensorToVector(*out_tensor, ctx, &out1);
5861

@@ -63,30 +66,37 @@ void Compare(float input, float expect) {
6366
engine->InitNetwork();
6467
engine->DeclareInput("X", nvinfer1::DataType::kFLOAT,
6568
nvinfer1::DimsCHW{1, 1, 1});
66-
69+
// convert op
6770
OpConverter op_converter;
6871
op_converter.ConvertOp(*op_desc.Proto(), engine);
6972

7073
engine->DeclareOutput("Out");
7174
engine->FreezeNetwork();
72-
engine->SetInputFromCPU("X", &input, 1 * sizeof(float));
7375

74-
// run tensorrt op
76+
// convert LoDTensor to ITensor
77+
size_t size = x_tensor->memory_size();
78+
EngineIOConverter::ConvertInput(op_type, *x_tensor,
79+
engine->buffer("X").buffer, size, &stream);
80+
// run tensorrt Outp
7581
engine->Execute(1);
76-
77-
float out2;
78-
engine->GetOutputInCPU("Out", &out2, 1 * sizeof(float));
79-
80-
ASSERT_EQ(out1[0], out2);
82+
// convert ITensor to LoDTensor
83+
EngineIOConverter::ConvertOutput(op_type, engine->buffer("Out").buffer,
84+
out_tensor, size, &stream);
85+
// get tensorrt output
86+
std::vector<float> out2;
87+
framework::TensorToVector(*out_tensor, ctx, &out2);
88+
89+
// compare
90+
ASSERT_EQ(out1[0], out2[0]);
8191
ASSERT_EQ(out1[0], expect);
8292

8393
delete engine;
8494
cudaStreamDestroy(stream);
8595
}
8696

8797
TEST(OpConverter, ConvertRelu) {
88-
Compare(1, 1); // relu(1) = 1
89-
Compare(-5, 0); // relu(-5) = 0
98+
Compare("relu", 1, 1); // relu(1) = 1
99+
Compare("relu", -5, 0); // relu(-5) = 0
90100
}
91101

92102
} // namespace tensorrt

paddle/fluid/inference/tensorrt/convert/test_io_converter.cc

Lines changed: 43 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,40 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

15+
#include <gtest/gtest.h>
1516
#include "paddle/fluid/framework/lod_tensor.h"
1617
#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
1718

18-
#include <gtest/gtest.h>
19-
2019
namespace paddle {
2120
namespace inference {
2221
namespace tensorrt {
2322

24-
class EngineInputConverterTester : public ::testing::Test {
25-
public:
26-
void SetUp() override { tensor.Resize({10, 10}); }
23+
void IOConverterTester(const platform::DeviceContext& ctx) {
24+
cudaStream_t stream;
25+
ASSERT_EQ(0, cudaStreamCreate(&stream));
2726

28-
framework::LoDTensor tensor;
29-
};
27+
// init fluid in_tensor
28+
framework::LoDTensor in_tensor;
29+
in_tensor.Resize({10, 10});
30+
auto place = ctx.GetPlace();
31+
in_tensor.mutable_data<float>(place);
32+
std::vector<float> init;
33+
for (int64_t i = 0; i < 10 * 10; ++i) {
34+
init.push_back(i);
35+
}
36+
framework::TensorFromVector(init, ctx, &in_tensor);
3037

31-
TEST_F(EngineInputConverterTester, DefaultCPU) {
38+
// init tensorrt buffer
3239
void* buffer;
33-
tensor.mutable_data<float>(platform::CPUPlace());
34-
ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
40+
size_t size = in_tensor.memory_size();
41+
ASSERT_EQ(cudaMalloc(&buffer, size), 0);
3542

36-
cudaStream_t stream;
37-
EngineInputConverter::Run("test", tensor, buffer, tensor.memory_size(),
38-
&stream);
43+
// convert fluid in_tensor to tensorrt buffer
44+
EngineIOConverter::ConvertInput("test", in_tensor, buffer, size, &stream);
45+
46+
// convert tensorrt buffer to fluid out_tensor
47+
framework::LoDTensor out_tensor;
48+
out_tensor.Resize({10, 10});
49+
out_tensor.mutable_data<float>(place);
50+
EngineIOConverter::ConvertOutput("test", buffer, &out_tensor, size, &stream);
51+
52+
// compare in_tensor and out_tensor
53+
std::vector<float> result;
54+
framework::TensorToVector(out_tensor, ctx, &result);
55+
EXPECT_EQ(init.size(), result.size());
56+
for (size_t i = 0; i < init.size(); i++) {
57+
EXPECT_EQ(init[i], result[i]);
58+
}
59+
cudaStreamDestroy(stream);
3960
}
4061

41-
TEST_F(EngineInputConverterTester, DefaultGPU) {
42-
void* buffer;
43-
tensor.mutable_data<float>(platform::CUDAPlace());
44-
ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
62+
TEST(EngineIOConverterTester, DefaultCPU) {
63+
platform::CPUPlace place;
64+
platform::CPUDeviceContext ctx(place);
65+
IOConverterTester(ctx);
66+
}
4567

46-
cudaStream_t stream;
47-
EngineInputConverter::Run("test", tensor, buffer, tensor.memory_size(),
48-
&stream);
68+
TEST(EngineIOConverterTester, DefaultGPU) {
69+
platform::CUDAPlace place;
70+
platform::CUDADeviceContext ctx(place);
71+
IOConverterTester(ctx);
4972
}
5073

5174
} // namespace tensorrt

0 commit comments

Comments
 (0)