Skip to content

Commit a032f56

Browse files
authored
Add profiling information for inference example (#8748)
* Add profiling information for inference example, recognize digits. * Refine the profiling method. * Correct the use of RecordEvent and simplify recognize_digits.
1 parent 1e4d95c commit a032f56

File tree

5 files changed

+104
-91
lines changed

5 files changed

+104
-91
lines changed

paddle/fluid/inference/io.cc

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,14 @@ namespace paddle {
2222
namespace inference {
2323

2424
void ReadBinaryFile(const std::string& filename, std::string& contents) {
25-
VLOG(3) << "loading model from " << filename;
26-
std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
27-
inputfs.seekg(0, std::ios::end);
25+
std::ifstream fin(filename, std::ios::in | std::ios::binary);
26+
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
27+
fin.seekg(0, std::ios::end);
2828
contents.clear();
29-
contents.resize(inputfs.tellg());
30-
inputfs.seekg(0, std::ios::beg);
31-
inputfs.read(&contents[0], contents.size());
32-
inputfs.close();
29+
contents.resize(fin.tellg());
30+
fin.seekg(0, std::ios::beg);
31+
fin.read(&contents[0], contents.size());
32+
fin.close();
3333
}
3434

3535
bool IsPersistable(const framework::VarDesc* var) {
@@ -97,6 +97,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
9797
const std::string& dirname) {
9898
std::string model_filename = dirname + "/__model__";
9999
std::string program_desc_str;
100+
VLOG(3) << "loading model from " << model_filename;
100101
ReadBinaryFile(model_filename, program_desc_str);
101102

102103
std::unique_ptr<framework::ProgramDesc> main_program(

paddle/fluid/inference/tests/book/test_inference_image_classification.cc

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,13 @@ limitations under the License. */
1717
#include "paddle/fluid/inference/tests/test_helper.h"
1818

1919
DEFINE_string(dirname, "", "Directory of the inference model.");
20+
DEFINE_int32(batch_size, 1, "Batch size of input data");
21+
DEFINE_int32(repeat, 1, "Running the inference program repeat times");
2022

2123
TEST(inference, image_classification) {
22-
if (FLAGS_dirname.empty()) {
23-
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
24+
if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
25+
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
26+
"--batch_size=1 --repeat=1";
2427
}
2528

2629
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
@@ -29,13 +32,11 @@ TEST(inference, image_classification) {
2932
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
3033
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
3134

32-
int64_t batch_size = 1;
33-
3435
paddle::framework::LoDTensor input;
3536
// Use normilized image pixels as input data,
3637
// which should be in the range [0.0, 1.0].
3738
SetupTensor<float>(input,
38-
{batch_size, 3, 32, 32},
39+
{FLAGS_batch_size, 3, 32, 32},
3940
static_cast<float>(0),
4041
static_cast<float>(1));
4142
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
@@ -46,7 +47,9 @@ TEST(inference, image_classification) {
4647
cpu_fetchs1.push_back(&output1);
4748

4849
// Run inference on CPU
49-
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
50+
LOG(INFO) << "--- CPU Runs: ---";
51+
TestInference<paddle::platform::CPUPlace>(
52+
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
5053
LOG(INFO) << output1.dims();
5154

5255
#ifdef PADDLE_WITH_CUDA
@@ -55,7 +58,9 @@ TEST(inference, image_classification) {
5558
cpu_fetchs2.push_back(&output2);
5659

5760
// Run inference on CUDA GPU
58-
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
61+
LOG(INFO) << "--- GPU Runs: ---";
62+
TestInference<paddle::platform::CUDAPlace>(
63+
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
5964
LOG(INFO) << output2.dims();
6065

6166
CheckError<float>(output1, output2);

paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc

Lines changed: 24 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,13 @@ limitations under the License. */
1717
#include "paddle/fluid/inference/tests/test_helper.h"
1818

1919
DEFINE_string(dirname, "", "Directory of the inference model.");
20+
DEFINE_int32(batch_size, 1, "Batch size of input data");
21+
DEFINE_int32(repeat, 1, "Running the inference program repeat times");
2022

2123
TEST(inference, recognize_digits) {
22-
if (FLAGS_dirname.empty()) {
23-
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
24+
if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
25+
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
26+
"--batch_size=1 --repeat=1";
2427
}
2528

2629
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
@@ -29,77 +32,39 @@ TEST(inference, recognize_digits) {
2932
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
3033
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
3134

32-
int64_t batch_size = 1;
33-
3435
paddle::framework::LoDTensor input;
3536
// Use normilized image pixels as input data,
3637
// which should be in the range [-1.0, 1.0].
3738
SetupTensor<float>(input,
38-
{batch_size, 1, 28, 28},
39+
{FLAGS_batch_size, 1, 28, 28},
3940
static_cast<float>(-1),
4041
static_cast<float>(1));
4142
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
4243
cpu_feeds.push_back(&input);
4344

44-
paddle::framework::LoDTensor output1;
45-
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
46-
cpu_fetchs1.push_back(&output1);
45+
for (auto is_combined : {false, true}) {
46+
paddle::framework::LoDTensor output1;
47+
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
48+
cpu_fetchs1.push_back(&output1);
4749

48-
// Run inference on CPU
49-
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
50-
LOG(INFO) << output1.dims();
50+
// Run inference on CPU
51+
LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
52+
TestInference<paddle::platform::CPUPlace>(
53+
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
54+
LOG(INFO) << output1.dims();
5155

5256
#ifdef PADDLE_WITH_CUDA
53-
paddle::framework::LoDTensor output2;
54-
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
55-
cpu_fetchs2.push_back(&output2);
57+
paddle::framework::LoDTensor output2;
58+
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
59+
cpu_fetchs2.push_back(&output2);
5660

57-
// Run inference on CUDA GPU
58-
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
59-
LOG(INFO) << output2.dims();
61+
// Run inference on CUDA GPU
62+
LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
63+
TestInference<paddle::platform::CUDAPlace>(
64+
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
65+
LOG(INFO) << output2.dims();
6066

61-
CheckError<float>(output1, output2);
67+
CheckError<float>(output1, output2);
6268
#endif
63-
}
64-
65-
TEST(inference, recognize_digits_combine) {
66-
if (FLAGS_dirname.empty()) {
67-
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
6869
}
69-
70-
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
71-
std::string dirname = FLAGS_dirname;
72-
73-
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
74-
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
75-
76-
paddle::framework::LoDTensor input;
77-
// Use normilized image pixels as input data,
78-
// which should be in the range [-1.0, 1.0].
79-
SetupTensor<float>(
80-
input, {1, 1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
81-
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
82-
cpu_feeds.push_back(&input);
83-
84-
paddle::framework::LoDTensor output1;
85-
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
86-
cpu_fetchs1.push_back(&output1);
87-
88-
// Run inference on CPU
89-
TestInference<paddle::platform::CPUPlace, true>(
90-
dirname, cpu_feeds, cpu_fetchs1);
91-
LOG(INFO) << output1.dims();
92-
93-
#ifdef PADDLE_WITH_CUDA
94-
paddle::framework::LoDTensor output2;
95-
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
96-
cpu_fetchs2.push_back(&output2);
97-
98-
// Run inference on CUDA GPU
99-
TestInference<paddle::platform::CUDAPlace, true>(
100-
dirname, cpu_feeds, cpu_fetchs2);
101-
LOG(INFO) << output2.dims();
102-
103-
CheckError<float>(output1, output2);
104-
#endif
10570
}

paddle/fluid/inference/tests/test_helper.h

Lines changed: 59 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ limitations under the License. */
1515
#include <time.h>
1616
#include "paddle/fluid/framework/lod_tensor.h"
1717
#include "paddle/fluid/inference/io.h"
18+
#include "paddle/fluid/platform/profiler.h"
1819

1920
template <typename T>
2021
void SetupTensor(paddle::framework::LoDTensor& input,
@@ -87,31 +88,58 @@ void CheckError(paddle::framework::LoDTensor& output1,
8788
EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
8889
}
8990

90-
template <typename Place, bool IsCombined = false>
91+
template <typename Place>
9192
void TestInference(const std::string& dirname,
9293
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
93-
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
94+
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
95+
const int repeat = 1,
96+
const bool is_combined = false) {
9497
// 1. Define place, executor, scope
9598
auto place = Place();
9699
auto executor = paddle::framework::Executor(place);
97100
auto* scope = new paddle::framework::Scope();
98101

102+
// Profile the performance
103+
paddle::platform::ProfilerState state;
104+
if (paddle::platform::is_cpu_place(place)) {
105+
state = paddle::platform::ProfilerState::kCPU;
106+
} else {
107+
#ifdef PADDLE_WITH_CUDA
108+
state = paddle::platform::ProfilerState::kCUDA;
109+
// The default device_id of paddle::platform::CUDAPlace is 0.
110+
// Users can get the device_id using:
111+
// int device_id = place.GetDeviceId();
112+
paddle::platform::SetDeviceId(0);
113+
#endif
114+
}
115+
116+
// Enable the profiler
117+
paddle::platform::EnableProfiler(state);
118+
99119
// 2. Initialize the inference_program and load parameters
100120
std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
101-
if (IsCombined) {
102-
// All parameters are saved in a single file.
103-
// Hard-coding the file names of program and parameters in unittest.
104-
// The file names should be consistent with that used in Python API
105-
// `fluid.io.save_inference_model`.
106-
std::string prog_filename = "__model_combined__";
107-
std::string param_filename = "__params_combined__";
108-
inference_program = paddle::inference::Load(executor,
109-
*scope,
110-
dirname + "/" + prog_filename,
111-
dirname + "/" + param_filename);
112-
} else {
113-
// Parameters are saved in separate files sited in the specified `dirname`.
114-
inference_program = paddle::inference::Load(executor, *scope, dirname);
121+
{
122+
paddle::platform::RecordEvent record_event(
123+
"init_program",
124+
paddle::platform::DeviceContextPool::Instance().Get(place));
125+
126+
if (is_combined) {
127+
// All parameters are saved in a single file.
128+
// Hard-coding the file names of program and parameters in unittest.
129+
// The file names should be consistent with that used in Python API
130+
// `fluid.io.save_inference_model`.
131+
std::string prog_filename = "__model_combined__";
132+
std::string param_filename = "__params_combined__";
133+
inference_program =
134+
paddle::inference::Load(executor,
135+
*scope,
136+
dirname + "/" + prog_filename,
137+
dirname + "/" + param_filename);
138+
} else {
139+
// Parameters are saved in separate files sited in the specified
140+
// `dirname`.
141+
inference_program = paddle::inference::Load(executor, *scope, dirname);
142+
}
115143
}
116144

117145
// 3. Get the feed_target_names and fetch_target_names
@@ -134,7 +162,21 @@ void TestInference(const std::string& dirname,
134162
}
135163

136164
// 6. Run the inference program
137-
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
165+
{
166+
// Run repeat times to profile the performance
167+
for (int i = 0; i < repeat; ++i) {
168+
paddle::platform::RecordEvent record_event(
169+
"run_inference",
170+
paddle::platform::DeviceContextPool::Instance().Get(place));
171+
172+
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
173+
}
174+
}
175+
176+
// Disable the profiler and print the timing information
177+
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
178+
"profiler.txt");
179+
paddle::platform::ResetProfiler();
138180

139181
delete scope;
140182
}

paddle/fluid/platform/profiler.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ void EnableProfiler(ProfilerState state) {
178178
}
179179
#ifdef PADDLE_WITH_CUDA
180180
if (g_state == ProfilerState::kCUDA) {
181-
// Generate some dummy evenets first to reduce the startup overhead.
181+
// Generate some dummy events first to reduce the startup overhead.
182182
for (int i = 0; i < 5; i++) {
183183
ForEachDevice([](int d) {
184184
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));

0 commit comments

Comments
 (0)