Skip to content

Commit 0fbe0a7

Browse files
committed
add multi-thread ut for ditu-rnn
1 parent d0c65bf commit 0fbe0a7

File tree

6 files changed

+119
-86
lines changed

6 files changed

+119
-86
lines changed

paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
// limitations under the License.
1414

1515
#include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
16+
#include <string>
1617
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
1718
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
1819
#include "paddle/fluid/framework/lod_tensor.h"
19-
#include "paddle/fluid/inference/api/helper.h"
2020

2121
namespace paddle {
2222
namespace framework {

paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
3535

3636
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
3737
Graph* g) {
38-
3938
auto* id = subgraph.at(gpd.pattern().RetrieveNode("any_node"));
4039
marked_nodes.insert(id);
4140
};
@@ -89,7 +88,6 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
8988
LINK_TO(op, hidden_n);
9089
#undef LINK_TO
9190
return op;
92-
9391
};
9492

9593
lstm_creator(16, 12, 14, 18, 17, 22, 21, 19);
@@ -105,14 +103,16 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
105103
for (auto it = node->inputs.begin(); it != node->inputs.end();) {
106104
if (marked_nodes.count(*it)) {
107105
it = const_cast<Node*>(node)->inputs.erase(it);
108-
} else
106+
} else {
109107
it++;
108+
}
110109
}
111110
for (auto it = node->outputs.begin(); it != node->outputs.end();) {
112111
if (marked_nodes.count(*it)) {
113112
it = const_cast<Node*>(node)->outputs.erase(it);
114-
} else
113+
} else {
115114
it++;
115+
}
116116
}
117117
}
118118

paddle/fluid/framework/ir/graph_pattern_detector.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ void GraphPatternDetector::operator()(Graph* graph,
8181
LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
8282
int id = 0;
8383
for (auto& g : subgraphs) {
84-
LOG(INFO) << "optimizing #" << id++ << " subgraph";
84+
VLOG(3) << "optimizing #" << id++ << " subgraph";
8585
handler(g, graph);
8686
}
8787
}

paddle/fluid/inference/analysis/analyzer_tester.cc

Lines changed: 69 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -16,26 +16,25 @@
1616

1717
#include <google/protobuf/text_format.h>
1818
#include <gtest/gtest.h>
19+
#include <thread> // NOLINT
1920
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
2021
#include "paddle/fluid/framework/ir/pass.h"
2122
#include "paddle/fluid/inference/analysis/ut_helper.h"
2223
#include "paddle/fluid/inference/api/analysis_predictor.h"
2324
#include "paddle/fluid/inference/api/helper.h"
2425
#include "paddle/fluid/inference/api/paddle_inference_api.h"
2526
#include "paddle/fluid/inference/utils/singleton.h"
26-
#include "paddle/fluid/platform/profiler.h"
2727

2828
DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
2929
DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
3030
DEFINE_int32(batch_size, 10, "batch size.");
3131
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
32+
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
3233

3334
namespace paddle {
3435
namespace inference {
3536
namespace analysis {
3637

37-
using namespace framework;
38-
3938
TEST(Analyzer, analysis_without_tensorrt) {
4039
FLAGS_IA_enable_tensorrt_subgraph_engine = false;
4140
Argument argument;
@@ -219,39 +218,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
219218
}
220219
}
221220

222-
std::string DescribeTensor(const PaddleTensor &tensor) {
223-
std::stringstream os;
224-
os << "Tensor [" << tensor.name << "]\n";
225-
os << " - type: ";
226-
switch (tensor.dtype) {
227-
case PaddleDType::FLOAT32:
228-
os << "float32";
229-
break;
230-
case PaddleDType::INT64:
231-
os << "int64";
232-
break;
233-
default:
234-
os << "unset";
235-
}
236-
os << '\n';
237-
238-
os << " - shape: " << to_string(tensor.shape) << '\n';
239-
os << " - lod: ";
240-
for (auto &l : tensor.lod) {
241-
os << to_string(l) << "; ";
242-
}
243-
os << "\n";
244-
os << " - data: ";
245-
246-
int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
247-
[](int a, int b) { return a * b; });
248-
for (int i = 0; i < dim; i++) {
249-
os << static_cast<float *>(tensor.data.data())[i] << " ";
250-
}
251-
os << '\n';
252-
return os.str();
253-
}
254-
255221
} // namespace
256222

257223
const float ditu_rnn_target_data[] = {
@@ -266,58 +232,92 @@ const float ditu_rnn_target_data[] = {
266232
93.5771, 3.84641, 0, 0, 0, 0, 0, 0,
267233
169.426, 0, 0, 0, 0, 0, 0, 0};
268234
// Test with a really complicate model.
269-
void TestDituRNNPrediction(const std::string &model_path,
270-
const std::string &data_path, int batch_size,
271-
bool use_analysis, bool activate_ir,
272-
int num_times = 1) {
235+
void TestDituRNNPrediction(bool use_analysis_and_activate_ir = false,
236+
int num_threads = FLAGS_num_threads) {
273237
NativeConfig config;
274238
config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
275239
config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
276240
config.use_gpu = false;
277241
config.device = 0;
278242
config.specify_input_name = true;
243+
int batch_size = FLAGS_batch_size;
244+
int num_times = FLAGS_repeat;
279245

280246
auto base_predictor =
281247
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
282248
auto predictor =
283249
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(config);
284250
std::vector<PaddleTensor> input_slots;
285-
DataRecord data(data_path, batch_size);
251+
DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
286252
// Prepare inputs.
287253
PrepareInputs(&input_slots, &data, batch_size);
288254
std::vector<PaddleTensor> outputs, base_outputs;
289255

290256
base_predictor->Run(input_slots, &base_outputs);
291257

292-
Timer timer;
293-
timer.tic();
294-
for (int i = 0; i < num_times; i++) {
295-
predictor->Run(input_slots, &outputs);
296-
}
297258
LOG(INFO) << "===========profile result===========";
298-
LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
299-
<< ", latency: " << timer.toc() / num_times << "ms";
259+
if (num_threads == 1) {
260+
std::vector<PaddleTensor> input_slots;
261+
// Prepare inputs.
262+
DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
263+
PrepareInputs(&input_slots, &data, batch_size);
264+
265+
Timer timer;
266+
timer.tic();
267+
for (int i = 0; i < num_times; i++) {
268+
predictor->Run(input_slots, &outputs);
269+
}
270+
print_time(batch_size, num_times, 1, 0, timer.toc() / num_times);
271+
} else {
272+
std::vector<std::thread> threads;
273+
std::vector<PaddleTensor> input_slots;
274+
// Prepare inputs.
275+
PrepareInputs(&input_slots, &data, batch_size);
276+
std::vector<PaddleTensor> outputs;
277+
for (int tid = 0; tid < num_threads; ++tid) {
278+
threads.emplace_back([&, tid]() {
279+
auto predictor_tid =
280+
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(
281+
config);
282+
DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
283+
284+
Timer timer;
285+
timer.tic();
286+
for (int i = 0; i < num_times; i++) {
287+
predictor_tid->Run(input_slots, &outputs);
288+
}
289+
print_time(batch_size, num_times, num_threads, tid,
290+
timer.toc() / num_times);
291+
});
292+
}
293+
for (int i = 0; i < num_threads; ++i) {
294+
threads[i].join();
295+
}
296+
}
300297
LOG(INFO) << "=====================================";
301298

302-
PADDLE_ENFORCE_GT(outputs.size(), 0);
303-
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
304-
for (size_t i = 0; i < outputs.size(); i++) {
305-
auto &out = outputs[i];
306-
auto &base_out = base_outputs[i];
307-
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
308-
[](int a, int b) { return a * b; });
309-
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
310-
1, [](int a, int b) { return a * b; });
311-
PADDLE_ENFORCE_EQ(size, size1);
312-
PADDLE_ENFORCE_GT(size, 0);
313-
float *data = static_cast<float *>(out.data.data());
314-
float *base_data = static_cast<float *>(base_out.data.data());
315-
for (size_t i = 0; i < size; i++) {
316-
EXPECT_NEAR(data[i], base_data[i], 1e-3);
299+
if (num_threads == 1) {
300+
PADDLE_ENFORCE_GT(outputs.size(), 0);
301+
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
302+
for (size_t i = 0; i < outputs.size(); i++) {
303+
auto &out = outputs[i];
304+
auto &base_out = base_outputs[i];
305+
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
306+
[](int a, int b) { return a * b; });
307+
size_t size1 =
308+
std::accumulate(base_out.shape.begin(), base_out.shape.end(), 1,
309+
[](int a, int b) { return a * b; });
310+
PADDLE_ENFORCE_EQ(size, size1);
311+
PADDLE_ENFORCE_GT(size, 0);
312+
float *data = static_cast<float *>(out.data.data());
313+
float *base_data = static_cast<float *>(base_out.data.data());
314+
for (size_t i = 0; i < size; i++) {
315+
EXPECT_NEAR(data[i], base_data[i], 1e-3);
316+
}
317317
}
318318
}
319319

320-
if (use_analysis && activate_ir) {
320+
if (use_analysis_and_activate_ir) {
321321
AnalysisPredictor *analysis_predictor =
322322
dynamic_cast<AnalysisPredictor *>(predictor.get());
323323
auto &fuse_statis = analysis_predictor->analysis_argument()
@@ -334,23 +334,16 @@ void TestDituRNNPrediction(const std::string &model_path,
334334

335335
// Directly infer with the original model.
336336
TEST(Analyzer, DituRNN_without_analysis) {
337-
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
338-
FLAGS_batch_size, false, false, FLAGS_repeat);
339-
}
340-
341-
// Inference with the original model with the analysis turned on, the analysis
342-
// module will transform the program to a data flow graph.
343-
TEST(Analyzer, DituRNN_with_analysis) {
344-
LOG(INFO) << "ditu rnn with analysis";
345-
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
346-
FLAGS_batch_size, true, false, FLAGS_repeat);
337+
LOG(INFO) << "ditu rnn without analysis";
338+
TestDituRNNPrediction(false, 1);
339+
TestDituRNNPrediction(false, 4); // multi-threads
347340
}
348341

349342
// Inference with analysis and IR. The IR module will fuse some large kernels.
350343
TEST(Analyzer, DituRNN_with_analysis_with_IR) {
351344
LOG(INFO) << "ditu rnn with analysis and IR fuse";
352-
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
353-
FLAGS_batch_size, true, true, FLAGS_repeat);
345+
TestDituRNNPrediction(true, 1);
346+
TestDituRNNPrediction(true, 4); // multi-threads
354347
}
355348

356349
} // namespace analysis

paddle/fluid/inference/api/analysis_predictor.cc

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
#include "paddle/fluid/inference/api/analysis_predictor.h"
1616
#include <memory>
17+
#include <string>
18+
#include <vector>
1719
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
1820
#include "paddle/fluid/framework/ir/pass.h"
1921
#include "paddle/fluid/framework/scope.h"
@@ -30,7 +32,6 @@ bool AnalysisPredictor::Init(
3032
} else {
3133
place_ = paddle::platform::CPUPlace();
3234
}
33-
PADDLE_ENFORCE(!parent_scope);
3435
if (parent_scope) {
3536
scope_ = parent_scope;
3637
sub_scope_ = &(parent_scope->NewScope());
@@ -92,8 +93,6 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
9293
Analyzer().Run(&argument_);
9394
CHECK(argument_.transformed_program_desc);
9495
VLOG(5) << "to prepare executor";
95-
// LOG(INFO) << "transformed_parogram_desc " <<
96-
// argument.transformed_program_desc->DebugString();
9796
inference_program_.reset(
9897
new framework::ProgramDesc(*argument_.transformed_program_desc));
9998
PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr));
@@ -106,7 +105,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
106105
template <>
107106
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
108107
NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) {
109-
VLOG(3) << "create NativePredictor";
108+
VLOG(3) << "create AnalysisPredictor";
110109
if (config.use_gpu) {
111110
// 1. GPU memeroy
112111
PADDLE_ENFORCE_GT(

paddle/fluid/inference/api/helper.h

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#pragma once
1616

17+
#include <glog/logging.h>
1718
#include <sys/time.h>
1819
#include <algorithm>
1920
#include <sstream>
@@ -106,5 +107,45 @@ static void TensorAssignData(PaddleTensor *tensor,
106107
}
107108
}
108109

110+
std::string DescribeTensor(const PaddleTensor &tensor) {
111+
std::stringstream os;
112+
os << "Tensor [" << tensor.name << "]\n";
113+
os << " - type: ";
114+
switch (tensor.dtype) {
115+
case PaddleDType::FLOAT32:
116+
os << "float32";
117+
break;
118+
case PaddleDType::INT64:
119+
os << "int64";
120+
break;
121+
default:
122+
os << "unset";
123+
}
124+
os << '\n';
125+
126+
os << " - shape: " << to_string(tensor.shape) << '\n';
127+
os << " - lod: ";
128+
for (auto &l : tensor.lod) {
129+
os << to_string(l) << "; ";
130+
}
131+
os << "\n";
132+
os << " - data: ";
133+
134+
int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
135+
[](int a, int b) { return a * b; });
136+
for (int i = 0; i < dim; i++) {
137+
os << static_cast<float *>(tensor.data.data())[i] << " ";
138+
}
139+
os << '\n';
140+
return os.str();
141+
}
142+
143+
void print_time(int batch_size, int repeat, int num_threads, int tid,
144+
double latency) {
145+
LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat
146+
<< ", threads: " << num_threads << ", thread id: " << tid
147+
<< ", latency: " << latency << "ms";
148+
}
149+
109150
} // namespace inference
110151
} // namespace paddle

0 commit comments

Comments
 (0)