|
16 | 16 |
|
17 | 17 | #include <google/protobuf/text_format.h>
|
18 | 18 | #include <gtest/gtest.h>
|
19 |
| -#include <thread> // NOLINT |
20 |
| -#include "paddle/fluid/framework/ir/fuse_pass_base.h" |
21 |
| -#include "paddle/fluid/framework/ir/pass.h" |
22 | 19 | #include "paddle/fluid/inference/analysis/ut_helper.h"
|
23 |
| -#include "paddle/fluid/inference/api/analysis_predictor.h" |
24 |
| -#include "paddle/fluid/inference/api/helper.h" |
25 | 20 | #include "paddle/fluid/inference/api/paddle_inference_api.h"
|
26 | 21 | #include "paddle/fluid/inference/api/paddle_inference_pass.h"
|
27 |
| -#include "paddle/fluid/inference/utils/singleton.h" |
28 |
| - |
29 |
| -DEFINE_string(infer_model, "", "model path"); |
30 |
| -DEFINE_string(infer_data, "", "data path"); |
31 |
| -DEFINE_int32(batch_size, 10, "batch size."); |
32 |
| -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); |
33 |
| -DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); |
34 | 22 |
|
35 | 23 | namespace paddle {
|
36 | 24 | namespace inference {
|
@@ -91,274 +79,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
|
91 | 79 | }
|
92 | 80 | }
|
93 | 81 |
|
94 |
| -namespace { |
95 |
| - |
96 |
| -struct DataRecord { |
97 |
| - std::vector<std::vector<std::vector<float>>> link_step_data_all; |
98 |
| - std::vector<std::vector<float>> week_data_all, minute_data_all; |
99 |
| - std::vector<size_t> lod1, lod2, lod3; |
100 |
| - std::vector<std::vector<float>> rnn_link_data, rnn_week_datas, |
101 |
| - rnn_minute_datas; |
102 |
| - size_t batch_iter{0}; |
103 |
| - size_t batch_size{1}; |
104 |
| - DataRecord() = default; |
105 |
| - explicit DataRecord(const std::string &path, int batch_size = 1) |
106 |
| - : batch_size(batch_size) { |
107 |
| - Load(path); |
108 |
| - } |
109 |
| - DataRecord NextBatch() { |
110 |
| - DataRecord data; |
111 |
| - size_t batch_end = batch_iter + batch_size; |
112 |
| - // NOTE skip the final batch, if no enough data is provided. |
113 |
| - if (batch_end <= link_step_data_all.size()) { |
114 |
| - data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, |
115 |
| - link_step_data_all.begin() + batch_end); |
116 |
| - data.week_data_all.assign(week_data_all.begin() + batch_iter, |
117 |
| - week_data_all.begin() + batch_end); |
118 |
| - data.minute_data_all.assign(minute_data_all.begin() + batch_iter, |
119 |
| - minute_data_all.begin() + batch_end); |
120 |
| - // Prepare LoDs |
121 |
| - data.lod1.push_back(0); |
122 |
| - data.lod2.push_back(0); |
123 |
| - data.lod3.push_back(0); |
124 |
| - CHECK(!data.link_step_data_all.empty()) << "empty"; |
125 |
| - CHECK(!data.week_data_all.empty()); |
126 |
| - CHECK(!data.minute_data_all.empty()); |
127 |
| - CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size()); |
128 |
| - CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size()); |
129 |
| - for (size_t j = 0; j < data.link_step_data_all.size(); j++) { |
130 |
| - for (const auto &d : data.link_step_data_all[j]) { |
131 |
| - data.rnn_link_data.push_back(d); |
132 |
| - } |
133 |
| - data.rnn_week_datas.push_back(data.week_data_all[j]); |
134 |
| - data.rnn_minute_datas.push_back(data.minute_data_all[j]); |
135 |
| - // calculate lod |
136 |
| - data.lod1.push_back(data.lod1.back() + |
137 |
| - data.link_step_data_all[j].size()); |
138 |
| - data.lod3.push_back(data.lod3.back() + 1); |
139 |
| - for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) { |
140 |
| - data.lod2.push_back(data.lod2.back() + |
141 |
| - data.link_step_data_all[j].size()); |
142 |
| - } |
143 |
| - } |
144 |
| - } |
145 |
| - batch_iter += batch_size; |
146 |
| - return data; |
147 |
| - } |
148 |
| - void Load(const std::string &path) { |
149 |
| - std::ifstream file(path); |
150 |
| - std::string line; |
151 |
| - int num_lines = 0; |
152 |
| - while (std::getline(file, line)) { |
153 |
| - num_lines++; |
154 |
| - std::vector<std::string> data; |
155 |
| - split(line, ':', &data); |
156 |
| - std::vector<std::vector<float>> link_step_data; |
157 |
| - std::vector<std::string> link_datas; |
158 |
| - split(data[0], '|', &link_datas); |
159 |
| - for (auto &step_data : link_datas) { |
160 |
| - std::vector<float> tmp; |
161 |
| - split_to_float(step_data, ',', &tmp); |
162 |
| - link_step_data.push_back(tmp); |
163 |
| - } |
164 |
| - // load week data |
165 |
| - std::vector<float> week_data; |
166 |
| - split_to_float(data[2], ',', &week_data); |
167 |
| - // load minute data |
168 |
| - std::vector<float> minute_data; |
169 |
| - split_to_float(data[1], ',', &minute_data); |
170 |
| - link_step_data_all.push_back(std::move(link_step_data)); |
171 |
| - week_data_all.push_back(std::move(week_data)); |
172 |
| - minute_data_all.push_back(std::move(minute_data)); |
173 |
| - } |
174 |
| - } |
175 |
| -}; |
176 |
| -void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, |
177 |
| - int batch_size) { |
178 |
| - PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, |
179 |
| - week_tensor, minute_tensor; |
180 |
| - lod_attention_tensor.name = "data_lod_attention"; |
181 |
| - init_zero_tensor.name = "cell_init"; |
182 |
| - lod_tensor_tensor.name = "data"; |
183 |
| - week_tensor.name = "week"; |
184 |
| - minute_tensor.name = "minute"; |
185 |
| - auto one_batch = data->NextBatch(); |
186 |
| - std::vector<int> rnn_link_data_shape( |
187 |
| - {static_cast<int>(one_batch.rnn_link_data.size()), |
188 |
| - static_cast<int>(one_batch.rnn_link_data.front().size())}); |
189 |
| - lod_attention_tensor.shape.assign({1, 2}); |
190 |
| - lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2}); |
191 |
| - init_zero_tensor.shape.assign({batch_size, 15}); |
192 |
| - init_zero_tensor.lod.assign({one_batch.lod3}); |
193 |
| - lod_tensor_tensor.shape = rnn_link_data_shape; |
194 |
| - lod_tensor_tensor.lod.assign({one_batch.lod1}); |
195 |
| - // clang-format off |
196 |
| - week_tensor.shape.assign( |
197 |
| - {static_cast<int>(one_batch.rnn_week_datas.size()), |
198 |
| - static_cast<int>(one_batch.rnn_week_datas.front().size())}); |
199 |
| - week_tensor.lod.assign({one_batch.lod3}); |
200 |
| - minute_tensor.shape.assign( |
201 |
| - {static_cast<int>(one_batch.rnn_minute_datas.size()), |
202 |
| - static_cast<int>(one_batch.rnn_minute_datas.front().size())}); |
203 |
| - minute_tensor.lod.assign({one_batch.lod3}); |
204 |
| - // clang-format on |
205 |
| - // assign data |
206 |
| - TensorAssignData<float>(&lod_attention_tensor, |
207 |
| - std::vector<std::vector<float>>({{0, 0}})); |
208 |
| - std::vector<float> tmp_zeros(batch_size * 15, 0.); |
209 |
| - TensorAssignData<float>(&init_zero_tensor, {tmp_zeros}); |
210 |
| - TensorAssignData<float>(&lod_tensor_tensor, one_batch.rnn_link_data); |
211 |
| - TensorAssignData<float>(&week_tensor, one_batch.rnn_week_datas); |
212 |
| - TensorAssignData<float>(&minute_tensor, one_batch.rnn_minute_datas); |
213 |
| - // Set inputs. |
214 |
| - auto init_zero_tensor1 = init_zero_tensor; |
215 |
| - init_zero_tensor1.name = "hidden_init"; |
216 |
| - input_slots->assign({week_tensor, init_zero_tensor, minute_tensor, |
217 |
| - init_zero_tensor1, lod_attention_tensor, |
218 |
| - lod_tensor_tensor}); |
219 |
| - for (auto &tensor : *input_slots) { |
220 |
| - tensor.dtype = PaddleDType::FLOAT32; |
221 |
| - } |
222 |
| -} |
223 |
| - |
224 |
| -} // namespace |
225 |
| - |
226 |
| -void CompareResult(const std::vector<PaddleTensor> &outputs, |
227 |
| - const std::vector<PaddleTensor> &base_outputs) { |
228 |
| - PADDLE_ENFORCE_GT(outputs.size(), 0); |
229 |
| - PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); |
230 |
| - for (size_t i = 0; i < outputs.size(); i++) { |
231 |
| - auto &out = outputs[i]; |
232 |
| - auto &base_out = base_outputs[i]; |
233 |
| - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, |
234 |
| - [](int a, int b) { return a * b; }); |
235 |
| - size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), |
236 |
| - 1, [](int a, int b) { return a * b; }); |
237 |
| - PADDLE_ENFORCE_EQ(size, size1); |
238 |
| - PADDLE_ENFORCE_GT(size, 0); |
239 |
| - float *data = static_cast<float *>(out.data.data()); |
240 |
| - float *base_data = static_cast<float *>(base_out.data.data()); |
241 |
| - for (size_t i = 0; i < size; i++) { |
242 |
| - EXPECT_NEAR(data[i], base_data[i], 1e-3); |
243 |
| - } |
244 |
| - } |
245 |
| -} |
246 |
| -// Test with a really complicate model. |
247 |
| -void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { |
248 |
| - AnalysisConfig config; |
249 |
| - config.prog_file = FLAGS_infer_model + "/__model__"; |
250 |
| - config.param_file = FLAGS_infer_model + "/param"; |
251 |
| - config.use_gpu = false; |
252 |
| - config.device = 0; |
253 |
| - config.specify_input_name = true; |
254 |
| - config.enable_ir_optim = activate_ir; |
255 |
| - PADDLE_ENFORCE(config.ir_mode == |
256 |
| - AnalysisConfig::IrPassMode::kExclude); // default |
257 |
| - config.ir_passes.clear(); // Do not exclude any pass. |
258 |
| - |
259 |
| - int batch_size = FLAGS_batch_size; |
260 |
| - int num_times = FLAGS_repeat; |
261 |
| - |
262 |
| - auto base_predictor = |
263 |
| - CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); |
264 |
| - auto predictor = |
265 |
| - CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>( |
266 |
| - config); |
267 |
| - std::vector<PaddleTensor> input_slots; |
268 |
| - DataRecord data(FLAGS_infer_data, batch_size); |
269 |
| - // Prepare inputs. |
270 |
| - PrepareInputs(&input_slots, &data, batch_size); |
271 |
| - std::vector<PaddleTensor> outputs, base_outputs; |
272 |
| - |
273 |
| - base_predictor->Run(input_slots, &base_outputs); |
274 |
| - |
275 |
| - if (num_threads == 1) { |
276 |
| - // Prepare inputs. |
277 |
| - Timer timer; |
278 |
| - timer.tic(); |
279 |
| - for (int i = 0; i < num_times; i++) { |
280 |
| - predictor->Run(input_slots, &outputs); |
281 |
| - } |
282 |
| - PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times); |
283 |
| - CompareResult(outputs, base_outputs); |
284 |
| - } else { |
285 |
| - std::vector<std::thread> threads; |
286 |
| - std::vector<std::unique_ptr<PaddlePredictor>> predictors; |
287 |
| - // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled |
288 |
| - // because AttentionLSTM's hard code nodeid will be damanged. |
289 |
| - for (int tid = 0; tid < num_threads; ++tid) { |
290 |
| - predictors.emplace_back( |
291 |
| - CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>( |
292 |
| - config)); |
293 |
| - } |
294 |
| - for (int tid = 0; tid < num_threads; ++tid) { |
295 |
| - threads.emplace_back([&, tid]() { |
296 |
| - // Each thread should have local input_slots and outputs. |
297 |
| - std::vector<PaddleTensor> input_slots; |
298 |
| - DataRecord data(FLAGS_infer_data, batch_size); |
299 |
| - PrepareInputs(&input_slots, &data, batch_size); |
300 |
| - std::vector<PaddleTensor> outputs; |
301 |
| - Timer timer; |
302 |
| - timer.tic(); |
303 |
| - for (int i = 0; i < num_times; i++) { |
304 |
| - predictors[tid]->Run(input_slots, &outputs); |
305 |
| - } |
306 |
| - PrintTime(batch_size, num_times, num_threads, tid, |
307 |
| - timer.toc() / num_times); |
308 |
| - CompareResult(outputs, base_outputs); |
309 |
| - }); |
310 |
| - } |
311 |
| - for (int i = 0; i < num_threads; ++i) { |
312 |
| - threads[i].join(); |
313 |
| - } |
314 |
| - } |
315 |
| - |
316 |
| - if (use_analysis && activate_ir) { |
317 |
| - AnalysisPredictor *analysis_predictor = |
318 |
| - dynamic_cast<AnalysisPredictor *>(predictor.get()); |
319 |
| - auto &fuse_statis = analysis_predictor->analysis_argument() |
320 |
| - .Get<std::unordered_map<std::string, int>>( |
321 |
| - framework::ir::kFuseStatisAttr); |
322 |
| - for (auto &item : fuse_statis) { |
323 |
| - LOG(INFO) << "fused " << item.first << " " << item.second; |
324 |
| - } |
325 |
| - |
326 |
| - int num_ops = 0; |
327 |
| - for (auto &node : |
328 |
| - analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { |
329 |
| - if (node->IsFunction()) { |
330 |
| - ++num_ops; |
331 |
| - } |
332 |
| - } |
333 |
| - LOG(INFO) << "has num ops: " << num_ops; |
334 |
| - |
335 |
| - ASSERT_TRUE(fuse_statis.count("fc_fuse")); |
336 |
| - EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); |
337 |
| - EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM |
338 |
| - EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); |
339 |
| - EXPECT_EQ(num_ops, |
340 |
| - 13); // After graph optimization, only 13 operators exists. |
341 |
| - } |
342 |
| -} |
343 |
| - |
344 |
| -// Inference with analysis and IR, easy for profiling independently. |
345 |
| -TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); } |
346 |
| - |
347 |
| -// Other unit-tests of RNN1, test different options of use_analysis, |
348 |
| -// activate_ir and multi-threads. |
349 |
| -TEST(Analyzer, RNN_tests) { |
350 |
| - int num_threads[2] = {1, 4}; |
351 |
| - for (auto i : num_threads) { |
352 |
| - // Directly infer with the original model. |
353 |
| - TestRNN1Prediction(false, false, i); |
354 |
| - // Inference with the original model with the analysis turned on, the |
355 |
| - // analysis |
356 |
| - // module will transform the program to a data flow graph. |
357 |
| - TestRNN1Prediction(true, false, i); |
358 |
| - // Inference with analysis and IR. The IR module will fuse some large |
359 |
| - // kernels. |
360 |
| - TestRNN1Prediction(true, true, i); |
361 |
| - } |
| 82 | +TEST(Analyzer, word2vec_without_analysis) { |
| 83 | + TestWord2vecPrediction(FLAGS_inference_model_dir); |
362 | 84 | }
|
363 | 85 |
|
364 | 86 | } // namespace analysis
|
|
0 commit comments