[Faster Transformer] Refine transformer cpp inference demo (#575)

FrostML · ZeyuChen · web-flow · commit 480a33d36862 · 2021-06-17T09:01:16.000+08:00
* refine faster transformer transformer demo

Co-authored-by: Zeyu Chen &lt;chenzeyu01@baidu.com&gt;
diff --git a/examples/machine_translation/transformer/faster_transformer/README.md b/examples/machine_translation/transformer/faster_transformer/README.md
@@ -237,19 +237,14 @@ cd ../
 
 编译完成后，在 `build/bin/` 路径下将会看到 `transformer_e2e` 的一个可执行文件。通过设置对应的设置参数完成执行的过程。
 
-``` sh
-cd bin/
-./transformer_e2e <batch_size> <gpu_id> <model_directory> <dict_directory> <input_data>
-```
-
 ### 导出基于 Faster Transformer 自定义 op 的预测库可使用模型文件
 
 我们提供一个已经基于动态图训练好的 base model 的 checkpoint 以供使用，当前 checkpoint 是基于 WMT 英德翻译的任务训练。可以通过[tranformer-base-wmt_ende_bpe](https://paddlenlp.bj.bcebos.com/models/transformers/transformer/tranformer-base-wmt_ende_bpe.tar.gz)下载。
 
 使用 C++ 预测库，首先，我们需要做的是将动态图的 checkpoint 导出成预测库能使用的模型文件和参数文件。可以执行 `export_model.py` 实现这个过程。
 
 ``` sh
-python export_model.py --config ../configs/transformer.base.yaml --decoding_lib ../../../../paddlenlp/ops/src/build/lib/libdecoding_op.so  --decoding_strategy beam_search --beam_size 5
+python export_model.py --config ../configs/transformer.base.yaml --decoding_lib ../../../../paddlenlp/ops/build/lib/libdecoding_op.so  --decoding_strategy beam_search --beam_size 5
 ```
 
 注意：这里的 `libdecoding_op.so` 的动态库是参照前文 **`Python 动态图使用自定义 op`** 编译出来的 lib，当前 **`C++ 预测库使用自定义 op`** 不包含编译的动态库。因此，如果在使用预测库前，还需要额外导出模型，需要编译两次：
@@ -269,7 +264,7 @@ python export_model.py --config ../configs/transformer.base.yaml --decoding_lib
 
 ``` sh
 cd bin/
-./transformer_e2e <batch_size> <gpu_id> <model_directory> <dict_directory> <input_data>
+./transformer_e2e -batch_size <batch_size> -beam_size <beam_size> -gpu_id <gpu_id> -model_dir <model_directory> -vocab_dir <dict_directory> -data_dir <input_data>
 ```
 
 这里的 `<model_directory>` 即是上文说到导出的 paddle inference 模型。
@@ -279,7 +274,7 @@ cd bin/
 ``` sh
 cd bin/
 ../third-party/build/bin/decoding_gemm 8 5 8 64 38512 256 512 0
-./transformer_e2e 8 0 ./infer_model/ DATA_HOME/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708 DATA_HOME/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/newstest2014.tok.bpe.33708.en
+./transformer_e2e -batch_size 8 -beam_size 5 -gpu_id 0 -model_dir ./infer_model/ -vocab_dir DATA_HOME/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708 -data_dir DATA_HOME/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/newstest2014.tok.bpe.33708.en
 ```
 
 其中：
diff --git a/examples/machine_translation/transformer/faster_transformer/export_model.py b/examples/machine_translation/transformer/faster_transformer/export_model.py
@@ -29,7 +29,7 @@ def parse_args():
         help="Path of the config file. ")
     parser.add_argument(
         "--decoding_lib",
-        default="../../../../paddlenlp/ops/src/build/lib/libdecoding_op.so",
+        default="../../../../paddlenlp/ops/build/lib/libdecoding_op.so",
         type=str,
         help="Path of libdecoding_op.so. ")
     parser.add_argument(
diff --git a/paddlenlp/ops/README.md b/paddlenlp/ops/README.md
@@ -236,15 +236,15 @@ cd ../
 
 ``` sh
 cd bin/
-./transformer_e2e <batch_size> <gpu_id> <model_directory> <dict_directory> <input_data>
+./transformer_e2e -batch_size <batch_size> -beam_size <beam_size> -gpu_id <gpu_id> -model_dir <model_directory> -vocab_dir <dict_directory> -data_dir <input_data>
 ```
 
 举例说明：
 
 ``` sh
 cd bin/
 ../third-party/build/bin/decoding_gemm 8 5 8 64 38512 256 512 0
-./transformer_e2e 8 0 ./infer_model/ DATA_HOME/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708 DATA_HOME/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/newstest2014.tok.bpe.33708.en
+./transformer_e2e -batch_size 8 -beam_size 5 -gpu_id 0 -model_dir ./infer_model/ -vocab_dir DATA_HOME/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708 -data_dir DATA_HOME/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/newstest2014.tok.bpe.33708.en
 ```
 
 其中：
diff --git a/paddlenlp/ops/faster_transformer/src/demo/transformer_e2e.cc b/paddlenlp/ops/faster_transformer/src/demo/transformer_e2e.cc
@@ -33,19 +33,31 @@ limitations under the License. */
 
 using namespace paddle_infer;
 
+DEFINE_int32(batch_size, 1, "Batch size to do inference. ");
+DEFINE_int32(beam_size, 5, "Beam size to do inference. ");
+DEFINE_int32(gpu_id, 0, "The gpu id to do inference. ");
+DEFINE_string(model_dir,
+              "./infer_model/",
+              "The directory to the inference model. ");
+DEFINE_string(vocab_dir,
+              "./vocab_all.bpe.33708",
+              "The directory to the vocabulary file. ");
+DEFINE_string(data_dir,
+              "./newstest2014.tok.bpe.33708.en",
+              "The directory to the input data. ");
 
 std::string model_dir = "";
-std::string dict_dir = "";
-std::string datapath = "";
+std::string vocab_dir = "";
+std::string data_dir = "";
 
-const int eos_idx = 1;
-const int pad_idx = 0;
-const int beam_size = 5;
-const int max_length = 256;
-const int n_best = 1;
+const int EOS_IDX = 1;
+const int PAD_IDX = 0;
+const int MAX_LENGTH = 256;
+const int N_BEST = 1;
 
 int batch_size = 1;
 int gpu_id = 0;
+int beam_size = 5;
 
 namespace paddle {
 namespace inference {
@@ -69,18 +81,18 @@ bool get_result_tensor(const std::unique_ptr<paddle_infer::Tensor>& seq_ids,
   seq_ids_out.resize(out_num);
   seq_ids->CopyToCpu(seq_ids_out.data());
 
-  dataresultvec.resize(batch_size * n_best);
+  dataresultvec.resize(batch_size * N_BEST);
   auto max_output_length = output_shape[0];
 
   for (int bsz = 0; bsz < output_shape[1]; ++bsz) {
-    for (int k = 0; k < n_best; ++k) {
-      dataresultvec[bsz * n_best + k].result_q = "";
+    for (int k = 0; k < N_BEST; ++k) {
+      dataresultvec[bsz * N_BEST + k].result_q = "";
       for (int len = 0; len < max_output_length; ++len) {
         if (seq_ids_out[len * batch_size * beam_size + bsz * beam_size + k] ==
-            eos_idx)
+            EOS_IDX)
           break;
-        dataresultvec[bsz * n_best + k].result_q =
-            dataresultvec[bsz * n_best + k].result_q +
+        dataresultvec[bsz * N_BEST + k].result_q =
+            dataresultvec[bsz * N_BEST + k].result_q +
             num2word_dict[seq_ids_out[len * batch_size * beam_size +
                                       bsz * beam_size + k]] +
             " ";
@@ -110,7 +122,7 @@ class DataReader {
       split(line, ' ', &word_data);
       std::string query_str = "";
       for (int j = 0; j < word_data.size(); ++j) {
-        if (j >= max_length) {
+        if (j >= MAX_LENGTH) {
           break;
         }
         query_str += word_data[j];
@@ -121,9 +133,9 @@ class DataReader {
         }
       }
       source_query_vec.push_back(query_str);
-      data_input.src_data.push_back(eos_idx);
+      data_input.src_data.push_back(EOS_IDX);
       max_len = std::max(max_len, static_cast<int>(data_input.src_data.size()));
-      max_len = std::min(max_len, max_length);
+      max_len = std::min(max_len, MAX_LENGTH);
       data_input_vec.push_back(data_input);
     }
     if (data_input_vec.empty()) {
@@ -134,7 +146,7 @@ class DataReader {
   }
 
   bool GetWordDict() {
-    std::ifstream fin(dict_dir);
+    std::ifstream fin(vocab_dir);
     std::string line;
     int k = 0;
     while (std::getline(fin, line)) {
@@ -165,7 +177,7 @@ class DataReader {
         if (k < data_input_vec[i].src_data.size()) {
           src_word_vec[i * max_len + k] = data_input_vec[i].src_data[k];
         } else {
-          src_word_vec[i * max_len + k] = pad_idx;
+          src_word_vec[i * max_len + k] = PAD_IDX;
         }
       }
     }
@@ -204,7 +216,7 @@ void Main(int batch_size, int gpu_id) {
   config.SwitchUseFeedFetchOps(false);
   config.SwitchSpecifyInputNames(true);
   auto predictor = CreatePredictor(config);
-  DataReader reader(datapath);
+  DataReader reader(data_dir);
   reader.GetWordDict();
 
   double whole_time = 0;
@@ -242,12 +254,15 @@ void Main(int batch_size, int gpu_id) {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
-  batch_size = std::stoi(std::string(argv[1]));
-  gpu_id = std::stoi(std::string(argv[2]));
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
 
-  model_dir = std::string(argv[3]);
-  dict_dir = std::string(argv[4]);
-  datapath = std::string(argv[5]);
+  batch_size = FLAGS_batch_size;
+  gpu_id = FLAGS_gpu_id;
+  beam_size = FLAGS_beam_size;
+
+  model_dir = FLAGS_model_dir;
+  vocab_dir = FLAGS_vocab_dir;
+  data_dir = FLAGS_data_dir;
 
   paddle::inference::Main(batch_size, gpu_id);