update ernie text cls ci (#823)

Steffy-zxf · web-flow · commit ebc2b1b7992e · 2021-07-30T22:24:02.000+08:00
* add ernie text cls ci

* mv ernie_text_cls_params.txt to ernie_text_cls directory
diff --git a/tests/ernie_text_cls/ernie_text_cls_params.txt b/tests/ernie_text_cls/ernie_text_cls_params.txt
@@ -2,18 +2,18 @@
 model_name:ernie_text_cls
 python:python
 gpu_list:0|0,1
-Global.use_gpu:null
-Global.auto_cast:null
+null:null
+null:null
 --epoch:1
 --save_dir:./output/
 --batch_size:32
-Global.pretrained_model:null
-train_model_name:null
-train_infer_img_dir:null
+null:null
+null:null
+null:null
 null:null
 ##
 trainer:norm
-norm_train:../examples/text_classification/pretrained_models/train.py
+norm_train:./ernie_text_cls/train.py
 pact_train:null
 fpgm_train:null
 distill_train:null
@@ -27,14 +27,17 @@ null:null
 ===========================infer_params===========================
 --output_path:./output
 --params_path: ./output/model_100/model_state.pdparams
-norm_export:../examples/text_classification/pretrained_models/export_model.py
+norm_export:./ernie_text_cls/export_model.py
 quant_export:null
 fpgm_export:null
 distill_export:null
 null:null
 null:null
 ##
-inference:../examples/text_classification/pretrained_models/deploy/python/predict.py
+infer_model:null
+infer_export:null
+infer_quant:null
+inference:./ernie_text_cls/predict.py
 --device:cpu|gpu
 --enable_mkldnn:True|False
 --cpu_threads:1|6
@@ -45,4 +48,4 @@ inference:../examples/text_classification/pretrained_models/deploy/python/predic
 --image_dir:null
 --save_log_path:null
 --benchmark:True
-null:nul
+null:nul
diff --git a/tests/ernie_text_cls/export_model.py b/tests/ernie_text_cls/export_model.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from functools import partial
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import paddlenlp as ppnlp
+from paddlenlp.data import Stack, Tuple, Pad
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--params_path", type=str, required=True, default='./checkpoint/model_900/model_state.pdparams', help="The path to model parameters to be loaded.")
+parser.add_argument("--output_path", type=str, default='./output', help="The path of model parameter in static graph to be saved.")
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == "__main__":
+    # The number of labels should be in accordance with the training dataset.
+    label_map = {0: 'negative', 1: 'positive'}
+    model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained(
+        "ernie-tiny", num_classes=len(label_map))
+
+    if args.params_path and os.path.isfile(args.params_path):
+        state_dict = paddle.load(args.params_path)
+        model.set_dict(state_dict)
+        print("Loaded parameters from %s" % args.params_path)
+    model.eval()
+
+    # Convert to static graph with specific input description
+    model = paddle.jit.to_static(
+        model,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, None], dtype="int64"),  # input_ids
+            paddle.static.InputSpec(
+                shape=[None, None], dtype="int64")  # segment_ids
+        ])
+    # Save in static graph model.
+    save_path = os.path.join(args.output_path, "inference")
+    paddle.jit.save(model, save_path)
diff --git a/tests/ernie_text_cls/predict.py b/tests/ernie_text_cls/predict.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+import numpy as np
+import paddle
+import paddlenlp as ppnlp
+from scipy.special import softmax
+from paddle import inference
+from paddlenlp.data import Stack, Tuple, Pad
+from paddlenlp.datasets import load_dataset
+from paddlenlp.utils.log import logger
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_dir", type=str, required=True,
+    help="The directory to static model.")
+
+parser.add_argument("--max_seq_length", default=128, type=int,
+    help="The maximum total input sequence length after tokenization. Sequences "
+    "longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=2, type=int,
+    help="Batch size per GPU/CPU for training.")
+parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu",
+    help="Select which device to train model, defaults to gpu.")
+
+parser.add_argument('--use_tensorrt', default=False, type=eval, choices=[True, False],
+    help='Enable to use tensorrt to speed up.')
+parser.add_argument("--precision", default="fp32", type=str, choices=["fp32", "fp16", "int8"],
+    help='The tensorrt precision.')
+
+parser.add_argument('--cpu_threads', default=10, type=int,
+    help='Number of threads to predict when using cpu.')
+parser.add_argument('--enable_mkldnn', default=False, type=eval, choices=[True, False],
+    help='Enable to use mkldnn to speed up when using cpu.')
+
+parser.add_argument("--benchmark", type=eval, default=False,
+    help="To log some information about environment and running.")
+parser.add_argument("--save_log_path", type=str, default="./log_output/",
+    help="The file path to save log.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def convert_example(example,
+                    tokenizer,
+                    label_list,
+                    max_seq_length=512,
+                    is_test=False):
+    """
+    Builds model inputs from a sequence or a pair of sequence for sequence classification tasks
+    by concatenating and adding special tokens. And creates a mask from the two sequences passed 
+    to be used in a sequence-pair classification task.
+        
+    A BERT sequence has the following format:
+
+    - single sequence: ``[CLS] X [SEP]``
+    - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+    A BERT sequence pair mask has the following format:
+    ::
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+
+    If only one sequence, only returns the first portion of the mask (0's).
+
+
+    Args:
+        example(obj:`list[str]`): List of input data, containing text and label if it have label.
+        tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
+            which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+        label_list(obj:`list[str]`): All the labels that the data has.
+        max_seq_len(obj:`int`): The maximum total input sequence length after tokenization. 
+            Sequences longer than this will be truncated, sequences shorter will be padded.
+        is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
+
+    Returns:
+        input_ids(obj:`list[int]`): The list of token ids.
+        segment_ids(obj: `list[int]`): List of sequence pair mask.
+        label(obj:`numpy.array`, data type of int64, optional): The input label if not is_test.
+    """
+    text = example
+    encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length)
+    input_ids = encoded_inputs["input_ids"]
+    segment_ids = encoded_inputs["token_type_ids"]
+
+    if not is_test:
+        # create label maps
+        label_map = {}
+        for (i, l) in enumerate(label_list):
+            label_map[l] = i
+
+        label = label_map[label]
+        label = np.array([label], dtype="int64")
+        return input_ids, segment_ids, label
+    else:
+        return input_ids, segment_ids
+
+
+class Predictor(object):
+    def __init__(self,
+                 model_dir,
+                 device="gpu",
+                 max_seq_length=128,
+                 batch_size=32,
+                 use_tensorrt=False,
+                 precision="fp32",
+                 cpu_threads=10,
+                 enable_mkldnn=False):
+        self.max_seq_length = max_seq_length
+        self.batch_size = batch_size
+
+        model_file = model_dir + "/inference.pdmodel"
+        params_file = model_dir + "/inference.pdiparams"
+        if not os.path.exists(model_file):
+            raise ValueError("not find model file path {}".format(model_file))
+        if not os.path.exists(params_file):
+            raise ValueError("not find params file path {}".format(params_file))
+        config = paddle.inference.Config(model_file, params_file)
+
+        if device == "gpu":
+            # set GPU configs accordingly
+            # such as intialize the gpu memory, enable tensorrt
+            config.enable_use_gpu(100, 0)
+            precision_map = {
+                "fp16": inference.PrecisionType.Half,
+                "fp32": inference.PrecisionType.Float32,
+                "int8": inference.PrecisionType.Int8
+            }
+            precision_mode = precision_map[precision]
+
+            if args.use_tensorrt:
+                config.enable_tensorrt_engine(
+                    max_batch_size=batch_size,
+                    min_subgraph_size=30,
+                    precision_mode=precision_mode)
+        elif device == "cpu":
+            # set CPU configs accordingly,
+            # such as enable_mkldnn, set_cpu_math_library_num_threads
+            config.disable_gpu()
+            if args.enable_mkldnn:
+                # cache 10 different shapes for mkldnn to avoid memory leak
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+            config.set_cpu_math_library_num_threads(args.cpu_threads)
+        elif device == "xpu":
+            # set XPU configs accordingly
+            config.enable_xpu(100)
+
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = paddle.inference.create_predictor(config)
+        self.input_handles = [
+            self.predictor.get_input_handle(name)
+            for name in self.predictor.get_input_names()
+        ]
+        self.output_handle = self.predictor.get_output_handle(
+            self.predictor.get_output_names()[0])
+
+        if args.benchmark:
+            import auto_log
+            pid = os.getpid()
+            self.autolog = auto_log.AutoLogger(
+                model_name="ernie-tiny",
+                model_precision=precision,
+                batch_size=self.batch_size,
+                data_shape="dynamic",
+                save_path=args.save_log_path,
+                inference_config=config,
+                pids=pid,
+                process_name=None,
+                gpu_ids=0,
+                time_keys=[
+                    'preprocess_time', 'inference_time', 'postprocess_time'
+                ],
+                warmup=0,
+                logger=logger)
+
+    def predict(self, data, tokenizer, label_map):
+        """
+        Predicts the data labels.
+
+        Args:
+            data (obj:`List(str)`): The batch data whose each element is a raw text.
+            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
+                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+            label_map(obj:`dict`): The label id (key) to label str (value) map.
+
+        Returns:
+            results(obj:`dict`): All the predictions labels.
+        """
+        if args.benchmark:
+            self.autolog.times.start()
+
+        examples = []
+        for text in data:
+            input_ids, segment_ids = convert_example(
+                text,
+                tokenizer,
+                label_list=label_map.values(),
+                max_seq_length=self.max_seq_length,
+                is_test=True)
+            examples.append((input_ids, segment_ids))
+
+        batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+        ): fn(samples)
+
+        if args.benchmark:
+            self.autolog.times.stamp()
+
+        input_ids, segment_ids = batchify_fn(examples)
+        self.input_handles[0].copy_from_cpu(input_ids)
+        self.input_handles[1].copy_from_cpu(segment_ids)
+        self.predictor.run()
+        logits = self.output_handle.copy_to_cpu()
+        if args.benchmark:
+            self.autolog.times.stamp()
+
+        probs = softmax(logits, axis=1)
+        idx = np.argmax(probs, axis=1)
+        idx = idx.tolist()
+        labels = [label_map[i] for i in idx]
+
+        if args.benchmark:
+            self.autolog.times.end(stamp=True)
+
+        return labels
+
+
+if __name__ == "__main__":
+    # Define predictor to do prediction.
+    predictor = Predictor(args.model_dir, args.device, args.max_seq_length,
+                          args.batch_size, args.use_tensorrt, args.precision,
+                          args.cpu_threads, args.enable_mkldnn)
+
+    # ErnieTinyTokenizer is special for ernie-tiny pretained model.
+    tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
+        'ernie-tiny')
+    test_ds = load_dataset("chnsenticorp", splits=["test"])
+    data = [d["text"] for d in test_ds]
+    batches = [
+        data[idx:idx + args.batch_size]
+        for idx in range(0, len(data), args.batch_size)
+    ]
+    label_map = {0: 'negative', 1: 'positive'}
+
+    results = []
+    for batch_data in batches:
+        results.extend(predictor.predict(batch_data, tokenizer, label_map))
+    for idx, text in enumerate(data):
+        print('Data: {} \t Label: {}'.format(text, results[idx]))
+    if args.benchmark:
+        predictor.autolog.report()
diff --git a/tests/ernie_text_cls/train.py b/tests/ernie_text_cls/train.py