PaddlePaddle
diff --git a/‎paddlenlp/ops/faster_transformer/sample/plato_export_model_sample.py‎
Lines changed: 159 additions & 0 deletions b/‎paddlenlp/ops/faster_transformer/sample/plato_export_model_sample.py‎
Lines changed: 159 additions & 0 deletions
diff --git a/‎paddlenlp/ops/faster_transformer/sample/plato_inference.py‎
Lines changed: 119 additions & 0 deletions b/‎paddlenlp/ops/faster_transformer/sample/plato_inference.py‎
Lines changed: 119 additions & 0 deletions
@@ -0,0 +1,159 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import numpy as np
+from attrdict import AttrDict
+import argparse
+import time
+
+import paddle
+
+import yaml
+from pprint import pprint
+
+from paddlenlp.ops import FasterGPT
+from paddlenlp.transformers import UnifiedTransformerModel, UnifiedTransformerLMHeadModel
+from paddlenlp.ops import FasterUnifiedTransformer
+
+from paddlenlp.utils.log import logger
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name_or_path",
+        default="plato-xl",
+        type=str,
+        help="The model name to specify the gpt to use. Can be one of ['gpt2-en', 'gpt2-medium-en', 'gpt-cpm-large-cn']. "
+    )
+    parser.add_argument(
+        "--inference_model_dir",
+        default="./infer_model/",
+        type=str,
+        help="Path to save inference model of gpt. ")
+    parser.add_argument(
+        "--topk",
+        default=4,
+        type=int,
+        help="The number of candidate to procedure top_k sampling. ")
+    parser.add_argument(
+        "--topp",
+        default=1.0,
+        type=float,
+        help="The probability threshold to procedure top_p sampling. ")
+    parser.add_argument(
+        "--max_out_len", default=64, type=int, help="Maximum output length. ")
+    parser.add_argument(
+        "--min_out_len", default=1, type=int, help="Minimum output length. ")
+    parser.add_argument(
+        "--num_return_sequence",
+        default=1,
+        type=int,
+        help="The number of returned sequence. ")
+    parser.add_argument(
+        "--temperature",
+        default=1.0,
+        type=float,
+        help="The temperature to set. ")
+    parser.add_argument(
+        "--num_return_sequences",
+        default=1,
+        type=int,
+        help="The number of returned sequences. ")
+    parser.add_argument(
+        "--use_fp16_decoding",
+        action="store_true",
+        help="Whether to use fp16 decoding to predict. ")
+    parser.add_argument(
+        "--decoding_strategy",
+        default="sampling",
+        choices=["sampling", "beam_search"],
+        type=str,
+        help="The main strategy to decode. ")
+    parser.add_argument(
+        "--num_beams",
+        default=4,
+        type=int,
+        help="The number of candidate to procedure beam search. ")
+    parser.add_argument(
+        "--diversity_rate",
+        default=0.0,
+        type=float,
+        help="The diversity rate to procedure beam search. ")
+
+    args = parser.parse_args()
+    return args
+
+
+def do_predict(args):
+    place = "gpu"
+    place = paddle.set_device(place)
+
+    model_name = 'plato-xl'
+    model = UnifiedTransformerLMHeadModel.from_pretrained(model_name)
+    tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name)
+
+    plato = FasterUnifiedTransformer(
+        model=model, use_fp16_decoding=args.use_fp16_decoding)
+    # Set evaluate mode
+    plato.eval()
+
+    # Convert dygraph model to static graph model 
+    plato = paddle.jit.to_static(
+        plato,
+        input_spec=[
+            # input_ids
+            paddle.static.InputSpec(
+                shape=[None, None], dtype="int32"),
+            # token_type_ids
+            paddle.static.InputSpec(
+                shape=[None, None], dtype="int32"),
+            # attention_mask
+            paddle.static.InputSpec(
+                shape=[None, 1, None, None], dtype="float32"),
+            # seq_len
+            paddle.static.InputSpec(
+                shape=[None], dtype="int32"),
+            # role_ids
+            paddle.static.InputSpec(
+                shape=[None, None], dtype="int32"),
+            # position_ids
+            paddle.static.InputSpec(
+                shape=[None, None], dtype="int32"),
+            args.max_out_len,
+            args.min_out_len,
+            args.topk,
+            args.topp,
+            args.decoding_strategy,
+            tokenizer.cls_token_id,  # cls/bos
+            tokenizer.eos_token_id,  # eos
+            tokenizer.pad_token_id,  # pad
+            args.num_beams,  # num_beams. Used for beam_search. 
+            args.diversity_rate,  # diversity rate. Used for beam search. 
+            args.temperature,
+            args.num_return_sequences,
+        ])
+
+    # Save converted static graph model
+    paddle.jit.save(plato, os.path.join(args.inference_model_dir, "plato"))
+    logger.info("PLATO has been saved to {}".format(args.inference_model_dir))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    pprint(args)
+
+    do_predict(args)
@@ -0,0 +1,119 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import argparse
+import numpy as np
+from pprint import pprint
+
+import paddle
+import paddle.inference as paddle_infer
+
+from paddlenlp.transformers import UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer
+from paddlenlp.ops.ext_utils import load
+
+
+def setup_args():
+    """Setup arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--inference_model_dir",
+        default="./infer_model/",
+        type=str,
+        help="Path to save inference model of gpt. ")
+    parser.add_argument(
+        "--use_role",
+        action="store_true",
+        help="Whether to use role embeddings. ")
+    parser.add_argument(
+        "--position_style",
+        default="relative",
+        choices=["continuous", "relative"],
+        type=str,
+        help="The type for positional embedding. Default is continuous. ")
+
+    args = parser.parse_args()
+
+    return args
+
+
+def postprocess_response(token_ids, tokenizer):
+    """Post-process the decoded sequence. Truncate from the first <eos>."""
+    eos_pos = len(token_ids)
+    for i, tok_id in enumerate(token_ids):
+        if tok_id == tokenizer.sep_token_id:
+            eos_pos = i
+            break
+    token_ids = token_ids[:eos_pos]
+    tokens = tokenizer.convert_ids_to_tokens(token_ids)
+    tokens = tokenizer.merge_subword(tokens)
+    return tokens
+
+
+def infer(args):
+    model_name = 'plato-xl'
+    tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name)
+
+    context = [
+        "Hi , Becky , what's up ?",
+        "Not much , except that my mother-in-law is driving me up the wall .",
+        "What's the problem ?"
+    ]
+
+    data = tokenizer.dialogue_encode(
+        history=context,
+        add_start_token_as_response=True,
+        return_length=True,
+        return_role_ids=args.use_role,
+        position_style=args.position_style)
+
+    # Load FasterTransformer lib. 
+    load("FasterTransformer", verbose=True)
+
+    config = paddle_infer.Config(args.inference_model_dir + "plato.pdmodel",
+                                 args.inference_model_dir + "plato.pdiparams")
+    config.enable_use_gpu(100, 0)
+    config.disable_glog_info()
+    predictor = paddle_infer.create_predictor(config)
+
+    input_handles = {}
+    for name in predictor.get_input_names():
+        input_handles[name] = predictor.get_input_handle(name)
+        if name == "attention_mask":
+            input_handles[name].copy_from_cpu(
+                np.asarray(
+                    data[name], dtype="float32").reshape([1, 1, 41, 41]))
+        else:
+            input_handles[name].copy_from_cpu(
+                np.asarray(
+                    data[name], dtype="int32").reshape([1, -1]))
+
+    output_handles = [
+        predictor.get_output_handle(name)
+        for name in predictor.get_output_names()
+    ]
+
+    predictor.run()
+
+    output = [output_handle.copy_to_cpu() for output_handle in output_handles]
+
+    for sample in output[0].transpose([1, 0]).tolist():
+        print(" ".join(postprocess_response(sample, tokenizer)))
+
+
+if __name__ == "__main__":
+    args = setup_args()
+    pprint(args)
+
+    infer(args)