PaddlePaddle
diff --git a/‎examples/benchmark/clue/README.md‎
Lines changed: 44 additions & 0 deletions b/‎examples/benchmark/clue/README.md‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎examples/benchmark/clue/classification/run_clue_classifier_trainer.py‎
Lines changed: 325 additions & 0 deletions b/‎examples/benchmark/clue/classification/run_clue_classifier_trainer.py‎
Lines changed: 325 additions & 0 deletions
diff --git a/‎examples/language_model/ernie-1.0/finetune/question_answering.py‎
Lines changed: 0 additions & 6 deletions b/‎examples/language_model/ernie-1.0/finetune/question_answering.py‎
Lines changed: 0 additions & 6 deletions
@@ -100,6 +100,50 @@ eval loss: 2.572999, acc: 0.112, eval done total : 25.67190170288086 s
 global step 400/20010, epoch: 0, batch: 399, rank_id: 0, loss: 2.631579, lr: 0.0000059970, speed: 2.6238 step/s
 eval loss: 2.476962, acc: 0.1697, eval done total : 25.794789791107178 s
 ```
+#### 使用Trainer启动 CLUE 分类任务
+PaddleNLP提供了Trainer API，本示例新增了`run_clue_classifier_trainer.py`脚本供用户使用。需要从源码安装paddlenlp使用。
+```
+export CUDA_VISIBLE_DEVICES=0
+export TASK_NAME=TNEWS
+export LR=3e-5
+export BS=32
+export EPOCH=6
+export MAX_SEQ_LEN=128
+export MODEL_PATH=roberta-wwm-ext-large
+
+cd classification
+mkdir roberta-wwm-ext-large
+
+python -u ./run_clue_classifier_trainer.py \
+    --model_name_or_path ${MODEL_PATH} \
+    --dataset "clue ${TASK_NAME}" \
+    --max_seq_length ${MAX_SEQ_LEN} \
+    --per_device_train_batch_size ${BS}   \
+    --per_device_eval_batch_size ${BS}   \
+    --learning_rate ${LR} \
+    --num_train_epochs ${EPOCH} \
+    --logging_steps 100 \
+    --seed 42  \
+    --save_steps 100 \
+    --warmup_ratio 0.1 \
+    --weight_decay 0.01 \
+    --adam_epsilon 1e-8 \
+    --output_dir ${MODEL_PATH}/models/${TASK_NAME}/${LR}_${BS}/ \
+    --device gpu  \
+    --do_train \
+    --do_eval \
+    --metric_for_best_model "eval_accuracy" \
+    --load_best_model_at_end \
+    --save_total_limit 3 \
+```
+大部分参数含义如上文所述，这里简要介绍一些新参数:
+- `dataset`, 同上文`task_name`，此处为小写字母。表示 Fine-tuning 的分类任务，当前支持 afamc、tnews、iflytek、ocnli、cmnli、csl、cluewsc2020。
+- `per_device_train_batch_size` 同上文`batch_size`。训练时，每次迭代**每张卡**上的样本数目。
+- `per_device_eval_batch_size` 同上文`batch_size`。评估时，每次迭代**每张卡**上的样本数目。
+- `warmup_ratio` 同上文`warmup_proportion`，warmup步数占总步数的比例。
+- `metric_for_best_model` 评估时，最优评估指标。
+- `load_best_model_at_end` 训练结束时，时候加载评估结果最好的 ckpt。
+- `save_total_limit` 保存的ckpt数量的最大限制
 
 ### 启动 CLUE 阅读理解任务
 以 CLUE 的 C<sup>3</sup> 任务为例，多卡启动 CLUE 任务进行 Fine-tuning 的方式如下：
 
@@ -0,0 +1,325 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+from functools import partial
+from typing import Optional
+from dataclasses import dataclass, field
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.metric import Accuracy
+from paddlenlp.data import DataCollatorWithPadding
+from paddlenlp.datasets import load_dataset
+from paddlenlp.trainer import (
+    PdArgumentParser,
+    TrainingArguments,
+    Trainer, )
+from paddlenlp.trainer.trainer_utils import get_last_checkpoint
+from paddlenlp.transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification, )
+from paddlenlp.utils.log import logger
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `PdArgumentParser` we can turn this class into argparse arguments to be able to 
+    specify them on the command line.
+    """
+
+    dataset: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the dataset to use (via the datasets library)."
+        })
+
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help":
+            "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        }, )
+    do_lower_case: bool = field(
+        default=False,
+        metadata={
+            "help":
+            "Whether to lower case the input text. Should be True for uncased models and False for cased models."
+        }, )
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(metadata={
+        "help":
+        "Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html"
+    })
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help":
+            "Pretrained config name or path if not the same as model_name"
+        })
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help":
+            "Pretrained tokenizer name or path if not the same as model_name"
+        })
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help":
+            "Path to directory to store the pretrained models downloaded from huggingface.co"
+        }, )
+    export_model_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help":
+            "Path to directory to store the pretrained models downloaded from huggingface.co"
+        }, )
+
+
+# Data pre-process function for clue benchmark datatset
+def convert_clue(example,
+                 label_list,
+                 tokenizer=None,
+                 max_seq_length=512,
+                 **kwargs):
+    """convert a glue example into necessary features"""
+    is_test = False
+    if 'label' not in example.keys():
+        is_test = True
+
+    if not is_test:
+        # `label_list == None` is for regression task
+        label_dtype = "int64" if label_list else "float32"
+        # print("label_list", label_list)
+        # Get the label
+        # example['label'] = np.array(example["label"], dtype="int64")
+        example['label'] = int(example[
+            "label"]) if label_dtype != "float32" else float(example["label"])
+        label = example['label']
+    # Convert raw text to feature
+    if 'keyword' in example:  # CSL
+        sentence1 = " ".join(example['keyword'])
+        example = {
+            'sentence1': sentence1,
+            'sentence2': example['abst'],
+            'label': example['label']
+        }
+    elif 'target' in example:  # wsc
+        text, query, pronoun, query_idx, pronoun_idx = example['text'], example[
+            'target']['span1_text'], example['target']['span2_text'], example[
+                'target']['span1_index'], example['target']['span2_index']
+        text_list = list(text)
+        assert text[pronoun_idx:(pronoun_idx + len(pronoun)
+                                 )] == pronoun, "pronoun: {}".format(pronoun)
+        assert text[query_idx:(query_idx + len(query)
+                               )] == query, "query: {}".format(query)
+        if pronoun_idx > query_idx:
+            text_list.insert(query_idx, "_")
+            text_list.insert(query_idx + len(query) + 1, "_")
+            text_list.insert(pronoun_idx + 2, "[")
+            text_list.insert(pronoun_idx + len(pronoun) + 2 + 1, "]")
+        else:
+            text_list.insert(pronoun_idx, "[")
+            text_list.insert(pronoun_idx + len(pronoun) + 1, "]")
+            text_list.insert(query_idx + 2, "_")
+            text_list.insert(query_idx + len(query) + 2 + 1, "_")
+        text = "".join(text_list)
+        example['sentence'] = text
+
+    if tokenizer is None:
+        return example
+    if 'sentence' in example:
+        example = tokenizer(example['sentence'], max_seq_len=max_seq_length)
+    elif 'sentence1' in example:
+        example = tokenizer(
+            example['sentence1'],
+            text_pair=example['sentence2'],
+            max_seq_len=max_seq_length)
+
+    if not is_test:
+        return {
+            "input_ids": example['input_ids'],
+            "token_type_ids": example['token_type_ids'],
+            "labels": label
+        }
+    else:
+        return {
+            "input_ids": example['input_ids'],
+            "token_type_ids": example['token_type_ids']
+        }
+
+
+def clue_trans_fn(example, tokenizer, args):
+    return convert_clue(
+        example,
+        tokenizer=tokenizer,
+        label_list=args.label_list,
+        max_seq_length=args.max_seq_length)
+
+
+def main():
+    parser = PdArgumentParser(
+        (ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Log model and data config
+    training_args.print_config(model_args, "Model")
+    training_args.print_config(data_args, "Data")
+
+    paddle.set_device(training_args.device)
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
+        +
+        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(
+            training_args.output_dir
+    ) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(
+                os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome.")
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    data_args.dataset = data_args.dataset.strip()
+
+    dataset_config = data_args.dataset.split(" ")
+    print(dataset_config)
+    raw_datasets = load_dataset(
+        dataset_config[0],
+        name=None if len(dataset_config) <= 1 else dataset_config[1],
+        splits=('train', 'dev'))
+
+    data_args.label_list = getattr(raw_datasets['train'], "label_list", None)
+    num_classes = 1 if raw_datasets["train"].label_list == None else len(
+        raw_datasets['train'].label_list)
+
+    # Define tokenizer, model, loss function. 
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_args.model_name_or_path, num_classes=num_classes)
+    criterion = nn.loss.CrossEntropyLoss(
+    ) if data_args.label_list else nn.loss.MSELoss()
+
+    # Define dataset pre-process function
+    trans_fn = partial(clue_trans_fn, tokenizer=tokenizer, args=data_args)
+
+    # Define data collector
+    data_collator = DataCollatorWithPadding(tokenizer)
+
+    # Dataset pre-process
+    if training_args.do_train:
+        train_dataset = raw_datasets["train"].map(trans_fn)
+    if training_args.do_eval:
+        eval_dataset = raw_datasets["dev"].map(trans_fn)
+    if training_args.do_predict:
+        test_dataset = raw_datasets["test"].map(trans_fn)
+
+    # Define the metrics of tasks.
+    def compute_metrics(p):
+        preds = p.predictions[0] if isinstance(p.predictions,
+                                               tuple) else p.predictions
+
+        preds = paddle.to_tensor(preds)
+        label = paddle.to_tensor(p.label_ids)
+
+        probs = F.softmax(preds, axis=1)
+        metric = Accuracy()
+        metric.reset()
+        result = metric.compute(preds, label)
+        metric.update(result)
+        accu = metric.accumulate()
+        metric.reset()
+        return {"accuracy": accu}
+
+    trainer = Trainer(
+        model=model,
+        criterion=criterion,
+        args=training_args,
+        data_collator=data_collator,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics, )
+
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        metrics = train_result.metrics
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluate and tests model
+    if training_args.do_eval:
+        eval_metrics = trainer.evaluate()
+        trainer.log_metrics("eval", eval_metrics)
+
+    if training_args.do_predict:
+        test_ret = trainer.predict(test_dataset)
+        trainer.log_metrics("test", test_ret.metrics)
+        if test_ret.label_ids is None:
+            paddle.save(
+                test_ret.predictions,
+                os.path.join(training_args.output_dir, "test_results.pdtensor"),
+            )
+
+    # export inference model
+    if training_args.do_export:
+        input_spec = [
+            paddle.static.InputSpec(
+                shape=[None, None], dtype="int64"),  # input_ids
+            paddle.static.InputSpec(
+                shape=[None, None], dtype="int64")  # segment_ids
+        ]
+        trainer.export_model(
+            input_spec=input_spec,
+            load_best_model=True,
+            output_dir=model_args.export_model_dir)
+
+
+if __name__ == "__main__":
+    main()
@@ -16,14 +16,8 @@
 import time
 import json
 import os
-import sys
-from functools import partial
 
-import numpy as np
 import paddle
-import paddlenlp as ppnlp
-from paddlenlp.data import Pad, Stack, Tuple
-from paddlenlp.utils.log import logger
 from paddlenlp.trainer import Trainer
 from paddlenlp.trainer.trainer_utils import PredictionOutput