Calc CHID acc by considering the global probability (#1891)

LiuChiachi · web-flow · commit 6f0535a14d72 · 2022-04-07T15:02:28.000+08:00
* compute acc by considering the global probability

* remove debug var

* remove debug var

* update readme results

* fix cmrc2018 max answer length

* update readme

* add lr,bs in readme
diff --git a/examples/benchmark/clue/README.md b/examples/benchmark/clue/README.md
@@ -9,29 +9,31 @@
 使用多种中文预训练模型微调在 CLUE 的各验证集上有如下结果：
 
 
-| Model                 | AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL   | C<sup>3</sup> |
-| --------------------- | ----- | ----- | ------- | ----- | ----- | ----------- | ----- | ------------- |
-| RoBERTa-wwm-ext-large | 76.20 | 59.50 | 62.10   | 84.02 | 79.15 | 90.79       | 82.03 | 75.79         |
+| Model                 | AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL   | CMRC2018    | CHID  | C<sup>3</sup> |
+| --------------------- | ----- | ----- | ------- | ----- | ----- | ----------- | ----- | ----------- | ----- | ------------- |
+| RoBERTa-wwm-ext-large | 75.32 | 59.33 | 61.91   | 83.87 | 78.81 | 91.78       | 81.80 | 70.67/90.61 | 85.83 | 74.90         |
 
 
-AFQMC、TNEWS、IFLYTEK、CMNLI、OCNLI、CLUEWSC2020、CSL 和 C<sup>3</sup> 任务使用的评估指标均是 Accuracy。
-其中前 7 项属于分类任务，后面 1 项属于阅读理解任务，这两种任务的训练过程在下面将会分开介绍。
+AFQMC、TNEWS、IFLYTEK、CMNLI、OCNLI、CLUEWSC2020、CSL 、CHID 和 C<sup>3</sup> 任务使用的评估指标均是 Accuracy。CMRC2018 的评估指标是 EM/F1。
+其中前 7 项属于分类任务，后面 3 项属于阅读理解任务，这两种任务的训练过程在下面将会分开介绍。
 
 **NOTE：具体评测方式如下**
 1. 以上所有任务均基于 Grid Search 方式进行超参寻优。分类任务训练每间隔 100 steps 评估验证集效果，阅读理解任务每隔一个 epoch 评估验证集效果，取验证集最优效果作为表格中的汇报指标。
 
 2. 分类任务 Grid Search 超参范围: batch_size: 16, 32, 64; learning rates: 1e-5, 2e-5, 3e-5, 5e-5；因为 CLUEWSC2020 数据集效果对 batch_size 较为敏感，对CLUEWSC2020 评测时额外增加了 batch_size = 8 的超参搜索。
 
-3. 阅读理解任务 Grid Search 超参范围：batch_size: 24, 32; learning rates: 1e-5, 2e-5, 3e-5。
+3. 阅读理解任务 Grid Search 超参范围：batch_size: 24, 32; learning rates: 1e-5, 2e-5, 3e-5。阅读理解任务均使用多卡训练，其中 Grid Search 中的 batch_size 是指多张卡上的 batch_size 总和。
 
-4. 以上任务的 epoch、max_seq_length、warmup proportion 如下表所示：
+4. 以上每个任务的固定超参配置如下表所示：
 
 | TASK              | AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL  | CMRC2018 | CHID | C<sup>3</sup> |
 | ----------------- | ----- | ----- | ------- | ----- | ----- | ----------- | ---- | -------- | ---- | ------------- |
 | epoch             | 3     | 3     | 3       | 2     | 5     | 50          | 5    | 2        | 3    | 8             |
 | max_seq_length    | 128   | 128   | 128     | 128   | 128   | 128         | 128  | 512      | 64   | 512           |
-| warmup_proportion | 0.1   | 0.1   | 0.1     | 0.1   | 0.1   | 0.1         | 0.1  | 0.1      | 0.06 | 0.05          |
-
+| warmup_proportion | 0.1   | 0.1   | 0.1     | 0.1   | 0.1   | 0.1         | 0.1  | 0.1      | 0.06 | 0.1           |
+| num_cards         | 1     | 1     | 1       | 1     | 1     | 1           | 1    | 2        | 4    | 4             |
+| learning_rate     | 1e-5  | 3e-5  | 3e-5    | 1e-5  | 1e-5  | 1e-5        | 2e-5 | 32       | 24   | 24            |
+| batch_size        | 32    | 32    | 32      | 16    | 16    | 16          | 16   | 3e-5     | 1e-5 | 2e-5          |
 
 
 ## 一键复现模型效果
@@ -100,26 +102,29 @@ eval loss: 2.476962, acc: 0.1697, eval done total : 25.794789791107178 s
 ```
 
 ### 启动 CLUE 阅读理解任务
-以 CLUE 的 C<sup>3</sup> 任务为例，启动 CLUE 任务进行 Fine-tuning 的方式如下：
+以 CLUE 的 C<sup>3</sup> 任务为例，多卡启动 CLUE 任务进行 Fine-tuning 的方式如下：
 
 ```shell
 
 cd mrc
 
 mkdir roberta-wwm-ext-large
 MODEL_PATH=roberta-wwm-ext-large
-BATCH_SIZE=24
+BATCH_SIZE=6
 LR=2e-5
 
-python -u run_c3.py \
+python -m paddle.distributed.launch --gpus "0,1,2,3" run_c3.py \
     --model_name_or_path ${MODEL_PATH} \
     --batch_size ${BATCH_SIZE} \
     --learning_rate ${LR} \
     --max_seq_length 512 \
     --num_train_epochs 8 \
-    --warmup_proportion 0.05 \
+    --do_train \
+    --warmup_proportion 0.1 \
+    --gradient_accumulation_steps 3 \
 
 ```
+需要注意的是，如果显存无法容纳所传入的 `batch_size`，可以通过传入 `gradient_accumulation_steps` 参数来模拟该 `batch_size`。
 
 ## 参加 CLUE 竞赛
 
diff --git a/examples/benchmark/clue/mrc/run_chid.py b/examples/benchmark/clue/mrc/run_chid.py
@@ -24,11 +24,9 @@
 import numpy as np
 
 import paddle
-from paddle.metric import Accuracy
 import paddle.nn as nn
 
 from datasets import load_dataset
-
 from paddlenlp.data import Pad, Stack, Tuple, Dict
 from paddlenlp.transformers import AutoModelForMultipleChoice, AutoTokenizer
 from paddlenlp.transformers import LinearDecayWithWarmup
@@ -140,19 +138,64 @@ def set_seed(args):
     paddle.seed(args.seed)
 
 
+def calc_global_pred_results(logits):
+    logits = np.array(logits)
+    # [num_choices, tag_size]
+    logits = np.transpose(logits)
+    tmp = []
+    for i, row in enumerate(logits):
+        for j, col in enumerate(row):
+            tmp.append((i, j, col))
+    else:
+        choice = set(range(i + 1))
+        blanks = set(range(j + 1))
+    tmp = sorted(tmp, key=lambda x: x[2], reverse=True)
+    results = []
+    for i, j, v in tmp:
+        if (j in blanks) and (i in choice):
+            results.append((i, j))
+            blanks.remove(j)
+            choice.remove(i)
+    results = sorted(results, key=lambda x: x[1], reverse=False)
+    results = [i for i, j in results]
+    return results
+
+
 @paddle.no_grad()
-def evaluate(model, loss_fct, metric, data_loader):
+def evaluate(model, data_loader, do_predict=False):
     model.eval()
-    metric.reset()
+    right_num, total_num = 0, 0
+    all_results = []
     for step, batch in enumerate(data_loader):
-        input_ids, segment_ids, labels = batch
+        if do_predict:
+            input_ids, segment_ids, example_ids = batch
+        else:
+            input_ids, segment_ids, labels, example_ids = batch
         logits = model(input_ids=input_ids, token_type_ids=segment_ids)
-        loss = loss_fct(logits, labels)
-        correct = metric.compute(logits, labels)
-        metric.update(correct)
-    res = metric.accumulate()
+        batch_num = example_ids.shape[0]
+        l = 0
+        r = batch_num - 1
+        batch_results = []
+        for i in range(batch_num - 1):
+            if example_ids[i] != example_ids[i + 1]:
+                r = i
+                batch_results.extend(
+                    calc_global_pred_results(logits[l:r + 1, :]))
+                l = i + 1
+        if l <= batch_num - 1:
+            batch_results.extend(
+                calc_global_pred_results(logits[l:batch_num, :]))
+        if do_predict:
+            all_results.extend(batch_results)
+        else:
+            right_num += np.sum(np.array(batch_results) == labels.numpy())
+            total_num += labels.shape[0]
     model.train()
-    return res
+    if not do_predict:
+        acc = right_num / total_num
+        print("acc", right_num, total_num, acc)
+        return acc
+    return all_results
 
 
 def run(args):
@@ -242,9 +285,14 @@ def add_tokens_for_around(tokens, pos, num_tokens):
         num_tokens = max_tokens_for_doc - 5
         num_examples = len(examples.data["candidates"])
         if do_predict:
-            result = {"input_ids": [], "token_type_ids": []}
+            result = {"input_ids": [], "token_type_ids": [], "example_ids": []}
         else:
-            result = {"input_ids": [], "token_type_ids": [], "labels": []}
+            result = {
+                "input_ids": [],
+                "token_type_ids": [],
+                "labels": [],
+                "example_ids": []
+            }
         for idx in range(num_examples):
             candidate = 0
             options = examples.data['candidates'][idx]
@@ -316,6 +364,7 @@ def add_tokens_for_around(tokens, pos, num_tokens):
                     # Final shape of input_ids: [batch_size, num_choices, seq_len]
                     result["input_ids"].append(new_data["input_ids"])
                     result["token_type_ids"].append(new_data["token_type_ids"])
+                    result["example_ids"].append(idx)
                     if not do_predict:
                         label = examples.data["answers"][idx]["candidate_id"][
                             candidate]
@@ -350,7 +399,8 @@ def add_tokens_for_around(tokens, pos, num_tokens):
         batchify_fn = lambda samples, fn=Dict({
             'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input
             'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id),  # segment
-            'labels': Stack(dtype="int64")  # label
+            'labels': Stack(dtype="int64"),  # label
+            'example_ids': Stack(dtype="int64"),  # example id
         }): fn(samples)
 
         train_batch_sampler = paddle.io.DistributedBatchSampler(
@@ -397,15 +447,14 @@ def add_tokens_for_around(tokens, pos, num_tokens):
             grad_clip=grad_clip)
 
         loss_fct = nn.CrossEntropyLoss()
-        metric = Accuracy()
 
         model.train()
         global_step = 0
         best_acc = 0.0
         tic_train = time.time()
         for epoch in range(args.num_train_epochs):
             for step, batch in enumerate(train_data_loader):
-                input_ids, segment_ids, labels = batch
+                input_ids, segment_ids, labels, example_ids = batch
                 logits = model(input_ids=input_ids, token_type_ids=segment_ids)
                 loss = loss_fct(logits, labels)
                 if args.gradient_accumulation_steps > 1:
@@ -424,7 +473,7 @@ def add_tokens_for_around(tokens, pos, num_tokens):
                                args.logging_steps / (time.time() - tic_train)))
                         tic_train = time.time()
             tic_eval = time.time()
-            acc = evaluate(model, loss_fct, metric, dev_data_loader)
+            acc = evaluate(model, dev_data_loader)
             print("eval acc: %.5f, eval done total : %s s" %
                   (acc, time.time() - tic_eval))
             if paddle.distributed.get_rank() == 0 and acc > best_acc:
@@ -445,13 +494,13 @@ def add_tokens_for_around(tokens, pos, num_tokens):
                               batch_size=len(test_ds),
                               remove_columns=column_names,
                               num_proc=1)
-
         test_batch_sampler = paddle.io.BatchSampler(
             test_ds, batch_size=args.eval_batch_size, shuffle=False)
 
         batchify_fn = lambda samples, fn=Dict({
             'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input
             'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id),  # segment
+            'example_ids': Stack(dtype="int64"),  # example id
         }): fn(samples)
 
         test_data_loader = paddle.io.DataLoader(
@@ -462,15 +511,10 @@ def add_tokens_for_around(tokens, pos, num_tokens):
 
         result = {}
         idx = 623377
-        for step, batch in enumerate(test_data_loader):
-            input_ids, segment_ids = batch
-            with paddle.no_grad():
-                logits = model(input_ids, segment_ids)
-            preds = paddle.argmax(logits, axis=1).numpy().tolist()
-            for pred in preds:
-                result["#idiom" + str(idx)] = pred
-                idx += 1
-
+        preds = evaluate(model, test_data_loader, do_predict=True)
+        for pred in preds:
+            result["#idiom" + str(idx)] = pred
+            idx += 1
         if not os.path.exists(args.output_dir):
             os.makedirs(args.output_dir)
         with open(
diff --git a/examples/benchmark/clue/mrc/run_cmrc.py b/examples/benchmark/clue/mrc/run_cmrc.py
@@ -130,7 +130,7 @@ def parse_args():
     parser.add_argument(
         "--max_query_length", type=int, default=64, help="Max query length.")
     parser.add_argument(
-        "--max_answer_length", type=int, default=30, help="Max answer length.")
+        "--max_answer_length", type=int, default=50, help="Max answer length.")
     parser.add_argument(
         "--do_lower_case",
         action='store_false',
@@ -184,12 +184,15 @@ def evaluate(model, raw_dataset, data_loader, args, do_eval=True):
         raw_dataset, data_loader.dataset, (all_start_logits, all_end_logits),
         False, args.n_best_size, args.max_answer_length)
 
-    # Can also write all_nbest_json and scores_diff_json files if needed
-    if args.do_predict:
-        with open('cmrc2018_predict.json', "w", encoding='utf-8') as writer:
-            writer.write(
-                json.dumps(
-                    all_predictions, ensure_ascii=False, indent=4) + "\n")
+    mode = 'validation' if do_eval else 'test'
+    if do_eval:
+        filename = 'prediction_validation.json'
+    else:
+        filename = 'cmrc2018_predict.json'
+    with open(filename, "w", encoding='utf-8') as writer:
+        writer.write(
+            json.dumps(
+                all_predictions, ensure_ascii=False, indent=4) + "\n")
     if do_eval:
         squad_evaluate(
             examples=[raw_data for raw_data in raw_dataset],