Merge pull request PaddlePaddle#1105 from leeyy2020/few_shot_rdrop

tianxin · web-flow · commit 73933974ea9e · 2021-10-08T15:19:28.000+08:00
add drop for few-shot learning
diff --git a/examples/few_shot/README.md b/examples/few_shot/README.md
@@ -11,9 +11,12 @@ Few-Shot Learning 旨在研究如何从少量有监督的训练样本中学习
 
 | 算法 | 预训练模型  | Score  | eprstmt  | bustm  | ocnli  | csldcp  | tnews  |  cluewsc | iflytek | csl | chid |
 | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ |------------ | ------------ | ---------- |
-| P-tuning  | ERNIE1.0  | 55.70 | 83.28  | 63.43  | 35.36  | 60.54  | 50.02  | 54.51  | 50.14 | 54.93 | 41.16 |
-| EFL       | ERNIE1.0  | 54.47 | 84.10  | 60.10  | 35.12  | 56.61  | 56.57  | 53.59  | 46.37 | 61.21 | 36.56 |
-| PET       | ERNIE1.0  | 56.63 | 86.88  | 61.90  | 36.90  | 61.10  | 56.51  | 55.02  | 50.31 | 59.72 | 41.35 |
+| P-tuning  | ERNIE-1.0  | 55.70 | 83.28  | 63.43  | 35.36  | 60.54  | 50.02  | 54.51  | 50.14 | 54.93 | 41.16 |
+| P-tuning+R-Drop  | ERNIE-1.0  | 56.23 | 83.11  | 64.56  | 35.71  | 61.88  | 57.51  | 54  | 52 | 56.3 | 41 |
+| EFL       | ERNIE-1.0  | 54.47 | 84.10  | 60.10  | 35.12  | 56.61  | 56.57  | 53.59  | 46.37 | 61.21 | 36.56 |
+| EFL+R-Drop       | ERNIE-1.0  | 56.94 | 87 | 62.75  | 37.54  | 53.98 | 56.77  | 56.87  | 48.54 | 62.19 | 46.85 |
+| PET       | ERNIE-1.0  | 56.63 | 86.88  | 61.90  | 36.90  | 61.10  | 56.51  | 55.02  | 50.31 | 59.72 | 41.35 |
+| PET+R-Drop   | ERNIE-1.0  | 57.37 | 87.54  | 63.66  | 36.46  | 62.5  | 58.91  | 56.25  | 53.46 | 57.22 | 40.31 |
 
 ## Models
 - [P-tuning](./p-tuning)
diff --git a/examples/few_shot/efl/README.md b/examples/few_shot/efl/README.md
@@ -39,14 +39,16 @@ python -u -m paddle.distributed.launch --gpus "0" \
     --batch_size 32 \
     --learning_rate 5E-5 \
     --epochs 10 \
-    --max_seq_length 512
+    --max_seq_length 512 \
+    --rdrop_coef 0 \
 ```
 参数含义说明
 - `task_name`: FewCLUE 中的数据集名字
 - `negative_num`:  负样本采样个数，对于多分类任务，负样本数量对效果影响很大。负样本数量参数取值范围为 [1, class_num - 1]
 - `device`: 使用 cpu/gpu 进行训练
 - `save_dir`: 模型存储路径
 - `max_seq_length`: 文本的最大截断长度
+- `rdrop_coef`: R-Drop 策略 Loss 的权重系数，默认为 0， 若为 0 则未使用 R-Drop 策略
 
 模型每训练 1 个 epoch,  会在验证集上进行评估，并针对测试集进行预测存储到预测结果文件。
 
diff --git a/examples/few_shot/efl/train.py b/examples/few_shot/efl/train.py
@@ -106,6 +106,12 @@ def parse_args():
         type=int,
         default=100000,
         help="Inteval steps to save checkpoint")
+    parser.add_argument(
+        "--rdrop_coef", 
+        default=0.0, 
+        type=float, 
+        help="The coefficient of KL-Divergence loss in R-Drop paper, for more detail please refer to https://arxiv.org/abs/2106.14448), if rdrop_coef > 0 then R-Drop works")
+
     return parser.parse_args()
 
 
@@ -210,7 +216,7 @@ def do_train():
         apply_decay_param_fun=lambda x: x in decay_params)
 
     criterion = paddle.nn.loss.CrossEntropyLoss()
-
+    rdrop_loss = ppnlp.losses.RDropLoss()
     global_step = 0
     tic_train = time.time()
     for epoch in range(1, args.epochs + 1):
@@ -222,7 +228,14 @@ def do_train():
             prediction_scores = model(
                 input_ids=src_ids, token_type_ids=token_type_ids)
 
-            loss = criterion(prediction_scores, labels)
+            if args.rdrop_coef > 0:
+                prediction_scores_2 = model(
+                    input_ids=src_ids, token_type_ids=token_type_ids)
+                ce_loss = (criterion(prediction_scores, labels) + criterion(prediction_scores_2, labels)) * 0.5
+                kl_loss = rdrop_loss(prediction_scores, prediction_scores_2)
+                loss = ce_loss + kl_loss * args.rdrop_coef
+            else:
+                loss = criterion(prediction_scores, labels)
 
             global_step += 1
             if global_step % 10 == 0 and rank == 0:
diff --git a/examples/few_shot/p-tuning/README.md b/examples/few_shot/p-tuning/README.md
@@ -36,14 +36,16 @@ python -u -m paddle.distributed.launch --gpus "0" \
     --batch_size 32 \
     --learning_rate 5E-5 \
     --epochs 10 \
-    --max_seq_length 512
+    --max_seq_length 512 \
+    --rdrop_coef 0 \
 ```
 参数含义说明
 - `task_name`: FewCLUE 中的数据集名字
 - `p_embedding_num`: P-embedding 的个数
 - `device`: 使用 cpu/gpu 进行训练
 - `save_dir`: 模型存储路径
 - `max_seq_length`: 文本的最大截断长度
+- `rdrop_coef`: R-Drop 策略 Loss 的权重系数，默认为 0， 若为 0 则未使用 R-Drop 策略
 
 模型每训练 1 个 epoch, 会在验证集和测试集上进行评估。
 
diff --git a/examples/few_shot/p-tuning/ptuning.py b/examples/few_shot/p-tuning/ptuning.py
@@ -51,6 +51,7 @@
 parser.add_argument("--seed", type=int, default=1000, help="random seed for initialization")
 parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 parser.add_argument('--save_steps', type=int, default=10000, help="Inteval steps to save checkpoint")
+parser.add_argument("--rdrop_coef", default=0.0, type=float, help="The coefficient of KL-Divergence loss in R-Drop paper, for more detail please refer to https://arxiv.org/abs/2106.14448), if rdrop_coef > 0 then R-Drop works")
 
 args = parser.parse_args()
 # yapf: enable
@@ -153,6 +154,7 @@ def do_train():
         print("warmup from:{}".format(args.init_from_ckpt))
 
     mlm_loss_fn = ErnieMLMCriterion()
+    rdrop_loss = ppnlp.losses.RDropLoss()
 
     num_training_steps = len(train_data_loader) * args.epochs
 
@@ -187,7 +189,16 @@ def do_train():
                 token_type_ids=token_type_ids,
                 masked_positions=masked_positions)
 
-            loss = mlm_loss_fn(prediction_scores, masked_lm_labels)
+            if args.rdrop_coef > 0:
+                prediction_scores_2 = model(
+                    input_ids=src_ids,
+                    token_type_ids=token_type_ids,
+                    masked_positions=masked_positions)
+                ce_loss = (mlm_loss_fn(prediction_scores, masked_lm_labels) + mlm_loss_fn(prediction_scores_2, masked_lm_labels)) * 0.5
+                kl_loss = rdrop_loss(prediction_scores, prediction_scores_2)
+                loss = ce_loss + kl_loss * args.rdrop_coef
+            else:
+                loss = mlm_loss_fn(prediction_scores, masked_lm_labels)
 
             global_step += 1
             if global_step % 10 == 0 and rank == 0:
diff --git a/examples/few_shot/pet/README.md b/examples/few_shot/pet/README.md
@@ -39,13 +39,15 @@ python -u -m paddle.distributed.launch --gpus "0" \
 	--epochs 10 \
 	--max_seq_length 512 \
 	--language_model "ernie-1.0" \
+    --rdrop_coef 0 \
 ```
 参数含义说明
 - `task_name`: FewCLUE 中的数据集名字
 - `device`: 使用 cpu/gpu 进行训练
 - `pattern_id` 完形填空的模式
 - `save_dir`: 模型存储路径
 - `max_seq_length`: 文本的最大截断长度
+- `rdrop_coef`: R-Drop 策略 Loss 的权重系数，默认为 0， 若为 0 则未使用 R-Drop 策略
 
 模型每训练 1 个 epoch,  会在验证集上进行评估
 
diff --git a/examples/few_shot/pet/pet.py b/examples/few_shot/pet/pet.py
@@ -167,7 +167,7 @@ def do_train(args):
         print("warmup from:{}".format(args.init_from_ckpt))
 
     mlm_loss_fn = ErnieMLMCriterion()
-    cross_loss_fn = paddle.nn.CrossEntropyLoss()
+    rdrop_loss = ppnlp.losses.RDropLoss()
     max_test_acc = 0.0
     global_step = 0
     tic_train = time.time()
@@ -195,7 +195,16 @@ def do_train(args):
                 input_ids=src_ids,
                 token_type_ids=token_type_ids,
                 masked_positions=new_masked_positions)
-            loss = mlm_loss_fn(prediction_scores, masked_lm_labels)
+            if args.rdrop_coef > 0:
+                prediction_scores_2 = model(
+                    input_ids=src_ids,
+                    token_type_ids=token_type_ids,
+                    masked_positions=new_masked_positions)
+                ce_loss = (mlm_loss_fn(prediction_scores, masked_lm_labels) + mlm_loss_fn(prediction_scores_2, masked_lm_labels)) * 0.5
+                kl_loss = rdrop_loss(prediction_scores, prediction_scores_2)
+                loss = ce_loss + kl_loss * args.rdrop_coef
+            else:
+                loss = mlm_loss_fn(prediction_scores, masked_lm_labels)
 
             global_step += 1
             if global_step % 10 == 0 and rank == 0:
@@ -307,5 +316,11 @@ def do_train(args):
         default='ernie-1.0',
         choices=['ernie-1.0'],
         help="Language model")
+    parser.add_argument(
+        "--rdrop_coef", 
+        default=0.0, 
+        type=float, 
+        help="The coefficient of KL-Divergence loss in R-Drop paper, for more detail please refer to https://arxiv.org/abs/2106.14448), if rdrop_coef > 0 then R-Drop works")
+
     args = parser.parse_args()
     do_train(args)