[ehealth] modify NER code according to comments

LemonNoel · LemonNoel · commit 5126f8258664 · 2022-03-18T17:41:26.000+08:00
diff --git a/examples/biomedical/cblue/README.md b/examples/biomedical/cblue/README.md
@@ -32,7 +32,9 @@
 ```text
 cblue/
 ├── README.md # 使用说明
-└── train_classification.py # 分类任务训练评估脚本
+├── train_classification.py # 分类任务训练评估脚本
+├── train_ner.py # 实体识别任务训练评估脚本
+└── train_spo.py # 关系抽取任务训练评估脚本
 ```
 
 ### 模型训练
diff --git a/examples/biomedical/cblue/model.py b/examples/biomedical/cblue/model.py
@@ -13,9 +13,6 @@ class ElectraForBinaryTokenClassification(ElectraPretrainedModel):
             An instance of ElectraModel.
         num_classes (list):
             The number of classes.
-        use_crf (bool, optional):
-            Use conditional random fields for named entity recognition.
-            Defaults to False.
         dropout (float, optionl):
             The dropout probability for output of Electra.
             If None, use the same value as `hidden_dropout_prob' of 'ElectraModel`
@@ -41,11 +38,6 @@ def forward(self,
                 token_type_ids=None,
                 position_ids=None,
                 attention_mask=None):
-        r"""
-        The ElectraForMedicalClassification forward method, overrides the __call__() special method.
-
-        TODO
-        """
         sequence_output = self.electra(input_ids, token_type_ids, position_ids,
                                        attention_mask)
         sequence_output = self.dropout(sequence_output)
@@ -56,6 +48,10 @@ def forward(self,
 
 
 class MultiHeadAttentionForSPO(nn.Layer):
+    """
+    Multi-head attention layer for SPO task.
+    """
+
     def __init__(self, embed_dim, num_heads, scale_value=768):
         super(MultiHeadAttentionForSPO, self).__init__()
         self.embed_dim = embed_dim
@@ -78,6 +74,19 @@ def forward(self, query, key):
 
 class ElectraForSPO(ElectraPretrainedModel):
     """
+    Electra Model with a linear layer on top of the hidden-states output
+    layers for entity recognition, and a multi-head attention layer for
+    relation classification.
+
+    Args: 
+        electra (:class:`ElectraModel`):
+            An instance of ElectraModel.
+        num_classes (int):
+            The number of classes.
+        dropout (float, optionl):
+            The dropout probability for output of Electra.
+            If None, use the same value as `hidden_dropout_prob' of 'ElectraModel`
+            instance `electra`. Defaults to None.
     """
 
     def __init__(self, electra, num_classes, dropout=None):
diff --git a/examples/biomedical/cblue/train_ner.py b/examples/biomedical/cblue/train_ner.py
@@ -29,8 +29,6 @@
 parser.add_argument('--warmup_proportion', default=0.1, type=float, help='Linear warmup proportion over the training process.')
 parser.add_argument('--use_amp', default=False, type=bool, help='Enable mixed precision training.')
 parser.add_argument('--epochs', default=1, type=int, help='Total number of training epochs.')
-parser.add_argument('--eval_mention', default=True, type=bool, help='.')
-parser.add_argument('--update_tokenizer', default=True, type=bool, help='Update the word tokenizer during training.')
 parser.add_argument('--seed', default=1000, type=int, help='Random seed.')
 parser.add_argument('--save_dir', default='./checkpoint', type=str, help='The output directory where the model checkpoints will be written.')
 
@@ -105,7 +103,7 @@ def do_train():
         'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),
         'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),
         'position_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),
-        'mask': Pad(axis=0, pad_val=0, dtype='float32'),
+        'attention_mask': Pad(axis=0, pad_val=0, dtype='float32'),
         'label_oth': Pad(axis=0, pad_val=pad_label_id[0], dtype='int64'),
         'label_sym': Pad(axis=0, pad_val=pad_label_id[1], dtype='int64')
     }): fn(samples)
@@ -164,8 +162,6 @@ def do_train():
             with paddle.amp.auto_cast(
                     args.use_amp,
                     custom_white_list=['layer_norm', 'softmax', 'gelu'], ):
-                att_mask = paddle.unsqueeze(masks, axis=2)
-                att_mask = paddle.matmul(att_mask, att_mask, transpose_y=True)
                 logits = model(input_ids, token_type_ids, position_ids, masks)
 
                 loss_oth = criterion(logits[0], paddle.unsqueeze(label_oth, 2))
diff --git a/examples/biomedical/cblue/utils.py b/examples/biomedical/cblue/utils.py
@@ -114,9 +114,8 @@ def convert_example_ner(example,
         text=text,
         max_seq_len=max_seq_length,
         return_position_ids=True,
-        return_length=True)
+        return_attention_mask=True)
     input_len = len(encoded_inputs['input_ids'])
-    encoded_inputs['mask'] = np.ones(input_len)
 
     if example.get('labels', None):
         labels = example['labels']
diff --git a/paddlenlp/transformers/electra/modeling.py b/paddlenlp/transformers/electra/modeling.py
@@ -386,7 +386,7 @@ def _init_weights(self, layer):
         elif isinstance(layer, nn.LayerNorm):
             layer.bias.set_value(paddle.zeros_like(layer.bias))
             layer.weight.set_value(paddle.full_like(layer.weight, 1.0))
-            layer._epsilon = getattr(self, 'layer_norm_eps', 1e-12)
+            layer._epsilon = getattr(self, "layer_norm_eps", 1e-12)
         if isinstance(layer, nn.Linear) and layer.bias is not None:
             layer.bias.set_value(paddle.zeros_like(layer.bias))