PaddlePaddle
diff --git a/‎examples/few_shot/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/few_shot/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/few_shot/pet/data.py‎
Lines changed: 7 additions & 167 deletions b/‎examples/few_shot/pet/data.py‎
Lines changed: 7 additions & 167 deletions
diff --git a/‎examples/few_shot/pet/evaluate.py‎
Lines changed: 1 addition & 40 deletions b/‎examples/few_shot/pet/evaluate.py‎
Lines changed: 1 addition & 40 deletions
@@ -12,7 +12,7 @@ Few-Shot Learning 旨在研究如何从少量有监督的训练样本中学习
 | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ |------------ | ------------ | ---------- |
 | P-tuning  | ERNIE1.0  | 55.70 | 83.28  | 63.43  | 35.36  | 60.54  | 50.02  | 54.51  | 50.14 | 54.93 | 41.16 |
 | EFL       | ERNIE1.0  | 54.47 | 84.10  | 60.10  | 35.12  | 56.61  | 56.57  | 53.59  | 46.37 | 61.21 | 36.56 |
-| PET       | ERNIE1.0  | 56.38 | 86.88  | 61.90  | 36.90  | 61.10  | 56.51  | 55.02  | 50.31 | 59.72 | 39.11 |
+| PET       | ERNIE1.0  | 56.63 | 86.88  | 61.90  | 36.90  | 61.10  | 56.51  | 55.02  | 50.31 | 59.72 | 41.35 |
 ## 策略库
 - [P-tuning](./p-tuning)
 - [EFL](./efl)
 
@@ -80,19 +80,12 @@ def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
         src_ids = encoded_inputs["input_ids"]
         token_type_ids = encoded_inputs["token_type_ids"]
 
-        # # Step2: gen p_token_ids
-        # p_tokens = ["[unused{}]".format(i) for i in range(p_embedding_num)]
-        # p_token_ids = tokenizer.convert_tokens_to_ids(p_tokens)
-
-        # Step3: Insert "[MASK]" to src_ids based on start_mask_position
+        # Step2: Insert "[MASK]" to src_ids based on start_mask_position
         src_ids = src_ids[0:start_mask_position] + mask_ids + src_ids[
             start_mask_position:]
         token_type_ids = token_type_ids[0:start_mask_position] + [0] * len(
             mask_ids) + token_type_ids[start_mask_position:]
 
-        # Stpe4: Insert P-tokens at begin of sentence
-        # src_ids = p_token_ids + src_ids
-
         # calculate mask_positions
         mask_positions = [
             index + start_mask_position for index in range(label_length)
@@ -143,129 +136,6 @@ def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
         return src_ids, token_type_ids, mask_positions, mask_lm_labels
 
 
-def convert_cluewsc_example(example,
-                            tokenizer,
-                            max_seq_length=512,
-                            is_test=False):
-    """
-    Args:
-        example(obj:`list(str)`): The list of text to be converted to ids.
-        tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
-            which contains most of the methods. Users should refer to the superclass for more information regarding methods.
-        max_seq_len(obj:`int`): The maximum total input sequence length after tokenization. 
-            Sequences longer than this will be truncated, sequences shorter will be padded.
-        p_embedding_num(obj:`int`) The number of p-embedding.
-    Returns:
-        input_ids(obj:`list[int]`): The list of query token ids.
-        token_type_ids(obj: `list[int]`): List of query sequence pair mask.
-        mask_positions(obj: `list[int]`): The list of mask_positions.
-        mask_lm_labels(obj: `list[int]`): The list of mask_lm_labels.
-    """
-
-    # Replace <unk> with '[MASK]'
-
-    # Step1: gen mask ids
-    if is_test:
-        label_length = example["label_length"]
-    else:
-        text_label = example["text_label"]
-        label_length = len(text_label)
-
-    mask_tokens = ["[MASK]"] * label_length
-    mask_ids = tokenizer.convert_tokens_to_ids(mask_tokens)
-
-    sentence1 = example["sentence1"]
-    if "<unk>" in sentence1:
-        start_mask_position = sentence1.index("<unk>") + 1
-        sentence1 = sentence1.replace("<unk>", "")
-        encoded_inputs = tokenizer(text=sentence1, max_seq_len=max_seq_length)
-        src_ids = encoded_inputs["input_ids"]
-        token_type_ids = encoded_inputs["token_type_ids"]
-
-        # # Step2: gen p_token_ids
-        # p_tokens = ["[unused{}]".format(i) for i in range(p_embedding_num)]
-        # p_token_ids = tokenizer.convert_tokens_to_ids(p_tokens)
-
-        # Step3: Insert "[MASK]" to src_ids based on start_mask_position
-        src_ids = src_ids[0:start_mask_position] + mask_ids + src_ids[
-            start_mask_position:]
-        token_type_ids = token_type_ids[0:start_mask_position] + [0] * len(
-            mask_ids) + token_type_ids[start_mask_position:]
-
-        # Stpe4: Insert P-tokens at begin of sentence
-        # src_ids = p_token_ids + src_ids
-
-        # calculate mask_positions
-        mask_positions = [
-            index + start_mask_position for index in range(label_length)
-        ]
-    else:
-        sentence2 = example['sentence2']
-        start_mask_position = sentence2.index("<unk>") + 1
-        sentence2 = sentence2.replace("<unk>", "")
-
-        encoded_inputs = tokenizer(text=sentence2, max_seq_len=max_seq_length)
-        src_ids = encoded_inputs["input_ids"]
-        token_type_ids = encoded_inputs["token_type_ids"]
-        src_ids = src_ids[0:start_mask_position] + mask_ids + src_ids[
-            start_mask_position:]
-        token_type_ids = token_type_ids[0:start_mask_position] + [0] * len(
-            mask_ids) + token_type_ids[start_mask_position:]
-
-        encoded_inputs = tokenizer(text=sentence1, max_seq_len=max_seq_length)
-        sentence1_src_ids = encoded_inputs["input_ids"][1:]
-        src_ids = sentence1_src_ids + src_ids
-        token_type_ids += [1] * len(src_ids)
-        mask_positions = [
-            index + start_mask_position + len(sentence1)
-            for index in range(label_length)
-        ]
-
-    token_type_ids = [0] * len(src_ids)
-
-    assert len(src_ids) == len(
-        token_type_ids), "length src_ids, token_type_ids must be equal"
-
-    length = len(src_ids)
-    if length > 512:
-        src_ids = src_ids[:512]
-        token_type_ids = token_type_ids[:512]
-
-    if is_test:
-        import jieba.posseg as pseg
-        judge = 0
-
-        def isname(single_word_string):
-            pair_word_list = pseg.lcut(single_word_string)
-            for eve_word, cixing in pair_word_list:
-                if cixing == "nr":
-                    return True
-            return False
-
-        text_ori = example["target"]["span1_text"]
-        text_daici = example["target"]["span2_text"]
-        if isname(text_ori) and text_daici == "它":
-            judge = 1
-        if ("妈" in text_ori or "姨" in text_ori or "婆" in text_ori or
-                "太太" in text_ori or "妻" in text_ori or "姐" in text_ori or
-                "妹" in text_ori) and ("他" in text_daici):
-            judge = 1
-        if ("爸" in text_ori or "叔" in text_ori or "公" in text_ori or
-                "夫" in text_ori or "哥" in text_ori or
-                "弟" in text_ori) and ("她" in text_daici):
-            judge = 1
-        # print(paddle.to_tensor(judge, dtype="int64"))
-        return src_ids, token_type_ids, mask_positions, judge
-    else:
-        mask_lm_labels = tokenizer(
-            text=text_label, max_seq_len=max_seq_length)["input_ids"][1:-1]
-        assert len(mask_lm_labels) == len(
-            mask_positions
-        ) == label_length, "length of mask_lm_labels:{} mask_positions:{} label_length:{} not equal".format(
-            mask_lm_labels, mask_positions, text_label)
-        return src_ids, token_type_ids, mask_positions, mask_lm_labels
-
-
 def convert_chid_example(example, tokenizer, max_seq_length=512, is_test=False):
     """
     Args:
@@ -283,17 +153,12 @@ def convert_chid_example(example, tokenizer, max_seq_length=512, is_test=False):
         mask_lm_labels(obj: `list[int]`): The list of mask_lm_labels.
     """
     # FewClue Task `Chid`' label's position must be calculated by special token: "淠"
-    # FewClue Task `Chid`' label's position must be calculated by special token: "龜" 
 
     seg_tokens = tokenizer.tokenize(example["sentence1"])
 
     # find insert position of `[MASK]`
-    # start_mask_position = seg_tokens.index("淠") + 1
-    # seg_tokens.remove("淠")
-    # start_mask_position = seg_tokens.index("龜") + 1
-    # seg_tokens.remove("龜")
-    start_mask_position = seg_tokens.index("[UNK]") + 1
-    seg_tokens.remove("[UNK]")
+    start_mask_position = seg_tokens.index("淠") + 1
+    seg_tokens.remove("淠")
 
     sentence1 = "".join(seg_tokens)
     candidates = example["candidates"]
@@ -317,19 +182,12 @@ def convert_chid_example(example, tokenizer, max_seq_length=512, is_test=False):
     mask_tokens = ["[MASK]"] * label_length
     mask_ids = tokenizer.convert_tokens_to_ids(mask_tokens)
 
-    # Step2: gen p_token_ids
-    # p_tokens = ["[unused{}]".format(i) for i in range(p_embedding_num)]
-    # p_token_ids = tokenizer.convert_tokens_to_ids(p_tokens)
-
-    # Step3: Insert "[MASK]" to src_ids based on start_mask_position
+    # Step2: Insert "[MASK]" to src_ids based on start_mask_position
     src_ids = src_ids[0:start_mask_position] + mask_ids + src_ids[
         start_mask_position:]
     token_type_ids = token_type_ids[0:start_mask_position] + [0] * len(
         mask_ids) + token_type_ids[start_mask_position:]
 
-    # Stpe4: Insert P-tokens at begin of sentence
-    # src_ids = p_token_ids + src_ids
-
     # calculate mask_positions
     mask_positions = [
         index + start_mask_position for index in range(label_length)
@@ -577,12 +435,6 @@ def transform_csldcp(example,
 
         if pattern_id == 0:
             example["sentence1"] = u'这篇关于<unk>的文章讲了' + example["content"]
-        # elif pattern_id == 1:
-        #     example["sentence1"] = example["content"] + u'这是一篇关于<unk>的文章'
-        # elif pattern_id == 2:
-        #     example["sentence1"] = example["content"] + u'这是和<unk>有关的文章'
-        # elif pattern_id == 3:
-        #     example["sentence1"] = example["content"] + u'这些与<unk>有关'
         elif pattern_id == 1:
             example["sentence1"] = example["content"] + u'和<unk>息息相关'
         elif pattern_id == 2:
@@ -599,12 +451,6 @@ def transform_csldcp(example,
         example['text_label'] = normalized_label
         if pattern_id == 0:
             example["sentence1"] = u'这篇关于<unk>的文章讲了' + example["content"]
-        # elif pattern_id == 1:
-        #     example["sentence1"] = example["content"] + u'这是一篇关于<unk>的文章'
-        # elif pattern_id == 2:
-        #     example["sentence1"] = example["content"] + u'这是和<unk>有关的文章'
-        # elif pattern_id == 3:
-        #     example["sentence1"] = example["content"] + u'这些与<unk>有关'
         elif pattern_id == 1:
             example["sentence1"] = example["content"] + u'和<unk>息息相关'
         elif pattern_id == 2:
@@ -661,9 +507,7 @@ def transform_chid(example,
 
     if is_test:
         example["label_length"] = 4
-        # example["sentence1"] = example["content"].replace("#idiom#", "淠")
-        # example["sentence1"] = example["content"].replace("#idiom#", "龜")
-        example["sentence1"] = example["content"].replace("#idiom#", "蠅")
+        example["sentence1"] = example["content"].replace("#idiom#", "淠")
         del example["content"]
 
         return example
@@ -675,11 +519,7 @@ def transform_chid(example,
         # Note: `#idom#` represent a idom which must be replaced with rarely-used Chinese characters
         # to get the label's position after the text processed by tokenizer
         #ernie
-        # example["sentence1"] = example["content"].replace("#idiom#", "淠")
-        #albert
-        # example["sentence1"] = example["content"].replace("#idiom#", "龜")
-        #macbert
-        example["sentence1"] = example["content"].replace("#idiom#", "蠅")
+        example["sentence1"] = example["content"].replace("#idiom#", "淠")
         del example["content"]
 
         return example
@@ -744,4 +584,4 @@ def transform_cluewsc(example,
     "csldcp": transform_csldcp,
     "cluewsc": transform_cluewsc,
     "chid": transform_chid
-}
+}
@@ -34,11 +34,6 @@ def do_evaluate(model, tokenizer, data_loader, label_normalize_dict):
 
     for batch in data_loader:
         src_ids, token_type_ids, masked_positions, masked_lm_labels = batch
-        # [bs * label_length, vocab_size]
-        # prediction_probs = model.predict(
-        #     input_ids=src_ids,
-        #     token_type_ids=token_type_ids,
-        #     masked_positions=masked_positions)
 
         max_len = src_ids.shape[1]
         new_masked_positions = []
@@ -113,44 +108,10 @@ def do_evaluate_cluewsc(model, tokenizer, data_loader, label_normalize_dict):
         prediction_probs = model(
             input_ids=src_ids, token_type_ids=token_type_ids)
 
-        # max_len = src_ids.shape[1]
-        # new_masked_positions = []
-
-        # for bs_index, mask_pos in enumerate(masked_positions.numpy()):
-        #     for pos in mask_pos:
-        #         new_masked_positions.append(bs_index * max_len + pos)
-        # new_masked_positions = paddle.to_tensor(np.array(new_masked_positions).astype('int32'))
-
-        # prediction_scores, _ = model(
-        #     input_ids=src_ids,
-        #     token_type_ids=token_type_ids,
-        #     masked_positions=new_masked_positions)
-        # softmax_fn = paddle.nn.Softmax()
-        # prediction_probs = softmax_fn(prediction_scores)
-
-        # batch_size = len(src_ids)
-        # vocab_size = 2
-
-        # # prediction_probs: [batch_size, label_lenght, vocab_size]
-        # prediction_probs = paddle.reshape(
-        #     prediction_probs, shape=[batch_size, -1, vocab_size]).numpy()
-
-        # # [label_num, label_length]
-        # label_ids = np.array(
-        #     [tokenizer(label)["input_ids"][1:-1] for label in normed_labels])
-
-        # y_pred = np.ones(shape=[batch_size, len(label_ids)])
-
-        # # Calculate joint distribution of candidate labels
-        # for index in range(label_length):
-        #     y_pred *= prediction_probs[:, index, label_ids[:, index]]
-
         # Get max probs label's index
         y_pred_index = paddle.argmax(prediction_probs, axis=-1).numpy()
         y_true_index = []
-        # print(y_pred_index)
-        # print(label_idx)
-        # print()
+
         for label_i in label_idx.numpy():
             y_true_index.append(label_i)
         y_true_index = np.array(y_true_index)