PaddlePaddle
diff --git a/‎examples/text_matching/ernie_matching/data.py‎
Lines changed: 152 additions & 0 deletions b/‎examples/text_matching/ernie_matching/data.py‎
Lines changed: 152 additions & 0 deletions
diff --git a/‎examples/text_matching/ernie_matching/model.py‎
Lines changed: 101 additions & 0 deletions b/‎examples/text_matching/ernie_matching/model.py‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎examples/text_matching/ernie_matching/predict_pairwise.py‎
Lines changed: 122 additions & 0 deletions b/‎examples/text_matching/ernie_matching/predict_pairwise.py‎
Lines changed: 122 additions & 0 deletions
@@ -0,0 +1,152 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+
+from paddlenlp.datasets import MapDataset
+
+
+def create_dataloader(dataset,
+                      mode='train',
+                      batch_size=1,
+                      batchify_fn=None,
+                      trans_fn=None):
+    if trans_fn:
+        dataset = dataset.map(trans_fn)
+
+    shuffle = True if mode == 'train' else False
+    if mode == 'train':
+        batch_sampler = paddle.io.DistributedBatchSampler(
+            dataset, batch_size=batch_size, shuffle=shuffle)
+    else:
+        batch_sampler = paddle.io.BatchSampler(
+            dataset, batch_size=batch_size, shuffle=shuffle)
+
+    return paddle.io.DataLoader(
+        dataset=dataset,
+        batch_sampler=batch_sampler,
+        collate_fn=batchify_fn,
+        return_list=True)
+
+
+def read_text_pair(data_path):
+    """Reads data."""
+    with open(data_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            data = line.rstrip().split("\t")
+            if len(data) != 2:
+                continue
+            yield {'query': data[0], 'title': data[1]}
+
+
+def convert_pointwise_example(example,
+                              tokenizer,
+                              max_seq_length=512,
+                              is_test=False):
+
+    query, title = example["query"], example["title"]
+
+    encoded_inputs = tokenizer(
+        text=query, text_pair=title, max_seq_len=max_seq_length)
+
+    input_ids = encoded_inputs["input_ids"]
+    token_type_ids = encoded_inputs["token_type_ids"]
+
+    if not is_test:
+        label = np.array([example["label"]], dtype="int64")
+        return input_ids, token_type_ids, label
+    else:
+        return input_ids, token_type_ids
+
+
+def convert_pairwise_example(example,
+                             tokenizer,
+                             max_seq_length=512,
+                             phase="train"):
+
+    if phase == "train":
+        query, pos_title, neg_title = example["query"], example[
+            "title"], example["neg_title"]
+
+        pos_inputs = tokenizer(
+            text=query, text_pair=pos_title, max_seq_len=max_seq_length)
+        neg_inputs = tokenizer(
+            text=query, text_pair=neg_title, max_seq_len=max_seq_length)
+
+        pos_input_ids = pos_inputs["input_ids"]
+        pos_token_type_ids = pos_inputs["token_type_ids"]
+        neg_input_ids = neg_inputs["input_ids"]
+        neg_token_type_ids = neg_inputs["token_type_ids"]
+
+        return (pos_input_ids, pos_token_type_ids, neg_input_ids,
+                neg_token_type_ids)
+
+    else:
+        query, title = example["query"], example["title"]
+
+        inputs = tokenizer(
+            text=query, text_pair=title, max_seq_len=max_seq_length)
+
+        input_ids = inputs["input_ids"]
+        token_type_ids = inputs["token_type_ids"]
+        if phase == "eval":
+            return input_ids, token_type_ids, example["label"]
+        elif phase == "predict":
+            return input_ids, token_type_ids
+        else:
+            raise ValueError("not supported phase:{}".format(phase))
+
+
+def gen_pair(dataset, pool_size=100):
+    """ 
+    Generate triplet randomly based on dataset
+ 
+    Args:
+        dataset: A `MapDataset` or `IterDataset` or a tuple of those. 
+            Each example is composed of 2 texts: exampe["query"], example["title"]
+        pool_size: the number of example to sample negative example randomly
+
+    Return:
+        dataset: A `MapDataset` or `IterDataset` or a tuple of those.
+        Each example is composed of 2 texts: exampe["query"], example["pos_title"]、example["neg_title"]
+    """
+
+    if len(dataset) < pool_size:
+        pool_size = len(dataset)
+
+    new_examples = []
+    pool = []
+    tmp_exmaples = []
+
+    for example in dataset:
+        label = example["label"]
+
+        # Filter negative example
+        if label == 0:
+            continue
+
+        tmp_exmaples.append(example)
+        pool.append(example["title"])
+
+        if len(pool) >= pool_size:
+            np.random.shuffle(pool)
+            for idx, example in enumerate(tmp_exmaples):
+                example["neg_title"] = pool[idx]
+                new_examples.append(example)
+            tmp_exmaples = []
+            pool = []
+        else:
+            continue
+    return MapDataset(new_examples)
@@ -0,0 +1,101 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class PointwiseMatching(nn.Layer):
+    def __init__(self, pretrained_model, dropout=None):
+        super().__init__()
+        self.ptm = pretrained_model
+        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
+
+        # num_labels = 2 (similar or dissimilar)
+        self.classifier = nn.Linear(self.ptm.config["hidden_size"], 2)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                position_ids=None,
+                attention_mask=None):
+
+        _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids,
+                                    attention_mask)
+
+        cls_embedding = self.dropout(cls_embedding)
+        logits = self.classifier(cls_embedding)
+        probs = F.softmax(logits)
+
+        return probs
+
+
+class PairwiseMatching(nn.Layer):
+    def __init__(self, pretrained_model, dropout=None, margin=0.1):
+        super().__init__()
+        self.ptm = pretrained_model
+        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
+        self.margin = margin
+
+        # hidden_size -> 1, calculate similarity
+        self.similarity = nn.Linear(self.ptm.config["hidden_size"], 1)
+
+    def predict(self,
+                input_ids,
+                token_type_ids=None,
+                position_ids=None,
+                attention_mask=None):
+
+        _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids,
+                                    attention_mask)
+
+        cls_embedding = self.dropout(cls_embedding)
+        sim_score = self.similarity(cls_embedding)
+        sim_score = F.sigmoid(sim_score)
+
+        return sim_score
+
+    def forward(self,
+                pos_input_ids,
+                neg_input_ids,
+                pos_token_type_ids=None,
+                neg_token_type_ids=None,
+                pos_position_ids=None,
+                neg_position_ids=None,
+                pos_attention_mask=None,
+                neg_attention_mask=None):
+
+        _, pos_cls_embedding = self.ptm(pos_input_ids, pos_token_type_ids,
+                                        pos_position_ids, pos_attention_mask)
+
+        _, neg_cls_embedding = self.ptm(neg_input_ids, neg_token_type_ids,
+                                        neg_position_ids, neg_attention_mask)
+
+        pos_embedding = self.dropout(pos_cls_embedding)
+        neg_embedding = self.dropout(neg_cls_embedding)
+
+        pos_sim = self.similarity(pos_embedding)
+        neg_sim = self.similarity(neg_embedding)
+
+        pos_sim = F.sigmoid(pos_sim)
+        neg_sim = F.sigmoid(neg_sim)
+
+        labels = paddle.full(
+            shape=[pos_cls_embedding.shape[0]], fill_value=1.0, dtype='float32')
+
+        loss = F.margin_ranking_loss(
+            pos_sim, neg_sim, labels, margin=self.margin)
+
+        return loss
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import argparse
+import sys
+import os
+import random
+import time
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import paddlenlp as ppnlp
+from paddlenlp.datasets import load_dataset
+from paddlenlp.data import Stack, Tuple, Pad
+
+from data import create_dataloader, read_text_pair
+from data import convert_pairwise_example as convert_example
+from model import PairwiseMatching
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--input_file", type=str, required=True, help="The full path of input file")
+parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.")
+parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. "
+    "Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def predict(model, data_loader):
+    """
+    Predicts the data labels.
+
+    Args:
+        model (obj:`SemanticIndexBase`): A model to extract text embedding or calculate similarity of text pair.
+        data_loaer (obj:`List(Example)`): The processed data ids of text pair: [query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids]
+    Returns:
+        results(obj:`List`): cosine similarity of text pairs.
+    """
+    batch_probs = []
+
+    model.eval()
+
+    with paddle.no_grad():
+        for batch_data in data_loader:
+            input_ids, token_type_ids = batch_data
+
+            input_ids = paddle.to_tensor(input_ids)
+            token_type_ids = paddle.to_tensor(token_type_ids)
+
+            batch_prob = model.predict(
+                input_ids=input_ids, token_type_ids=token_type_ids).numpy()
+
+            batch_probs.append(batch_prob)
+
+        batch_probs = np.concatenate(batch_probs, axis=0)
+
+        return batch_probs
+
+
+if __name__ == "__main__":
+    paddle.set_device(args.device)
+
+    tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')
+
+    trans_func = partial(
+        convert_example,
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length,
+        phase="predict")
+
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment_ids
+    ): [data for data in fn(samples)]
+
+    valid_ds = load_dataset(
+        read_text_pair, data_path=args.input_file, lazy=False)
+
+    valid_data_loader = create_dataloader(
+        valid_ds,
+        mode='predict',
+        batch_size=args.batch_size,
+        batchify_fn=batchify_fn,
+        trans_fn=trans_func)
+
+    pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(
+        "ernie-1.0")
+
+    model = PairwiseMatching(pretrained_model)
+
+    if args.params_path and os.path.isfile(args.params_path):
+        state_dict = paddle.load(args.params_path)
+        model.set_dict(state_dict)
+        print("Loaded parameters from %s" % args.params_path)
+    else:
+        raise ValueError(
+            "Please set --params_path with correct pretrained model file")
+
+    y_probs = predict(model, valid_data_loader)
+
+    valid_ds = load_dataset(
+        read_text_pair, data_path=args.input_file, lazy=False)
+    for idx, prob in enumerate(y_probs):
+        text_pair = valid_ds[idx]
+        text_pair["pred_prob"] = prob[0]
+        print(text_pair)