Add recall inference similarity (#1507)

w5688414 · web-flow · commit 843c69af49ef · 2021-12-25T13:01:26.000+08:00
* add recall inference similarity

* update examples

* updatea readme

* update dir name
diff --git a/applications/neural_search/recall/domain_adaptive_pretraining/data_tools/process_data.py b/applications/neural_search/recall/domain_adaptive_pretraining/data_tools/process_data.py
diff --git a/applications/neural_search/recall/in_batch_negative/README.md b/applications/neural_search/recall/in_batch_negative/README.md
@@ -412,10 +412,16 @@ sh scripts/export_model.sh
 
 ### Paddle Inference预测
 
+预测既可以抽取向量也可以计算两个文本的相似度。
+
 修改id2corpus的样本：
 
 ```
+# 抽取向量
 id2corpus={0:'国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
+# 计算相似度
+corpus_list=[['中西方语言与文化的差异','中西方文化差异以及语言体现中西方文化,差异,语言体现'],
+                    ['中西方语言与文化的差异','飞桨致力于让深度学习技术的创新与应用更简单']]
 
 ```
 
@@ -429,14 +435,16 @@ python deploy/python/predict.py --model_dir=./output
 ```
 sh deploy.sh
 ```
-输出结果如下：
+最终输出的是256维度的特征向量和句子对的预测概率：
 
 ```
 (1, 256)
 [[-0.0394925  -0.04474756 -0.065534    0.00939134  0.04359895  0.14659195
   -0.0091779  -0.07303623  0.09413272 -0.01255222 -0.08685658  0.02762237
    0.10138468  0.00962821  0.10888419  0.04553023  0.05898942  0.00694253
    ....
+
+[0.959269642829895, 0.04725276678800583]
 ```
 
 ## Reference
diff --git a/applications/neural_search/recall/in_batch_negative/deploy/python/predict.py b/applications/neural_search/recall/in_batch_negative/deploy/python/predict.py
@@ -20,6 +20,7 @@
 import paddle
 import paddlenlp as ppnlp
 from scipy.special import softmax
+from scipy import spatial
 from paddle import inference
 from paddlenlp.data import Stack, Tuple, Pad
 from paddlenlp.datasets import load_dataset
@@ -172,7 +173,7 @@ def __init__(self,
                 warmup=0,
                 logger=logger)
 
-    def predict(self, data, tokenizer):
+    def extract_embedding(self, data, tokenizer):
         """
         Predicts the data labels.
 
@@ -182,7 +183,7 @@ def predict(self, data, tokenizer):
                 which contains most of the methods. Users should refer to the superclass for more information regarding methods.
 
         Returns:
-            results(obj:`dict`): All the predictions labels.
+            results(obj:`dict`): All the feature vectors.
         """
         if args.benchmark:
             self.autolog.times.start()
@@ -213,6 +214,57 @@ def predict(self, data, tokenizer):
 
         return logits
 
+    def predict(self, data, tokenizer):
+        """
+        Predicts the data labels.
+
+        Args:
+            data (obj:`List(str)`): The batch data whose each element is a raw text.
+            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
+                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+
+        Returns:
+            results(obj:`dict`): All the predictions probs.
+        """
+        if args.benchmark:
+            self.autolog.times.start()
+
+        examples = []
+        for idx,text in enumerate(data):
+            input_ids, segment_ids = convert_example(
+                {idx:text[0]}, tokenizer)
+            title_ids,title_segment_ids=convert_example({idx:text[1]},tokenizer)
+            examples.append((input_ids, segment_ids,title_ids,title_segment_ids))
+
+        batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+        ): fn(samples)
+
+        if args.benchmark:
+            self.autolog.times.stamp()
+
+        query_ids, query_segment_ids,title_ids, title_segment_ids = batchify_fn(examples)
+        self.input_handles[0].copy_from_cpu(query_ids)
+        self.input_handles[1].copy_from_cpu(query_segment_ids)
+        self.predictor.run()
+        query_logits = self.output_handle.copy_to_cpu()
+
+        self.input_handles[0].copy_from_cpu(title_ids)
+        self.input_handles[1].copy_from_cpu(title_segment_ids)
+        self.predictor.run()
+        title_logits = self.output_handle.copy_to_cpu()
+
+        if args.benchmark:
+            self.autolog.times.stamp()
+            
+        if args.benchmark:
+            self.autolog.times.end(stamp=True)
+        result=[float(1 - spatial.distance.cosine(arr1, arr2)) for arr1, arr2 in zip(query_logits, title_logits)]
+        return result
+
 
 if __name__ == "__main__":
     # Define predictor to do prediction.
@@ -225,6 +277,10 @@ def predict(self, data, tokenizer):
     tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')
     id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
     corpus_list = [{idx: text} for idx, text in id2corpus.items()]
-    res = predictor.predict(corpus_list, tokenizer)
+    res=predictor.extract_embedding(corpus_list, tokenizer)
     print(res.shape)
     print(res)
+    corpus_list=[['中西方语言与文化的差异','中西方文化差异以及语言体现中西方文化,差异,语言体现'],
+                    ['中西方语言与文化的差异','飞桨致力于让深度学习技术的创新与应用更简单']]
+    res=predictor.predict(corpus_list,tokenizer)
+    print(res)
diff --git a/applications/neural_search/recall/milvus/milvus_recall.py b/applications/neural_search/recall/milvus/milvus_recall.py
@@ -37,7 +37,7 @@ def search(self, vectors, collection_name, partition_tag=None):
 if __name__ == '__main__':
     import random
     client = RecallByMilvus()
-    collection_name = 'test1'
+    collection_name = 'literature_search'
     partition_tag = 'partition_3'
     embeddings = [[random.random() for _ in range(128)] for _ in range(2)]
     status, resultes = client.search(
diff --git a/applications/neural_search/recall/simcse/README.md b/applications/neural_search/recall/simcse/README.md
@@ -398,10 +398,16 @@ sh scripts/export_model.sh
 
 ### Paddle Inference预测
 
+预测既可以抽取向量也可以计算两个文本的相似度。
+
 修改id2corpus的样本：
 
 ```
+# 抽取向量
 id2corpus={0:'国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
+# 计算相似度
+corpus_list=[['中西方语言与文化的差异','中西方文化差异以及语言体现中西方文化,差异,语言体现'],
+                    ['中西方语言与文化的差异','飞桨致力于让深度学习技术的创新与应用更简单']]
 
 ```
 然后使用PaddleInference
@@ -414,14 +420,16 @@ python deploy/python/predict.py --model_dir=./output
 ```
 sh deploy.sh
 ```
-最终输出的是256维度的特征向量
+最终输出的是256维度的特征向量和句子对的预测概率
 
 ```
 (1, 256)
 [[-6.70653731e-02 -6.46873191e-03 -6.78317575e-03  1.66618153e-02
    7.20006898e-02 -9.79136024e-03 -1.38439541e-03  4.37440872e-02
    4.78115827e-02  1.33881137e-01  1.82927139e-02  3.23656537e-02
    .......
+
+[0.5649663209915161, 0.03284594044089317]
 ```
 
 
diff --git a/applications/neural_search/recall/simcse/deploy/python/predict.py b/applications/neural_search/recall/simcse/deploy/python/predict.py
@@ -20,6 +20,7 @@
 import paddle
 import paddlenlp as ppnlp
 from scipy.special import softmax
+from scipy import spatial
 from paddle import inference
 from paddlenlp.data import Stack, Tuple, Pad
 from paddlenlp.datasets import load_dataset
@@ -168,7 +169,7 @@ def __init__(self,
                 warmup=0,
                 logger=logger)
 
-    def predict(self, data, tokenizer):
+    def extract_embedding(self, data, tokenizer):
         """
         Predicts the data labels.
 
@@ -178,7 +179,7 @@ def predict(self, data, tokenizer):
                 which contains most of the methods. Users should refer to the superclass for more information regarding methods.
 
         Returns:
-            results(obj:`dict`): All the predictions labels.
+            results(obj:`dict`): All the feature vectors.
         """
         if args.benchmark:
             self.autolog.times.start()
@@ -209,6 +210,57 @@ def predict(self, data, tokenizer):
 
         return logits
 
+    def predict(self, data, tokenizer):
+        """
+        Predicts the data labels.
+
+        Args:
+            data (obj:`List(str)`): The batch data whose each element is a raw text.
+            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
+                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+
+        Returns:
+            results(obj:`dict`): All the prediction probs.
+        """
+        if args.benchmark:
+            self.autolog.times.start()
+
+        examples = []
+        for idx,text in enumerate(data):
+            input_ids, segment_ids = convert_example(
+                {idx:text[0]}, tokenizer)
+            title_ids,title_segment_ids=convert_example({idx:text[1]},tokenizer)
+            examples.append((input_ids, segment_ids,title_ids,title_segment_ids))
+
+        batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+        ): fn(samples)
+
+        if args.benchmark:
+            self.autolog.times.stamp()
+
+        query_ids, query_segment_ids,title_ids, title_segment_ids = batchify_fn(examples)
+        self.input_handles[0].copy_from_cpu(query_ids)
+        self.input_handles[1].copy_from_cpu(query_segment_ids)
+        self.predictor.run()
+        query_logits = self.output_handle.copy_to_cpu()
+
+        self.input_handles[0].copy_from_cpu(title_ids)
+        self.input_handles[1].copy_from_cpu(title_segment_ids)
+        self.predictor.run()
+        title_logits = self.output_handle.copy_to_cpu()
+
+        if args.benchmark:
+            self.autolog.times.stamp()
+            
+        if args.benchmark:
+            self.autolog.times.end(stamp=True)
+        result=[float(1 - spatial.distance.cosine(arr1, arr2)) for arr1, arr2 in zip(query_logits, title_logits)]
+        return result
+
 
 if __name__ == "__main__":
     # Define predictor to do prediction.
@@ -221,6 +273,11 @@ def predict(self, data, tokenizer):
     tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')
     id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
     corpus_list = [{idx: text} for idx, text in id2corpus.items()]
+    res=predictor.extract_embedding(corpus_list, tokenizer)
     res = predictor.predict(corpus_list, tokenizer)
     print(res.shape)
     print(res)
+    corpus_list=[['中西方语言与文化的差异','中西方文化差异以及语言体现中西方文化,差异,语言体现'],
+                    ['中西方语言与文化的差异','飞桨致力于让深度学习技术的创新与应用更简单']]
+    res=predictor.predict(corpus_list,tokenizer)
+    print(res)