Skip to content

Commit 843c69a

Browse files
authored
Add recall inference similarity (#1507)
* add recall inference similarity * update examples * updatea readme * update dir name
1 parent 3012065 commit 843c69a

File tree

6 files changed

+137
-40
lines changed

6 files changed

+137
-40
lines changed

applications/neural_search/recall/domain_adaptive_pretraining/data_tools/process_data.py

Lines changed: 0 additions & 32 deletions
This file was deleted.

applications/neural_search/recall/in_batch_negative/README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,10 +412,16 @@ sh scripts/export_model.sh
412412

413413
### Paddle Inference预测
414414

415+
预测既可以抽取向量也可以计算两个文本的相似度。
416+
415417
修改id2corpus的样本:
416418

417419
```
420+
# 抽取向量
418421
id2corpus={0:'国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
422+
# 计算相似度
423+
corpus_list=[['中西方语言与文化的差异','中西方文化差异以及语言体现中西方文化,差异,语言体现'],
424+
['中西方语言与文化的差异','飞桨致力于让深度学习技术的创新与应用更简单']]
419425
420426
```
421427

@@ -429,14 +435,16 @@ python deploy/python/predict.py --model_dir=./output
429435
```
430436
sh deploy.sh
431437
```
432-
输出结果如下
438+
最终输出的是256维度的特征向量和句子对的预测概率
433439

434440
```
435441
(1, 256)
436442
[[-0.0394925 -0.04474756 -0.065534 0.00939134 0.04359895 0.14659195
437443
-0.0091779 -0.07303623 0.09413272 -0.01255222 -0.08685658 0.02762237
438444
0.10138468 0.00962821 0.10888419 0.04553023 0.05898942 0.00694253
439445
....
446+
447+
[0.959269642829895, 0.04725276678800583]
440448
```
441449

442450
## Reference

applications/neural_search/recall/in_batch_negative/deploy/python/predict.py

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import paddle
2121
import paddlenlp as ppnlp
2222
from scipy.special import softmax
23+
from scipy import spatial
2324
from paddle import inference
2425
from paddlenlp.data import Stack, Tuple, Pad
2526
from paddlenlp.datasets import load_dataset
@@ -172,7 +173,7 @@ def __init__(self,
172173
warmup=0,
173174
logger=logger)
174175

175-
def predict(self, data, tokenizer):
176+
def extract_embedding(self, data, tokenizer):
176177
"""
177178
Predicts the data labels.
178179
@@ -182,7 +183,7 @@ def predict(self, data, tokenizer):
182183
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
183184
184185
Returns:
185-
results(obj:`dict`): All the predictions labels.
186+
results(obj:`dict`): All the feature vectors.
186187
"""
187188
if args.benchmark:
188189
self.autolog.times.start()
@@ -213,6 +214,57 @@ def predict(self, data, tokenizer):
213214

214215
return logits
215216

217+
def predict(self, data, tokenizer):
218+
"""
219+
Predicts the data labels.
220+
221+
Args:
222+
data (obj:`List(str)`): The batch data whose each element is a raw text.
223+
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
224+
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
225+
226+
Returns:
227+
results(obj:`dict`): All the predictions probs.
228+
"""
229+
if args.benchmark:
230+
self.autolog.times.start()
231+
232+
examples = []
233+
for idx,text in enumerate(data):
234+
input_ids, segment_ids = convert_example(
235+
{idx:text[0]}, tokenizer)
236+
title_ids,title_segment_ids=convert_example({idx:text[1]},tokenizer)
237+
examples.append((input_ids, segment_ids,title_ids,title_segment_ids))
238+
239+
batchify_fn = lambda samples, fn=Tuple(
240+
Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
241+
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
242+
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
243+
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
244+
): fn(samples)
245+
246+
if args.benchmark:
247+
self.autolog.times.stamp()
248+
249+
query_ids, query_segment_ids,title_ids, title_segment_ids = batchify_fn(examples)
250+
self.input_handles[0].copy_from_cpu(query_ids)
251+
self.input_handles[1].copy_from_cpu(query_segment_ids)
252+
self.predictor.run()
253+
query_logits = self.output_handle.copy_to_cpu()
254+
255+
self.input_handles[0].copy_from_cpu(title_ids)
256+
self.input_handles[1].copy_from_cpu(title_segment_ids)
257+
self.predictor.run()
258+
title_logits = self.output_handle.copy_to_cpu()
259+
260+
if args.benchmark:
261+
self.autolog.times.stamp()
262+
263+
if args.benchmark:
264+
self.autolog.times.end(stamp=True)
265+
result=[float(1 - spatial.distance.cosine(arr1, arr2)) for arr1, arr2 in zip(query_logits, title_logits)]
266+
return result
267+
216268

217269
if __name__ == "__main__":
218270
# Define predictor to do prediction.
@@ -225,6 +277,10 @@ def predict(self, data, tokenizer):
225277
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')
226278
id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
227279
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
228-
res = predictor.predict(corpus_list, tokenizer)
280+
res=predictor.extract_embedding(corpus_list, tokenizer)
229281
print(res.shape)
230282
print(res)
283+
corpus_list=[['中西方语言与文化的差异','中西方文化差异以及语言体现中西方文化,差异,语言体现'],
284+
['中西方语言与文化的差异','飞桨致力于让深度学习技术的创新与应用更简单']]
285+
res=predictor.predict(corpus_list,tokenizer)
286+
print(res)

applications/neural_search/recall/milvus/milvus_recall.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def search(self, vectors, collection_name, partition_tag=None):
3737
if __name__ == '__main__':
3838
import random
3939
client = RecallByMilvus()
40-
collection_name = 'test1'
40+
collection_name = 'literature_search'
4141
partition_tag = 'partition_3'
4242
embeddings = [[random.random() for _ in range(128)] for _ in range(2)]
4343
status, resultes = client.search(

applications/neural_search/recall/simcse/README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,10 +398,16 @@ sh scripts/export_model.sh
398398

399399
### Paddle Inference预测
400400

401+
预测既可以抽取向量也可以计算两个文本的相似度。
402+
401403
修改id2corpus的样本:
402404

403405
```
406+
# 抽取向量
404407
id2corpus={0:'国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
408+
# 计算相似度
409+
corpus_list=[['中西方语言与文化的差异','中西方文化差异以及语言体现中西方文化,差异,语言体现'],
410+
['中西方语言与文化的差异','飞桨致力于让深度学习技术的创新与应用更简单']]
405411
406412
```
407413
然后使用PaddleInference
@@ -414,14 +420,16 @@ python deploy/python/predict.py --model_dir=./output
414420
```
415421
sh deploy.sh
416422
```
417-
最终输出的是256维度的特征向量
423+
最终输出的是256维度的特征向量和句子对的预测概率
418424

419425
```
420426
(1, 256)
421427
[[-6.70653731e-02 -6.46873191e-03 -6.78317575e-03 1.66618153e-02
422428
7.20006898e-02 -9.79136024e-03 -1.38439541e-03 4.37440872e-02
423429
4.78115827e-02 1.33881137e-01 1.82927139e-02 3.23656537e-02
424430
.......
431+
432+
[0.5649663209915161, 0.03284594044089317]
425433
```
426434

427435

applications/neural_search/recall/simcse/deploy/python/predict.py

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import paddle
2121
import paddlenlp as ppnlp
2222
from scipy.special import softmax
23+
from scipy import spatial
2324
from paddle import inference
2425
from paddlenlp.data import Stack, Tuple, Pad
2526
from paddlenlp.datasets import load_dataset
@@ -168,7 +169,7 @@ def __init__(self,
168169
warmup=0,
169170
logger=logger)
170171

171-
def predict(self, data, tokenizer):
172+
def extract_embedding(self, data, tokenizer):
172173
"""
173174
Predicts the data labels.
174175
@@ -178,7 +179,7 @@ def predict(self, data, tokenizer):
178179
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
179180
180181
Returns:
181-
results(obj:`dict`): All the predictions labels.
182+
results(obj:`dict`): All the feature vectors.
182183
"""
183184
if args.benchmark:
184185
self.autolog.times.start()
@@ -209,6 +210,57 @@ def predict(self, data, tokenizer):
209210

210211
return logits
211212

213+
def predict(self, data, tokenizer):
214+
"""
215+
Predicts the data labels.
216+
217+
Args:
218+
data (obj:`List(str)`): The batch data whose each element is a raw text.
219+
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
220+
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
221+
222+
Returns:
223+
results(obj:`dict`): All the prediction probs.
224+
"""
225+
if args.benchmark:
226+
self.autolog.times.start()
227+
228+
examples = []
229+
for idx,text in enumerate(data):
230+
input_ids, segment_ids = convert_example(
231+
{idx:text[0]}, tokenizer)
232+
title_ids,title_segment_ids=convert_example({idx:text[1]},tokenizer)
233+
examples.append((input_ids, segment_ids,title_ids,title_segment_ids))
234+
235+
batchify_fn = lambda samples, fn=Tuple(
236+
Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
237+
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
238+
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
239+
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
240+
): fn(samples)
241+
242+
if args.benchmark:
243+
self.autolog.times.stamp()
244+
245+
query_ids, query_segment_ids,title_ids, title_segment_ids = batchify_fn(examples)
246+
self.input_handles[0].copy_from_cpu(query_ids)
247+
self.input_handles[1].copy_from_cpu(query_segment_ids)
248+
self.predictor.run()
249+
query_logits = self.output_handle.copy_to_cpu()
250+
251+
self.input_handles[0].copy_from_cpu(title_ids)
252+
self.input_handles[1].copy_from_cpu(title_segment_ids)
253+
self.predictor.run()
254+
title_logits = self.output_handle.copy_to_cpu()
255+
256+
if args.benchmark:
257+
self.autolog.times.stamp()
258+
259+
if args.benchmark:
260+
self.autolog.times.end(stamp=True)
261+
result=[float(1 - spatial.distance.cosine(arr1, arr2)) for arr1, arr2 in zip(query_logits, title_logits)]
262+
return result
263+
212264

213265
if __name__ == "__main__":
214266
# Define predictor to do prediction.
@@ -221,6 +273,11 @@ def predict(self, data, tokenizer):
221273
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')
222274
id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
223275
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
276+
res=predictor.extract_embedding(corpus_list, tokenizer)
224277
res = predictor.predict(corpus_list, tokenizer)
225278
print(res.shape)
226279
print(res)
280+
corpus_list=[['中西方语言与文化的差异','中西方文化差异以及语言体现中西方文化,差异,语言体现'],
281+
['中西方语言与文化的差异','飞桨致力于让深度学习技术的创新与应用更简单']]
282+
res=predictor.predict(corpus_list,tokenizer)
283+
print(res)

0 commit comments

Comments
 (0)