Skip to content

Commit 99673ad

Browse files
authored
Fix the WR bug and recall script bug (#2381)
* Fix the WR bug and recall script bug * Cast pad type to int64 * Cast the input type to int 64 * Cast padd to int64
1 parent 25b41ea commit 99673ad

File tree

18 files changed

+42
-79
lines changed

18 files changed

+42
-79
lines changed

applications/neural_search/ranking/ernie_matching/evaluate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,8 @@ def do_train():
132132
phase="eval")
133133

134134
batchify_fn_eval = lambda samples, fn=Tuple(
135-
Pad(axis=0, pad_val=tokenizer.pad_token_id), # pair_input
136-
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # pair_segment
135+
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # pair_input
136+
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # pair_segment
137137
Stack(dtype="int64") # label
138138
): [data for data in fn(samples)]
139139

applications/neural_search/ranking/ernie_matching/predict_pairwise.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,8 @@ def predict(model, data_loader):
9494
phase="predict")
9595

9696
batchify_fn = lambda samples, fn=Tuple(
97-
Pad(axis=0, pad_val=tokenizer.pad_token_id), # input_ids
98-
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment_ids
97+
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # input_ids
98+
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # segment_ids
9999
): [data for data in fn(samples)]
100100

101101
valid_ds = load_dataset(

applications/neural_search/ranking/ernie_matching/train_pairwise.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -148,15 +148,15 @@ def do_train():
148148
phase="eval")
149149

150150
batchify_fn_train = lambda samples, fn=Tuple(
151-
Pad(axis=0, pad_val=tokenizer.pad_token_id), # pos_pair_input
152-
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # pos_pair_segment
153-
Pad(axis=0, pad_val=tokenizer.pad_token_id), # neg_pair_input
154-
Pad(axis=0, pad_val=tokenizer.pad_token_type_id) # neg_pair_segment
151+
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # pos_pair_input
152+
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # pos_pair_segment
153+
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # neg_pair_input
154+
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64") # neg_pair_segment
155155
): [data for data in fn(samples)]
156156

157157
batchify_fn_eval = lambda samples, fn=Tuple(
158-
Pad(axis=0, pad_val=tokenizer.pad_token_id), # pair_input
159-
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # pair_segment
158+
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # pair_input
159+
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # pair_segment
160160
Stack(dtype="int64") # label
161161
): [data for data in fn(samples)]
162162

applications/neural_search/recall/in_batch_negative/deploy/python/web_service.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ def preprocess(self, input_dicts, data_id, log_id):
5555
self.tokenizer)
5656
examples.append((input_ids, segment_ids))
5757
batchify_fn = lambda samples, fn=Tuple(
58-
Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # input
59-
Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # segment
58+
Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"), # input
59+
Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"), # segment
6060
): fn(samples)
6161
input_ids, segment_ids = batchify_fn(examples)
6262
feed_dict = {}

applications/neural_search/recall/in_batch_negative/evaluate.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,6 @@ def recall(rs, N=10):
7272
relevance_labels = []
7373

7474
text, recalled_text, cosine_sim = line.rstrip().split("\t")
75-
if text == recalled_text:
76-
continue
7775
if text2similar[text] == recalled_text:
7876
relevance_labels.append(1)
7977
else:

applications/neural_search/recall/in_batch_negative/recall.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,8 @@
8282
max_seq_length=args.max_seq_length)
8383

8484
batchify_fn = lambda samples, fn=Tuple(
85-
Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_input
86-
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment
85+
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # text_input
86+
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # text_segment
8787
): [data for data in fn(samples)]
8888

8989
pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(

applications/neural_search/recall/in_batch_negative/train_batch_neg.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,10 @@ def do_train():
9898
max_seq_length=args.max_seq_length)
9999

100100
batchify_fn = lambda samples, fn=Tuple(
101-
Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input
102-
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment
103-
Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input
104-
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tilte_segment
101+
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # query_input
102+
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # query_segment
103+
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # title_input
104+
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # tilte_segment
105105
): [data for data in fn(samples)]
106106

107107
train_data_loader = create_dataloader(

applications/neural_search/recall/simcse/evaluate.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,10 @@ def recall(rs, N=10):
6868
relevance_labels = []
6969

7070
text, recalled_text, cosine_sim = line.rstrip().split("\t")
71-
if text == recalled_text:
72-
continue
7371
if text2similar[text] == recalled_text:
7472
relevance_labels.append(1)
7573
else:
7674
relevance_labels.append(0)
77-
# print(len(rs))
78-
# print(rs[:50])
7975

8076
recall_N = []
8177
recall_num = [1, 5, 10, 20, 50]

applications/neural_search/recall/simcse/recall.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@
7070
max_seq_length=args.max_seq_length)
7171

7272
batchify_fn = lambda samples, fn=Tuple(
73-
Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_input
74-
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment
73+
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # text_input
74+
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # text_segment
7575
): [data for data in fn(samples)]
7676

7777
pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(

applications/neural_search/recall/simcse/train.py

Lines changed: 4 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -66,35 +66,6 @@ def set_seed(seed):
6666
np.random.seed(seed)
6767
paddle.seed(seed)
6868

69-
def do_evaluate(model, tokenizer, data_loader, with_pooler=False):
70-
model.eval()
71-
72-
total_num = 0
73-
spearman_corr = 0.0
74-
sims = []
75-
labels = []
76-
77-
for batch in data_loader:
78-
query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids, label = batch
79-
total_num += len(label)
80-
81-
query_cls_embedding = model.get_pooled_embedding(
82-
query_input_ids, query_token_type_ids, with_pooler=with_pooler)
83-
84-
title_cls_embedding = model.get_pooled_embedding(title_input_ids, title_token_type_ids, with_pooler=with_pooler)
85-
86-
cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, axis=-1)
87-
88-
sims.append(cosine_sim.numpy())
89-
labels.append(label.numpy())
90-
91-
sims = np.concatenate(sims, axis=0)
92-
labels = np.concatenate(labels, axis=0)
93-
94-
spearman_corr = stats.spearmanr(labels, sims).correlation
95-
model.train()
96-
return spearman_corr, total_num
97-
9869
def do_train():
9970
paddle.set_device(args.device)
10071
rank = paddle.distributed.get_rank()
@@ -121,10 +92,10 @@ def do_train():
12192
max_seq_length=args.max_seq_length)
12293

12394
batchify_fn = lambda samples, fn=Tuple(
124-
Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input
125-
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment
126-
Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input
127-
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tilte_segment
95+
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # query_input
96+
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # query_segment
97+
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # title_input
98+
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # tilte_segment
12899
): [data for data in fn(samples)]
129100

130101

0 commit comments

Comments
 (0)