-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathx_oag_bert.py
More file actions
51 lines (46 loc) · 2.01 KB
/
x_oag_bert.py
File metadata and controls
51 lines (46 loc) · 2.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from cogdl.oag import oagbert
import torch
import json
import numpy as np
from tqdm import tqdm
#分别将dpv12_train.json和test,valid运行一次,得到特征
#加载模型,注意,这里选择使用oagbert-v2版本
tokenizer, model = oagbert("oagbert-v2")
model.cuda()
roles = ["train", "valid", "test"]
# with open("data/dpv12_test.json",'r')as fin:
for role in roles:
embedding=[]
with open("data/dpv12_{}.json".format(role),'r')as fin:
data=json.load(fin)
for item in tqdm(data):
#start=time.time()
title=item['title']
abstract=item['abstract']
authors=[]
concepts=[]
author=item['authors']
fos_name=item['fos']
for i in author:
authors.append(i.get('name'))
if fos_name==None:
concepts=[]
else:
for i in fos_name:
concepts.append(i.get('name'))
input_ids, input_masks, token_type_ids, masked_lm_labels, position_ids, position_ids_second, masked_positions, num_spans = model.build_inputs(title=title, abstract=abstract, authors=authors, concepts=concepts)
# 使用模型进行前向传播
sequence_output, pooled_output = model.bert.forward(
input_ids=torch.LongTensor(input_ids).unsqueeze(0).cuda(),
token_type_ids=torch.LongTensor(token_type_ids).unsqueeze(0).cuda(),
attention_mask=torch.LongTensor(input_masks).unsqueeze(0).cuda(),
output_all_encoded_layers=False,
checkpoint_activations=False,
position_ids=torch.LongTensor(position_ids).unsqueeze(0).cuda(),
position_ids_second=torch.LongTensor(position_ids).unsqueeze(0).cuda()
)
pooled_output=torch.squeeze(pooled_output)
embedding.append(pooled_output.cpu().detach().numpy())
em=np.array(embedding)
# np.save('data/tx.npy',em)
np.save('data/tx_{}.npy'.format(role),em)