-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpredict.py
More file actions
128 lines (104 loc) · 4.65 KB
/
predict.py
File metadata and controls
128 lines (104 loc) · 4.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# from dataloader import CreateDataset
# from bert3head.model import BertLinear3HEAD
# import torch
# model = BertLinear3HEAD("vinai/phobert-base-v2")
# model.load_state_dict(torch.load('models/linear/3head-boosting2.pt'))
# model = model.cuda()
# def remove_number(text):
# new_text = ''
# for char in text:
# if not char.isdigit():
# new_text += char
# return new_text
# import numpy as np
# import pandas as pd
# from tqdm import tqdm
# df = pd.read_csv('label_gemini_vi_v2.csv')
# df.dropna(inplace=True)
# print(df.shape)
# df.reset_index(inplace=True)
# df['text'] = df['text'].apply(lambda x: remove_number(x))
# dataloader = CreateDataset(df['text'], df['sentiment'], df['classification'],"vinai/phobert-base-v2",128, shuffle=False).label()
# sentences_index = []
# sentiments = []
# classifications = []
# model.eval()
# with torch.no_grad():
# for sentence, input_ids, attention_mask in tqdm(dataloader):
# sentences_index.extend(sentence.detach().cpu().numpy())
# input_ids = input_ids.cuda()
# attention_mask = attention_mask.cuda()
# sen, clas = model(input_ids, attention_mask)
# sen = sen.detach().cpu().numpy()
# clas = clas.detach().cpu().numpy()
# sen = sen.argmax(axis=1).flatten()
# clas = clas.argmax(axis=1).flatten()
# sentiments.extend(sen)
# classifications.extend(clas)
# sentences_index = np.array(sentences_index)
# sentences = df['text'][sentences_index]
# sentiments = np.array(sentiments)
# classifications = np.array(classifications)
# new_df = pd.DataFrame({'text':sentences, 'sentiment':sentiments, 'classification':classifications})
# new_df.to_csv('label_gemini_vi_v2_linear3head_v2.csv', index=False)
# import torch
# from trainer.base_trainer import Trainer
# from utils.dataloader import CreateDataset
# import pandas as pd
# test_set = pd.read_csv('dataset/test_set.csv')
# task = 'sentiment'
# bert_name = 'xlm-roberta-base'
# trainer = Trainer(bert_name, task)
# test_data_loader = CreateDataset(test_set['text'], test_set['sentiment'],test_set['classification'], bert_name, batch_size=128).todataloader()
# bert_name = bert_name.split('/')[-1]
# acc, f1m, f1w = trainer.evaluate(test_data_loader, save_name=f"{bert_name}-{task}")
# print(f'Final Prediction\n Model :{bert_name} | {task}\nAcc: {acc}, f1m: {f1m}, f1w: {f1w}')
import torch
# from trainer.mlm_head_trainer import Trainer
from trainer.head_trainer import Trainer
from architecture.bert2head.model import BertLinear2HEAD
from utils.dataloader import CreateDataset
import pandas as pd
import gc
bert_name = 'vinai/phobert-base-v2'
batch_size = 128
if 'vinai' in bert_name:
train_set = pd.read_csv('dataset/train_set_processed.csv')
test_set = pd.read_csv('dataset/test_set_processed.csv')
val_set = pd.read_csv('dataset/val_set_processed.csv')
else:
train_set = pd.read_csv('dataset/train_set.csv')
test_set = pd.read_csv('dataset/test_set.csv')
val_set = pd.read_csv('dataset/val_set.csv')
dataloader = CreateDataset(val_set['text'], val_set['sentiment'],val_set['classification'], bert_name, batch_size=batch_size).todataloader()
# trainer=Trainer(bert_name, train_data_loader, val_data_loader)
# bert_name = bert_name.split('/')[-1]
# valid_loss, valid_accs, valid_f1s = trainer.eval(test_data_loader, f"{bert_name}-epoch17")
# print(f'\tVal.acc se : {valid_accs[0]*100:.2f}% | Val.acc ca : {valid_accs[1]*100:.2f}%')
# print(f'\tVal.F1m se : {valid_f1s[0]*100:.2f} | Val.F1m ca : {valid_f1s[1]*100:.2f}')
# print(f'\tVal.F1w se : {valid_f1s[2]*100:.2f} | Val.F1w ca : {valid_f1s[3]*100:.2f}')
model = BertLinear2HEAD(bert_name)
model.load_state_dict(torch.load('models/linear/phobert-base-v2-epoch6.pt'))
model = model.to('cuda')
model.eval()
sentiments = []
classifications = []
sen_true = []
clas_true = []
with torch.no_grad():
for input_ids, attention_mask, b_sent, b_class in dataloader:
input_ids = input_ids.cuda()
attention_mask = attention_mask.cuda()
sen, clas = model(input_ids, attention_mask)
sen = sen.detach().cpu().numpy()
clas = clas.detach().cpu().numpy()
sen = sen.argmax(axis=1).flatten()
clas = clas.argmax(axis=1).flatten()
b_sent = b_sent.cpu().numpy()
b_class = b_class.cpu().numpy()
sentiments.extend(sen)
classifications.extend(clas)
sen_true.extend(b_sent)
clas_true.extend(b_class)
df = pd.DataFrame({'sentiment':sen_true, 'classification':clas_true, 'sentiment_pred':sentiments, 'classification_pred':classifications})
df.to_csv('phobert-base-v2-epoch6_val.csv', index=False)