-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnlp.py
More file actions
68 lines (57 loc) · 2.08 KB
/
nlp.py
File metadata and controls
68 lines (57 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import csv
from ocsvm import train_ocsvm
import torch
from sklearn import decomposition
import argparse
import numpy as np
parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
parser.add_argument('--gamma', default=0.1, type=float, help='gamma parameter for OCSVM')
args = parser.parse_args()
def read_dataset(csv_path, delimiter):
with open(csv_path, "r") as f:
reader = csv.reader(f, delimiter=delimiter)
lines = []
for line in reader:
lines.append(line)
return lines
amazon_review_data = read_dataset("../../nlp_dataset/yelp_review_polarity_csv/test.csv", ',')
print(amazon_review_data[0][1])
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
model.cuda()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inlier_data = []
for i in range(4000):
try:
inputs = tokenizer(amazon_review_data[i][1], return_tensors="pt")
inputs = inputs.to(device)
outputs = model(**inputs)
inlier_data.append(outputs.pooler_output.cpu().detach().numpy()[0])
except:
continue
print("Number of inlier data: ", len(inlier_data))
outlier_data = []
imdb_review_data = read_dataset("../../nlp_dataset/IMDB_data/test.csv", '\t')
print(imdb_review_data[0])
for i in range(300):
try:
inputs = tokenizer(imdb_review_data[i][1], return_tensors="pt")
inputs = inputs.to(device)
outputs = model(**inputs)
outlier_data.append(outputs.pooler_output.cpu().detach().numpy()[0])
except:
continue
print("Inference done!")
all_data = np.concatenate((np.array(inlier_data), np.array(outlier_data)))
print(all_data.shape)
pca = decomposition.PCA(n_components=10)
pca.fit(all_data)
X = pca.transform(all_data)
outlier_indices = train_ocsvm(X, gamma = args.gamma)
tp = len(np.where( outlier_indices > len(inlier_data) )[0])
recall = tp / (len(all_data) - len(inlier_data))
precision = tp / len(outlier_indices)
print(recall)
print(precision)