-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIdentifier.py
More file actions
63 lines (49 loc) · 2.84 KB
/
Identifier.py
File metadata and controls
63 lines (49 loc) · 2.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
from sklearn import preprocessing
from sklearn import svm
from sklearn import model_selection
class Identifier:
def __init__(self):
self.svc = svm.SVC()
self.scaler = preprocessing.StandardScaler()
self.data = pd.read_csv("Data/spambase.data")
self.data.columns = [
"word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", "word_freq_our",
"word_freq_over", "word_freq_remove", "word_freq_internet", "word_freq_order", "word_freq_mail",
"word_freq_receive", "word_freq_will", "word_freq_people", "word_freq_report", "word_freq_addresses",
"word_freq_free", "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit",
"word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money", "word_freq_hp", "word_freq_hpl",
"word_freq_george", "word_freq_650", "word_freq_lab", "word_freq_labs", "word_freq_telnet",
"word_freq_857", "word_freq_data", "word_freq_415", "word_freq_85", "word_freq_technology",
"word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct", "word_freq_cs",
"word_freq_meeting", "word_freq_original", "word_freq_project", "word_freq_re", "word_freq_edu",
"word_freq_table", "word_freq_conference", "char_freq_;", "char_freq_(", "char_freq_[", "char_freq_!",
"char_freq_$", "char_freq_#", "capital_run_length_average", "capital_run_length_longest",
"capital_run_length_total", "is_spam"]
self.data = self.data.dropna()
self.data = self.data[["word_freq_all", "word_freq_our", "word_freq_remove", "word_freq_receive",
"word_freq_credit", "word_freq_000", "word_freq_free",
"word_freq_email", "word_freq_george", "word_freq_data", "word_freq_415", "word_freq_original",
"word_freq_edu", "char_freq_;", "char_freq_!", "capital_run_length_average",
"capital_run_length_longest", "capital_run_length_total", "is_spam"]]
def train(self, x_train, y_train):
self.svc.fit(x_train, y_train)
def predict(self, x_list):
predicted = self.svc.predict(x_list)
index = predicted[0]
cls = ["Not Spam!", "Spam"]
result = cls[index]
return result
def getLists(self):
data = self.getData()
x = data.drop("is_spam", axis=1)
y = data["is_spam"]
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.1)
x_train = self.scaler.fit_transform(x_train)
x_test = self.scaler.transform(x_test)
return x_train, x_test, y_train, y_test
def getData(self):
return self.data
def evaluate_accuracy(self, x_test, y_test):
accuracy = self.svc.score(x_test, y_test)
return accuracy