-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel_testing.py
More file actions
168 lines (135 loc) · 8.96 KB
/
model_testing.py
File metadata and controls
168 lines (135 loc) · 8.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import sklearn as sk
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib as plt
import pandas as pd
import numpy as np
data1 = [pd.read_csv('./csv_files/train1.csv', index_col=0), 1, pd.read_csv('./csv_files/holdout1.csv')]
data2 = [pd.read_csv('./csv_files/train2.csv', index_col=0), 2, pd.read_csv('./csv_files/holdout2.csv')]
data3 = [pd.read_csv('./csv_files/train3.csv', index_col=0), 3, pd.read_csv('./csv_files/holdout3.csv')]
data4 = [pd.read_csv('./csv_files/train4.csv', index_col=0), 4, pd.read_csv('./csv_files/holdout4.csv')]
data5 = [pd.read_csv('./csv_files/train5.csv', index_col=0), 5, pd.read_csv('./csv_files/holdout5.csv')]
# Getting baseline accuracy
1 - (data1[2][data1[2]["Diagnosis"] == 0].shape[0] / (data1[2][data1[2]["Diagnosis"] == 1].shape[0] + data1[2][data1[2]["Diagnosis"] == 0].shape[0]))
(data2[2][data2[2]["Diagnosis"] == 0].shape[0] / (data2[2][data2[2]["Diagnosis"] == 1].shape[0] + data2[2][data2[2]["Diagnosis"] == 0].shape[0]))
(data3[2][data3[2]["Diagnosis"] == 0].shape[0] / (data3[2][data3[2]["Diagnosis"] == 1].shape[0] + data3[2][data3[2]["Diagnosis"] == 0].shape[0]))
(data4[2][data4[2]["Diagnosis"] == 0].shape[0] / (data4[2][data4[2]["Diagnosis"] == 1].shape[0] + data4[2][data4[2]["Diagnosis"] == 0].shape[0]))
(data5[2][data5[2]["Diagnosis"] == 0].shape[0] / (data5[2][data5[2]["Diagnosis"] == 1].shape[0] + data5[2][data5[2]["Diagnosis"] == 0].shape[0]))
################## Creating a loop, so I won't have to do the below code 5 times ##################
# Making the datasets into a list
datasets = [data1, data2, data3, data4, data5]
#creating empty objects to save data in
classif_reports = ["", "", "", "", ""]
conf_mtxs = []
model = SVC(kernel = 'linear', class_weight = 'balanced')
SVM_coef = pd.DataFrame(columns = ['predictor_name', 'coef', 'fold'])
predictions = pd.DataFrame({'ID' : datasets[0][2].loc[:,'ID'], 'sex' : datasets[0][2].loc[:, 'Gender'] , 'diagnosis_real' : datasets[0][2].loc[:,'Diagnosis']})
for train, n, holdout in datasets:
# Dividing train and holdout up into predictor variables, and Diagnosis
x = train.iloc[:,4:]
y = train.loc[:,['ID', 'Diagnosis']]
y = y.set_index('ID') # Setting index as ID, to be able to map how well the model predicts on genders
x_holdout = holdout.iloc[:,4:]
y_holdout = holdout.loc[:,['ID', 'Diagnosis']]
y_holdout = y_holdout.set_index('ID') # Setting index as ID, to be able to map how well the model predicts on genders
# Fit the data onto the model
model.fit(x, y)
# Predict the holdout set (on the basis of predictor variables), using the fitted model
y_predictions = model.predict(x_holdout)
# Print to keep track
print(y_predictions)
# Make a unique name for each iteration
new_col_name = "".join(["diagnosis_predic_", str(n)])
# Add a column to the dataframe (with unique name, and the predictions)
predictions[new_col_name] = y_predictions
# Get out the coefficients
coefs = model.coef_[0]
# Get out the coefficient names
coefs_names = list(x_holdout)
coefs_names = np.asarray(coefs_names)
# Create an array with info on which folds data/features we're handling!
fold = np.repeat(n, len(coefs))
# Load in the predictor names, and their coefficients to the empty dataframe
SVM_coef = SVM_coef.append(pd.DataFrame({'predictor_name': coefs_names, 'coef' : coefs, 'fold' : fold}), ignore_index = True)
# Get classification report
report = classification_report(y_holdout, y_predictions, output_dict = True)
report_number = n-1
classif_reports[report_number] = report
# Get confusion matrix
matrixx = confusion_matrix(y_holdout, y_predictions)
conf_mtxs.append(matrixx)
classif_reports[4]
# Saving data
# making variables to use in filenames
svm_coef_name = "".join(['./performance/holdout/', "coefs_all_models", ".csv"])
conf_matrix_name = "".join(['./performance/holdout/models/', "confusion_matrix", str(n), ".csv"])
classification_report_name = "".join(['./performance/holdout/models/', "classification_report", str(n), ".csv"])
# Loading each element (classification_report) of the list of dictionaries, as a dataframe
classif_report_fold = pd.DataFrame(classif_reports[n-1])
# Loading each element (classification_report) of the list of dictionaries, as a dataframe
conf_matrix_fold = pd.DataFrame(conf_mtxs[n-1])
# Give the matrices names for rows and columns
conf_matrix_fold.columns = conf_matrix_fold.columns = ['predict_td', 'predict_sz']
conf_matrix_fold.index = conf_matrix_fold.index = ['true_td', 'true_sz']
# Writing coefficients to .csv
SVM_coef.to_csv(svm_coef_name, sep=',', index = True)
# Writing confusion matrices to csv
conf_matrix_fold.to_csv(conf_matrix_name, sep=',', index = True)
# Writing classification reports to csv
classif_report_fold.to_csv(classification_report_name, sep=',', index = True)
# The end
# Fixing "predictions" dataframe
count_of_1_predictions = predictions.iloc[:,-5:].apply(pd.Series.value_counts, axis=1)[1].fillna(0)
predictions['diagnosis_predic_ensemble'] = [0 if x < 3 else 1 for x in count_of_1_predictions] # For each row, get a count of 1's and 0's in the new columns
predcitions.to_csv('performance/holdout/ensemble/performance.csv', sep = ',', index = True)
# Performance of ensemble all
classification_report_ensemble = pd.DataFrame(classification_report(predictions['diagnosis_real'], predictions['diagnosis_predic_ensemble'], output_dict = True))
conf_matrix_ensemble = pd.DataFrame(confusion_matrix(predictions['diagnosis_real'], predictions['diagnosis_predic_ensemble']))
classification_report_ensemble.to_csv('./performance/holdout/ensemble/classification_report.csv', sep = ",", index = True)
conf_matrix_ensemble.to_csv('./performance/holdout/ensemble/confusion_matrix.csv', sep = ",", index = True)
predictions.to_csv('./performance/holdout/all_predictions_holdout.csv')
# Performance of ensemble female
predictions_female = predictions[predictions['sex'] == 'F']
classification_report_ensemble_female = pd.DataFrame(classification_report(predictions_female['diagnosis_real'], predictions_female['diagnosis_predic_ensemble'], output_dict = True))
conf_matrix_ensemble_female = pd.DataFrame(confusion_matrix(predictions_female['diagnosis_real'], predictions_female['diagnosis_predic_ensemble']))
classification_report_ensemble_female.to_csv('./performance/holdout/ensemble/sex/female_classification_report.csv', sep = ",", index = True)
conf_matrix_ensemble_female.to_csv('./performance/holdout/ensemble/sex/female_confusion_matrix.csv', sep = ",", index = True)
# Performance of ensemble male
predictions_male = predictions[predictions['sex'] == 'M']
classification_report_ensemble_male = pd.DataFrame(classification_report(predictions_male['diagnosis_real'], predictions_male['diagnosis_predic_ensemble'], output_dict = True))
conf_matrix_ensemble_male = pd.DataFrame(confusion_matrix(predictions_male['diagnosis_real'], predictions_male['diagnosis_predic_ensemble']))
classification_report_ensemble_male.to_csv('./performance/holdout/ensemble/sex/male_classification_report.csv', sep = ",", index = True)
conf_matrix_ensemble_male.to_csv('./performance/holdout/ensemble/sex/male_confusion_matrix.csv', sep = ",", index = True)
####################### Results #######################
# Predictions
performance = pd.read_csv("performance/holdout/ensemble/performance.csv")
# Ensemble all
pd.read_csv("performance/holdout/ensemble/confusion_matrix.csv")
pd.read_csv("performance/holdout/ensemble/classification_report.csv")
# Ensemble female
pd.read_csv("performance/holdout/ensemble/sex/female_confusion_matrix.csv")
pd.read_csv("performance/holdout/ensemble/sex/female_classification_report.csv")
# Ensemble male
pd.read_csv("performance/holdout/ensemble/sex/male_confusion_matrix.csv")
pd.read_csv("performance/holdout/ensemble/sex/male_classification_report.csv")
# Submodels
# Coefs
pd.read_csv("performance/holdout/coefs_all_models.csv")
# 1
pd.read_csv("performance/holdout/models/classification_report1.csv")
pd.read_csv("performance/holdout/models/confusion_matrix1.csv")
# 2
pd.read_csv("performance/holdout/models/classification_report2.csv")
pd.read_csv("performance/holdout/models/confusion_matrix2.csv")
# 3
pd.read_csv("performance/holdout/models/classification_report3.csv")
pd.read_csv("performance/holdout/models/confusion_matrix3.csv")
# 4
pd.read_csv("performance/holdout/models/classification_report4.csv")
pd.read_csv("performance/holdout/models/confusion_matrix4.csv")
# 5
pd.read_csv("performance/holdout/models/classification_report5.csv")
pd.read_csv("performance/holdout/models/confusion_matrix5.csv")
# Calculating baseline accuracies for the two sexes (female and male and the conf. matrices)
1- (female.iloc[1, 1] + female.iloc[1, 2]) / ((female.iloc[1, 1] + female.iloc[1, 2]) + (female.iloc[0, 1] + female.iloc[0, 2]))
1- (male.iloc[1, 1] + male.iloc[1, 2]) / ((male.iloc[1, 1] + male.iloc[1, 2]) + (male.iloc[0, 1] + male.iloc[0, 2]))