Skip to content

Commit a80eadb

Browse files
Format code with yapf, black, autopep8 and isort
1 parent 3c4360c commit a80eadb

File tree

2 files changed

+129
-102
lines changed

2 files changed

+129
-102
lines changed

experimentation/general.py

Lines changed: 102 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,10 @@
1212
from math import floor
1313
from os import walk
1414

15-
current = os.path.dirname(os.path.realpath(__file__))
16-
parent = os.path.dirname(current)
17-
sys.path.append(parent)
18-
1915
import numpy as np
2016
import pandas as pd
2117
import yagmail
22-
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
18+
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
2319
from sklearn.model_selection import StratifiedKFold
2420
from sklearn.naive_bayes import GaussianNB
2521
from sklearn.neighbors import KNeighborsClassifier
@@ -28,153 +24,183 @@
2824
from instance_selection import ENN, LSSm
2925
from semisupervised import DensityPeaks
3026

27+
current = os.path.dirname(os.path.realpath(__file__))
28+
parent = os.path.dirname(current)
29+
sys.path.append(parent)
30+
3131
time_str = time.strftime("%Y%m%d-%H%M%S")
3232
k = 3
3333
folds = 10
3434
precision = 0.05
35-
file_name = 'experiments'
36-
csv_results = os.path.join('.', 'results', file_name + '_' + time_str + '.csv')
37-
log_file = os.path.join('.', 'logs', '_'.join([file_name, time_str]) + '.log')
38-
39-
logging.basicConfig(level=logging.DEBUG,
40-
format=' %(asctime)s :: %(levelname)-8s :: %(message)s',
41-
handlers=[logging.FileHandler(log_file),
42-
logging.StreamHandler(sys.stdout)]
43-
)
35+
file_name = "experiments"
36+
csv_results = os.path.join(".", "results", file_name + "_" + time_str + ".csv")
37+
log_file = os.path.join(".", "logs", "_".join([file_name, time_str]) + ".log")
38+
39+
logging.basicConfig(
40+
level=logging.DEBUG,
41+
format=" %(asctime)s :: %(levelname)-8s :: %(message)s",
42+
handlers=[logging.FileHandler(
43+
log_file), logging.StreamHandler(sys.stdout)],
44+
)
4445

4546

4647
def search_datasets(folder):
4748
if os.path.isdir(folder):
48-
logging.info(f'Looking up for datasets in {folder}')
49+
logging.info(f"Looking up for datasets in {folder}")
4950
else:
50-
logging.error(f'{folder} does not exist')
51+
logging.error(f"{folder} does not exist")
5152

5253
datasets_found = next(walk(folder), (None, None, []))[2]
5354
datasets_found.sort()
54-
logging.info(f'Founded {len(datasets_found)} - {datasets_found}')
55+
logging.info(f"Founded {len(datasets_found)} - {datasets_found}")
5556

5657
header = [
57-
'dataset',
58-
'percent labeled',
59-
'fold',
60-
'base',
61-
'filter',
62-
'f1-score',
63-
'mean squared error',
64-
'accuracy score'
58+
"dataset",
59+
"percent labeled",
60+
"fold",
61+
"base",
62+
"filter",
63+
"f1-score",
64+
"mean squared error",
65+
"accuracy score",
6566
]
6667

67-
with open(csv_results, 'w') as save:
68+
with open(csv_results, "w") as save:
6869
w = csv.writer(save)
6970
w.writerow(header)
7071
save.close()
7172

7273
datasets = dict.fromkeys(datasets_found)
7374
for dataset in datasets_found:
74-
datasets[dataset] = pd.read_csv(os.path.join(folder, dataset),
75-
header=None)
76-
logging.debug('Datasets ready to be used')
75+
datasets[dataset] = pd.read_csv(
76+
os.path.join(folder, dataset), header=None)
77+
logging.debug("Datasets ready to be used")
7778

7879
return datasets
7980

8081

8182
def main(datasets):
82-
logging.info('Starting main...')
83+
logging.info("Starting main...")
8384
random_state = 0x24032022
8485
skf = StratifiedKFold(n_splits=folds, shuffle=True,
8586
random_state=random_state)
8687
classifiers = [KNeighborsClassifier, DecisionTreeClassifier, GaussianNB]
8788
classifiers_params = [
88-
{'n_neighbors': k, 'n_jobs': -1}, {'random_state': random_state}, {}
89+
{"n_neighbors": k, "n_jobs": -1},
90+
{"random_state": random_state},
91+
{},
8992
]
90-
filters = [ENN, LSSm, 'ENANE']
93+
filters = [ENN, LSSm, "ENANE"]
9194

9295
for dataset, values in datasets.items():
93-
logging.info(f'\n\nCurrent dataset: {dataset} - Shape: '
94-
f'{values.shape}')
96+
logging.info(
97+
f"\n\nCurrent dataset: {dataset} - Shape: " f"{values.shape}")
9598
for n_classifier, classifier in enumerate(classifiers):
9699
classifier_name = classifier.__name__
97100
for filter_method in filters:
98-
filter_name = filter_method if isinstance(
99-
filter_method, str) else filter_method.__name__
101+
filter_name = (
102+
filter_method
103+
if isinstance(filter_method, str)
104+
else filter_method.__name__
105+
)
100106
samples = values.iloc[:, :-1]
101107
y = values.iloc[:, -1]
102108
y_df = pd.DataFrame(y.tolist())
103-
for fold, (train_index, test_index) in enumerate(skf.split(
104-
samples, y)):
109+
for fold, (train_index, test_index) in enumerate(skf.split(samples, y)):
105110
t_start = time.time()
106-
logging.info(f'Dataset: {dataset} -- Classifier: '
107-
f'{classifier_name} -- Filter: {filter_name} '
108-
f'-- Fold: {fold}')
111+
logging.info(
112+
f"Dataset: {dataset} -- Classifier: "
113+
f"{classifier_name} -- Filter: {filter_name} "
114+
f"-- Fold: {fold}"
115+
)
109116
x_train = samples.iloc[train_index, :].copy(deep=True)
110117
x_test = samples.iloc[test_index, :].copy(deep=True)
111118
y_train = y_df.iloc[train_index, :].copy(deep=True)
112119
y_test = y_df.iloc[test_index, :].copy(deep=True)
113120

114121
unlabeled_indexes = np.random.choice(
115-
train_index, floor(len(x_train) * (1 - precision)),
116-
replace=False)
122+
train_index,
123+
floor(len(x_train) * (1 - precision)),
124+
replace=False,
125+
)
117126

118127
y_train.at[unlabeled_indexes] = -1
119128

120129
model = DensityPeaks.STDPNF(
121130
classifier=classifier,
122131
classifier_params=classifiers_params[n_classifier],
123132
filtering=True,
124-
filter_method=filter_method
133+
filter_method=filter_method,
125134
)
126135
try:
127136
model.fit(x_train, y_train)
128137
y_pred = model.predict(x_test)
129138
f1 = f1_score(y_true=y_test, y_pred=y_pred,
130139
average="weighted")
131-
mse = mean_squared_error(y_true=y_test,
132-
y_pred=y_pred)
140+
mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
133141
acc = accuracy_score(y_true=y_test, y_pred=y_pred)
134142

135-
logging.info(f'\tf1: {f1:.2f} -- mse: {mse:.2f} -- acc:'
136-
f' {acc:.2f}')
143+
logging.info(
144+
f"\tf1: {f1:.2f} -- mse: {mse:.2f} -- acc:" f" {acc:.2f}"
145+
)
137146
except Exception:
138-
f1 = mse = acc = ''
139-
logging.exception('Failed')
147+
f1 = mse = acc = ""
148+
logging.exception("Failed")
140149
t_end = time.time()
141150
logging.info(
142-
f'\t\tElapsed: {(t_end - t_start) / 60:.2f} minutes')
143-
with open(csv_results, 'a') as save:
151+
f"\t\tElapsed: {(t_end - t_start) / 60:.2f} minutes")
152+
with open(csv_results, "a") as save:
144153
w = csv.writer(save)
145-
w.writerow([dataset, precision, fold, classifier_name,
146-
filter_name, f1, mse, acc])
147-
148-
149-
if __name__ == '__main__':
150-
mail = 'ntoolsecure'
151-
passwd = 'qfj3nfr_jnt7ATZ8jgh'
154+
w.writerow(
155+
[
156+
dataset,
157+
precision,
158+
fold,
159+
classifier_name,
160+
filter_name,
161+
f1,
162+
mse,
163+
acc,
164+
]
165+
)
166+
167+
168+
if __name__ == "__main__":
169+
mail = "ntoolsecure"
170+
passwd = "qfj3nfr_jnt7ATZ8jgh"
152171
yag = yagmail.SMTP(user=mail, password=passwd)
153172
t_start_g = time.time()
154173
try:
155-
logging.info('--- Starting ---')
156-
datasets_folder = os.path.join('..', 'datasets', 'UCI-Experimentation')
174+
logging.info("--- Starting ---")
175+
datasets_folder = os.path.join("..", "datasets", "UCI-Experimentation")
157176
datasets_dfs = search_datasets(datasets_folder)
158177

159178
main(datasets_dfs)
160179

161-
logging.info('--- Process completed ---')
180+
logging.info("--- Process completed ---")
162181
attach = [csv_results, log_file]
163182
t_end_g = time.time()
164-
logging.info(f'Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes')
165-
yag.send(to='[email protected]', subject='self_training_validation '
166-
'COMPLETED',
167-
contents='self_training_validation has been completed.\n'
168-
f'Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes',
169-
attachments=attach)
183+
logging.info(f"Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes")
184+
yag.send(
185+
186+
subject="self_training_validation " "COMPLETED",
187+
contents="self_training_validation has been completed.\n"
188+
f"Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes",
189+
attachments=attach,
190+
)
170191
except Exception as e:
171192
t_end_g = time.time()
172-
content = f'FATAL ERROR - Check the attached log\n' \
173-
f'Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes'
174-
175-
yag.send(to='[email protected]', subject='self_training_validation '
176-
'ERROR',
177-
contents=content, attachments=[log_file])
178-
logging.exception('--- Process has broken ---')
179-
logging.info(f'Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes')
193+
content = (
194+
f"FATAL ERROR - Check the attached log\n"
195+
f"Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes"
196+
)
197+
198+
yag.send(
199+
200+
subject="self_training_validation " "ERROR",
201+
contents=content,
202+
attachments=[log_file],
203+
)
204+
logging.exception("--- Process has broken ---")
205+
logging.info(f"Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes")
180206
logging.info("Email sent successfully")

experimentation/results.py

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,50 +11,52 @@
1111
import numpy as np
1212
import pandas as pd
1313

14-
if __name__ == '__main__':
14+
if __name__ == "__main__":
1515

16-
mse = 'mean squared error'
17-
pl = 'percent labeled'
16+
mse = "mean squared error"
17+
pl = "percent labeled"
1818

19-
folder = join('.', 'results', '')
20-
ranks_path = 'ranks'
21-
plots = 'plots'
19+
folder = join(".", "results", "")
20+
ranks_path = "ranks"
21+
plots = "plots"
2222
# A list of the percentages of the data that is labeled.
2323
precisions = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35]
2424
percent_precisions = [x * 100 for x in precisions]
25-
metrics = ['f1-score', mse, 'accuracy score']
25+
metrics = ["f1-score", mse, "accuracy score"]
2626
results_found = next(walk(folder), (None, None, []))[2]
2727
if len(results_found) != len(precisions) + 1:
2828
print(
2929
f"This script only works with {len(precisions) + 1} results in the "
30-
f"\'results\' folder.")
30+
f"'results' folder."
31+
)
3132
exit(1)
3233
dfs = []
3334
for index, r in enumerate(results_found):
3435
dfs.append(pd.read_csv(folder + results_found[index]))
3536

3637
df = pd.concat(dfs, ignore_index=True)
37-
df.drop(['fold', 'Unnamed: 0'], axis=1, inplace=True)
38+
df.drop(["fold", "Unnamed: 0"], axis=1, inplace=True)
3839

3940
classifiers = dfs[0].base.unique()
40-
filters = np.append(dfs[0]['filter'].unique(), 'base')
41-
datasets = dfs[0]['dataset'].unique()
41+
filters = np.append(dfs[0]["filter"].unique(), "base")
42+
datasets = dfs[0]["dataset"].unique()
4243

4344
ranks = {}
44-
vals = ['base', 'filter', pl, 'f1-score',
45-
'mean squared error', 'accuracy score']
45+
vals = ["base", "filter", pl, "f1-score",
46+
"mean squared error", "accuracy score"]
4647

4748
means = {}
4849
for classifier in classifiers:
4950
cl = []
5051
for dataset in datasets:
51-
rows = df[df['dataset'] == dataset]
52+
rows = df[df["dataset"] == dataset]
5253
for precision in precisions:
5354
temp = pd.DataFrame(index=filters, columns=metrics)
5455
temp[pl] = precision
5556
p_rows = rows.loc[
56-
(rows['base'] == classifier) & (rows[pl] == precision)]
57-
vals = p_rows.groupby(['filter']).mean()
57+
(rows["base"] == classifier) & (rows[pl] == precision)
58+
]
59+
vals = p_rows.groupby(["filter"]).mean()
5860

5961
for metric in metrics:
6062
dff = vals[metric].to_frame()
@@ -77,13 +79,12 @@
7779
rks[(precision, metric)] = np.ravel(vals.to_numpy())
7880
ranks[classifier] = rks
7981

80-
fig, axs = \
81-
plt.subplots(nrows=3, ncols=3, sharex='all', sharey='all', figsize=(
82-
12, 5))
82+
fig, axs = plt.subplots(
83+
nrows=3, ncols=3, sharex="all", sharey="all", figsize=(12, 5)
84+
)
8385

8486
for (i, classifier), axss in zip(enumerate(classifiers), axs):
85-
df_fin = pd.DataFrame(ranks.get(classifier), index=filters). \
86-
transpose()
87+
df_fin = pd.DataFrame(ranks.get(classifier), index=filters).transpose()
8788

8889
for (j, metric), ax in zip(enumerate(metrics), axss):
8990
vals = []
@@ -98,7 +99,7 @@
9899
ax.set_title(str(metric))
99100

100101
if j == 0:
101-
classifier = classifier.split('Classifier')[0]
102+
classifier = classifier.split("Classifier")[0]
102103
ax.set_ylabel(str(classifier))
103104

104105
# ax = df_f.plot(
@@ -110,14 +111,14 @@
110111
# )
111112
# plt.savefig(fname=join(plots, f'{classifier}_{metric}.png'),
112113
# dpi=300)
113-
df_fin.to_csv(join(ranks_path, f'{classifier}.csv'))
114+
df_fin.to_csv(join(ranks_path, f"{classifier}.csv"))
114115

115116
fig.legend(
116117
labels=filters,
117118
loc="center right",
118119
)
119120
plt.subplots_adjust(right=0.9)
120121

121-
plt.savefig(fname=join(plots, 'General.png'), dpi=300)
122-
df.to_csv(join(ranks_path, 'results.csv'), index=False)
123-
print('Plots generated and its CSV')
122+
plt.savefig(fname=join(plots, "General.png"), dpi=300)
123+
df.to_csv(join(ranks_path, "results.csv"), index=False)
124+
print("Plots generated and its CSV")

0 commit comments

Comments
 (0)