|
12 | 12 | from math import floor |
13 | 13 | from os import walk |
14 | 14 |
|
15 | | -current = os.path.dirname(os.path.realpath(__file__)) |
16 | | -parent = os.path.dirname(current) |
17 | | -sys.path.append(parent) |
18 | | - |
19 | 15 | import numpy as np |
20 | 16 | import pandas as pd |
21 | 17 | import yagmail |
22 | | -from sklearn.metrics import mean_squared_error, f1_score, accuracy_score |
| 18 | +from sklearn.metrics import accuracy_score, f1_score, mean_squared_error |
23 | 19 | from sklearn.model_selection import StratifiedKFold |
24 | 20 | from sklearn.naive_bayes import GaussianNB |
25 | 21 | from sklearn.neighbors import KNeighborsClassifier |
|
28 | 24 | from instance_selection import ENN, LSSm |
29 | 25 | from semisupervised import DensityPeaks |
30 | 26 |
|
| 27 | +current = os.path.dirname(os.path.realpath(__file__)) |
| 28 | +parent = os.path.dirname(current) |
| 29 | +sys.path.append(parent) |
| 30 | + |
31 | 31 | time_str = time.strftime("%Y%m%d-%H%M%S") |
32 | 32 | k = 3 |
33 | 33 | folds = 10 |
34 | 34 | precision = 0.05 |
35 | | -file_name = 'experiments' |
36 | | -csv_results = os.path.join('.', 'results', file_name + '_' + time_str + '.csv') |
37 | | -log_file = os.path.join('.', 'logs', '_'.join([file_name, time_str]) + '.log') |
38 | | - |
39 | | -logging.basicConfig(level=logging.DEBUG, |
40 | | - format=' %(asctime)s :: %(levelname)-8s :: %(message)s', |
41 | | - handlers=[logging.FileHandler(log_file), |
42 | | - logging.StreamHandler(sys.stdout)] |
43 | | - ) |
| 35 | +file_name = "experiments" |
| 36 | +csv_results = os.path.join(".", "results", file_name + "_" + time_str + ".csv") |
| 37 | +log_file = os.path.join(".", "logs", "_".join([file_name, time_str]) + ".log") |
| 38 | + |
| 39 | +logging.basicConfig( |
| 40 | + level=logging.DEBUG, |
| 41 | + format=" %(asctime)s :: %(levelname)-8s :: %(message)s", |
| 42 | + handlers=[logging.FileHandler( |
| 43 | + log_file), logging.StreamHandler(sys.stdout)], |
| 44 | +) |
44 | 45 |
|
45 | 46 |
|
46 | 47 | def search_datasets(folder): |
47 | 48 | if os.path.isdir(folder): |
48 | | - logging.info(f'Looking up for datasets in {folder}') |
| 49 | + logging.info(f"Looking up for datasets in {folder}") |
49 | 50 | else: |
50 | | - logging.error(f'{folder} does not exist') |
| 51 | + logging.error(f"{folder} does not exist") |
51 | 52 |
|
52 | 53 | datasets_found = next(walk(folder), (None, None, []))[2] |
53 | 54 | datasets_found.sort() |
54 | | - logging.info(f'Founded {len(datasets_found)} - {datasets_found}') |
| 55 | + logging.info(f"Founded {len(datasets_found)} - {datasets_found}") |
55 | 56 |
|
56 | 57 | header = [ |
57 | | - 'dataset', |
58 | | - 'percent labeled', |
59 | | - 'fold', |
60 | | - 'base', |
61 | | - 'filter', |
62 | | - 'f1-score', |
63 | | - 'mean squared error', |
64 | | - 'accuracy score' |
| 58 | + "dataset", |
| 59 | + "percent labeled", |
| 60 | + "fold", |
| 61 | + "base", |
| 62 | + "filter", |
| 63 | + "f1-score", |
| 64 | + "mean squared error", |
| 65 | + "accuracy score", |
65 | 66 | ] |
66 | 67 |
|
67 | | - with open(csv_results, 'w') as save: |
| 68 | + with open(csv_results, "w") as save: |
68 | 69 | w = csv.writer(save) |
69 | 70 | w.writerow(header) |
70 | 71 | save.close() |
71 | 72 |
|
72 | 73 | datasets = dict.fromkeys(datasets_found) |
73 | 74 | for dataset in datasets_found: |
74 | | - datasets[dataset] = pd.read_csv(os.path.join(folder, dataset), |
75 | | - header=None) |
76 | | - logging.debug('Datasets ready to be used') |
| 75 | + datasets[dataset] = pd.read_csv( |
| 76 | + os.path.join(folder, dataset), header=None) |
| 77 | + logging.debug("Datasets ready to be used") |
77 | 78 |
|
78 | 79 | return datasets |
79 | 80 |
|
80 | 81 |
|
81 | 82 | def main(datasets): |
82 | | - logging.info('Starting main...') |
| 83 | + logging.info("Starting main...") |
83 | 84 | random_state = 0x24032022 |
84 | 85 | skf = StratifiedKFold(n_splits=folds, shuffle=True, |
85 | 86 | random_state=random_state) |
86 | 87 | classifiers = [KNeighborsClassifier, DecisionTreeClassifier, GaussianNB] |
87 | 88 | classifiers_params = [ |
88 | | - {'n_neighbors': k, 'n_jobs': -1}, {'random_state': random_state}, {} |
| 89 | + {"n_neighbors": k, "n_jobs": -1}, |
| 90 | + {"random_state": random_state}, |
| 91 | + {}, |
89 | 92 | ] |
90 | | - filters = [ENN, LSSm, 'ENANE'] |
| 93 | + filters = [ENN, LSSm, "ENANE"] |
91 | 94 |
|
92 | 95 | for dataset, values in datasets.items(): |
93 | | - logging.info(f'\n\nCurrent dataset: {dataset} - Shape: ' |
94 | | - f'{values.shape}') |
| 96 | + logging.info( |
| 97 | + f"\n\nCurrent dataset: {dataset} - Shape: " f"{values.shape}") |
95 | 98 | for n_classifier, classifier in enumerate(classifiers): |
96 | 99 | classifier_name = classifier.__name__ |
97 | 100 | for filter_method in filters: |
98 | | - filter_name = filter_method if isinstance( |
99 | | - filter_method, str) else filter_method.__name__ |
| 101 | + filter_name = ( |
| 102 | + filter_method |
| 103 | + if isinstance(filter_method, str) |
| 104 | + else filter_method.__name__ |
| 105 | + ) |
100 | 106 | samples = values.iloc[:, :-1] |
101 | 107 | y = values.iloc[:, -1] |
102 | 108 | y_df = pd.DataFrame(y.tolist()) |
103 | | - for fold, (train_index, test_index) in enumerate(skf.split( |
104 | | - samples, y)): |
| 109 | + for fold, (train_index, test_index) in enumerate(skf.split(samples, y)): |
105 | 110 | t_start = time.time() |
106 | | - logging.info(f'Dataset: {dataset} -- Classifier: ' |
107 | | - f'{classifier_name} -- Filter: {filter_name} ' |
108 | | - f'-- Fold: {fold}') |
| 111 | + logging.info( |
| 112 | + f"Dataset: {dataset} -- Classifier: " |
| 113 | + f"{classifier_name} -- Filter: {filter_name} " |
| 114 | + f"-- Fold: {fold}" |
| 115 | + ) |
109 | 116 | x_train = samples.iloc[train_index, :].copy(deep=True) |
110 | 117 | x_test = samples.iloc[test_index, :].copy(deep=True) |
111 | 118 | y_train = y_df.iloc[train_index, :].copy(deep=True) |
112 | 119 | y_test = y_df.iloc[test_index, :].copy(deep=True) |
113 | 120 |
|
114 | 121 | unlabeled_indexes = np.random.choice( |
115 | | - train_index, floor(len(x_train) * (1 - precision)), |
116 | | - replace=False) |
| 122 | + train_index, |
| 123 | + floor(len(x_train) * (1 - precision)), |
| 124 | + replace=False, |
| 125 | + ) |
117 | 126 |
|
118 | 127 | y_train.at[unlabeled_indexes] = -1 |
119 | 128 |
|
120 | 129 | model = DensityPeaks.STDPNF( |
121 | 130 | classifier=classifier, |
122 | 131 | classifier_params=classifiers_params[n_classifier], |
123 | 132 | filtering=True, |
124 | | - filter_method=filter_method |
| 133 | + filter_method=filter_method, |
125 | 134 | ) |
126 | 135 | try: |
127 | 136 | model.fit(x_train, y_train) |
128 | 137 | y_pred = model.predict(x_test) |
129 | 138 | f1 = f1_score(y_true=y_test, y_pred=y_pred, |
130 | 139 | average="weighted") |
131 | | - mse = mean_squared_error(y_true=y_test, |
132 | | - y_pred=y_pred) |
| 140 | + mse = mean_squared_error(y_true=y_test, y_pred=y_pred) |
133 | 141 | acc = accuracy_score(y_true=y_test, y_pred=y_pred) |
134 | 142 |
|
135 | | - logging.info(f'\tf1: {f1:.2f} -- mse: {mse:.2f} -- acc:' |
136 | | - f' {acc:.2f}') |
| 143 | + logging.info( |
| 144 | + f"\tf1: {f1:.2f} -- mse: {mse:.2f} -- acc:" f" {acc:.2f}" |
| 145 | + ) |
137 | 146 | except Exception: |
138 | | - f1 = mse = acc = '' |
139 | | - logging.exception('Failed') |
| 147 | + f1 = mse = acc = "" |
| 148 | + logging.exception("Failed") |
140 | 149 | t_end = time.time() |
141 | 150 | logging.info( |
142 | | - f'\t\tElapsed: {(t_end - t_start) / 60:.2f} minutes') |
143 | | - with open(csv_results, 'a') as save: |
| 151 | + f"\t\tElapsed: {(t_end - t_start) / 60:.2f} minutes") |
| 152 | + with open(csv_results, "a") as save: |
144 | 153 | w = csv.writer(save) |
145 | | - w.writerow([dataset, precision, fold, classifier_name, |
146 | | - filter_name, f1, mse, acc]) |
147 | | - |
148 | | - |
149 | | -if __name__ == '__main__': |
150 | | - mail = 'ntoolsecure' |
151 | | - passwd = 'qfj3nfr_jnt7ATZ8jgh' |
| 154 | + w.writerow( |
| 155 | + [ |
| 156 | + dataset, |
| 157 | + precision, |
| 158 | + fold, |
| 159 | + classifier_name, |
| 160 | + filter_name, |
| 161 | + f1, |
| 162 | + mse, |
| 163 | + acc, |
| 164 | + ] |
| 165 | + ) |
| 166 | + |
| 167 | + |
| 168 | +if __name__ == "__main__": |
| 169 | + mail = "ntoolsecure" |
| 170 | + passwd = "qfj3nfr_jnt7ATZ8jgh" |
152 | 171 | yag = yagmail.SMTP(user=mail, password=passwd) |
153 | 172 | t_start_g = time.time() |
154 | 173 | try: |
155 | | - logging.info('--- Starting ---') |
156 | | - datasets_folder = os.path.join('..', 'datasets', 'UCI-Experimentation') |
| 174 | + logging.info("--- Starting ---") |
| 175 | + datasets_folder = os.path.join("..", "datasets", "UCI-Experimentation") |
157 | 176 | datasets_dfs = search_datasets(datasets_folder) |
158 | 177 |
|
159 | 178 | main(datasets_dfs) |
160 | 179 |
|
161 | | - logging.info('--- Process completed ---') |
| 180 | + logging.info("--- Process completed ---") |
162 | 181 | attach = [csv_results, log_file] |
163 | 182 | t_end_g = time.time() |
164 | | - logging.info(f'Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes') |
165 | | - yag. send( to='[email protected]', subject='self_training_validation ' |
166 | | - 'COMPLETED', |
167 | | - contents='self_training_validation has been completed.\n' |
168 | | - f'Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes', |
169 | | - attachments=attach) |
| 183 | + logging.info(f"Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes") |
| 184 | + yag.send( |
| 185 | + |
| 186 | + subject="self_training_validation " "COMPLETED", |
| 187 | + contents="self_training_validation has been completed.\n" |
| 188 | + f"Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes", |
| 189 | + attachments=attach, |
| 190 | + ) |
170 | 191 | except Exception as e: |
171 | 192 | t_end_g = time.time() |
172 | | - content = f'FATAL ERROR - Check the attached log\n' \ |
173 | | - f'Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes' |
174 | | - |
175 | | - yag. send( to='[email protected]', subject='self_training_validation ' |
176 | | - 'ERROR', |
177 | | - contents=content, attachments=[log_file]) |
178 | | - logging.exception('--- Process has broken ---') |
179 | | - logging.info(f'Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes') |
| 193 | + content = ( |
| 194 | + f"FATAL ERROR - Check the attached log\n" |
| 195 | + f"Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes" |
| 196 | + ) |
| 197 | + |
| 198 | + yag.send( |
| 199 | + |
| 200 | + subject="self_training_validation " "ERROR", |
| 201 | + contents=content, |
| 202 | + attachments=[log_file], |
| 203 | + ) |
| 204 | + logging.exception("--- Process has broken ---") |
| 205 | + logging.info(f"Elapsed: {(t_end_g - t_start_g) / 60:.2f} minutes") |
180 | 206 | logging.info("Email sent successfully") |
0 commit comments