-
Notifications
You must be signed in to change notification settings - Fork 38
Exact MeLiF #56
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Exact MeLiF #56
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,4 +2,5 @@ | |
| ITMO_FS.egg-info | ||
| dist | ||
| build | ||
| common | ||
| common | ||
| *.DS_Store | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,267 @@ | ||
| import numpy as np | ||
|
|
||
| from frozenlist import FrozenList | ||
| from scipy.spatial import Delaunay | ||
| from sortedcontainers import SortedSet | ||
| from functools import partial, cmp_to_key | ||
| from sklearn.model_selection import cross_val_score | ||
|
|
||
|
|
||
| class ExactMelif: | ||
| _K_FOLD = 5 | ||
| _RANDOM_STATE = 42 | ||
|
|
||
| def __init__(self, measures, kappa, estimator, score): | ||
| self._measures = measures | ||
| self._kappa = kappa | ||
| self._estimator = estimator | ||
| self._score = score | ||
|
|
||
| def fit(self, X, y): | ||
| planes = self._get_planes(X, y) | ||
| planes = self._normalize(planes) | ||
| planes = self._kappa_filter(planes) | ||
|
|
||
| intersections = self._get_intersections(planes) | ||
| edges = self._get_edges(planes, intersections) | ||
|
|
||
| self._best_feature_indices = self._get_best_feature_indices(edges, X, y) | ||
| self._estimator.fit(self._best_sub_X(X), y) | ||
|
|
||
| def predict(self, X): | ||
| return self._estimator.predict(self._best_sub_X(X)) | ||
|
|
||
| def _get_planes(self, X, y): | ||
| n_objects, n_features = X.shape | ||
| n_measures = len(self._measures) | ||
| planes = np.empty((n_features, n_measures)) | ||
|
|
||
| for j in range(n_measures): | ||
| score = self._measures[j](X, y) | ||
| for i in range(n_features): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't you use numpy array assignment? |
||
| planes[i][j] = score[i] | ||
|
|
||
| return planes | ||
|
|
||
| @staticmethod | ||
| def _normalize(planes): | ||
| shape = planes.shape | ||
| normalized = np.zeros(shape) | ||
|
|
||
| minimum = planes.min() | ||
| maximum = planes.max() | ||
| min_max_diff = maximum - minimum | ||
|
|
||
| for i in range(shape[0]): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again you could use array assignment like |
||
| for j in range(shape[1]): | ||
| normalized[i][j] = (planes[i][j] - minimum) / min_max_diff | ||
|
|
||
| return normalized | ||
|
|
||
| def _kappa_filter(self, planes): | ||
| n_measures = len(self._measures) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are repeating this string, it is better to create class field with this value |
||
|
|
||
| indexed = [] | ||
| for i, plane in enumerate(planes): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems you could just assign enumerate to indexed without for loop |
||
| indexed.append((i, plane)) | ||
| planes = np.array(indexed, dtype=object) | ||
|
|
||
| kappa_indices = set() | ||
| for i in range(n_measures): | ||
| planes = sorted(planes, key=lambda p: p[1][i]) | ||
| kappa_indices.add(planes[-self._kappa][0]) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if -self._kappa exceeds list index bounds? |
||
|
|
||
| filtered_indices = set() | ||
| for i in range(n_measures): | ||
| planes.sort(key=lambda p: p[1][i]) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems that you already sorted it before |
||
|
|
||
| left = 0 | ||
| while planes[left][0] not in kappa_indices: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Numpy where or count, should work faster |
||
| left += 1 | ||
|
|
||
| right = len(planes) - 1 | ||
| while planes[right][0] not in kappa_indices: | ||
| right -= 1 | ||
|
|
||
| for j in range(left, right + 1): | ||
| filtered_indices.add(planes[j][0]) | ||
|
|
||
| filtered_planes = [] | ||
| for i, plane in planes: | ||
| if i in filtered_indices: | ||
| filtered_planes.append(plane) | ||
|
|
||
| return np.array(filtered_planes) | ||
|
|
||
| def _get_intersections(self, planes): | ||
| n_planes, dim = planes.shape | ||
| intersections = np.zeros((n_planes, n_planes), dtype=np.ndarray) | ||
|
|
||
| for i in range(n_planes): | ||
| for j in range(i + 1, n_planes): | ||
| plane_i = planes[i] | ||
| plane_j = planes[j] | ||
|
|
||
| intersection = SortedSet(key=cmp_to_key(_double_list_cmp)) | ||
| for k in range(dim): | ||
| for l in range(k + 1, dim): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Definitely this could be optimised with numpy |
||
| a, b = plane_i[k], plane_i[l] | ||
| c, d = plane_j[k], plane_j[l] | ||
|
|
||
| if abs(a * d - b * c) < 1e-9: | ||
| continue | ||
|
|
||
| point = np.zeros(dim) | ||
| point[k] = a * c * (d - b) / (a * d - b * c) | ||
| point[l] = b * d * (c - a) / (b * c - a * d) | ||
|
|
||
| if point[k] < 0 or point[k] > 1 \ | ||
| or point[l] < 0 or point[l] > 1: | ||
| continue | ||
|
|
||
| point = FrozenList(point) | ||
| point.freeze() | ||
| intersection.add(point) | ||
|
|
||
| intersection = list(intersection) | ||
| for k in range(len(intersection)): | ||
| intersection[k] = list(intersection[k]) | ||
|
|
||
| intersections[i][j] = intersection | ||
| intersections[j][i] = intersection | ||
|
|
||
| return intersections | ||
|
|
||
| def _get_edges(self, planes, intersections): | ||
| n_planes, dim = planes.shape | ||
| edges = [] | ||
|
|
||
| for i in range(n_planes): | ||
| plane_points = [] | ||
|
|
||
| for j in range(dim): | ||
| point = np.zeros(dim) | ||
| point[j] = planes[i][j] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be made with numpy diagonal array |
||
| plane_points.append(point) | ||
|
|
||
| for j in range(n_planes): | ||
| if i != j: | ||
| for point in intersections[i][j]: | ||
| if point != 0: | ||
| plane_points.append(point) | ||
|
|
||
| plane_points = np.array(plane_points) | ||
| if len(plane_points) == 3: | ||
| edges.append(plane_points) | ||
| else: | ||
| triangulation = Delaunay(plane_points, qhull_options='QJ Pp') | ||
| edges.extend(plane_points[triangulation.simplices]) | ||
|
|
||
| return edges | ||
|
|
||
| def _get_best_feature_indices(self, edges, X, y): | ||
| best_feature_indices = None | ||
| best_quality = None | ||
| feature_indices_sets = set() | ||
|
|
||
| for edge in edges: | ||
| master_measure = self._build_master_measure(edge) | ||
| feature_indices, filtered_X = self._filter_X(master_measure, X, y) | ||
|
|
||
| if feature_indices in feature_indices_sets: | ||
| continue | ||
|
|
||
| feature_indices_sets.add(feature_indices) | ||
| quality = self._get_quality(filtered_X, y) | ||
|
|
||
| if best_quality is None or best_quality < quality: | ||
| best_feature_indices = feature_indices | ||
| best_quality = quality | ||
|
|
||
| return best_feature_indices | ||
|
|
||
| def _build_master_measure(self, edge): | ||
| n_points, dim = edge.shape | ||
| center = np.zeros(dim) | ||
| for point in edge: | ||
| for i in range(dim): | ||
| center[i] += point[i] | ||
|
|
||
| for i in range(dim): | ||
| center[i] /= n_points | ||
|
|
||
| alphas = center | ||
|
|
||
| def master(measures, X, y): | ||
| n_measures = len(self._measures) | ||
| n_features = X.shape[1] | ||
| result = np.zeros(n_features) | ||
|
|
||
| for i in range(n_measures): | ||
| value = measures[i](X, y) | ||
| for j in range(n_features): | ||
| result[j] += alphas[i] * value[j] | ||
|
|
||
| return result | ||
|
|
||
| return partial(master, self._measures) | ||
|
|
||
| def _filter_X(self, master_measure, X, y): | ||
| features = np.transpose(X) | ||
| n_features = len(features) | ||
|
|
||
| scores = master_measure(X, y) | ||
| feature_scores = [] | ||
| for feature_i, score in enumerate(scores): | ||
| feature_scores.append((feature_i, score)) | ||
|
|
||
| feature_scores.sort(key=lambda p: p[1]) | ||
|
|
||
| feature_indices = set() | ||
| for i in range(n_features - self._kappa, n_features): | ||
| feature_indices.add(feature_scores[i][0]) | ||
|
|
||
| filtered_features = [] | ||
| for i, feature in enumerate(features): | ||
| if i in feature_indices: | ||
| filtered_features.append(feature) | ||
|
|
||
| feature_indices = frozenset(feature_indices) | ||
| filtered_features = np.array(filtered_features) | ||
| filtered_X = np.transpose(filtered_features) | ||
|
|
||
| return feature_indices, filtered_X | ||
|
|
||
| def _get_quality(self, X, y): | ||
| return np.mean(cross_val_score(self._estimator, X, y, scoring=self._score, cv=self._K_FOLD)) | ||
|
|
||
| @staticmethod | ||
| def _sub_X(feature_indices, X): | ||
| features = np.transpose(X) | ||
| filtered_features = [] | ||
|
|
||
| for i, feature in enumerate(features): | ||
| if i in feature_indices: | ||
| filtered_features.append(feature) | ||
|
|
||
| filtered_features = np.array(filtered_features) | ||
| return np.transpose(filtered_features) | ||
|
|
||
| def _best_sub_X(self, X): | ||
| return self._sub_X(self._best_feature_indices, X) | ||
|
|
||
|
|
||
| def _double_list_cmp(a_list, b_list): | ||
| for i in range(len(a_list)): | ||
| res = _double_cmp(a_list[i], b_list[i]) | ||
| if res != 0: | ||
| return res | ||
| return 0 | ||
|
|
||
|
|
||
| def _double_cmp(a, b): | ||
| if abs(a - b) < 1e-9: | ||
| return 0 | ||
| if a > b: | ||
| return 1 | ||
| return -1 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| import unittest | ||
|
|
||
| from sklearn.svm import SVC | ||
| from sklearn.metrics import f1_score | ||
| from sklearn.datasets import make_classification | ||
| from sklearn.model_selection import train_test_split | ||
| from ITMO_FS.ensembles import WeightBased | ||
| from ITMO_FS.hybrid.Melif import Melif | ||
| from ITMO_FS.filters import * | ||
|
|
||
| from cpu_melif import ExactMelif | ||
|
|
||
|
|
||
| class ExactMelifTest(unittest.TestCase): | ||
| def test_compare(self): | ||
| random_state = 42 | ||
| measures = [pearson_corr, spearman_corr, anova] | ||
| kappa = 5 | ||
|
|
||
| univariate_filters = list(map(lambda m: UnivariateFilter(m), measures)) | ||
|
|
||
| estimator = SVC(random_state=random_state) | ||
| ensemble = WeightBased(univariate_filters) | ||
|
|
||
| melif = Melif(ensemble, f1_score) | ||
| exact_melif = ExactMelif(measures, kappa, estimator, 'f1_macro') | ||
|
|
||
| X, y = make_classification(n_samples=100, n_classes=2, n_features=30, n_informative=kappa) | ||
| X_train, X_test, y_train, y_test = train_test_split(X, y) | ||
|
|
||
| melif.fit(X_train, y_train, estimator, select_k_best(kappa)) | ||
| print(f'MeLiF: {f1_score(y_test, melif.predict(X_test))}') | ||
|
|
||
| exact_melif.fit(X_train, y_train) | ||
| print(f'Exact MeLiF: {f1_score(y_test, exact_melif.predict(X_test))}') | ||
|
|
||
| # def test_simple(self): | ||
| # def measure_1(f, _): | ||
| # return {1: .1, 11: .2, 21: .3, 31: .5, 41: .4, 51: .8, 61: .6, 71: .9, 81: .95, 91: 1}[f[0]] | ||
| # | ||
| # def measure_2(f, _): | ||
| # return {1: .2, 11: .3, 21: .1, 31: .5, 41: .8, 51: .4, 61: .6, 71: .9, 81: 1, 91: .95}[f[0]] | ||
| # | ||
| # def measure_3(f, _): | ||
| # return {1: .3, 11: .1, 21: .2, 31: .8, 41: .6, 51: .4, 61: .6, 71: .1, 81: .9, 91: .95}[f[0]] | ||
| # | ||
| # measures = [measure_1, measure_2, measure_3] | ||
| # kappa = 4 | ||
| # classifier = _MockClassifier(lambda X: []) | ||
| # quality = 'f1-macro' | ||
| # X = np.array([ | ||
| # [1, 11, 21, 31, 41, 51, 61, 71, 81, 91], | ||
| # [2, 12, 22, 32, 42, 52, 62, 72, 82, 92], | ||
| # [3, 13, 23, 33, 43, 53, 63, 73, 83, 93], | ||
| # [4, 14, 24, 34, 44, 54, 64, 74, 84, 94], | ||
| # [5, 15, 25, 35, 45, 55, 65, 75, 85, 95], | ||
| # [6, 16, 26, 36, 46, 56, 66, 76, 86, 96], | ||
| # [7, 17, 27, 37, 47, 57, 67, 77, 87, 97], | ||
| # [8, 18, 28, 38, 48, 58, 68, 78, 88, 98], | ||
| # [9, 19, 29, 39, 49, 59, 69, 79, 89, 99], | ||
| # [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] | ||
| # ]) | ||
| # y = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] | ||
| # | ||
| # melif = ExactMelif(measures, kappa, classifier, quality) | ||
| # melif.select([], []) | ||
|
|
||
|
|
||
| class _MockClassifier: | ||
| def __init__(self, predict_f): | ||
| self._predict_f = predict_f | ||
|
|
||
| def fit(self, X, y): | ||
| pass | ||
|
|
||
| def predict(self, X): | ||
| return self._predict_f(X) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| unittest.main() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| # Ядерный SVM с автоматическим подбором ядер из Sklearn | ||
| # Датасеты из ITMO_FS 5 штук (Modelon) | ||
| # F1 score (MACRO) классификатора | ||
| # 3- или 5-fold cross validation | ||
| # Меры из ITMO_FS штук 5-11 (Спирмэн и тд) | ||
| # kappa=(10, 20, 50, 100), (5%, 10%, 15%) | ||
| # broken stick rule | ||
| from ITMO_FS.filters import * |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Documentation with argument explanation and types needed in functions