Skip to content

Commit e6a7ec6

Browse files
committed
ENH: Implemented performance speedup for binary ReliefF + bug fixes
1 parent 1679885 commit e6a7ec6

File tree

6 files changed

+622
-166
lines changed

6 files changed

+622
-166
lines changed

performance_tests.py

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
from skrebate import ReliefF, SURF, SURFstar, MultiSURF, MultiSURFstar
2+
from sklearn.pipeline import make_pipeline
3+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
4+
from sklearn.impute import SimpleImputer
5+
from sklearn.model_selection import cross_val_score
6+
import pandas as pd
7+
import numpy as np
8+
import warnings
9+
10+
import timeit
11+
12+
warnings.filterwarnings('ignore')
13+
14+
np.random.seed(3249083)
15+
16+
genetic_data = pd.read_csv(
17+
'data/GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1.tsv.gz', sep='\t', compression='gzip')
18+
# genetic_data = genetic_data.sample(frac=0.25)
19+
20+
genetic_data_cont_endpoint = pd.read_csv(
21+
'data/GAMETES_Epistasis_2-Way_continuous_endpoint_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz', sep='\t', compression='gzip')
22+
genetic_data_cont_endpoint.rename(columns={'Class': 'class'}, inplace=True)
23+
genetic_data_cont_endpoint = genetic_data_cont_endpoint.sample(frac=0.25)
24+
25+
genetic_data_mixed_attributes = pd.read_csv(
26+
'data/GAMETES_Epistasis_2-Way_mixed_attribute_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz', sep='\t', compression='gzip')
27+
genetic_data_mixed_attributes.rename(columns={'Class': 'class'}, inplace=True)
28+
genetic_data_mixed_attributes = genetic_data_mixed_attributes.sample(frac=0.25)
29+
30+
genetic_data_missing_values = pd.read_csv(
31+
'data/GAMETES_Epistasis_2-Way_missing_values_0.1_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz', sep='\t', compression='gzip')
32+
genetic_data_missing_values.rename(columns={'Class': 'class'}, inplace=True)
33+
genetic_data_missing_values = genetic_data_missing_values.sample(frac=0.25)
34+
35+
genetic_data_multiclass = pd.read_csv('data/3Class_Datasets_Loc_2_01.txt', sep='\t')
36+
genetic_data_multiclass.rename(columns={'Class': 'class'}, inplace=True)
37+
genetic_data_multiclass = genetic_data_multiclass.sample(frac=0.25)
38+
39+
40+
features, labels = genetic_data.drop('class', axis=1).values, genetic_data['class'].values
41+
headers = list(genetic_data.drop("class", axis=1))
42+
43+
features_cont_endpoint, labels_cont_endpoint = genetic_data_cont_endpoint.drop(
44+
'class', axis=1).values, genetic_data_cont_endpoint['class'].values
45+
headers_cont_endpoint = list(genetic_data_cont_endpoint.drop("class", axis=1))
46+
47+
features_mixed_attributes, labels_mixed_attributes = genetic_data_mixed_attributes.drop(
48+
'class', axis=1).values, genetic_data_mixed_attributes['class'].values
49+
headers_mixed_attributes = list(genetic_data_mixed_attributes.drop("class", axis=1))
50+
51+
features_missing_values, labels_missing_values = genetic_data_missing_values.drop(
52+
'class', axis=1).values, genetic_data_missing_values['class'].values
53+
headers_missing_values = list(genetic_data_missing_values.drop("class", axis=1))
54+
55+
features_multiclass, labels_multiclass = genetic_data_multiclass.drop(
56+
'class', axis=1).values, genetic_data_multiclass['class'].values
57+
headers_multiclass = list(genetic_data_multiclass.drop("class", axis=1))
58+
59+
60+
61+
# Basic Parallelization Tests and Core binary data and discrete feature data testing (Focus on ReliefF only for efficiency)------------------------------------------------------------
62+
def test_relieff():
63+
"""Check: Data (Binary Endpoint, Discrete Features): ReliefF works in a sklearn pipeline"""
64+
np.random.seed(49082)
65+
66+
alg = ReliefF(n_features_to_select=2, n_neighbors=10)
67+
alg.fit(features, labels)
68+
69+
70+
def test_relieff_parallel():
71+
"""Check: Data (Binary Endpoint, Discrete Features): ReliefF works in a sklearn pipeline when ReliefF is parallelized"""
72+
# Note that the rebate algorithm cannot be parallelized with both the random forest and the cross validation all at once. If the rebate algorithm is parallelized, the cross-validation scoring cannot be.
73+
np.random.seed(49082)
74+
75+
alg = ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1)
76+
alg.fit(features, labels)
77+
78+
79+
def test_relieffpercent():
80+
"""Check: Data (Binary Endpoint, Discrete Features): ReliefF with % neighbors works in a sklearn pipeline"""
81+
np.random.seed(49082)
82+
83+
alg = ReliefF(n_features_to_select=2, n_neighbors=0.1)
84+
alg.fit(features, labels)
85+
86+
87+
def test_surf():
88+
"""Check: Data (Binary Endpoint, Discrete Features): SURF works in a sklearn pipeline"""
89+
np.random.seed(240932)
90+
91+
alg = SURF(n_features_to_select=2)
92+
alg.fit(features, labels)
93+
94+
95+
def test_surf_parallel():
96+
"""Check: Data (Binary Endpoint, Discrete Features): SURF works in a sklearn pipeline when SURF is parallelized"""
97+
np.random.seed(240932)
98+
99+
alg = SURF(n_features_to_select=2, n_jobs=-1)
100+
alg.fit(features, labels)
101+
102+
103+
def test_surfstar():
104+
"""Check: Data (Binary Endpoint, Discrete Features): SURF* works in a sklearn pipelined"""
105+
np.random.seed(9238745)
106+
107+
alg = SURFstar(n_features_to_select=2)
108+
alg.fit(features, labels)
109+
110+
111+
def test_surfstar_parallel():
112+
"""Check: Data (Binary Endpoint, Discrete Features): SURF* works in a sklearn pipeline when SURF* is parallelized"""
113+
np.random.seed(9238745)
114+
115+
alg = SURFstar(n_features_to_select=2, n_jobs=-1)
116+
alg.fit(features, labels)
117+
118+
119+
def test_multisurfstar():
120+
"""Check: Data (Binary Endpoint, Discrete Features): MultiSURF* works in a sklearn pipeline"""
121+
np.random.seed(320931)
122+
123+
alg = MultiSURFstar(n_features_to_select=2)
124+
alg.fit(features, labels)
125+
126+
127+
def test_multisurfstar_parallel():
128+
"""Check: Data (Binary Endpoint, Discrete Features): MultiSURF* works in a sklearn pipeline when MultiSURF* is parallelized"""
129+
np.random.seed(320931)
130+
131+
alg = MultiSURFstar(n_features_to_select=2, n_jobs=-1)
132+
alg.fit(features, labels)
133+
134+
135+
def test_multisurf():
136+
"""Check: Data (Binary Endpoint, Discrete Features): MultiSURF works in a sklearn pipeline"""
137+
np.random.seed(320931)
138+
139+
alg = MultiSURF(n_features_to_select=2)
140+
alg.fit(features, labels)
141+
142+
143+
def test_multisurf_parallel():
144+
"""Check: Data (Binary Endpoint, Discrete Features): MultiSURF works in a sklearn pipeline when MultiSURF is parallelized"""
145+
np.random.seed(320931)
146+
147+
alg = MultiSURF(n_features_to_select=2, n_jobs=-1)
148+
alg.fit(features, labels)
149+
150+
151+
test_cases = [
152+
test_relieff,
153+
test_relieff_parallel,
154+
test_relieffpercent,
155+
# test_surf,
156+
# test_surf_parallel,
157+
# test_surfstar,
158+
# test_surfstar_parallel,
159+
# test_multisurfstar,
160+
# test_multisurfstar_parallel,
161+
# test_multisurf,
162+
# test_multisurf_parallel
163+
]
164+
165+
if __name__ == '__main__':
166+
timing_df = pd.DataFrame(columns=['test_case', 'mean', 'std'])
167+
168+
for test_case in test_cases:
169+
timing = timeit.repeat(test_case, number=1, repeat=5)
170+
# ignore the first test to avoid high initial overhead to compile numba
171+
# functions with small datasets
172+
timing = timing[1:]
173+
print(test_case.__name__, np.mean(timing), np.std(timing))
174+
d = {'test_case' : test_case.__name__, 'mean' : np.mean(timing), 'std' : np.std(timing)}
175+
timing_df = timing_df.append(d, ignore_index = True)
176+
177+
print(timing_df)
178+
179+
timing_df.to_csv('timing_benchmarks.csv')

run_performance_benchmark.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
python -m cProfile -o perf_data.pstats performance_tests.py
2+
gprof2dot -f pstats perf_data.pstats | dot -Tpng -o perfgraph.png

0 commit comments

Comments
 (0)