|
| 1 | +from skrebate import ReliefF, SURF, SURFstar, MultiSURF, MultiSURFstar |
| 2 | +from sklearn.pipeline import make_pipeline |
| 3 | +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor |
| 4 | +from sklearn.impute import SimpleImputer |
| 5 | +from sklearn.model_selection import cross_val_score |
| 6 | +import pandas as pd |
| 7 | +import numpy as np |
| 8 | +import warnings |
| 9 | + |
| 10 | +import timeit |
| 11 | + |
| 12 | +warnings.filterwarnings('ignore') |
| 13 | + |
| 14 | +np.random.seed(3249083) |
| 15 | + |
| 16 | +genetic_data = pd.read_csv( |
| 17 | + 'data/GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1.tsv.gz', sep='\t', compression='gzip') |
| 18 | +# genetic_data = genetic_data.sample(frac=0.25) |
| 19 | + |
| 20 | +genetic_data_cont_endpoint = pd.read_csv( |
| 21 | + 'data/GAMETES_Epistasis_2-Way_continuous_endpoint_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz', sep='\t', compression='gzip') |
| 22 | +genetic_data_cont_endpoint.rename(columns={'Class': 'class'}, inplace=True) |
| 23 | +genetic_data_cont_endpoint = genetic_data_cont_endpoint.sample(frac=0.25) |
| 24 | + |
| 25 | +genetic_data_mixed_attributes = pd.read_csv( |
| 26 | + 'data/GAMETES_Epistasis_2-Way_mixed_attribute_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz', sep='\t', compression='gzip') |
| 27 | +genetic_data_mixed_attributes.rename(columns={'Class': 'class'}, inplace=True) |
| 28 | +genetic_data_mixed_attributes = genetic_data_mixed_attributes.sample(frac=0.25) |
| 29 | + |
| 30 | +genetic_data_missing_values = pd.read_csv( |
| 31 | + 'data/GAMETES_Epistasis_2-Way_missing_values_0.1_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz', sep='\t', compression='gzip') |
| 32 | +genetic_data_missing_values.rename(columns={'Class': 'class'}, inplace=True) |
| 33 | +genetic_data_missing_values = genetic_data_missing_values.sample(frac=0.25) |
| 34 | + |
| 35 | +genetic_data_multiclass = pd.read_csv('data/3Class_Datasets_Loc_2_01.txt', sep='\t') |
| 36 | +genetic_data_multiclass.rename(columns={'Class': 'class'}, inplace=True) |
| 37 | +genetic_data_multiclass = genetic_data_multiclass.sample(frac=0.25) |
| 38 | + |
| 39 | + |
| 40 | +features, labels = genetic_data.drop('class', axis=1).values, genetic_data['class'].values |
| 41 | +headers = list(genetic_data.drop("class", axis=1)) |
| 42 | + |
| 43 | +features_cont_endpoint, labels_cont_endpoint = genetic_data_cont_endpoint.drop( |
| 44 | + 'class', axis=1).values, genetic_data_cont_endpoint['class'].values |
| 45 | +headers_cont_endpoint = list(genetic_data_cont_endpoint.drop("class", axis=1)) |
| 46 | + |
| 47 | +features_mixed_attributes, labels_mixed_attributes = genetic_data_mixed_attributes.drop( |
| 48 | + 'class', axis=1).values, genetic_data_mixed_attributes['class'].values |
| 49 | +headers_mixed_attributes = list(genetic_data_mixed_attributes.drop("class", axis=1)) |
| 50 | + |
| 51 | +features_missing_values, labels_missing_values = genetic_data_missing_values.drop( |
| 52 | + 'class', axis=1).values, genetic_data_missing_values['class'].values |
| 53 | +headers_missing_values = list(genetic_data_missing_values.drop("class", axis=1)) |
| 54 | + |
| 55 | +features_multiclass, labels_multiclass = genetic_data_multiclass.drop( |
| 56 | + 'class', axis=1).values, genetic_data_multiclass['class'].values |
| 57 | +headers_multiclass = list(genetic_data_multiclass.drop("class", axis=1)) |
| 58 | + |
| 59 | + |
| 60 | + |
| 61 | +# Basic Parallelization Tests and Core binary data and discrete feature data testing (Focus on ReliefF only for efficiency)------------------------------------------------------------ |
| 62 | +def test_relieff(): |
| 63 | + """Check: Data (Binary Endpoint, Discrete Features): ReliefF works in a sklearn pipeline""" |
| 64 | + np.random.seed(49082) |
| 65 | + |
| 66 | + alg = ReliefF(n_features_to_select=2, n_neighbors=10) |
| 67 | + alg.fit(features, labels) |
| 68 | + |
| 69 | + |
| 70 | +def test_relieff_parallel(): |
| 71 | + """Check: Data (Binary Endpoint, Discrete Features): ReliefF works in a sklearn pipeline when ReliefF is parallelized""" |
| 72 | + # Note that the rebate algorithm cannot be parallelized with both the random forest and the cross validation all at once. If the rebate algorithm is parallelized, the cross-validation scoring cannot be. |
| 73 | + np.random.seed(49082) |
| 74 | + |
| 75 | + alg = ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1) |
| 76 | + alg.fit(features, labels) |
| 77 | + |
| 78 | + |
| 79 | +def test_relieffpercent(): |
| 80 | + """Check: Data (Binary Endpoint, Discrete Features): ReliefF with % neighbors works in a sklearn pipeline""" |
| 81 | + np.random.seed(49082) |
| 82 | + |
| 83 | + alg = ReliefF(n_features_to_select=2, n_neighbors=0.1) |
| 84 | + alg.fit(features, labels) |
| 85 | + |
| 86 | + |
| 87 | +def test_surf(): |
| 88 | + """Check: Data (Binary Endpoint, Discrete Features): SURF works in a sklearn pipeline""" |
| 89 | + np.random.seed(240932) |
| 90 | + |
| 91 | + alg = SURF(n_features_to_select=2) |
| 92 | + alg.fit(features, labels) |
| 93 | + |
| 94 | + |
| 95 | +def test_surf_parallel(): |
| 96 | + """Check: Data (Binary Endpoint, Discrete Features): SURF works in a sklearn pipeline when SURF is parallelized""" |
| 97 | + np.random.seed(240932) |
| 98 | + |
| 99 | + alg = SURF(n_features_to_select=2, n_jobs=-1) |
| 100 | + alg.fit(features, labels) |
| 101 | + |
| 102 | + |
| 103 | +def test_surfstar(): |
| 104 | + """Check: Data (Binary Endpoint, Discrete Features): SURF* works in a sklearn pipelined""" |
| 105 | + np.random.seed(9238745) |
| 106 | + |
| 107 | + alg = SURFstar(n_features_to_select=2) |
| 108 | + alg.fit(features, labels) |
| 109 | + |
| 110 | + |
| 111 | +def test_surfstar_parallel(): |
| 112 | + """Check: Data (Binary Endpoint, Discrete Features): SURF* works in a sklearn pipeline when SURF* is parallelized""" |
| 113 | + np.random.seed(9238745) |
| 114 | + |
| 115 | + alg = SURFstar(n_features_to_select=2, n_jobs=-1) |
| 116 | + alg.fit(features, labels) |
| 117 | + |
| 118 | + |
| 119 | +def test_multisurfstar(): |
| 120 | + """Check: Data (Binary Endpoint, Discrete Features): MultiSURF* works in a sklearn pipeline""" |
| 121 | + np.random.seed(320931) |
| 122 | + |
| 123 | + alg = MultiSURFstar(n_features_to_select=2) |
| 124 | + alg.fit(features, labels) |
| 125 | + |
| 126 | + |
| 127 | +def test_multisurfstar_parallel(): |
| 128 | + """Check: Data (Binary Endpoint, Discrete Features): MultiSURF* works in a sklearn pipeline when MultiSURF* is parallelized""" |
| 129 | + np.random.seed(320931) |
| 130 | + |
| 131 | + alg = MultiSURFstar(n_features_to_select=2, n_jobs=-1) |
| 132 | + alg.fit(features, labels) |
| 133 | + |
| 134 | + |
| 135 | +def test_multisurf(): |
| 136 | + """Check: Data (Binary Endpoint, Discrete Features): MultiSURF works in a sklearn pipeline""" |
| 137 | + np.random.seed(320931) |
| 138 | + |
| 139 | + alg = MultiSURF(n_features_to_select=2) |
| 140 | + alg.fit(features, labels) |
| 141 | + |
| 142 | + |
| 143 | +def test_multisurf_parallel(): |
| 144 | + """Check: Data (Binary Endpoint, Discrete Features): MultiSURF works in a sklearn pipeline when MultiSURF is parallelized""" |
| 145 | + np.random.seed(320931) |
| 146 | + |
| 147 | + alg = MultiSURF(n_features_to_select=2, n_jobs=-1) |
| 148 | + alg.fit(features, labels) |
| 149 | + |
| 150 | + |
| 151 | +test_cases = [ |
| 152 | + test_relieff, |
| 153 | + test_relieff_parallel, |
| 154 | + test_relieffpercent, |
| 155 | + # test_surf, |
| 156 | + # test_surf_parallel, |
| 157 | + # test_surfstar, |
| 158 | + # test_surfstar_parallel, |
| 159 | + # test_multisurfstar, |
| 160 | + # test_multisurfstar_parallel, |
| 161 | + # test_multisurf, |
| 162 | + # test_multisurf_parallel |
| 163 | +] |
| 164 | + |
| 165 | +if __name__ == '__main__': |
| 166 | + timing_df = pd.DataFrame(columns=['test_case', 'mean', 'std']) |
| 167 | + |
| 168 | + for test_case in test_cases: |
| 169 | + timing = timeit.repeat(test_case, number=1, repeat=5) |
| 170 | + # ignore the first test to avoid high initial overhead to compile numba |
| 171 | + # functions with small datasets |
| 172 | + timing = timing[1:] |
| 173 | + print(test_case.__name__, np.mean(timing), np.std(timing)) |
| 174 | + d = {'test_case' : test_case.__name__, 'mean' : np.mean(timing), 'std' : np.std(timing)} |
| 175 | + timing_df = timing_df.append(d, ignore_index = True) |
| 176 | + |
| 177 | + print(timing_df) |
| 178 | + |
| 179 | + timing_df.to_csv('timing_benchmarks.csv') |
0 commit comments