-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathforward_selection.py
More file actions
75 lines (67 loc) · 2.93 KB
/
forward_selection.py
File metadata and controls
75 lines (67 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""Simple implementation of forward selection using statsmodels."""
import sys
import pandas as pd
import statsmodels.formula.api as smf
import utils
class CandidateModel:
def __init__(self, score, candidate, model, cost, coverage):
self.score = score
self.candidate = candidate
self.model = model
self.cost = cost
self.coverage = coverage
def forward_selection(data, response, mutators=None, full_num=None, del_num=None, log=True):
"""Linear model designed by forward selection.
Credit: https://planspace.org/20150423-forward_selection_with_statsmodels/
"""
candidates = list(data.columns)
candidates.remove(response)
if log:
print('{} data points'.format(len(data)))
print('Initial candidates: {}'.format(candidates))
initial_mutators = len(candidates)
selected = []
maxint = sys.maxsize
current_score = maxint
best = None
models = []
while candidates and (best is None or current_score == best.score):
scores_with_candidates = []
for candidate in candidates:
features = selected + [candidate]
formula = '{} ~ {} + 1'.format(response, ' + '.join(features))
cost = None
coverage = None
if full_num is not None:
subset_data = utils.get_data_for_subset(mutators, subset=features)
# subset_data = subset_data.loc[data.index]
cost = subset_data['num'].median()
coverage = (subset_data['cov'].mean(), subset_data['cov'].std())
model = smf.ols(formula, data=data).fit()
score = model.bic
candidate_model = CandidateModel(score, candidate, model, cost, coverage)
scores_with_candidates.append(candidate_model)
# sort by cost
if full_num is not None:
scores_with_candidates = sorted(scores_with_candidates, key=lambda c: c.cost)
# sort by score
scores_with_candidates = sorted(scores_with_candidates, key=lambda c: c.score)
best = scores_with_candidates[0]
if current_score > best.score:
candidates.remove(best.candidate)
selected.append(best.candidate)
current_score = best.score
if log:
print('Add {}.'.format(best.candidate))
if best.cost is not None:
if log:
print('{} mutants.\t{:.2%} of FULL.'.format(best.cost, best.cost / full_num), end=' ')
if del_num is not None:
print('{:.2%} of DELETION.'.format(best.cost / del_num))
print('\tCoverage: {}'.format(best.coverage))
if log:
print('\tR^2 = {:.2%}.'.format(best.model.rsquared_adj))
models.append(best.model)
if log:
print('Selected {} / {} mutators'.format(len(selected), initial_mutators))
return models, selected