@@ -38,7 +38,9 @@ def neglog(weights, x, y):
3838
3939
4040class SequentialSelector :
41- def __init__ (self , model , n_features_to_select = None , direction = 'forward' , scoring = None ):
41+ def __init__ (self , model , n_features_to_select = None ,
42+ direction = 'forward' , scoring = None ,
43+ train = None , test = None ):
4244 """
4345 Sequential feature selection for neural models
4446
@@ -67,52 +69,96 @@ def __init__(self, model, n_features_to_select=None, direction='forward', scorin
6769 self .scoring = scoring
6870 self .delta_scores = pd .DataFrame (index = self .model .clu_ids )
6971 self .trlabels = self .design .trlabels
70- self .train = np .isin (self .trlabels , self .model .traininds ).flatten ()
71- self .test = ~ self .train
72+ if train is None :
73+ self .train = np .isin (self .trlabels , self .model .traininds ).flatten ()
74+ else :
75+ self .train = np .isin (self .trlabels , train ).flatten ()
76+ if test is None :
77+ self .test = ~ self .train
78+ else :
79+ self .test = np .isin (self .trlabels , test ).flatten ()
7280 self .features = np .array (list (self .design .covar .keys ()))
7381
74- def fit (self , progress = False ):
82+ def fit (self , train_idx = None , full_scores = False , progress = False ):
7583 """
7684 Fit the sequential feature selection
77-
7885 Parameters
7986 ----------
87+ train_idx : array-like
88+ indices of trials to use in the training set. If the model passed to the SFS instance
89+ did not already have training indices, this must be specified. If it did have indices,
90+ then this will override those.
91+ full_scores : bool, optional
92+ Whether to store the full set of submodel scores at each step. Produces additional
93+ attributes .full_scores_train_ and .full_scores_test_
8094 progress : bool, optional
8195 Whether to show a progress bar, by default False
8296 """
97+ if train_idx is None and self .train is None :
98+ raise ValueError ('train_idx cannot be None if model used to create SFS did not have '
99+ 'any training indices' )
100+ if train_idx is not None :
101+ self .train = np .isin (self .trlabels , train_idx ).flatten ()
102+ self .test = ~ self .train
83103 n_features = len (self .features )
84104 maskdf = pd .DataFrame (index = self .model .clu_ids , columns = self .features , dtype = bool )
85105 maskdf .loc [:, :] = False
86106 seqdf = pd .DataFrame (index = self .model .clu_ids , columns = range (self .n_features_to_select ))
87- scoredf = pd .DataFrame (index = self .model .clu_ids , columns = range (self .n_features_to_select ))
107+ trainscoredf = pd .DataFrame (index = self .model .clu_ids ,
108+ columns = range (self .n_features_to_select ))
109+ testscoredf = pd .DataFrame (index = self .model .clu_ids ,
110+ columns = range (self .n_features_to_select ))
88111
89112 if not 0 < self .n_features_to_select <= n_features :
90113 raise ValueError ('n_features_to_select is not a valid number in the context'
91114 ' of the model.' )
92115
93- n_iterations = (
94- self .n_features_to_select if self .direction == 'forward'
95- else n_features - self .n_features_to_select
96- )
116+ n_iterations = (self .n_features_to_select if self .direction == 'forward' else n_features -
117+ self .n_features_to_select )
118+ if full_scores :
119+ fullindex = pd .MultiIndex .from_product ([self .model .clu_ids , np .arange (n_iterations )],
120+ names = ['clu_id' , 'feature_iter' ])
121+ fulltrain = pd .DataFrame (index = fullindex , columns = range (len (self .design .covar )))
122+ fulltest = pd .DataFrame (index = fullindex , columns = range (len (self .design .covar )))
123+
97124 for i in tqdm (range (n_iterations ), desc = 'step' , leave = False , disable = not progress ):
98125 masks_set = maskdf .groupby (self .features .tolist ()).groups
99126 for current_mask in tqdm (masks_set , desc = 'feature subset' , leave = False ):
100127 cells = masks_set [current_mask ]
101- new_feature_idx , nf_score = self ._get_best_new_feature (current_mask , cells )
128+ outputs = self ._get_best_new_feature (current_mask , cells , full_scores )
129+ if full_scores :
130+ new_feature_idx , nf_train , nf_test , nf_fulltrain , nf_fulltest = outputs
131+ else :
132+ new_feature_idx , nf_train , nf_test = outputs
102133 for cell in cells :
103134 maskdf .at [cell , self .features [new_feature_idx .loc [cell ]]] = True
104135 seqdf .loc [cell , i ] = self .features [new_feature_idx .loc [cell ]]
105- scoredf .loc [cell , i ] = nf_score .loc [cell ]
136+ trainscoredf .loc [cell , i ] = nf_train .loc [cell ]
137+ testscoredf .loc [cell , i ] = nf_test .loc [cell ]
138+ if full_scores :
139+ fulltest .loc [cell , i ] = nf_fulltest .loc [cell ]
140+ fulltrain .loc [cell , i ] = nf_fulltrain .loc [cell ]
106141 self .support_ = maskdf
107142 self .sequences_ = seqdf
108- self .scores_ = scoredf
143+ self .scores_test_ = testscoredf
144+ self .scores_train_ = trainscoredf
145+ if full_scores :
146+ self .full_scores_train_ = fulltrain
147+ self .full_scores_test_ = fulltest
109148
110- def _get_best_new_feature (self , mask , cells ):
149+ def _get_best_new_feature (self , mask , cells , full_scores = False ):
150+ """
151+ Returns
152+ -------
153+ maxind, trainmax, testmax, trainscores, testscores
154+ """
111155 mask = np .array (mask )
112156 candidate_features = np .flatnonzero (~ mask )
113157 cell_idxs = np .argwhere (np .isin (self .model .clu_ids , cells )).flatten ()
114158 my = self .model .binnedspikes [np .ix_ (self .train , cell_idxs )]
115- scores = pd .DataFrame (index = cells , columns = candidate_features , dtype = float )
159+ my_test = self .model .binnedspikes [np .ix_ (self .test , cell_idxs )]
160+ trainscores = pd .DataFrame (index = cells , columns = candidate_features , dtype = float )
161+ testscores = pd .DataFrame (index = cells , columns = candidate_features , dtype = float )
116162 for feature_idx in candidate_features :
117163 candidate_mask = mask .copy ()
118164 candidate_mask [feature_idx ] = True
@@ -121,9 +167,27 @@ def _get_best_new_feature(self, mask, cells):
121167 fitfeatures = self .features [candidate_mask ]
122168 feat_idx = np .hstack ([self .design .covar [feat ]['dmcol_idx' ] for feat in fitfeatures ])
123169 mdm = self .design [np .ix_ (self .train , feat_idx )]
170+ mdm_test = self .design [np .ix_ (self .test , feat_idx )]
171+
124172 coefs , intercepts = self .model ._fit (mdm , my , cells = cells )
125173 for i , cell in enumerate (cells ):
126- scores .at [cell , feature_idx ] = self .model ._scorer (coefs .loc [cell ],
127- intercepts .loc [cell ],
128- mdm , my [:, i ])
129- return scores .idxmax (axis = 1 ), scores .max (axis = 1 )
174+ trainscores .at [cell ,
175+ feature_idx ] = self .model ._scorer (coefs .loc [cell ],
176+ intercepts .loc [cell ], mdm , my [:,
177+ i ])
178+ testscores .at [cell ,
179+ feature_idx ] = self .model ._scorer (coefs .loc [cell ],
180+ intercepts .loc [cell ], mdm_test ,
181+ my_test [:, i ])
182+
183+ maxind = trainscores .idxmax (axis = 1 )
184+ trainmax = trainscores .max (axis = 1 )
185+ # Ugly kludge to compensate for DataFrame.lookup being deprecated
186+ midx , cols = pd .factorize (maxind )
187+ testmax = pd .Series (testscores .reindex (cols , axis = 1 ).to_numpy ()[np .arange (len (testscores )),
188+ midx ],
189+ index = testscores .index )
190+ if full_scores :
191+ return maxind , trainmax , testmax , trainscores , testscores
192+ else :
193+ return maxind , trainmax , testmax
0 commit comments