Merge pull request #473 from int-brain-lab/release/2.12.0

mayofaulkner · web-flow · commit b6f2e2278d44 · 2022-05-09T20:06:03.000+01:00
Release/2.12.0
diff --git a/brainbox/io/one.py b/brainbox/io/one.py
@@ -700,8 +700,9 @@ def load_wheel_reaction_times(eid, one=None):
     return firstMove_times - trials['goCue_times']
 
 
-def load_trials_df(eid, one=None, maxlen=None, t_before=0., t_after=0., ret_wheel=False,
-                   ret_abswheel=False, wheel_binsize=0.02, addtl_types=[]):
+def load_trials_df(eid, one=None, maxlen=None, t_before=0., t_after=0.2, ret_wheel=False,
+                   ret_abswheel=False, wheel_binsize=0.02, addtl_types=[],
+                   align_event='stimOn_times', keeptrials=None):
     """
     Generate a pandas dataframe of per-trial timing information about a given session.
     Each row in the frame will correspond to a single trial, with timing values indicating timing
@@ -776,18 +777,19 @@ def remap_trialp(probs):
     endtimes = trials.feedback_times
     tmp = {key: value for key, value in trials.items() if key in trialstypes}
 
-    if maxlen is not None:
-        with np.errstate(invalid='ignore'):
-            keeptrials = (endtimes - starttimes) <= maxlen
-    else:
-        keeptrials = range(len(starttimes))
+    if keeptrials is None:
+        if maxlen is not None:
+            with np.errstate(invalid='ignore'):
+                keeptrials = (endtimes - starttimes) <= maxlen
+        else:
+            keeptrials = range(len(starttimes))
     trialdata = {x: tmp[x][keeptrials] for x in trialstypes}
     trialdata['probabilityLeft'] = remap_trialp(trialdata['probabilityLeft'])
     trialsdf = pd.DataFrame(trialdata)
     if maxlen is not None:
         trialsdf.set_index(np.nonzero(keeptrials)[0], inplace=True)
-    trialsdf['trial_start'] = trialsdf['stimOn_times'] - t_before
-    trialsdf['trial_end'] = trialsdf['feedback_times'] + t_after
+    trialsdf['trial_start'] = trialsdf[align_event] - t_before
+    trialsdf['trial_end'] = trialsdf[align_event] + t_after
     tdiffs = trialsdf['trial_end'] - np.roll(trialsdf['trial_start'], -1)
     if np.any(tdiffs[:-1] > 0):
         logging.warning(f'{sum(tdiffs[:-1] > 0)} trials overlapping due to t_before and t_after '
diff --git a/brainbox/modeling/linear.py b/brainbox/modeling/linear.py
@@ -87,7 +87,7 @@ def _fit(self, dm, binned, cells=None):
             raise ValueError('Length of cells does not match shape of binned')
 
         coefs = pd.Series(index=cells, name='coefficients', dtype=object)
-        intercepts = pd.Series(index=cells, name='intercepts')
+        intercepts = pd.Series(index=cells, name='intercepts', dtype=object)
 
         lm = self.estimator.fit(dm, binned)
         weight, intercept = lm.coef_, lm.intercept_
diff --git a/brainbox/modeling/neural_model.py b/brainbox/modeling/neural_model.py
@@ -195,7 +195,7 @@ def fit(self, train_idx=None, printcond=True):
         # operate on. If all data indices are in train indices, train and test are the same set.
         self.traininds = train_idx
         if not np.all(np.isin(self.design.trialsdf.index, train_idx)):
-            self.testinds = self.design.trialsdf.index[~self.trialsdf.index.isin(train_idx)]
+            self.testinds = self.design.trialsdf.index[~self.design.trialsdf.index.isin(train_idx)]
         else:
             self.testinds = train_idx
 
@@ -226,7 +226,7 @@ def score(self, testinds=None):
         testmask = np.isin(self.design.trlabels, testinds).flatten()
         dm, binned = self.design[testmask, :], self.binnedspikes[testmask]
 
-        scores = pd.Series(index=self.coefs.index, name='scores')
+        scores = pd.Series(index=self.coefs.index, name='scores', dtype=object)
         for cell in self.coefs.index:
             cell_idx = np.argwhere(self.clu_ids == cell)[0, 0]
             wt = self.coefs.loc[cell].reshape(-1, 1)
diff --git a/brainbox/modeling/utils.py b/brainbox/modeling/utils.py
@@ -38,7 +38,9 @@ def neglog(weights, x, y):
 
 
 class SequentialSelector:
-    def __init__(self, model, n_features_to_select=None, direction='forward', scoring=None):
+    def __init__(self, model, n_features_to_select=None,
+                 direction='forward', scoring=None,
+                 train=None, test=None):
         """
         Sequential feature selection for neural models
 
@@ -67,52 +69,96 @@ def __init__(self, model, n_features_to_select=None, direction='forward', scorin
         self.scoring = scoring
         self.delta_scores = pd.DataFrame(index=self.model.clu_ids)
         self.trlabels = self.design.trlabels
-        self.train = np.isin(self.trlabels, self.model.traininds).flatten()
-        self.test = ~self.train
+        if train is None:
+            self.train = np.isin(self.trlabels, self.model.traininds).flatten()
+        else:
+            self.train = np.isin(self.trlabels, train).flatten()
+        if test is None:
+            self.test = ~self.train
+        else:
+            self.test = np.isin(self.trlabels, test).flatten()
         self.features = np.array(list(self.design.covar.keys()))
 
-    def fit(self, progress=False):
+    def fit(self, train_idx=None, full_scores=False, progress=False):
         """
         Fit the sequential feature selection
-
         Parameters
         ----------
+        train_idx : array-like
+            indices of trials to use in the training set. If the model passed to the SFS instance
+            did not already have training indices, this must be specified. If it did have indices,
+            then this will override those.
+        full_scores : bool, optional
+            Whether to store the full set of submodel scores at each step. Produces additional
+            attributes .full_scores_train_ and .full_scores_test_
         progress : bool, optional
             Whether to show a progress bar, by default False
         """
+        if train_idx is None and self.train is None:
+            raise ValueError('train_idx cannot be None if model used to create SFS did not have '
+                             'any training indices')
+        if train_idx is not None:
+            self.train = np.isin(self.trlabels, train_idx).flatten()
+            self.test = ~self.train
         n_features = len(self.features)
         maskdf = pd.DataFrame(index=self.model.clu_ids, columns=self.features, dtype=bool)
         maskdf.loc[:, :] = False
         seqdf = pd.DataFrame(index=self.model.clu_ids, columns=range(self.n_features_to_select))
-        scoredf = pd.DataFrame(index=self.model.clu_ids, columns=range(self.n_features_to_select))
+        trainscoredf = pd.DataFrame(index=self.model.clu_ids,
+                                    columns=range(self.n_features_to_select))
+        testscoredf = pd.DataFrame(index=self.model.clu_ids,
+                                   columns=range(self.n_features_to_select))
 
         if not 0 < self.n_features_to_select <= n_features:
             raise ValueError('n_features_to_select is not a valid number in the context'
                              ' of the model.')
 
-        n_iterations = (
-            self.n_features_to_select if self.direction == 'forward'
-            else n_features - self.n_features_to_select
-        )
+        n_iterations = (self.n_features_to_select if self.direction == 'forward' else n_features -
+                        self.n_features_to_select)
+        if full_scores:
+            fullindex = pd.MultiIndex.from_product([self.model.clu_ids, np.arange(n_iterations)],
+                                                   names=['clu_id', 'feature_iter'])
+            fulltrain = pd.DataFrame(index=fullindex, columns=range(len(self.design.covar)))
+            fulltest = pd.DataFrame(index=fullindex, columns=range(len(self.design.covar)))
+
         for i in tqdm(range(n_iterations), desc='step', leave=False, disable=not progress):
             masks_set = maskdf.groupby(self.features.tolist()).groups
             for current_mask in tqdm(masks_set, desc='feature subset', leave=False):
                 cells = masks_set[current_mask]
-                new_feature_idx, nf_score = self._get_best_new_feature(current_mask, cells)
+                outputs = self._get_best_new_feature(current_mask, cells, full_scores)
+                if full_scores:
+                    new_feature_idx, nf_train, nf_test, nf_fulltrain, nf_fulltest = outputs
+                else:
+                    new_feature_idx, nf_train, nf_test = outputs
                 for cell in cells:
                     maskdf.at[cell, self.features[new_feature_idx.loc[cell]]] = True
                     seqdf.loc[cell, i] = self.features[new_feature_idx.loc[cell]]
-                    scoredf.loc[cell, i] = nf_score.loc[cell]
+                    trainscoredf.loc[cell, i] = nf_train.loc[cell]
+                    testscoredf.loc[cell, i] = nf_test.loc[cell]
+                    if full_scores:
+                        fulltest.loc[cell, i] = nf_fulltest.loc[cell]
+                        fulltrain.loc[cell, i] = nf_fulltrain.loc[cell]
         self.support_ = maskdf
         self.sequences_ = seqdf
-        self.scores_ = scoredf
+        self.scores_test_ = testscoredf
+        self.scores_train_ = trainscoredf
+        if full_scores:
+            self.full_scores_train_ = fulltrain
+            self.full_scores_test_ = fulltest
 
-    def _get_best_new_feature(self, mask, cells):
+    def _get_best_new_feature(self, mask, cells, full_scores=False):
+        """
+        Returns
+        -------
+        maxind, trainmax, testmax, trainscores, testscores
+        """
         mask = np.array(mask)
         candidate_features = np.flatnonzero(~mask)
         cell_idxs = np.argwhere(np.isin(self.model.clu_ids, cells)).flatten()
         my = self.model.binnedspikes[np.ix_(self.train, cell_idxs)]
-        scores = pd.DataFrame(index=cells, columns=candidate_features, dtype=float)
+        my_test = self.model.binnedspikes[np.ix_(self.test, cell_idxs)]
+        trainscores = pd.DataFrame(index=cells, columns=candidate_features, dtype=float)
+        testscores = pd.DataFrame(index=cells, columns=candidate_features, dtype=float)
         for feature_idx in candidate_features:
             candidate_mask = mask.copy()
             candidate_mask[feature_idx] = True
@@ -121,9 +167,27 @@ def _get_best_new_feature(self, mask, cells):
             fitfeatures = self.features[candidate_mask]
             feat_idx = np.hstack([self.design.covar[feat]['dmcol_idx'] for feat in fitfeatures])
             mdm = self.design[np.ix_(self.train, feat_idx)]
+            mdm_test = self.design[np.ix_(self.test, feat_idx)]
+
             coefs, intercepts = self.model._fit(mdm, my, cells=cells)
             for i, cell in enumerate(cells):
-                scores.at[cell, feature_idx] = self.model._scorer(coefs.loc[cell],
-                                                                  intercepts.loc[cell],
-                                                                  mdm, my[:, i])
-        return scores.idxmax(axis=1), scores.max(axis=1)
+                trainscores.at[cell,
+                               feature_idx] = self.model._scorer(coefs.loc[cell],
+                                                                 intercepts.loc[cell], mdm, my[:,
+                                                                                               i])
+                testscores.at[cell,
+                              feature_idx] = self.model._scorer(coefs.loc[cell],
+                                                                intercepts.loc[cell], mdm_test,
+                                                                my_test[:, i])
+
+        maxind = trainscores.idxmax(axis=1)
+        trainmax = trainscores.max(axis=1)
+        # Ugly kludge to compensate for DataFrame.lookup being deprecated
+        midx, cols = pd.factorize(maxind)
+        testmax = pd.Series(testscores.reindex(cols, axis=1).to_numpy()[np.arange(len(testscores)),
+                                                                        midx],
+                            index=testscores.index)
+        if full_scores:
+            return maxind, trainmax, testmax, trainscores, testscores
+        else:
+            return maxind, trainmax, testmax
diff --git a/ibllib/__init__.py b/ibllib/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "2.11.1"
+__version__ = "2.12.0"
 import warnings
 
 from ibllib.misc import logger_config
diff --git a/ibllib/atlas/flatmaps.py b/ibllib/atlas/flatmaps.py
@@ -181,4 +181,11 @@ def plot_swanson(acronyms=None, values=None, ax=None, hemisphere=None, br=None,
     imb[s2a == 0] = 255
     imb[s2a == 1] = np.array([167, 169, 172, 255])
     ax.imshow(imb)
+
+    # provides the mean to sea the region on axis
+    def format_coord(x, y):
+        acronym = br.acronym[s2a[int(y), int(x)]]
+        return f'x={x:1.4f}, y={x:1.4f}, {acronym}'
+
+    ax.format_coord = format_coord
     return ax
diff --git a/release_notes.md b/release_notes.md
@@ -1,3 +1,13 @@
+## Release Notes 2.12
+
+### Release Notes 2.12.0 2022-05-10
+- ibllib.atlas: add the Swanson flatmap backend (Olivier)
+- ibllib.io.extractors: output of task extractions are trial tables, not individual datasets (Miles)
+- Documentation: data release examples (Mayo)
+- ibl-neuropixel new repository contains `ibllib.dsp`, `illlib.ephys.neuropixel` and `ibllib.io.spikeglx` modules (Olivier)
+- brainbox.task.closed loop get impostor targets to evaluate null distribution  (Brandon)
+- minimum supported version of Python is 3.8 (Michele)
+
 ## Release Notes 2.11
 
 ### Release Notes 2.11.1 2022-04-12

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "2.11.1"`
	`1`	`+__version__ = "2.12.0"`
`2`	`2`	`import warnings`
`3`	`3`
`4`	`4`	`from ibllib.misc import logger_config`