int-brain-lab
diff --git a/‎brainbox/modeling/design_matrix.py‎
Lines changed: 64 additions & 19 deletions b/‎brainbox/modeling/design_matrix.py‎
Lines changed: 64 additions & 19 deletions
diff --git a/‎brainbox/modeling/linear.py‎
Lines changed: 4 additions & 25 deletions b/‎brainbox/modeling/linear.py‎
Lines changed: 4 additions & 25 deletions
diff --git a/‎brainbox/modeling/neural_model.py‎
Lines changed: 49 additions & 31 deletions b/‎brainbox/modeling/neural_model.py‎
Lines changed: 49 additions & 31 deletions
@@ -11,7 +11,7 @@ class DesignMatrix:
     and allow the generation of a design matrix with specified regressors
     """
 
-    def __init__(self, trialsdf, vartypes=None, binwidth=0.02):
+    def __init__(self, trialsdf, vartypes, binwidth=0.02):
         """
         Class for generating design matrices to model neural data. Provides handy routines for
         describing neural spiking activity using basis functions and other primitives.
@@ -31,7 +31,7 @@ def __init__(self, trialsdf, vartypes=None, binwidth=0.02):
 
             Obligatory columns for the dataframe are "trial_start" and "trial_end", which tell the
             constructor which time points to associate with that trial.
-        vartypes : dict, optional
+        vartypes : dict
             Dictionary of types for each of the columns in trialsdf. Columns must be of the types:
             -- timing: timing events, in which the column values are times since the start of the
                 session of an event within that trial, e.g. stimulus onset.
@@ -41,46 +41,44 @@ def __init__(self, trialsdf, vartypes=None, binwidth=0.02):
                 changes within the trial. e.g. pupil diameter.
             Dictionary keys should be columns in trialsdf, values should be strings that are equal
             to one of the above.
-
-            If vartypes is not passed, the constructor will assume you know what you are doing. Be
-            warned that this can result in the class failing in spectacular and vindictive ways.
-            by default None
         binwidth : float, optional
             Length of time bins which will be used for design matrix, by default 0.02
         """
         # Data checks #
-        if vartypes is not None:
-            validtypes = ('timing', 'continuous', 'value')
-            if not all([name in vartypes for name in trialsdf.columns]):
-                raise KeyError("Some columns were not described in vartypes")
-            if not all([value in validtypes for value in vartypes.values()]):
-                raise ValueError("Invalid values were passed in vartypes")
+        validtypes = ('timing', 'continuous', 'value')
+        if not all([name in vartypes for name in trialsdf.columns]):
+            raise KeyError("Some columns were not described in vartypes")
+        if not all([value in validtypes for value in vartypes.values()]):
+            raise ValueError("Invalid values were passed in vartypes")
 
         # Filter out cells which don't meet the criteria for minimum spiking, while doing trial
         # assignment
-        self.vartypes = vartypes
-        if vartypes is not None:
-            self.vartypes['duration'] = 'value'
+        vartypes['duration'] = 'value'
         base_df = trialsdf.copy()
         trialsdf = trialsdf.copy()  # Make sure we don't modify the original dataframe
         trbounds = trialsdf[['trial_start', 'trial_end']]  # Get the start/end of trials
         # Empty trial duration value to use later
         trialsdf['duration'] = np.nan
+        # Figure out which columns are timing variables if vartypes was passed
         timingvars = [col for col in trialsdf.columns if vartypes[col] == 'timing']
+
         for i, (start, end) in trbounds.iterrows():
             if any(np.isnan((start, end))):
                 warn(f"NaN values found in trial start or end at trial number {i}. "
                      "Discarding trial.")
                 trialsdf.drop(i, inplace=True)
                 continue
             for col in timingvars:
+                # Round values for the timing variables to the 5th decimal place and subtract
+                # trial start time.
                 trialsdf.at[i, col] = np.round(trialsdf.at[i, col] - start, decimals=5)
             trialsdf.at[i, 'duration'] = end - start
 
         # Set model parameters to begin with
         self.binwidth = binwidth
         self.covar = {}
         self.trialsdf = trialsdf
+        self.vartypes = vartypes
         self.base_df = base_df
         self.compiled = False
         return
@@ -155,7 +153,7 @@ def add_covariate_timing(self, covlabel, eventname, bases,
         else:
             raise TypeError('deltaval must be None, pandas series, or string reference'
                             f' to trialsdf column. {type(deltaval)} was passed instead.')
-        if eventname in self.vartypes and self.vartypes[eventname] != 'timing':
+        if self.vartypes[eventname] != 'timing':
             raise TypeError(f'Column {eventname} in trialsdf is not registered as a timing')
 
         vecsizes = self.trialsdf['duration'].apply(self.binf)
@@ -174,6 +172,33 @@ def add_covariate_timing(self, covlabel, eventname, bases,
 
     def add_covariate_boxcar(self, covlabel, boxstart, boxend,
                              cond=None, height=None, desc=''):
+        """
+        Convenience wrapped on add_covariate to add a boxcar covariate on the given start and end
+        variables, such that the covariate is a step function with non-zero value between those
+        values.
+
+        Note: This has not been tested yet and is not guaranteed to work, or work correctly.
+
+        Parameters
+        ----------
+        covlabel : str
+            Name of the covariate for accessing later. Can be accessed via dot syntax of the
+            instance usually.
+        boxstart : str
+            Column name in trialsdf which will be used to define the start of the boxcar
+        boxend : str
+            Column name in trialsdf which defines the end of boxcar variable
+        cond : None, list, or func, optional
+            Condition in which to apply this covariate. Can either be a list of trial indices, or
+            a function which takes in a row of the trialsdf and returns a boolen on inclusion,
+            by default None
+        height : None, str, or pandas series, optional
+            Values for the height of the boxcar during the period defined per trial. Can be a
+            reference to a column in trialsdf or a separate series, by default None
+        desc : str, optional
+            Additional information about the covariate to store as a string, by default ''
+
+        """
         if covlabel in self.covar:
             raise AttributeError(f'Covariate {covlabel} already exists in model.')
         self._compile_check()
@@ -210,6 +235,27 @@ def add_covariate_boxcar(self, covlabel, boxstart, boxend,
 
     def add_covariate_raw(self, covlabel, raw,
                           cond=None, desc=''):
+        """
+        Convenience wrapper to add a 'raw' covariate, that is to say a covariate which is a
+        continuous value that changes with time during the course of a trial.
+
+        Note: This has not been tested and is not guaranteed to work or to work correctly.
+
+        Parameters
+        ----------
+        covlabel : str
+            String used to reference covariate, can usually be accessed by instance's dot syntax
+        raw : str, func, or pandas series
+            The covariate to add to the design matrix. Can be a str reference to a column in
+            trialsdf, a function which takes in rows of trialsdf and produces a vector for each
+            row of the appropriate size given binwidth and trial duration, or a pandas series
+            of vectors of said appropriate type.
+        cond : None, list, or func, optional
+            Trials in which to apply the given covariate. Can be a list of trial numbers,
+            or a function which accepts rows of the trialsdf and returns a boolean, by default None
+        desc : str, optional
+            Additional information about the covariate for access later, by default ''
+        """
         stimlens = self.trialsdf.duration.apply(self.binf)
         if isinstance(raw, str):
             if raw not in self.trialsdf.columns:
@@ -354,7 +400,6 @@ def compile_design_matrix(self, dense=True):
             assert self.binnedspikes.shape[0] == dm.shape[0], "Oh shit. Indexing error."
         self.dm = dm
         self.trlabels = trlabels
-        # self.dm = np.roll(dm, -1, axis=0)  # Fix weird +1 offset bug in design matrix
         self.compiled = True
         return
 
@@ -384,7 +429,7 @@ def denseconv(X, bases):
         A = np.zeros((T + TB - 1, int(np.sum(indices[kCov, :]))))
         for i, j in enumerate(np.argwhere(indices[kCov, :]).flat):
             A[:, i] = np.convolve(X[:, kCov], bases[:, j])
-        BX[:, k: sI[kCov]] = A[: T, :]
+        BX[:, k: sI[kCov]] = A[:T, :]
         k = sI[kCov]
     return BX
 
@@ -400,5 +445,5 @@ def convbasis(stim, bases, offset=0):
     if offset < 0:
         X = X[-offset:, :]
     elif offset > 0:
-        X = X[: -(1 + offset), :]
+        X = X[:-offset, :]
     return X
@@ -17,7 +17,7 @@
 class LinearGLM(NeuralModel):
     def __init__(self, design_matrix, spk_times, spk_clu,
                  binwidth=0.02, metric='rsq', estimator=None,
-                 train=0.8, blocktrain=False, mintrials=100):
+                 mintrials=100):
         """
         Fit a linear model using a DesignMatrix object and spike data. Can use ridge regression
         or pure linear regression
@@ -48,13 +48,15 @@ def __init__(self, design_matrix, spk_times, spk_clu,
             fitting, by default 100
         """
         super().__init__(design_matrix, spk_times, spk_clu,
-                         binwidth, train, blocktrain, mintrials)
+                         binwidth, mintrials)
         if estimator is None:
             estimator = LinearRegression()
         if not isinstance(estimator, BaseEstimator):
             raise ValueError('Estimator must be a scikit-learn estimator, e.g. LinearRegression')
         self.metric = metric
         self.estimator = estimator
+        self.link = lambda x: x
+        self.invlink = self.link
 
     def _fit(self, dm, binned, cells=None):
         """
@@ -94,26 +96,3 @@ def _fit(self, dm, binned, cells=None):
             coefs.at[cell] = weight[cell_idx, :]
             intercepts.at[cell] = intercept[cell_idx]
         return coefs, intercepts
-
-    def score(self):
-        """
-        Score model using chosen metric
-
-        Returns
-        -------
-        pandas.Series
-            Score using chosen metric (defined at instantiation) for each unit fit by the model.
-        """
-        if not hasattr(self, 'coefs'):
-            raise AttributeError('Model has not been fit yet.')
-        testmask = np.isin(self.design.trlabels, self.testinds).flatten()
-        dm, binned = self.design[testmask, :], self.binnedspikes[testmask]
-
-        scores = pd.Series(index=self.coefs.index, name='scores')
-        for cell in self.coefs.index:
-            cell_idx = np.argwhere(self.clu_ids == cell)[0, 0]
-            wt = self.coefs.loc[cell].reshape(-1, 1)
-            bias = self.intercepts.loc[cell]
-            y = binned[:, cell_idx]
-            scores.at[cell] = self._scorer(wt, bias, dm, y)
-        return scores
@@ -23,7 +23,7 @@ class NeuralModel:
     """
 
     def __init__(self, design_matrix, spk_times, spk_clu,
-                 binwidth=0.02, train=0.8, blocktrain=False, mintrials=100, stepwise=False):
+                 binwidth=0.02, mintrials=100, stepwise=False):
         """
         Construct GLM object using information about all trials, and the relevant spike times.
         Only ingests data, and further object methods must be called to describe kernels, gain
@@ -38,10 +38,8 @@ def __init__(self, design_matrix, spk_times, spk_clu,
         spk_clu: numpy.array of integers
             1-D array of same shape as spk_times, with integer cluster IDs identifying which
             cluster a spike time belonged to.
-        train: float
-            Float in (0, 1] indicating proportion of data to use for training GLM vs testing
-            (using the NeuralGLM.score method). Trials to keep will be randomly sampled, by default
-            0.8
+        binwidth : float
+            Size of bins to put spikes in to, in seconds.
         mintrials: int
             Minimum number of trials in which neurons fired a spike in order to be fit. Defaults
             to 100 trials.
@@ -54,10 +52,6 @@ def __init__(self, design_matrix, spk_times, spk_clu,
         # Data checks #
         if not len(spk_times) == len(spk_clu):
             raise IndexError("Spike times and cluster IDs are not same length")
-        if not isinstance(train, float) and not train == 1:
-            raise TypeError('train must be a float between 0 and 1')
-        if not ((train > 0) & (train <= 1)):
-            raise ValueError('train must be between 0 and 1')
         if not design_matrix.compiled:
             raise AttributeError('Design matrix object must be compiled before passing to fit')
 
@@ -83,29 +77,11 @@ def __init__(self, design_matrix, spk_times, spk_clu,
             spks[i] = spk_times[st_startind:st_endind] - start
             clu[i] = spk_clu[st_startind:st_endind]
 
-        # Break the data into test and train sections for cross-validation
-        if train == 1:
-            print('Training fraction set to 1. Training on all data.')
-            traininds = base_df.index
-            testinds = base_df.index
-        else:
-            trainlen = int(np.floor(len(base_df) * train))
-            if blocktrain:
-                testlen, midpoint = len(base_df) - trainlen, len(base_df) // 2
-                starttest, endtest = midpoint - (testlen // 2), midpoint + (testlen // 2)
-                testinds = base_df.index[starttest:endtest]
-                traininds = base_df.index[~np.isin(base_df.index, testinds)]
-            else:
-                traininds = sorted(np.random.choice(base_df.index, trainlen, replace=False))
-                testinds = base_df.index[~base_df.index.isin(traininds)]
-
         # Set model parameters to begin with
         self.design = design_matrix
         self.spikes = spks
         self.clu = clu
         self.clu_ids = np.argwhere(np.sum(trialspiking, axis=0) > mintrials).flatten()
-        self.traininds = traininds
-        self.testinds = testinds
         self.stepwise = stepwise
         self.binwidth = binwidth
 
@@ -168,7 +144,7 @@ def _scorer(self, wt, bias, dm, y):
         """
         Score a single target y
         """
-        pred = (dm @ wt + bias).flatten()
+        pred = self.link(dm @ wt + bias).flatten()
         if self.metric == 'dsq':
             null_pred = np.ones_like(pred) * np.mean(y)
             null_deviance = 2 * np.sum(xlogy(y, y / null_pred.flat) - y + null_pred.flat)
@@ -186,7 +162,7 @@ def _scorer(self, wt, bias, dm, y):
         else:
             raise AttributeError('No valid metric exists in the instance for use by _scorer()')
 
-    def fit(self, printcond=True):
+    def fit(self, train_idx=None, printcond=True):
         """
         Fit the current set of binned spikes as a function of the current design matrix. Requires
         NeuralGLM.bin_spike_trains and NeuralGLM.compile_design_matrix to be run first. Will store
@@ -195,6 +171,9 @@ def fit(self, printcond=True):
 
         Parameters
         ----------
+        train_idx : array-like of trial indices, optional
+            List of which trials to use to train the model. Defaults to None, which indicates all
+            indices in the trialsdf will be used (100% train)
         printcond : bool
             Whether or not to print the condition number of the design matrix. Defaults to True
 
@@ -204,10 +183,24 @@ def fit(self, printcond=True):
             List of coefficients fit. Not recommended to use these for interpretation. Use
             the .combine_weights() method instead.
         intercepts : list
-            List of intercepts (bias terms) fit. Not recommended to use these for interpretation.
+            List of intercepts (bias terms) fit.
         """
+        # Input checks
+        if train_idx is None:
+            train_idx = self.design.trialsdf.index
+        if not np.all(np.isin(train_idx, self.design.trialsdf.index)):
+            raise IndexError('Not all train indices in the trials of design matrix')
+
+        # Store training and test indices for self so that .score() method will know what to
+        # operate on. If all data indices are in train indices, train and test are the same set.
+        self.traininds = train_idx
+        if not np.all(np.isin(self.design.trialsdf.index, train_idx)):
+            self.testinds = self.design.trialsdf.index[~self.trialsdf.index.isin(train_idx)]
+        else:
+            self.testinds = train_idx
+
         # Mask for training data
-        trainmask = np.isin(self.design.trlabels, self.traininds).flatten()
+        trainmask = np.isin(self.design.trlabels, train_idx).flatten()
         trainbinned = self.binnedspikes[trainmask]
         if printcond:
             print(f'Condition of design matrix is {np.linalg.cond(self.design[trainmask])}')
@@ -217,6 +210,31 @@ def fit(self, printcond=True):
         self.coefs, self.intercepts = coefs, intercepts
         return
 
+    def score(self, testinds=None):
+        """
+        Score model using chosen metric
+
+        Returns
+        -------
+        pandas.Series
+            Score using chosen metric (defined at instantiation) for each unit fit by the model.
+        """
+        if not hasattr(self, 'coefs'):
+            raise AttributeError('Model has not been fit yet.')
+        if testinds is None:
+            testinds = self.testinds
+        testmask = np.isin(self.design.trlabels, testinds).flatten()
+        dm, binned = self.design[testmask, :], self.binnedspikes[testmask]
+
+        scores = pd.Series(index=self.coefs.index, name='scores')
+        for cell in self.coefs.index:
+            cell_idx = np.argwhere(self.clu_ids == cell)[0, 0]
+            wt = self.coefs.loc[cell].reshape(-1, 1)
+            bias = self.intercepts.loc[cell]
+            y = binned[:, cell_idx]
+            scores.at[cell] = self._scorer(wt, bias, dm, y)
+        return scores
+
     def binf(self, t):
         """
         Bin function for a given timestep. Returns the number of bins after trial start a given t