KrishnaswamyLab
diff --git a/‎LICENSE‎
Lines changed: 674 additions & 0 deletions b/‎LICENSE‎
Lines changed: 674 additions & 0 deletions
diff --git a/‎python/scprep/filter.py‎
Lines changed: 137 additions & 24 deletions b/‎python/scprep/filter.py‎
Lines changed: 137 additions & 24 deletions
diff --git a/‎python/scprep/measure.py‎
Lines changed: 9 additions & 2 deletions b/‎python/scprep/measure.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎python/scprep/plot.py‎
Lines changed: 7 additions & 3 deletions b/‎python/scprep/plot.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎python/scprep/sanitize.py‎
Lines changed: 6 additions & 3 deletions b/‎python/scprep/sanitize.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎python/scprep/utils.py‎
Lines changed: 37 additions & 4 deletions b/‎python/scprep/utils.py‎
Lines changed: 37 additions & 4 deletions
diff --git a/‎python/scprep/version.py‎
Lines changed: 1 addition & 1 deletion b/‎python/scprep/version.py‎
Lines changed: 1 addition & 1 deletion
@@ -2,6 +2,7 @@
 # (C) 2018 Krishnaswamy Lab GPLv2
 
 import numpy as np
+import pandas as pd
 
 from . import utils, measure
 
@@ -51,13 +52,17 @@ def remove_rare_genes(data, cutoff=0, min_cells=5):
     return data
 
 
-def remove_empty_cells(data):
+def remove_empty_cells(data, sample_labels=None):
     """Remove all cells with zero library size
 
     Parameters
     ----------
     data : array-like, shape=[n_samples, n_features]
         Input data
+    sample_labels : list-like or None, optional, shape=[n_samples] (default: None)
+        Labels associated with the rows of `data`. If provided, these
+        will be filtered such that they retain a one-to-one mapping
+        with the rows of the output data.
 
     Returns
     -------
@@ -67,37 +72,141 @@ def remove_empty_cells(data):
     cell_sums = measure.library_size(data)
     keep_cells_idx = cell_sums > 0
     data = utils.select_rows(data, keep_cells_idx)
+    if sample_labels is not None:
+        sample_labels = sample_labels[keep_cells_idx]
+        data = data, sample_labels
     return data
 
 
-def filter_library_size(data, cutoff=2000):
-    """Remove all cells with library size below a certain value
+def _get_filter_idx(data, values,
+                    cutoff, percentile,
+                    keep_cells):
+    cutoff = measure._get_percentile_cutoff(
+        values, cutoff, percentile, required=True)
+    if keep_cells == 'above':
+        keep_cells_idx = values > cutoff
+    elif keep_cells == 'below':
+        keep_cells_idx = values < cutoff
+    else:
+        raise ValueError("Expected `keep_cells` in ['above', 'below']. "
+                         "Got {}".format(keep_cells))
+    return keep_cells_idx
 
-    It is recommended to use :func:`~scprep.plot.plot_library_size` to
+
+def filter_values(data, values,
+                  cutoff=None, percentile=None,
+                  keep_cells='above', sample_labels=None,
+                  filter_per_sample=False):
+    """Remove all cells with `values` above or below a certain threshold
+
+    It is recommended to use :func:`~scprep.plot.histogram` to
     choose a cutoff prior to filtering.
 
     Parameters
     ----------
     data : array-like, shape=[n_samples, n_features]
         Input data
-    cutoff : float, optional (default: 2000)
-        Minimum library size required to retain a cell
+    values : list-like, shape=[n_samples]
+        Value upon which to filter
+    cutoff : float, optional (default: None)
+        Minimum library size required to retain a cell. Only one of `cutoff`
+        and `percentile` should be specified.
+    percentile : int, optional (Default: None)
+        Percentile above or below which to remove cells.
+        Must be an integer between 0 and 100. Only one of `cutoff`
+        and `percentile` should be specified.
+    keep_cells : {'above', 'below'}, optional (default: 'above')
+        Keep cells above or below the cutoff
+    sample_labels : list-like or None, optional, shape=[n_samples] (default: None)
+        Labels associated with the rows of `data`. If provided, these
+        will be filtered such that they retain a one-to-one mapping
+        with the rows of the output data.
+    filter_per_sample : bool, optional (default: False)
+        If True, filters separately for each unique sample label. Only used
+        if `sample_labels` is not `None` and `percentile` is given.
 
     Returns
     -------
     data : array-like, shape=[m_samples, n_features]
         Filtered output data, where m_samples <= n_samples
+    sample_labels : list-like, shape=[m_samples]
+        Filtered sample labels, if provided
     """
-    cell_sums = measure.library_size(data)
-    keep_cells_idx = cell_sums > cutoff
+    if filter_per_sample and percentile is not None and \
+            sample_labels is not None:
+        # filter separately and combine
+        sample_labels_array = utils.toarray(sample_labels).flatten()
+        keep_cells_idx = np.full_like(
+            sample_labels_array, True,
+            dtype=bool)
+        for label in np.unique(sample_labels_array):
+            sample_idx = sample_labels_array == label
+            keep_cells_idx[sample_idx] = _get_filter_idx(
+                utils.select_rows(data, sample_idx),
+                values[sample_idx],
+                cutoff, percentile, keep_cells)
+            keep_cells_idx = keep_cells_idx.flatten()
+    else:
+        keep_cells_idx = _get_filter_idx(data, values,
+                                         cutoff, percentile,
+                                         keep_cells)
     data = utils.select_rows(data, keep_cells_idx)
+    if sample_labels is not None:
+        sample_labels = sample_labels[keep_cells_idx]
+        data = data, sample_labels
     return data
 
 
+def filter_library_size(data, cutoff=None, percentile=None,
+                        keep_cells='above', sample_labels=None,
+                        filter_per_sample=False):
+    """Remove all cells with library size above or below a certain threshold
+
+    It is recommended to use :func:`~scprep.plot.plot_library_size` to
+    choose a cutoff prior to filtering.
+
+    Parameters
+    ----------
+    data : array-like, shape=[n_samples, n_features]
+        Input data
+    cutoff : float, optional (default: None)
+        Minimum library size required to retain a cell. Only one of `cutoff`
+        and `percentile` should be specified.
+    percentile : int, optional (Default: None)
+        Percentile above or below which to remove cells.
+        Must be an integer between 0 and 100. Only one of `cutoff`
+        and `percentile` should be specified.
+    keep_cells : {'above', 'below'}, optional (default: 'above')
+        Keep cells above or below the cutoff
+    sample_labels : list-like or None, optional, shape=[n_samples] (default: None)
+        Labels associated with the rows of `data`. If provided, these
+        will be filtered such that they retain a one-to-one mapping
+        with the rows of the output data.
+    filter_per_sample : bool, optional (default: False)
+        If True, filters separately for each unique sample label.
+
+    Returns
+    -------
+    data : array-like, shape=[m_samples, n_features]
+        Filtered output data, where m_samples <= n_samples
+    sample_labels : list-like, shape=[m_samples]
+        Filtered sample labels, if provided
+    """
+    cell_sums = measure.library_size(data)
+    return filter_values(data, cell_sums,
+                         cutoff=cutoff, percentile=percentile,
+                         keep_cells=keep_cells,
+                         sample_labels=sample_labels,
+                         filter_per_sample=filter_per_sample)
+
+
 def filter_gene_set_expression(data, genes,
                                cutoff=None, percentile=None,
-                               keep_cells='below'):
-    """Remove cells with total expression of a gene set below a certain value
+                               library_size_normalize=True,
+                               keep_cells='below',
+                               sample_labels=None,
+                               filter_per_sample=False):
+    """Remove cells with total expression of a gene set above or below a certain threshold
 
     It is recommended to use :func:`~scprep.plot.plot_gene_set_expression` to
     choose a cutoff prior to filtering.
@@ -111,22 +220,26 @@ def filter_gene_set_expression(data, genes,
     cutoff : float, optional (default: 2000)
         Value above or below which to remove cells. Only one of `cutoff`
         and `percentile` should be specified.
-    percentile : int (Default: None)
+    percentile : int, optional (Default: None)
         Percentile above or below which to remove cells.
         Must be an integer between 0 and 100. Only one of `cutoff`
         and `percentile` should be specified.
-    keep_cells : {'above', 'below'}
+    library_size_normalize : bool, optional (default: True)
+        Divide gene set expression by library size
+    keep_cells : {'above', 'below'}, optional (default: 'below')
         Keep cells above or below the cutoff
+    sample_labels : list-like or None, optional, shape=[n_samples] (default: None)
+        Labels associated with the rows of `data`. If provided, these
+        will be filtered such that they retain a one-to-one mapping
+        with the rows of the output data.
+    filter_per_sample : bool, optional (default: False)
+        If True, filters separately for each unique sample label.
     """
-    cell_sums = measure.gene_set_expression(data, genes)
-    cutoff = measure._get_percentile_cutoff(
-        cell_sums, cutoff, percentile, required=True)
-    if keep_cells == 'above':
-        keep_cells_idx = cell_sums > cutoff
-    elif keep_cells == 'below':
-        keep_cells_idx = cell_sums < cutoff
-    else:
-        raise ValueError("Expected `keep_cells` in ['above', 'below']."
-                         "Got {}".format(keep_cells))
-    data = utils.select_rows(data, keep_cells_idx)
-    return data
+    cell_sums = measure.gene_set_expression(
+        data, genes,
+        library_size_normalize=library_size_normalize)
+    return filter_values(data, cell_sums,
+                         cutoff=cutoff, percentile=percentile,
+                         keep_cells=keep_cells,
+                         sample_labels=sample_labels,
+                         filter_per_sample=filter_per_sample)
@@ -30,7 +30,7 @@ def library_size(data):
     return library_size
 
 
-def gene_set_expression(data, genes):
+def gene_set_expression(data, genes, library_size_normalize=True):
     """Measure the expression of a set of genes in each cell.
 
     Parameters
@@ -39,14 +39,21 @@ def gene_set_expression(data, genes):
         Input data
     genes : list-like, shape<=[n_features]
         Integer column indices or string gene names included in gene set
+    library_size_normalize : bool, optional (default: True)
+        Divide gene set expression by library size
 
     Returns
     -------
     gene_set_expression : list-like, shape=[n_samples]
         Sum over genes for each cell
     """
     gene_data = select_cols(data, genes)
-    return library_size(gene_data)
+    gene_set_expression = library_size(gene_data)
+    if library_size_normalize:
+        libsize = library_size(data)
+        libsize[libsize == 0] = 1
+        gene_set_expression /= libsize * np.median(np.array(libsize))
+    return gene_set_expression
 
 
 def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False):
 
@@ -104,6 +104,7 @@ def plot_library_size(data,
 def plot_gene_set_expression(data, genes,
                              bins=100, log=False,
                              cutoff=None, percentile=None,
+                             library_size_normalize=True,
                              ax=None, figsize=None):
     """Plot the hsitogram of the expression of a gene set.
 
@@ -125,11 +126,14 @@ def plot_gene_set_expression(data, genes,
     percentile : float or `None`, optional (default: `None`)
         Percentile between 0 and 100 at which to draw a vertical line.
         Only one of `cutoff` and `percentile` may be given.
+    library_size_normalize : bool, optional (default: True)
+        Divide gene set expression by library size
     ax : `matplotlib.Axes` or None, optional (default: None)
         Axis to plot on. If None, a new axis will be created.
     figsize : tuple or None, optional (default: None)
         If not None, sets the figure size (width, height)
     """
-    histogram(measure.gene_set_expression(data, genes),
-              cutoff=cutoff, percentile=percentile,
-              bins=bins, log=log, ax=ax, figsize=figsize)
+    histogram(measure.gene_set_expression(
+        data, genes, library_size_normalize=library_size_normalize),
+        cutoff=cutoff, percentile=percentile,
+        bins=bins, log=log, ax=ax, figsize=figsize)
@@ -5,7 +5,7 @@
 import pandas as pd
 
 
-def check_numeric(data, dtype='float', copy=False):
+def check_numeric(data, dtype='float', copy=None):
     """Check a matrix contains only numeric data
 
     Parameters
@@ -14,8 +14,9 @@ def check_numeric(data, dtype='float', copy=False):
         Input data
     dtype : str or `np.dtype`, optional (default: 'float')
         Data type to which to coerce the data
-    copy : bool, optional (default: False)
-        Copy the data before coercion.
+    copy : bool or None, optional (default: None)
+        Copy the data before coercion. If None, default to
+        False for all datatypes except pandas.SparseDataFrame
 
     Returns
     -------
@@ -26,6 +27,8 @@ def check_numeric(data, dtype='float', copy=False):
     ------
     TypeError : if `data` cannot be coerced to `dtype`
     """
+    if copy is None:
+        copy = isinstance(data, pd.SparseDataFrame)
     try:
         return data.astype(dtype, copy=copy)
     except TypeError as e:
 
@@ -6,6 +6,34 @@
 import re
 
 
+def toarray(x):
+    """Convert an array-like to a np.ndarray
+
+    Parameters
+    ----------
+    x : array-like
+        Array-like to be converted
+
+    Returns
+    -------
+    x : np.ndarray
+    """
+    if isinstance(x, pd.SparseDataFrame):
+        x = x.to_coo().toarray()
+    elif isinstance(x, pd.DataFrame):
+        x = x.values
+    elif isinstance(x, sparse.spmatrix):
+        x = x.toarray()
+    elif isinstance(x, np.matrix):
+        x = np.array(x)
+    elif isinstance(x, np.ndarray):
+        pass
+    else:
+        raise TypeError("Expected pandas DataFrame, scipy sparse matrix or "
+                        "numpy matrix. Got {}".format(type(x)))
+    return x
+
+
 def matrix_any(condition):
     """Check if a condition is true anywhere in a data matrix
 
@@ -83,6 +111,11 @@ def select_rows(data, idx):
     ------
     UserWarning : if no rows are selected
     """
+    if isinstance(idx, pd.DataFrame):
+        if idx.shape[1] > 1:
+            raise ValueError(
+                "Expected idx to be 1D. Got shape {}".format(idx.shape))
+        idx = idx.iloc[:, 0]
     if isinstance(data, pd.DataFrame):
         try:
             data = data.loc[idx]
@@ -213,7 +246,7 @@ def combine_batches(data, batch_labels, append_to_cell_names=False):
     data : data matrix, shape=[n_samples, n_features]
         Number of samples is the sum of numbers of samples of all batches.
         Number of features is the same as each of the batches.
-    sample_idx : list-like, shape=[n_samples]
+    sample_labels : list-like, shape=[n_samples]
         Batch labels corresponding to each sample
     """
     if not len(data) == len(batch_labels):
@@ -243,8 +276,8 @@ def combine_batches(data, batch_labels, append_to_cell_names=False):
         warnings.warn("append_to_cell_names only valid for pd.DataFrame input."
                       " Got {}".format(matrix_type.__name__), UserWarning)
 
-    sample_idx = np.concatenate([np.repeat(batch_labels[i], d.shape[0])
-                                 for i, d in enumerate(data)])
+    sample_labels = np.concatenate([np.repeat(batch_labels[i], d.shape[0])
+                                    for i, d in enumerate(data)])
     if issubclass(matrix_type, pd.DataFrame):
         if append_to_cell_names:
             index = np.concatenate(
@@ -259,4 +292,4 @@ def combine_batches(data, batch_labels, append_to_cell_names=False):
     elif issubclass(matrix_type, np.ndarray):
         data = np.vstack(data)
 
-    return data, sample_idx
+    return data, sample_labels
@@ -1,4 +1,4 @@
 # author: Scott Gigante <scott.gigante@yale.edu>
 # (C) 2018 Krishnaswamy Lab GPLv2
 
-__version__ = "0.4.0"
+__version__ = "0.5.0"