Skip to content

Commit 3a60166

Browse files
authored
Merge pull request #4 from KrishnaswamyLab/dev
scprep v0.5.0
2 parents 33047ff + 2b408d9 commit 3a60166

File tree

11 files changed

+1032
-50
lines changed

11 files changed

+1032
-50
lines changed

LICENSE

Lines changed: 674 additions & 0 deletions
Large diffs are not rendered by default.

python/scprep/filter.py

Lines changed: 137 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# (C) 2018 Krishnaswamy Lab GPLv2
33

44
import numpy as np
5+
import pandas as pd
56

67
from . import utils, measure
78

@@ -51,13 +52,17 @@ def remove_rare_genes(data, cutoff=0, min_cells=5):
5152
return data
5253

5354

54-
def remove_empty_cells(data):
55+
def remove_empty_cells(data, sample_labels=None):
5556
"""Remove all cells with zero library size
5657
5758
Parameters
5859
----------
5960
data : array-like, shape=[n_samples, n_features]
6061
Input data
62+
sample_labels : list-like or None, optional, shape=[n_samples] (default: None)
63+
Labels associated with the rows of `data`. If provided, these
64+
will be filtered such that they retain a one-to-one mapping
65+
with the rows of the output data.
6166
6267
Returns
6368
-------
@@ -67,37 +72,141 @@ def remove_empty_cells(data):
6772
cell_sums = measure.library_size(data)
6873
keep_cells_idx = cell_sums > 0
6974
data = utils.select_rows(data, keep_cells_idx)
75+
if sample_labels is not None:
76+
sample_labels = sample_labels[keep_cells_idx]
77+
data = data, sample_labels
7078
return data
7179

7280

73-
def filter_library_size(data, cutoff=2000):
74-
"""Remove all cells with library size below a certain value
81+
def _get_filter_idx(data, values,
82+
cutoff, percentile,
83+
keep_cells):
84+
cutoff = measure._get_percentile_cutoff(
85+
values, cutoff, percentile, required=True)
86+
if keep_cells == 'above':
87+
keep_cells_idx = values > cutoff
88+
elif keep_cells == 'below':
89+
keep_cells_idx = values < cutoff
90+
else:
91+
raise ValueError("Expected `keep_cells` in ['above', 'below']. "
92+
"Got {}".format(keep_cells))
93+
return keep_cells_idx
7594

76-
It is recommended to use :func:`~scprep.plot.plot_library_size` to
95+
96+
def filter_values(data, values,
97+
cutoff=None, percentile=None,
98+
keep_cells='above', sample_labels=None,
99+
filter_per_sample=False):
100+
"""Remove all cells with `values` above or below a certain threshold
101+
102+
It is recommended to use :func:`~scprep.plot.histogram` to
77103
choose a cutoff prior to filtering.
78104
79105
Parameters
80106
----------
81107
data : array-like, shape=[n_samples, n_features]
82108
Input data
83-
cutoff : float, optional (default: 2000)
84-
Minimum library size required to retain a cell
109+
values : list-like, shape=[n_samples]
110+
Value upon which to filter
111+
cutoff : float, optional (default: None)
112+
Minimum library size required to retain a cell. Only one of `cutoff`
113+
and `percentile` should be specified.
114+
percentile : int, optional (Default: None)
115+
Percentile above or below which to remove cells.
116+
Must be an integer between 0 and 100. Only one of `cutoff`
117+
and `percentile` should be specified.
118+
keep_cells : {'above', 'below'}, optional (default: 'above')
119+
Keep cells above or below the cutoff
120+
sample_labels : list-like or None, optional, shape=[n_samples] (default: None)
121+
Labels associated with the rows of `data`. If provided, these
122+
will be filtered such that they retain a one-to-one mapping
123+
with the rows of the output data.
124+
filter_per_sample : bool, optional (default: False)
125+
If True, filters separately for each unique sample label. Only used
126+
if `sample_labels` is not `None` and `percentile` is given.
85127
86128
Returns
87129
-------
88130
data : array-like, shape=[m_samples, n_features]
89131
Filtered output data, where m_samples <= n_samples
132+
sample_labels : list-like, shape=[m_samples]
133+
Filtered sample labels, if provided
90134
"""
91-
cell_sums = measure.library_size(data)
92-
keep_cells_idx = cell_sums > cutoff
135+
if filter_per_sample and percentile is not None and \
136+
sample_labels is not None:
137+
# filter separately and combine
138+
sample_labels_array = utils.toarray(sample_labels).flatten()
139+
keep_cells_idx = np.full_like(
140+
sample_labels_array, True,
141+
dtype=bool)
142+
for label in np.unique(sample_labels_array):
143+
sample_idx = sample_labels_array == label
144+
keep_cells_idx[sample_idx] = _get_filter_idx(
145+
utils.select_rows(data, sample_idx),
146+
values[sample_idx],
147+
cutoff, percentile, keep_cells)
148+
keep_cells_idx = keep_cells_idx.flatten()
149+
else:
150+
keep_cells_idx = _get_filter_idx(data, values,
151+
cutoff, percentile,
152+
keep_cells)
93153
data = utils.select_rows(data, keep_cells_idx)
154+
if sample_labels is not None:
155+
sample_labels = sample_labels[keep_cells_idx]
156+
data = data, sample_labels
94157
return data
95158

96159

160+
def filter_library_size(data, cutoff=None, percentile=None,
161+
keep_cells='above', sample_labels=None,
162+
filter_per_sample=False):
163+
"""Remove all cells with library size above or below a certain threshold
164+
165+
It is recommended to use :func:`~scprep.plot.plot_library_size` to
166+
choose a cutoff prior to filtering.
167+
168+
Parameters
169+
----------
170+
data : array-like, shape=[n_samples, n_features]
171+
Input data
172+
cutoff : float, optional (default: None)
173+
Minimum library size required to retain a cell. Only one of `cutoff`
174+
and `percentile` should be specified.
175+
percentile : int, optional (Default: None)
176+
Percentile above or below which to remove cells.
177+
Must be an integer between 0 and 100. Only one of `cutoff`
178+
and `percentile` should be specified.
179+
keep_cells : {'above', 'below'}, optional (default: 'above')
180+
Keep cells above or below the cutoff
181+
sample_labels : list-like or None, optional, shape=[n_samples] (default: None)
182+
Labels associated with the rows of `data`. If provided, these
183+
will be filtered such that they retain a one-to-one mapping
184+
with the rows of the output data.
185+
filter_per_sample : bool, optional (default: False)
186+
If True, filters separately for each unique sample label.
187+
188+
Returns
189+
-------
190+
data : array-like, shape=[m_samples, n_features]
191+
Filtered output data, where m_samples <= n_samples
192+
sample_labels : list-like, shape=[m_samples]
193+
Filtered sample labels, if provided
194+
"""
195+
cell_sums = measure.library_size(data)
196+
return filter_values(data, cell_sums,
197+
cutoff=cutoff, percentile=percentile,
198+
keep_cells=keep_cells,
199+
sample_labels=sample_labels,
200+
filter_per_sample=filter_per_sample)
201+
202+
97203
def filter_gene_set_expression(data, genes,
98204
cutoff=None, percentile=None,
99-
keep_cells='below'):
100-
"""Remove cells with total expression of a gene set below a certain value
205+
library_size_normalize=True,
206+
keep_cells='below',
207+
sample_labels=None,
208+
filter_per_sample=False):
209+
"""Remove cells with total expression of a gene set above or below a certain threshold
101210
102211
It is recommended to use :func:`~scprep.plot.plot_gene_set_expression` to
103212
choose a cutoff prior to filtering.
@@ -111,22 +220,26 @@ def filter_gene_set_expression(data, genes,
111220
cutoff : float, optional (default: 2000)
112221
Value above or below which to remove cells. Only one of `cutoff`
113222
and `percentile` should be specified.
114-
percentile : int (Default: None)
223+
percentile : int, optional (Default: None)
115224
Percentile above or below which to remove cells.
116225
Must be an integer between 0 and 100. Only one of `cutoff`
117226
and `percentile` should be specified.
118-
keep_cells : {'above', 'below'}
227+
library_size_normalize : bool, optional (default: True)
228+
Divide gene set expression by library size
229+
keep_cells : {'above', 'below'}, optional (default: 'below')
119230
Keep cells above or below the cutoff
231+
sample_labels : list-like or None, optional, shape=[n_samples] (default: None)
232+
Labels associated with the rows of `data`. If provided, these
233+
will be filtered such that they retain a one-to-one mapping
234+
with the rows of the output data.
235+
filter_per_sample : bool, optional (default: False)
236+
If True, filters separately for each unique sample label.
120237
"""
121-
cell_sums = measure.gene_set_expression(data, genes)
122-
cutoff = measure._get_percentile_cutoff(
123-
cell_sums, cutoff, percentile, required=True)
124-
if keep_cells == 'above':
125-
keep_cells_idx = cell_sums > cutoff
126-
elif keep_cells == 'below':
127-
keep_cells_idx = cell_sums < cutoff
128-
else:
129-
raise ValueError("Expected `keep_cells` in ['above', 'below']."
130-
"Got {}".format(keep_cells))
131-
data = utils.select_rows(data, keep_cells_idx)
132-
return data
238+
cell_sums = measure.gene_set_expression(
239+
data, genes,
240+
library_size_normalize=library_size_normalize)
241+
return filter_values(data, cell_sums,
242+
cutoff=cutoff, percentile=percentile,
243+
keep_cells=keep_cells,
244+
sample_labels=sample_labels,
245+
filter_per_sample=filter_per_sample)

python/scprep/measure.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def library_size(data):
3030
return library_size
3131

3232

33-
def gene_set_expression(data, genes):
33+
def gene_set_expression(data, genes, library_size_normalize=True):
3434
"""Measure the expression of a set of genes in each cell.
3535
3636
Parameters
@@ -39,14 +39,21 @@ def gene_set_expression(data, genes):
3939
Input data
4040
genes : list-like, shape<=[n_features]
4141
Integer column indices or string gene names included in gene set
42+
library_size_normalize : bool, optional (default: True)
43+
Divide gene set expression by library size
4244
4345
Returns
4446
-------
4547
gene_set_expression : list-like, shape=[n_samples]
4648
Sum over genes for each cell
4749
"""
4850
gene_data = select_cols(data, genes)
49-
return library_size(gene_data)
51+
gene_set_expression = library_size(gene_data)
52+
if library_size_normalize:
53+
libsize = library_size(data)
54+
libsize[libsize == 0] = 1
55+
gene_set_expression /= libsize * np.median(np.array(libsize))
56+
return gene_set_expression
5057

5158

5259
def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False):

python/scprep/plot.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ def plot_library_size(data,
104104
def plot_gene_set_expression(data, genes,
105105
bins=100, log=False,
106106
cutoff=None, percentile=None,
107+
library_size_normalize=True,
107108
ax=None, figsize=None):
108109
"""Plot the hsitogram of the expression of a gene set.
109110
@@ -125,11 +126,14 @@ def plot_gene_set_expression(data, genes,
125126
percentile : float or `None`, optional (default: `None`)
126127
Percentile between 0 and 100 at which to draw a vertical line.
127128
Only one of `cutoff` and `percentile` may be given.
129+
library_size_normalize : bool, optional (default: True)
130+
Divide gene set expression by library size
128131
ax : `matplotlib.Axes` or None, optional (default: None)
129132
Axis to plot on. If None, a new axis will be created.
130133
figsize : tuple or None, optional (default: None)
131134
If not None, sets the figure size (width, height)
132135
"""
133-
histogram(measure.gene_set_expression(data, genes),
134-
cutoff=cutoff, percentile=percentile,
135-
bins=bins, log=log, ax=ax, figsize=figsize)
136+
histogram(measure.gene_set_expression(
137+
data, genes, library_size_normalize=library_size_normalize),
138+
cutoff=cutoff, percentile=percentile,
139+
bins=bins, log=log, ax=ax, figsize=figsize)

python/scprep/sanitize.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pandas as pd
66

77

8-
def check_numeric(data, dtype='float', copy=False):
8+
def check_numeric(data, dtype='float', copy=None):
99
"""Check a matrix contains only numeric data
1010
1111
Parameters
@@ -14,8 +14,9 @@ def check_numeric(data, dtype='float', copy=False):
1414
Input data
1515
dtype : str or `np.dtype`, optional (default: 'float')
1616
Data type to which to coerce the data
17-
copy : bool, optional (default: False)
18-
Copy the data before coercion.
17+
copy : bool or None, optional (default: None)
18+
Copy the data before coercion. If None, default to
19+
False for all datatypes except pandas.SparseDataFrame
1920
2021
Returns
2122
-------
@@ -26,6 +27,8 @@ def check_numeric(data, dtype='float', copy=False):
2627
------
2728
TypeError : if `data` cannot be coerced to `dtype`
2829
"""
30+
if copy is None:
31+
copy = isinstance(data, pd.SparseDataFrame)
2932
try:
3033
return data.astype(dtype, copy=copy)
3134
except TypeError as e:

python/scprep/utils.py

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,34 @@
66
import re
77

88

9+
def toarray(x):
10+
"""Convert an array-like to a np.ndarray
11+
12+
Parameters
13+
----------
14+
x : array-like
15+
Array-like to be converted
16+
17+
Returns
18+
-------
19+
x : np.ndarray
20+
"""
21+
if isinstance(x, pd.SparseDataFrame):
22+
x = x.to_coo().toarray()
23+
elif isinstance(x, pd.DataFrame):
24+
x = x.values
25+
elif isinstance(x, sparse.spmatrix):
26+
x = x.toarray()
27+
elif isinstance(x, np.matrix):
28+
x = np.array(x)
29+
elif isinstance(x, np.ndarray):
30+
pass
31+
else:
32+
raise TypeError("Expected pandas DataFrame, scipy sparse matrix or "
33+
"numpy matrix. Got {}".format(type(x)))
34+
return x
35+
36+
937
def matrix_any(condition):
1038
"""Check if a condition is true anywhere in a data matrix
1139
@@ -83,6 +111,11 @@ def select_rows(data, idx):
83111
------
84112
UserWarning : if no rows are selected
85113
"""
114+
if isinstance(idx, pd.DataFrame):
115+
if idx.shape[1] > 1:
116+
raise ValueError(
117+
"Expected idx to be 1D. Got shape {}".format(idx.shape))
118+
idx = idx.iloc[:, 0]
86119
if isinstance(data, pd.DataFrame):
87120
try:
88121
data = data.loc[idx]
@@ -213,7 +246,7 @@ def combine_batches(data, batch_labels, append_to_cell_names=False):
213246
data : data matrix, shape=[n_samples, n_features]
214247
Number of samples is the sum of numbers of samples of all batches.
215248
Number of features is the same as each of the batches.
216-
sample_idx : list-like, shape=[n_samples]
249+
sample_labels : list-like, shape=[n_samples]
217250
Batch labels corresponding to each sample
218251
"""
219252
if not len(data) == len(batch_labels):
@@ -243,8 +276,8 @@ def combine_batches(data, batch_labels, append_to_cell_names=False):
243276
warnings.warn("append_to_cell_names only valid for pd.DataFrame input."
244277
" Got {}".format(matrix_type.__name__), UserWarning)
245278

246-
sample_idx = np.concatenate([np.repeat(batch_labels[i], d.shape[0])
247-
for i, d in enumerate(data)])
279+
sample_labels = np.concatenate([np.repeat(batch_labels[i], d.shape[0])
280+
for i, d in enumerate(data)])
248281
if issubclass(matrix_type, pd.DataFrame):
249282
if append_to_cell_names:
250283
index = np.concatenate(
@@ -259,4 +292,4 @@ def combine_batches(data, batch_labels, append_to_cell_names=False):
259292
elif issubclass(matrix_type, np.ndarray):
260293
data = np.vstack(data)
261294

262-
return data, sample_idx
295+
return data, sample_labels

python/scprep/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# author: Scott Gigante <scott.gigante@yale.edu>
22
# (C) 2018 Krishnaswamy Lab GPLv2
33

4-
__version__ = "0.4.0"
4+
__version__ = "0.5.0"

0 commit comments

Comments
 (0)