22# (C) 2018 Krishnaswamy Lab GPLv2
33
44import numpy as np
5+ import pandas as pd
56
67from . import utils , measure
78
@@ -51,13 +52,17 @@ def remove_rare_genes(data, cutoff=0, min_cells=5):
5152 return data
5253
5354
54- def remove_empty_cells (data ):
55+ def remove_empty_cells (data , sample_labels = None ):
5556 """Remove all cells with zero library size
5657
5758 Parameters
5859 ----------
5960 data : array-like, shape=[n_samples, n_features]
6061 Input data
62+ sample_labels : list-like or None, optional, shape=[n_samples] (default: None)
63+ Labels associated with the rows of `data`. If provided, these
64+ will be filtered such that they retain a one-to-one mapping
65+ with the rows of the output data.
6166
6267 Returns
6368 -------
@@ -67,37 +72,141 @@ def remove_empty_cells(data):
6772 cell_sums = measure .library_size (data )
6873 keep_cells_idx = cell_sums > 0
6974 data = utils .select_rows (data , keep_cells_idx )
75+ if sample_labels is not None :
76+ sample_labels = sample_labels [keep_cells_idx ]
77+ data = data , sample_labels
7078 return data
7179
7280
73- def filter_library_size (data , cutoff = 2000 ):
74- """Remove all cells with library size below a certain value
81+ def _get_filter_idx (data , values ,
82+ cutoff , percentile ,
83+ keep_cells ):
84+ cutoff = measure ._get_percentile_cutoff (
85+ values , cutoff , percentile , required = True )
86+ if keep_cells == 'above' :
87+ keep_cells_idx = values > cutoff
88+ elif keep_cells == 'below' :
89+ keep_cells_idx = values < cutoff
90+ else :
91+ raise ValueError ("Expected `keep_cells` in ['above', 'below']. "
92+ "Got {}" .format (keep_cells ))
93+ return keep_cells_idx
7594
76- It is recommended to use :func:`~scprep.plot.plot_library_size` to
95+
96+ def filter_values (data , values ,
97+ cutoff = None , percentile = None ,
98+ keep_cells = 'above' , sample_labels = None ,
99+ filter_per_sample = False ):
100+ """Remove all cells with `values` above or below a certain threshold
101+
102+ It is recommended to use :func:`~scprep.plot.histogram` to
77103 choose a cutoff prior to filtering.
78104
79105 Parameters
80106 ----------
81107 data : array-like, shape=[n_samples, n_features]
82108 Input data
83- cutoff : float, optional (default: 2000)
84- Minimum library size required to retain a cell
109+ values : list-like, shape=[n_samples]
110+ Value upon which to filter
111+ cutoff : float, optional (default: None)
112+ Minimum library size required to retain a cell. Only one of `cutoff`
113+ and `percentile` should be specified.
114+ percentile : int, optional (Default: None)
115+ Percentile above or below which to remove cells.
116+ Must be an integer between 0 and 100. Only one of `cutoff`
117+ and `percentile` should be specified.
118+ keep_cells : {'above', 'below'}, optional (default: 'above')
119+ Keep cells above or below the cutoff
120+ sample_labels : list-like or None, optional, shape=[n_samples] (default: None)
121+ Labels associated with the rows of `data`. If provided, these
122+ will be filtered such that they retain a one-to-one mapping
123+ with the rows of the output data.
124+ filter_per_sample : bool, optional (default: False)
125+ If True, filters separately for each unique sample label. Only used
126+ if `sample_labels` is not `None` and `percentile` is given.
85127
86128 Returns
87129 -------
88130 data : array-like, shape=[m_samples, n_features]
89131 Filtered output data, where m_samples <= n_samples
132+ sample_labels : list-like, shape=[m_samples]
133+ Filtered sample labels, if provided
90134 """
91- cell_sums = measure .library_size (data )
92- keep_cells_idx = cell_sums > cutoff
135+ if filter_per_sample and percentile is not None and \
136+ sample_labels is not None :
137+ # filter separately and combine
138+ sample_labels_array = utils .toarray (sample_labels ).flatten ()
139+ keep_cells_idx = np .full_like (
140+ sample_labels_array , True ,
141+ dtype = bool )
142+ for label in np .unique (sample_labels_array ):
143+ sample_idx = sample_labels_array == label
144+ keep_cells_idx [sample_idx ] = _get_filter_idx (
145+ utils .select_rows (data , sample_idx ),
146+ values [sample_idx ],
147+ cutoff , percentile , keep_cells )
148+ keep_cells_idx = keep_cells_idx .flatten ()
149+ else :
150+ keep_cells_idx = _get_filter_idx (data , values ,
151+ cutoff , percentile ,
152+ keep_cells )
93153 data = utils .select_rows (data , keep_cells_idx )
154+ if sample_labels is not None :
155+ sample_labels = sample_labels [keep_cells_idx ]
156+ data = data , sample_labels
94157 return data
95158
96159
160+ def filter_library_size (data , cutoff = None , percentile = None ,
161+ keep_cells = 'above' , sample_labels = None ,
162+ filter_per_sample = False ):
163+ """Remove all cells with library size above or below a certain threshold
164+
165+ It is recommended to use :func:`~scprep.plot.plot_library_size` to
166+ choose a cutoff prior to filtering.
167+
168+ Parameters
169+ ----------
170+ data : array-like, shape=[n_samples, n_features]
171+ Input data
172+ cutoff : float, optional (default: None)
173+ Minimum library size required to retain a cell. Only one of `cutoff`
174+ and `percentile` should be specified.
175+ percentile : int, optional (Default: None)
176+ Percentile above or below which to remove cells.
177+ Must be an integer between 0 and 100. Only one of `cutoff`
178+ and `percentile` should be specified.
179+ keep_cells : {'above', 'below'}, optional (default: 'above')
180+ Keep cells above or below the cutoff
181+ sample_labels : list-like or None, optional, shape=[n_samples] (default: None)
182+ Labels associated with the rows of `data`. If provided, these
183+ will be filtered such that they retain a one-to-one mapping
184+ with the rows of the output data.
185+ filter_per_sample : bool, optional (default: False)
186+ If True, filters separately for each unique sample label.
187+
188+ Returns
189+ -------
190+ data : array-like, shape=[m_samples, n_features]
191+ Filtered output data, where m_samples <= n_samples
192+ sample_labels : list-like, shape=[m_samples]
193+ Filtered sample labels, if provided
194+ """
195+ cell_sums = measure .library_size (data )
196+ return filter_values (data , cell_sums ,
197+ cutoff = cutoff , percentile = percentile ,
198+ keep_cells = keep_cells ,
199+ sample_labels = sample_labels ,
200+ filter_per_sample = filter_per_sample )
201+
202+
97203def filter_gene_set_expression (data , genes ,
98204 cutoff = None , percentile = None ,
99- keep_cells = 'below' ):
100- """Remove cells with total expression of a gene set below a certain value
205+ library_size_normalize = True ,
206+ keep_cells = 'below' ,
207+ sample_labels = None ,
208+ filter_per_sample = False ):
209+ """Remove cells with total expression of a gene set above or below a certain threshold
101210
102211 It is recommended to use :func:`~scprep.plot.plot_gene_set_expression` to
103212 choose a cutoff prior to filtering.
@@ -111,22 +220,26 @@ def filter_gene_set_expression(data, genes,
111220 cutoff : float, optional (default: 2000)
112221 Value above or below which to remove cells. Only one of `cutoff`
113222 and `percentile` should be specified.
114- percentile : int (Default: None)
223+ percentile : int, optional (Default: None)
115224 Percentile above or below which to remove cells.
116225 Must be an integer between 0 and 100. Only one of `cutoff`
117226 and `percentile` should be specified.
118- keep_cells : {'above', 'below'}
227+ library_size_normalize : bool, optional (default: True)
228+ Divide gene set expression by library size
229+ keep_cells : {'above', 'below'}, optional (default: 'below')
119230 Keep cells above or below the cutoff
231+ sample_labels : list-like or None, optional, shape=[n_samples] (default: None)
232+ Labels associated with the rows of `data`. If provided, these
233+ will be filtered such that they retain a one-to-one mapping
234+ with the rows of the output data.
235+ filter_per_sample : bool, optional (default: False)
236+ If True, filters separately for each unique sample label.
120237 """
121- cell_sums = measure .gene_set_expression (data , genes )
122- cutoff = measure ._get_percentile_cutoff (
123- cell_sums , cutoff , percentile , required = True )
124- if keep_cells == 'above' :
125- keep_cells_idx = cell_sums > cutoff
126- elif keep_cells == 'below' :
127- keep_cells_idx = cell_sums < cutoff
128- else :
129- raise ValueError ("Expected `keep_cells` in ['above', 'below']."
130- "Got {}" .format (keep_cells ))
131- data = utils .select_rows (data , keep_cells_idx )
132- return data
238+ cell_sums = measure .gene_set_expression (
239+ data , genes ,
240+ library_size_normalize = library_size_normalize )
241+ return filter_values (data , cell_sums ,
242+ cutoff = cutoff , percentile = percentile ,
243+ keep_cells = keep_cells ,
244+ sample_labels = sample_labels ,
245+ filter_per_sample = filter_per_sample )
0 commit comments