@@ -23,64 +23,51 @@ class BoostClassifier:
2323 """Classifier for doublets in single-cell RNA-seq data.
2424
2525 Parameters:
26- boost_rate (float, optional): Proportion of cell population size to
27- produce as synthetic doublets.
28- n_components (int, optional): Number of principal components used for
29- clustering.
30- n_top_var_genes (int, optional): Number of highest variance genes to
31- use; other genes discarded. Will use all genes when zero.
32- replace (bool, optional): If False, a cell will be selected as a
33- synthetic doublet's parent no more than once.
34- self.clustering_algorithm (str, optional): One of `["louvain", "leiden",
35- "phenograph"]`. `"louvain"` and `leiden` refer to the scanpy implementations.
36- clustering_kwargs (dict, optional): Keyword args to pass directly
37- to clusering algorithm. Note that we change the PhenoGraph 'prune' default to
38- True. We also set `directed=False` and `resolution=4` for Louvain
39- and Leiden clustering. You must specifically include these params here
40- to change them. `random_state` and `key_added` should not be overriden
41- when clustering algorithm is Louvain or Leiden.
42- n_iters (int, optional): Number of fit operations from which to collect
43- p-values. Defualt value is 25.
44- normalizer ((sp_sparse) -> ndarray): Method to normalize raw_counts.
45- Defaults to normalize_counts, included in this package. Note: To use
46- normalize_counts with its pseudocount parameter changed from the
47- default pseudocount value to some positive float `new_var`, use:
48- normalizer=lambda counts: doubletdetection.normalize_counts(counts,
49- pseudocount=new_var)
50- pseudocount (int, optional): Pseudocount used in normalize_counts.
51- If `1` is used, and `standard_scaling=False`, the classifier is
52- much more memory efficient; however, this may result in fewer doublets
53- detected.
54- random_state (int, optional): If provided, passed to PCA and used to
55- seedrandom seed numpy's RNG. NOTE: PhenoGraph does not currently
56- admit a random seed, and so this will not guarantee identical
57- results across runs.
58- verbose (bool, optional): Set to False to silence all normal operation
59- informational messages. Defaults to True.
60- standard_scaling (bool, optional): Set to True to enable standard scaling
61- of normalized count matrix prior to PCA. Recommended when not using
62- Phenograph. Defaults to False.
63- n_jobs (int, optional): Number of jobs to use. Speeds up neighbor computation.
26+ boost_rate: Proportion of cell population size to produce as synthetic doublets.
27+ n_components: Number of principal components used for clustering.
28+ n_top_var_genes: Number of highest variance genes to use. Other genes are
29+ discarded. Will use all genes when zero.
30+ replace: If False, a cell will be selected as a synthetic doublet's parent
31+ no more than once.
32+ clustering_algorithm: One of "louvain", "leiden", or "phenograph". "louvain"
33+ and "leiden" refer to the scanpy implementations.
34+ clustering_kwargs: Keyword args to pass directly to clustering algorithm.
35+ Note that PhenoGraph 'prune' default is changed to True. For Louvain and
36+ Leiden clustering, we set `directed=False` and `resolution=4`. Include
37+ these params explicitly to change them. Do not override `random_state`
38+ and `key_added` for Louvain/Leiden.
39+ n_iters: Number of fit operations from which to collect p-values. Default is 25.
40+ normalizer: Method to normalize raw_counts. Defaults to normalize_counts from
41+ this package. To use normalize_counts with a different pseudocount value,
42+ use: `lambda counts: doubletdetection.normalize_counts(counts,
43+ pseudocount=new_value)`
44+ pseudocount: Pseudocount used in normalize_counts. Using 1 with
45+ standard_scaling=False makes the classifier more memory efficient but may
46+ detect fewer doublets.
47+ random_state: Passed to PCA and doublet parent creation. Note: PhenoGraph does not
48+ support random seeds, so identical results aren't guaranteed across runs.
49+ verbose: Set to False to silence informational messages. Defaults to True.
50+ standard_scaling: Enable standard scaling of normalized count matrix prior to
51+ PCA. Recommended when not using Phenograph. Defaults to False.
52+ n_jobs: Number of jobs to use. Speeds up neighbor computation.
6453
6554 Attributes:
66- all_log_p_values_ (ndarray) : Hypergeometric test natural log p-value per
67- cell for cluster enrichment of synthetic doublets. Use for tresholding .
55+ all_log_p_values_: Hypergeometric test natural log p-value per cell for
56+ cluster enrichment of synthetic doublets. Use for thresholding .
6857 Shape (n_iters, num_cells).
69- all_scores_ (ndarray): The fraction of a cell's cluster that is
70- synthetic doublets. Shape (n_iters, num_cells).
71- communities_ (ndarray): Cluster ID for corresponding cell. Shape
72- (n_iters, num_cells).
73- labels_ (ndarray, ndims=1): 0 for singlet, 1 for detected doublet.
74- parents_ (list of sequences of int): Parent cells' indexes for each
75- synthetic doublet. A list wrapping the results from each run.
76- suggested_score_cutoff_ (float): Cutoff used to classify cells when
77- n_iters == 1 (scores >= cutoff). Not produced when n_iters > 1.
78- synth_communities_ (sequence of ints): Cluster ID for corresponding
79- synthetic doublet. Shape (n_iters, num_cells * boost_rate).
80- top_var_genes_ (ndarray): Indices of the n_top_var_genes used. Not
81- generated if n_top_var_genes <= 0.
82- voting_average_ (ndarray): Fraction of iterations each cell is called a
83- doublet.
58+ all_scores_: The fraction of a cell's cluster that is synthetic doublets.
59+ Shape (n_iters, num_cells).
60+ communities_: Cluster ID for corresponding cell. Shape (n_iters, num_cells).
61+ labels_: 0 for singlet, 1 for detected doublet.
62+ parents_: Parent cells' indexes for each synthetic doublet. A list wrapping
63+ the results from each run.
64+ suggested_score_cutoff_: Cutoff used to classify cells when n_iters == 1
65+ (scores >= cutoff). Not produced when n_iters > 1.
66+ synth_communities_: Cluster ID for corresponding synthetic doublet.
67+ Shape (n_iters, num_cells * boost_rate).
68+ top_var_genes_: Indices of the n_top_var_genes used. Not generated if
69+ n_top_var_genes <= 0.
70+ voting_average_: Fraction of iterations each cell is called a doublet.
8471 """
8572
8673 def __init__ (
@@ -148,7 +135,7 @@ def fit(self, raw_counts: NDArray | sp_sparse.csr_matrix) -> "BoostClassifier":
148135 """Fits the classifier on raw_counts.
149136
150137 Args:
151- raw_counts (array-like) : Count matrix, oriented cells by genes.
138+ raw_counts: Count matrix, oriented cells by genes.
152139
153140 Sets:
154141 all_scores_, all_log_p_values_, communities_,
@@ -229,9 +216,9 @@ def predict(self, p_thresh: float = 1e-7, voter_thresh: float = 0.9) -> NDArray:
229216 """Produce doublet calls from fitted classifier
230217
231218 Args:
232- p_thresh (float, optional) : hypergeometric test p-value threshold
219+ p_thresh: hypergeometric test p-value threshold
233220 that determines per iteration doublet calls
234- voter_thresh (float, optional) : fraction of iterations a cell must
221+ voter_thresh: fraction of iterations a cell must
235222 be called a doublet
236223
237224 Sets:
0 commit comments