Merge pull request #151 from JonathanShor/dev_4.1

JonathanShor · web-flow · commit 1b852c3c4d0e · 2022-03-12T14:20:05.000-05:00
Dev 4.1
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.7]
+        python-version: [3.6, 3.7, 3.8, 3.9]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,10 +1,15 @@
 repos:
--   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.7.7
-    hooks:
-    -   id: flake8
--   repo: https://github.com/ambv/black
-    rev: stable
-    hooks:
-    - id: black
-      language_version: python3.7
+    - repo: https://github.com/psf/black
+      rev: 22.1.0
+      hooks:
+          - id: black
+    - repo: https://github.com/PyCQA/flake8
+      rev: 4.0.1
+      hooks:
+          - id: flake8
+    - repo: https://github.com/pycqa/isort
+      rev: 5.10.1
+      hooks:
+          - id: isort
+            name: isort (python)
+            additional_dependencies: [toml]
diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@ In `v2.5` we have added a new experimental clustering method (`scanpy`'s Louvain
 
 ## Tutorial
 
-See our [jupyter notebook](https://nbviewer.jupyter.org/github/JonathanShor/DoubletDetection/blob/master/tests/notebooks/PBMC_10k_vignette.ipynb) for an example on 10k PBMCs from 10x Genomics.
+See our [tutorial](https://doubletdetection.readthedocs.io/en/latest/tutorial.html) for an example on 10k PBMCs from 10x Genomics.
 
 ## Obtaining data
 
diff --git a/doubletdetection/doubletdetection.py b/doubletdetection/doubletdetection.py
@@ -42,9 +42,13 @@ class BoostClassifier:
         normalizer ((sp_sparse) -> ndarray): Method to normalize raw_counts.
             Defaults to normalize_counts, included in this package. Note: To use
             normalize_counts with its pseudocount parameter changed from the
-            default 0.1 value to some positive float `new_var`, use:
+            default pseudocount value to some positive float `new_var`, use:
             normalizer=lambda counts: doubletdetection.normalize_counts(counts,
             pseudocount=new_var)
+        pseudocount (int, optional): Pseudocount used in normalize_counts.
+            If `1` is used, and `standard_scaling=False`, the classifier is
+            much more memory efficient; however, this may result in fewer doublets
+            detected.
         random_state (int, optional): If provided, passed to PCA and used to
             seedrandom seed numpy's RNG. NOTE: PhenoGraph does not currently
             admit a random seed, and so this will not guarantee identical
@@ -87,6 +91,7 @@ def __init__(
         clustering_kwargs=None,
         n_iters=10,
         normalizer=None,
+        pseudocount=0.1,
         random_state=0,
         verbose=False,
         standard_scaling=False,
@@ -101,6 +106,7 @@ def __init__(
         self.verbose = verbose
         self.standard_scaling = standard_scaling
         self.n_jobs = n_jobs
+        self.pseudocount = pseudocount
 
         if self.clustering_algorithm not in ["louvain", "phenograph", "leiden"]:
             raise ValueError(
@@ -297,7 +303,12 @@ def _one_fit(self):
             normed_synths = self._raw_synthetics.copy()
             inplace_csr_row_normalize_l1(normed_synths)
             aug_counts = sp_sparse.vstack((self._normed_raw_counts, normed_synths))
-            aug_counts = np.log((aug_counts * np.median(aug_lib_size)).A + 0.1)
+            scaled_aug_counts = aug_counts * np.median(aug_lib_size)
+            if self.pseudocount != 1:
+                aug_counts = np.log(scaled_aug_counts.A + 0.1)
+            else:
+                aug_counts = np.log1p(scaled_aug_counts)
+            del scaled_aug_counts
 
         aug_counts = anndata.AnnData(aug_counts)
         aug_counts.obs["n_counts"] = aug_lib_size
@@ -306,7 +317,14 @@ def _one_fit(self):
 
         if self.verbose:
             print("Running PCA...")
-        sc.tl.pca(aug_counts, n_comps=self.n_components, random_state=self.random_state)
+        # "auto" solver faster for dense matrices
+        solver = "arpack" if sp_sparse.issparse(aug_counts.X) else "auto"
+        sc.tl.pca(
+            aug_counts,
+            n_comps=self.n_components,
+            random_state=self.random_state,
+            svd_solver=solver,
+        )
         if self.verbose:
             print("Clustering augmented data set...\n")
         if self.clustering_algorithm == "phenograph":
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,7 @@ packages = [
   {include = "doubletdetection"},
 ]
 readme = "README.md"
-version = "4.0"
+version = "4.1"
 
 [tool.poetry.dependencies]
 anndata = ">=0.6"
diff --git a/tests/notebooks/PBMC_10k_vignette.ipynb b/tests/notebooks/PBMC_10k_vignette.ipynb

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@ packages = [`
`20`	`20`	`{include = "doubletdetection"},`
`21`	`21`	`]`
`22`	`22`	`readme = "README.md"`
`23`		`-version = "4.0"`
	`23`	`+version = "4.1"`
`24`	`24`
`25`	`25`	`[tool.poetry.dependencies]`
`26`	`26`	`anndata = ">=0.6"`