JonathanShor
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 44 additions & 0 deletions b/‎.github/workflows/test.yml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 11 additions & 1 deletion b/‎README.md‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎doubletdetection/__init__.py‎
Lines changed: 13 additions & 1 deletion b/‎doubletdetection/__init__.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎doubletdetection/doubletdetection.py‎
Lines changed: 35 additions & 75 deletions b/‎doubletdetection/doubletdetection.py‎
Lines changed: 35 additions & 75 deletions
@@ -0,0 +1,44 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: doubletdetection
+
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.6, 3.7]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Cache pip
+      uses: actions/cache@v2
+      with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+    - name: Install dependencies
+      run: |
+        pip install --quiet .[dev]
+    - name: Lint with flake8
+      run: |
+        flake8
+    - name: Format with black
+      run: |
+        black --check .
+    - name: Test with pytest
+      run: |
+        pytest
@@ -2,11 +2,21 @@
 
 [![DOI](https://zenodo.org/badge/86256007.svg)](https://zenodo.org/badge/latestdoi/86256007)
 [![Documentation Status](https://readthedocs.org/projects/doubletdetection/badge/?version=latest)](https://doubletdetection.readthedocs.io/en/latest/?badge=latest)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black)
+![Build Status](https://github.com/JonathanShor/DoubletDetection/workflows/doubletdetection/badge.svg)
 
 DoubletDetection is a Python3 package to detect doublets (technical errors) in single-cell RNA-seq count matrices.
 
 ## Installing DoubletDetection
 
+Install from PyPI
+
+```bash
+pip install doubletdetection
+```
+
+Install from source
+
 ```bash
 git clone https://github.com/JonathanShor/DoubletDetection.git
 cd DoubletDetection
@@ -41,7 +51,7 @@ The classifier works best when
 
 In `v2.5` we have added a new experimental clustering method (`scanpy`'s Louvain clustering) that is much faster than phenograph. We are still validating results from this new clustering. Please see the notebook below for an example of using this new feature.
 
-See our [jupyter notebook](https://nbviewer.jupyter.org/github/JonathanShor/DoubletDetection/blob/master/tests/notebooks/PBMC_8k_vignette.ipynb) for an example on 8k PBMCs from 10x.
+See our [jupyter notebook](https://nbviewer.jupyter.org/github/JonathanShor/DoubletDetection/blob/master/tests/notebooks/PBMC_10k_vignette.ipynb) for an example on 8k PBMCs from 10x.
 
 ## Obtaining data
 
 
@@ -1,2 +1,14 @@
-from .doubletdetection import BoostClassifier, load_mtx, load_10x_h5
+from .doubletdetection import BoostClassifier
 from . import plot
+
+
+# https://github.com/python-poetry/poetry/pull/2366#issuecomment-652418094
+# https://github.com/python-poetry/poetry/issues/144#issuecomment-623927302
+try:
+    import importlib.metadata as importlib_metadata
+except ModuleNotFoundError:
+    import importlib_metadata
+package_name = "doubletdetection"
+__version__ = importlib_metadata.version(package_name)
+
+__all__ = ["BoostClassifier", "plot"]
@@ -1,70 +1,22 @@
 """Doublet detection in single-cell RNA-seq data."""
 
 import collections
+import io
 import warnings
+from contextlib import redirect_stdout
 
 import anndata
 import numpy as np
 import phenograph
 import scipy.sparse as sp_sparse
-import tables
 import scanpy as sc
-from scipy.io import mmread
 from scipy.sparse import csr_matrix
 from scipy.stats import hypergeom
 from sklearn.utils import check_array
 from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1
 from tqdm.auto import tqdm
 
 
-def load_10x_h5(file, genome):
-    """Load count matrix in 10x H5 format
-       Adapted from:
-       https://support.10xgenomics.com/single-cell-gene-expression/software/
-       pipelines/latest/advanced/h5_matrices
-
-    Args:
-        file (str): Path to H5 file
-        genome (str): genome, top level h5 group
-
-    Returns:
-        ndarray: Raw count matrix.
-        ndarray: Barcodes
-        ndarray: Gene names
-    """
-
-    with tables.open_file(file, "r") as f:
-        try:
-            group = f.get_node(f.root, genome)
-        except tables.NoSuchNodeError:
-            print("That genome does not exist in this file.")
-            return None
-    # gene_ids = getattr(group, 'genes').read()
-    gene_names = getattr(group, "gene_names").read()
-    barcodes = getattr(group, "barcodes").read()
-    data = getattr(group, "data").read()
-    indices = getattr(group, "indices").read()
-    indptr = getattr(group, "indptr").read()
-    shape = getattr(group, "shape").read()
-    matrix = sp_sparse.csc_matrix((data, indices, indptr), shape=shape)
-
-    return matrix, barcodes, gene_names
-
-
-def load_mtx(file):
-    """Load count matrix in mtx format
-
-    Args:
-        file (str): Path to mtx file
-
-    Returns:
-        ndarray: Raw count matrix.
-    """
-    raw_counts = np.transpose(mmread(file))
-
-    return raw_counts.tocsc()
-
-
 class BoostClassifier:
     """Classifier for doublets in single-cell RNA-seq data.
 
@@ -162,8 +114,6 @@ def __init__(
         if use_phenograph is True:
             if "prune" not in phenograph_parameters:
                 phenograph_parameters["prune"] = True
-            if ("verbosity" not in phenograph_parameters) and (not self.verbose):
-                phenograph_parameters["verbosity"] = 1
             self.phenograph_parameters = phenograph_parameters
             if (self.n_iters == 1) and (phenograph_parameters.get("prune") is True):
                 warn_msg = (
@@ -238,9 +188,7 @@ def fit(self, raw_counts):
         self.all_log_p_values_ = np.zeros((self.n_iters, self._num_cells))
         all_communities = np.zeros((self.n_iters, self._num_cells))
         all_parents = []
-        all_synth_communities = np.zeros(
-            (self.n_iters, int(self.boost_rate * self._num_cells))
-        )
+        all_synth_communities = np.zeros((self.n_iters, int(self.boost_rate * self._num_cells)))
 
         for i in tqdm(range(self.n_iters)):
             if self.verbose:
@@ -284,9 +232,7 @@ def predict(self, p_thresh=1e-7, voter_thresh=0.9):
         """
         log_p_thresh = np.log(p_thresh)
         if self.n_iters > 1:
-            with np.errstate(
-                invalid="ignore"
-            ):  # Silence numpy warning about NaN comparison
+            with np.errstate(invalid="ignore"):  # Silence numpy warning about NaN comparison
                 self.voting_average_ = np.mean(
                     np.ma.masked_invalid(self.all_log_p_values_) <= log_p_thresh, axis=0
                 )
@@ -298,20 +244,34 @@ def predict(self, p_thresh=1e-7, voter_thresh=0.9):
             # Find a cutoff score
             potential_cutoffs = np.unique(self.all_scores_[~np.isnan(self.all_scores_)])
             if len(potential_cutoffs) > 1:
-                max_dropoff = (
-                    np.argmax(potential_cutoffs[1:] - potential_cutoffs[:-1]) + 1
-                )
+                max_dropoff = np.argmax(potential_cutoffs[1:] - potential_cutoffs[:-1]) + 1
             else:  # Most likely pathological dataset, only one (or no) clusters
                 max_dropoff = 0
             self.suggested_score_cutoff_ = potential_cutoffs[max_dropoff]
-            with np.errstate(
-                invalid="ignore"
-            ):  # Silence numpy warning about NaN comparison
+            with np.errstate(invalid="ignore"):  # Silence numpy warning about NaN comparison
                 self.labels_ = self.all_scores_[0, :] >= self.suggested_score_cutoff_
             self.labels_[np.isnan(self.all_scores_)[0, :]] = np.nan
 
         return self.labels_
 
+    def doublet_score(self):
+        """Produce doublet scores
+
+        The doublet score is the average negative log p-value of doublet enrichment
+        averaged over the iterations. Higher means more likely to be doublet.
+
+        Returns:
+            scores (ndarray, ndims=1):  Average negative log p-value over iterations
+        """
+
+        if self.n_iters > 1:
+            with np.errstate(invalid="ignore"):  # Silence numpy warning about NaN comparison
+                avg_log_p = np.mean(np.ma.masked_invalid(self.all_log_p_values_), axis=0)
+        else:
+            avg_log_p = self.all_log_p_values_[0]
+
+        return -avg_log_p
+
     def _one_fit(self):
         if self.verbose:
             print("\nCreating synthetic doublets...")
@@ -347,19 +307,22 @@ def _one_fit(self):
         if self.verbose:
             print("Clustering augmented data set...\n")
         if self.use_phenograph:
-            fullcommunities, _, _ = phenograph.cluster(
-                aug_counts.obsm["X_pca"], **self.phenograph_parameters
-            )
+            f = io.StringIO()
+            with redirect_stdout(f):
+                fullcommunities, _, _ = phenograph.cluster(
+                    aug_counts.obsm["X_pca"], **self.phenograph_parameters
+                )
+            out = f.getvalue()
+            if self.verbose:
+                print(out)
         else:
             sc.pp.neighbors(
                 aug_counts,
                 random_state=self.random_state,
                 method="umap",
                 n_neighbors=10,
             )
-            sc.tl.louvain(
-                aug_counts, random_state=self.random_state, resolution=4, directed=False
-            )
+            sc.tl.louvain(aug_counts, random_state=self.random_state, resolution=4, directed=False)
             fullcommunities = np.array(aug_counts.obs["louvain"], dtype=int)
         min_ID = min(fullcommunities)
         self.communities_ = fullcommunities[: self._num_cells]
@@ -380,8 +343,7 @@ def _one_fit(self):
         orig_cells_per_comm = collections.Counter(self.communities_)
         community_IDs = orig_cells_per_comm.keys()
         community_scores = {
-            i: float(synth_cells_per_comm[i])
-            / (synth_cells_per_comm[i] + orig_cells_per_comm[i])
+            i: float(synth_cells_per_comm[i]) / (synth_cells_per_comm[i] + orig_cells_per_comm[i])
             for i in community_IDs
         }
         scores = np.array([community_scores[i] for i in self.communities_])
@@ -412,9 +374,7 @@ def _createDoublets(self):
         num_synths = int(self.boost_rate * self._num_cells)
 
         # Parent indices
-        choices = np.random.choice(
-            self._num_cells, size=(num_synths, 2), replace=self.replace
-        )
+        choices = np.random.choice(self._num_cells, size=(num_synths, 2), replace=self.replace)
         parents = [list(p) for p in choices]
 
         parent0 = self._raw_counts[choices[:, 0], :]