AllenInstitute · yyuandann · Aug 13, 2024 · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/README.md b/README.md
@@ -10,10 +10,14 @@ Installing
 ----------
 For usage and installation instructions, see the [install guide](docs/install.rst).
 
+Examples
+--------
+You can find example usage of this package in the examples directory.
+
 Contributing
 ------------
 We welcome contributions! Please see our [contribution guide](CONTRIBUTING.md) for more information. Thank you!
 
 Level of Support
 ----------------
-We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests.
+We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests.
diff --git a/examples/clustering_example.py b/examples/clustering_example.py
@@ -0,0 +1,115 @@
+import scanpy as sc
+import pandas as pd
+import numpy as np
+import pickle
+
+# Skip this line if transcriptomic_clustering is installed
+sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')
+
+from transcriptomic_clustering.iterative_clustering import (
+    build_cluster_dict, iter_clust, OnestepKwargs
+)
+
+# Load the data that contains the raw counts in the 'X' slot. If using normalized data, skip the normalization step.
+adata = sc.read('path/to/your/data.h5ad')
+
+# Normalize the data. Skip if adata.X is already normalized. This is the same as using sc.normalize_total(adata, target_sum=1e6) and sc.pp.log1p(adata)
+adata=transcriptomic_clustering.normalize(adata)
+
+# Add scVI latent space to the adata object. Skip if adata.obsm['X_scVI'] is already present.
+scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
+adata.obsm['X_scVI'] = np.asarray(scvi)
+
+# Set up the clustering parameters
+def setup_transcriptomic_clustering(): 
+    means_vars_kwargs = {
+        'low_thresh': 0.6931472,
+        'min_cells': 4 
+    }
+    highly_variable_kwargs = {
+        'max_genes': 4000 
+    }
+    pca_kwargs = { # not used if using a scvi latent space
+        'cell_select': 30000, 
+        'n_comps': 50,
+        'svd_solver': 'randomized'
+    }
+    filter_pcs_kwargs = { # not used if using a scvi latent space
+        'known_components': None,
+        'similarity_threshold': 0.7,
+        'method': 'zscore', 
+        'zth': 2,
+        'max_pcs': None,
+    }
+    filter_known_modes_kwargs = { 
+        # 'known_modes': known_modes_df,
+        'similarity_threshold': 0.7
+    }
+    latent_kwargs = {
+        'latent_component': "X_scVI"
+    }
+    cluster_louvain_kwargs = { 
+        'k': 15, 
+        'nn_measure': 'euclidean',
+        'knn_method': 'annoy',
+        'louvain_method': 'taynaud', 
+        'weighting_method': 'jaccard',
+        'n_jobs': 30, 
+        'resolution': 1.0, 
+    }
+    merge_clusters_kwargs = {
+        'thresholds': {
+            'q1_thresh': 0.5,
+            'q2_thresh': None,
+            'cluster_size_thresh': 10, 
+            'qdiff_thresh': 0.7, 
+            'padj_thresh': 0.05, 
+            'lfc_thresh': 1, 
+            'score_thresh': 100, 
+            'low_thresh': 0.6931472, 
+            'min_genes': 5
+        },
+        'k': 4,
+        'de_method': 'ebayes'
+    }
+    onestep_kwargs = OnestepKwargs(
+        means_vars_kwargs = means_vars_kwargs,
+        highly_variable_kwargs = highly_variable_kwargs,
+        pca_kwargs = pca_kwargs,
+        filter_pcs_kwargs = filter_pcs_kwargs,
+        filter_known_modes_kwargs = filter_known_modes_kwargs,
+        latent_kwargs = latent_kwargs,
+        cluster_louvain_kwargs = cluster_louvain_kwargs,
+        merge_clusters_kwargs = merge_clusters_kwargs
+    )
+    return onestep_kwargs
+
+onestep_kwargs = setup_transcriptomic_clustering()
+
+# run the iterative clustering. need a tmp folder to store intermediate results
+clusters, markers = iter_clust(
+    adata,
+    min_samples=10, 
+    onestep_kwargs=onestep_kwargs,
+    random_seed=123,
+    tmp_dir="/path/to/your/tmp"
+)
+
+# save the results in .pkl format, which will be used for final merging
+with open('clustering_results.pkl', 'wb') as f:
+    pickle.dump(clusters, f)
+
+with open('markers.pkl', 'wb') as f:
+    pickle.dump(markers, f)
+
+# (Optional) save the clustering results in a data frame.
+cluster_dict = build_cluster_dict(clusters)
+
+adata.obs["scrattch_py_cluster"] = ""
+for cluster in cluster_dict.keys():
+    adata.obs.scrattch_py_cluster[cluster_dict[cluster]] = cluster
+
+adata.obs.scrattch_py_cluster = adata.obs.scrattch_py_cluster.astype('category')
+
+res = pd.DataFrame({'sample_id':adata.obs_names, 'cl': adata.obs.scrattch_py_cluster})
+res.to_csv('/path/to/your/clustering_results.csv')
diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py
@@ -0,0 +1,119 @@
+import sys
+import os
+import pickle
+import pandas as pd
+import scanpy as sc
+import importlib
+import time
+import anndata as ad
+import numpy as np
+import math
+
+# Skip this line if transcriptomic_clustering is installed
+sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')
+
+from transcriptomic_clustering.final_merging import final_merge, FinalMergeKwargs
+
+# Load the data that contains the raw counts in the 'X' slot. If adata.X is normalized, skip the next normalization step.
+adata = sc.read('path/to/your/adata.h5ad')
+
+# Normalize the data. Skip if adata.X is already normalized.
+sc.pp.normalize_total(adata, target_sum=1e6)
+sc.pp.log1p(adata)
+
+# Add scVI latent space to the adata object. Skip if adata.obsm['scVI'] is already present.
+scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
+adata.obsm['scVI'] = np.asarray(scvi)
+
+# Loading clustering results
+cl_pth = "/path/to/clustering_results"
+with open(os.path.join(cl_pth, 'clustering_results.pkl'), 'rb') as f:
+    clusters = pickle.load(f)
+
+# marker genes are only needed for computing PCA
+with open(os.path.join(cl_pth, 'markers.pkl'), 'rb') as f:
+    markers = pickle.load(f)
+
+# The first 4 are for PCA only. modify latent_kwargs if using a pre-computed latent space
+def setup_merging(): 
+    pca_kwargs ={
+        # 'cell_select': 30000, # should not use set this for final merging, as we need to sample from each cluster if computing PCA
+        'n_comps': 50,
+        'svd_solver': 'randomized'
+    }
+    filter_pcs_kwargs = {
+        'known_components': None,
+        'similarity_threshold': 0.7,
+        'method': 'zscore', 
+        'zth': 2,
+        'max_pcs': None}
+    filter_known_modes_kwargs = {
+        'known_modes': 'log2ngene', 
+        'similarity_threshold': 0.7}
+    project_kwargs = {}
+    merge_clusters_kwargs = {
+        'thresholds': {
+            'q1_thresh': 0.5,
+            'q2_thresh': None,
+            'cluster_size_thresh': 10, 
+            'qdiff_thresh': 0.7, 
+            'padj_thresh': 0.05, 
+            'lfc_thresh': 1, 
+            'score_thresh': 100, 
+            'low_thresh': 0.6931472, 
+            'min_genes': 5
+        },
+        'k': 4,
+        'de_method': 'ebayes',
+        'n_markers': None, # if set to None, will bypass the marker calculation step, which is the time-consuming step
+    }
+    latent_kwargs = { 
+        'latent_component': "scVI" # None or a obsm in adata. if None: default is running pca, else use the latent_component in adata.obsm
+    }
+
+    merge_kwargs = FinalMergeKwargs(
+        pca_kwargs = pca_kwargs,
+        filter_pcs_kwargs = filter_pcs_kwargs,
+        filter_known_modes_kwargs = filter_known_modes_kwargs,
+        project_kwargs = project_kwargs,
+        merge_clusters_kwargs = merge_clusters_kwargs,
+        latent_kwargs = latent_kwargs
+    )
+    return merge_kwargs
+
+merge_kwargs = setup_merging()
+
+# Run the final merging
+clusters_after_merging, markers_after_merging = final_merge(
+    adata, 
+    clusters, 
+    markers, # required for PCA, but optional if using a pre-computed latent space
+    n_samples_per_clust=20, 
+    random_seed=2024, 
+    n_jobs = 30, # modify this to the number of cores you want to use
+    return_markers_df = False, # return the pair-wise DE results for each cluster pair. If False (default), only return a set of markers (top 20 of up and down regulated genes in each pair comparison)
+    final_merge_kwargs=merge_kwargs
+)
+
+out_dir = "/path/to/output"
+
+with open(os.path.join(out_dir, "clustering_results_after_merging.pkl"), 'wb') as f:
+        pickle.dump(clusters_after_merging, f)
+
+# determine datatype for markers_after_merging and save
+if markers_after_merging is None:
+    print("Skipped calculating markers. Did not save markers.")
+elif isinstance(markers_after_merging, pd.DataFrame):
+    markers_after_merging.to_csv(os.path.join(out_dir,'markers_after_merging.csv'))
+else:
+    with open(os.path.join(out_dir,'markers_after_merging.pkl'), 'wb') as f:
+        pickle.dump(markers_after_merging, f)
+
+# convert the clustering results to a .csv file
+n_cells = sum(len(i) for i in clusters_after_merging)
+cl = ['unknown']*n_cells
+for i in range(len(clusters_after_merging)):
+    for j in clusters_after_merging[i]:
+        cl[j] = i+1
+res = pd.DataFrame({'cl': cl}, index=adata.obs_names)
+res.to_csv(os.path.join(out_dir,'clustering_results_after_merging.csv'))
diff --git a/transcriptomic_clustering/__init__.py b/transcriptomic_clustering/__init__.py
@@ -29,5 +29,6 @@
 from .cluster_means import get_cluster_means
 from .merging import merge_clusters
 from .diff_expression import de_pairs_chisq, vec_chisq_test
-from .de_ebayes import de_pairs_ebayes
+from .de_ebayes import de_pairs_ebayes, de_pairs_ebayes_parallel
 from .merging import merge_clusters
-from .merging import merge_clusters
-from .merging import merge_clusters
+from .final_merging import final_merge
diff --git a/transcriptomic_clustering/de_ebayes.py b/transcriptomic_clustering/de_ebayes.py
@@ -15,6 +15,9 @@
 
 from .diff_expression import get_qdiff, filter_gene_stats, calc_de_score
 
+import multiprocessing as mp
+from functools import partial
+
 logger = logging.getLogger(__name__)
 
 """
@@ -232,3 +235,90 @@ def de_pairs_ebayes(
 
     de_pairs = pd.DataFrame(de_pairs).T
     return de_pairs
+
+def process_pair(cl_means, cl_present, cl_size, stdev_unscaled, df, df_prior, sigma_sq_post, de_thresholds, pair):
+    cluster_a, cluster_b = pair
+    # t-test with ebayes adjusted variances
+    means_diff = cl_means.loc[cluster_a] - cl_means.loc[cluster_b]
+    means_diff = means_diff.to_frame()
+    stdev_unscaled_comb = np.sqrt(np.sum(stdev_unscaled.loc[[cluster_a, cluster_b]] ** 2))[0]
+
+    df_total = df + df_prior
+    df_pooled = np.sum(df)
+    df_total = min(df_total, df_pooled)
+
+    t_vals = means_diff / np.sqrt(sigma_sq_post) / stdev_unscaled_comb
+
+    p_adj = np.ones((len(t_vals),))
+    p_vals = 2 * stats.t.sf(np.abs(t_vals[0]), df_total)
+    reject, p_adj, alphacSidak, alphacBonf= multipletests(p_vals, alpha=de_thresholds['padj_thresh'], method='holm')
+    lfc = means_diff
+
+    # Get DE score
+    de_pair_stats = pd.DataFrame(index=cl_means.columns)
+    de_pair_stats['p_value'] = p_vals
+    de_pair_stats['p_adj'] = p_adj
+    de_pair_stats['lfc'] = lfc
+    de_pair_stats["meanA"] = cl_means.loc[cluster_a]
+    de_pair_stats["meanB"] = cl_means.loc[cluster_b]
+    de_pair_stats["q1"] = cl_present.loc[cluster_a]
+    de_pair_stats["q2"] = cl_present.loc[cluster_b]
+    de_pair_stats["qdiff"] = get_qdiff(cl_present.loc[cluster_a], cl_present.loc[cluster_b])
+
+    de_pair_up = filter_gene_stats(
+        de_stats=de_pair_stats,
+        gene_type='up-regulated', 
+        cl1_size=cl_size[cluster_a],
+        cl2_size=cl_size[cluster_b],
+        **de_thresholds
+    )
+    up_score = calc_de_score(de_pair_up['p_adj'].values)
+
+    de_pair_down = filter_gene_stats(
+        de_stats=de_pair_stats,
+        gene_type='down-regulated',
+        cl1_size=cl_size[cluster_a],
+        cl2_size=cl_size[cluster_b],
+        **de_thresholds
+    )
+    down_score = calc_de_score(de_pair_down['p_adj'].values)
+
+    return (pair, {
+        'score': up_score + down_score,
+        'up_score': up_score,
+        'down_score': down_score,
+        'up_genes': de_pair_up.index.to_list(),
+        'down_genes': de_pair_down.index.to_list(),
+        'up_num': len(de_pair_up.index),
+        'down_num': len(de_pair_down.index),
+        'num': len(de_pair_up.index) + len(de_pair_down.index)
+    })
+
+def de_pairs_ebayes_parallel(
+        pairs: List[Tuple[Any, Any]],
+        cl_means: pd.DataFrame,
+        cl_vars: pd.DataFrame,
+        cl_present: pd.DataFrame,
+        cl_size: Dict[Any, int],
+        de_thresholds: Dict[str, Any],
+        n_jobs: int = 1
+    ) -> pd.DataFrame:
+
+    logger.info('Fitting Variances')
+    sigma_sq, df, stdev_unscaled = get_linear_fit_vals(cl_vars, cl_size)
+    logger.info('Moderating Variances')
+    sigma_sq_post, var_prior, df_prior = moderate_variances(sigma_sq, df)
+
+    logger.info(f'Comparing {len(pairs)} pairs')
+
+    # de_pairs = {}
+
+    partial_process = partial(process_pair, cl_means, cl_present, cl_size, stdev_unscaled, df, df_prior, sigma_sq_post, de_thresholds)
+
+    # Use multiprocessing to parallelize the process
+    with mp.Pool(processes=n_jobs) as pool:
+        results = pool.map(partial_process, pairs)
+
+    de_pairs = {pair: result for pair, result in results}
+    de_pairs = pd.DataFrame(de_pairs).T
+    return de_pairs
diff --git a/transcriptomic_clustering/dimension_reduction.py b/transcriptomic_clustering/dimension_reduction.py
@@ -116,7 +116,7 @@ def pca(
     _, vidx = adata._normalize_indices((slice(None), gene_mask)) # handle gene mask like anndata would
     vidx_bool = np.zeros((adata.n_vars,), dtype=bool)
     vidx_bool[vidx] = True
-    n_genes = len(vidx)
+    n_genes = sum(vidx)
-    n_genes = sum(vidx)
+    n_genes = len(vidx)
-    n_genes = sum(vidx)
+    n_genes = len(vidx)
 
     # select n_comps
     max_comps = min(n_cells, n_genes) - 1

diff --git a/transcriptomic_clustering/filter_known_modes.py b/transcriptomic_clustering/filter_known_modes.py
@@ -10,7 +10,7 @@
 
 def filter_known_modes(
         projected_adata: ad.AnnData,
-        known_modes: Union[pd.DataFrame, pd.Series],
+        known_modes: Optional[str] = None,
         similarity_threshold: Optional[float] = 0.7):
     """
         Filters out principal components which correlate strongly with the known modes
@@ -27,6 +27,12 @@ def filter_known_modes(
         projected_adata: after filtering out correlated principal components
 
     """
+    # determine if know_modes is in adata.obs
+    if known_modes in projected_adata.obs.columns:
+        known_modes = projected_adata.obs[known_modes]
+    else:
+        raise ValueError(f'{known_modes} not found in adata.obs')
+
     if isinstance(known_modes, pd.Series):
         known_modes = known_modes.to_frame()