Skip to content
Open
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@ Installing
----------
For usage and installation instructions, see the [install guide](docs/install.rst).

Examples
--------
You can find example usage of this package in the examples directory.

Contributing
------------
We welcome contributions! Please see our [contribution guide](CONTRIBUTING.md) for more information. Thank you!

Level of Support
----------------
We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests.
We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests.
115 changes: 115 additions & 0 deletions examples/clustering_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import scanpy as sc
import pandas as pd
import numpy as np
import pickle

# Skip this line if transcriptomic_clustering is installed
sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')

from transcriptomic_clustering.iterative_clustering import (
build_cluster_dict, iter_clust, OnestepKwargs
)

# Load the data that contains the raw counts in the 'X' slot. If using normalized data, skip the normalization step.
adata = sc.read('path/to/your/data.h5ad')

# Normalize the data. Skip if adata.X is already normalized. This is the same as using sc.normalize_total(adata, target_sum=1e6) and sc.pp.log1p(adata)
adata=transcriptomic_clustering.normalize(adata)

# Add scVI latent space to the adata object. Skip if adata.obsm['X_scVI'] is already present.
scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
adata.obsm['X_scVI'] = np.asarray(scvi)

# Set up the clustering parameters
def setup_transcriptomic_clustering():
means_vars_kwargs = {
'low_thresh': 0.6931472,
'min_cells': 4
}
highly_variable_kwargs = {
'max_genes': 4000
}
pca_kwargs = { # not used if using a scvi latent space
'cell_select': 30000,
'n_comps': 50,
'svd_solver': 'randomized'
}
filter_pcs_kwargs = { # not used if using a scvi latent space
'known_components': None,
'similarity_threshold': 0.7,
'method': 'zscore',
'zth': 2,
'max_pcs': None,
}
filter_known_modes_kwargs = {
# 'known_modes': known_modes_df,
'similarity_threshold': 0.7
}
latent_kwargs = {
'latent_component': "X_scVI"
}
cluster_louvain_kwargs = {
'k': 15,
'nn_measure': 'euclidean',
'knn_method': 'annoy',
'louvain_method': 'taynaud',
'weighting_method': 'jaccard',
'n_jobs': 30,
'resolution': 1.0,
}
merge_clusters_kwargs = {
'thresholds': {
'q1_thresh': 0.5,
'q2_thresh': None,
'cluster_size_thresh': 10,
'qdiff_thresh': 0.7,
'padj_thresh': 0.05,
'lfc_thresh': 1,
'score_thresh': 100,
'low_thresh': 0.6931472,
'min_genes': 5
},
'k': 4,
'de_method': 'ebayes'
}
onestep_kwargs = OnestepKwargs(
means_vars_kwargs = means_vars_kwargs,
highly_variable_kwargs = highly_variable_kwargs,
pca_kwargs = pca_kwargs,
filter_pcs_kwargs = filter_pcs_kwargs,
filter_known_modes_kwargs = filter_known_modes_kwargs,
latent_kwargs = latent_kwargs,
cluster_louvain_kwargs = cluster_louvain_kwargs,
merge_clusters_kwargs = merge_clusters_kwargs
)
return onestep_kwargs

onestep_kwargs = setup_transcriptomic_clustering()

# run the iterative clustering. need a tmp folder to store intermediate results
clusters, markers = iter_clust(
adata,
min_samples=10,
onestep_kwargs=onestep_kwargs,
random_seed=123,
tmp_dir="/path/to/your/tmp"
)

# save the results in .pkl format, which will be used for final merging
with open('clustering_results.pkl', 'wb') as f:
pickle.dump(clusters, f)

with open('markers.pkl', 'wb') as f:
pickle.dump(markers, f)

# (Optional) save the clustering results in a data frame.
cluster_dict = build_cluster_dict(clusters)

adata.obs["scrattch_py_cluster"] = ""
for cluster in cluster_dict.keys():
adata.obs.scrattch_py_cluster[cluster_dict[cluster]] = cluster

adata.obs.scrattch_py_cluster = adata.obs.scrattch_py_cluster.astype('category')

res = pd.DataFrame({'sample_id':adata.obs_names, 'cl': adata.obs.scrattch_py_cluster})
res.to_csv('/path/to/your/clustering_results.csv')
119 changes: 119 additions & 0 deletions examples/final_merging_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import sys
import os
import pickle
import pandas as pd
import scanpy as sc
import importlib
import time
import anndata as ad
import numpy as np
import math

# Skip this line if transcriptomic_clustering is installed
sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')

from transcriptomic_clustering.final_merging import final_merge, FinalMergeKwargs

# Load the data that contains the raw counts in the 'X' slot. If adata.X is normalized, skip the next normalization step.
adata = sc.read('path/to/your/adata.h5ad')

# Normalize the data. Skip if adata.X is already normalized.
sc.pp.normalize_total(adata, target_sum=1e6)
sc.pp.log1p(adata)

# Add scVI latent space to the adata object. Skip if adata.obsm['scVI'] is already present.
scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
adata.obsm['scVI'] = np.asarray(scvi)

# Loading clustering results
cl_pth = "/path/to/clustering_results"
with open(os.path.join(cl_pth, 'clustering_results.pkl'), 'rb') as f:
clusters = pickle.load(f)

# marker genes are only needed for computing PCA
with open(os.path.join(cl_pth, 'markers.pkl'), 'rb') as f:
markers = pickle.load(f)

# The first 4 are for PCA only. modify latent_kwargs if using a pre-computed latent space
def setup_merging():
pca_kwargs ={
# 'cell_select': 30000, # should not use set this for final merging, as we need to sample from each cluster if computing PCA
'n_comps': 50,
'svd_solver': 'randomized'
}
filter_pcs_kwargs = {
'known_components': None,
'similarity_threshold': 0.7,
'method': 'zscore',
'zth': 2,
'max_pcs': None}
filter_known_modes_kwargs = {
'known_modes': 'log2ngene',
'similarity_threshold': 0.7}
project_kwargs = {}
merge_clusters_kwargs = {
'thresholds': {
'q1_thresh': 0.5,
'q2_thresh': None,
'cluster_size_thresh': 10,
'qdiff_thresh': 0.7,
'padj_thresh': 0.05,
'lfc_thresh': 1,
'score_thresh': 100,
'low_thresh': 0.6931472,
'min_genes': 5
},
'k': 4,
'de_method': 'ebayes',
'n_markers': None, # if set to None, will bypass the marker calculation step, which is the time-consuming step
}
latent_kwargs = {
'latent_component': "scVI" # None or a obsm in adata. if None: default is running pca, else use the latent_component in adata.obsm
}

merge_kwargs = FinalMergeKwargs(
pca_kwargs = pca_kwargs,
filter_pcs_kwargs = filter_pcs_kwargs,
filter_known_modes_kwargs = filter_known_modes_kwargs,
project_kwargs = project_kwargs,
merge_clusters_kwargs = merge_clusters_kwargs,
latent_kwargs = latent_kwargs
)
return merge_kwargs

merge_kwargs = setup_merging()

# Run the final merging
clusters_after_merging, markers_after_merging = final_merge(
adata,
clusters,
markers, # required for PCA, but optional if using a pre-computed latent space
n_samples_per_clust=20,
random_seed=2024,
n_jobs = 30, # modify this to the number of cores you want to use
return_markers_df = False, # return the pair-wise DE results for each cluster pair. If False (default), only return a set of markers (top 20 of up and down regulated genes in each pair comparison)
final_merge_kwargs=merge_kwargs
)

out_dir = "/path/to/output"

with open(os.path.join(out_dir, "clustering_results_after_merging.pkl"), 'wb') as f:
pickle.dump(clusters_after_merging, f)

# determine datatype for markers_after_merging and save
if markers_after_merging is None:
print("Skipped calculating markers. Did not save markers.")
elif isinstance(markers_after_merging, pd.DataFrame):
markers_after_merging.to_csv(os.path.join(out_dir,'markers_after_merging.csv'))
else:
with open(os.path.join(out_dir,'markers_after_merging.pkl'), 'wb') as f:
pickle.dump(markers_after_merging, f)

# convert the clustering results to a .csv file
n_cells = sum(len(i) for i in clusters_after_merging)
cl = ['unknown']*n_cells
for i in range(len(clusters_after_merging)):
for j in clusters_after_merging[i]:
cl[j] = i+1
res = pd.DataFrame({'cl': cl}, index=adata.obs_names)
res.to_csv(os.path.join(out_dir,'clustering_results_after_merging.csv'))
3 changes: 2 additions & 1 deletion transcriptomic_clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,6 @@
from .cluster_means import get_cluster_means
from .merging import merge_clusters
from .diff_expression import de_pairs_chisq, vec_chisq_test
from .de_ebayes import de_pairs_ebayes
from .de_ebayes import de_pairs_ebayes, de_pairs_ebayes_parallel
from .merging import merge_clusters
Copy link

Copilot AI Jul 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicate import of merge_clusters; remove the redundant line to keep the module init.py clean.

Suggested change
from .merging import merge_clusters

Copilot uses AI. Check for mistakes.
from .final_merging import final_merge
90 changes: 90 additions & 0 deletions transcriptomic_clustering/de_ebayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@

from .diff_expression import get_qdiff, filter_gene_stats, calc_de_score

import multiprocessing as mp
from functools import partial

logger = logging.getLogger(__name__)

"""
Expand Down Expand Up @@ -232,3 +235,90 @@ def de_pairs_ebayes(

de_pairs = pd.DataFrame(de_pairs).T
return de_pairs

def process_pair(cl_means, cl_present, cl_size, stdev_unscaled, df, df_prior, sigma_sq_post, de_thresholds, pair):
cluster_a, cluster_b = pair
# t-test with ebayes adjusted variances
means_diff = cl_means.loc[cluster_a] - cl_means.loc[cluster_b]
means_diff = means_diff.to_frame()
stdev_unscaled_comb = np.sqrt(np.sum(stdev_unscaled.loc[[cluster_a, cluster_b]] ** 2))[0]

df_total = df + df_prior
df_pooled = np.sum(df)
df_total = min(df_total, df_pooled)

t_vals = means_diff / np.sqrt(sigma_sq_post) / stdev_unscaled_comb

p_adj = np.ones((len(t_vals),))
p_vals = 2 * stats.t.sf(np.abs(t_vals[0]), df_total)
reject, p_adj, alphacSidak, alphacBonf= multipletests(p_vals, alpha=de_thresholds['padj_thresh'], method='holm')
lfc = means_diff

# Get DE score
de_pair_stats = pd.DataFrame(index=cl_means.columns)
de_pair_stats['p_value'] = p_vals
de_pair_stats['p_adj'] = p_adj
de_pair_stats['lfc'] = lfc
de_pair_stats["meanA"] = cl_means.loc[cluster_a]
de_pair_stats["meanB"] = cl_means.loc[cluster_b]
de_pair_stats["q1"] = cl_present.loc[cluster_a]
de_pair_stats["q2"] = cl_present.loc[cluster_b]
de_pair_stats["qdiff"] = get_qdiff(cl_present.loc[cluster_a], cl_present.loc[cluster_b])

de_pair_up = filter_gene_stats(
de_stats=de_pair_stats,
gene_type='up-regulated',
cl1_size=cl_size[cluster_a],
cl2_size=cl_size[cluster_b],
**de_thresholds
)
up_score = calc_de_score(de_pair_up['p_adj'].values)

de_pair_down = filter_gene_stats(
de_stats=de_pair_stats,
gene_type='down-regulated',
cl1_size=cl_size[cluster_a],
cl2_size=cl_size[cluster_b],
**de_thresholds
)
down_score = calc_de_score(de_pair_down['p_adj'].values)

return (pair, {
'score': up_score + down_score,
'up_score': up_score,
'down_score': down_score,
'up_genes': de_pair_up.index.to_list(),
'down_genes': de_pair_down.index.to_list(),
'up_num': len(de_pair_up.index),
'down_num': len(de_pair_down.index),
'num': len(de_pair_up.index) + len(de_pair_down.index)
})

def de_pairs_ebayes_parallel(
pairs: List[Tuple[Any, Any]],
cl_means: pd.DataFrame,
cl_vars: pd.DataFrame,
cl_present: pd.DataFrame,
cl_size: Dict[Any, int],
de_thresholds: Dict[str, Any],
n_jobs: int = 1
) -> pd.DataFrame:

logger.info('Fitting Variances')
sigma_sq, df, stdev_unscaled = get_linear_fit_vals(cl_vars, cl_size)
logger.info('Moderating Variances')
sigma_sq_post, var_prior, df_prior = moderate_variances(sigma_sq, df)

logger.info(f'Comparing {len(pairs)} pairs')

# de_pairs = {}

partial_process = partial(process_pair, cl_means, cl_present, cl_size, stdev_unscaled, df, df_prior, sigma_sq_post, de_thresholds)

# Use multiprocessing to parallelize the process
with mp.Pool(processes=n_jobs) as pool:
results = pool.map(partial_process, pairs)

de_pairs = {pair: result for pair, result in results}
de_pairs = pd.DataFrame(de_pairs).T
return de_pairs
2 changes: 1 addition & 1 deletion transcriptomic_clustering/dimension_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def pca(
_, vidx = adata._normalize_indices((slice(None), gene_mask)) # handle gene mask like anndata would
vidx_bool = np.zeros((adata.n_vars,), dtype=bool)
vidx_bool[vidx] = True
n_genes = len(vidx)
n_genes = sum(vidx)
Copy link

Copilot AI Jul 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using sum(vidx) will sum the index positions rather than count the number of genes; consider using len(vidx) or vidx_bool.sum() to get the correct gene count.

Suggested change
n_genes = sum(vidx)
n_genes = len(vidx)

Copilot uses AI. Check for mistakes.

# select n_comps
max_comps = min(n_cells, n_genes) - 1
Expand Down
8 changes: 7 additions & 1 deletion transcriptomic_clustering/filter_known_modes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

def filter_known_modes(
projected_adata: ad.AnnData,
known_modes: Union[pd.DataFrame, pd.Series],
known_modes: Optional[str] = None,
similarity_threshold: Optional[float] = 0.7):
"""
Filters out principal components which correlate strongly with the known modes
Expand All @@ -27,6 +27,12 @@ def filter_known_modes(
projected_adata: after filtering out correlated principal components

"""
# determine if know_modes is in adata.obs
if known_modes in projected_adata.obs.columns:
known_modes = projected_adata.obs[known_modes]
else:
raise ValueError(f'{known_modes} not found in adata.obs')

if isinstance(known_modes, pd.Series):
known_modes = known_modes.to_frame()

Expand Down
Loading