diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b6a02ad --- /dev/null +++ b/.gitignore @@ -0,0 +1,205 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock +#poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +#pdm.lock +#pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +#pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml + +tests/data diff --git a/modiscolite/__init__.py b/fastermodiscolite/__init__.py similarity index 100% rename from modiscolite/__init__.py rename to fastermodiscolite/__init__.py diff --git a/modiscolite/affinitymat.py b/fastermodiscolite/affinitymat.py similarity index 83% rename from modiscolite/affinitymat.py rename to fastermodiscolite/affinitymat.py index 0310d48..87f1e9f 100644 --- a/modiscolite/affinitymat.py +++ b/fastermodiscolite/affinitymat.py @@ -10,8 +10,11 @@ import scipy from scipy.sparse import coo_matrix +import numba as nb from numba import njit from numba import prange +from numba_progress import ProgressBar, ProgressBarType + from . import util from . import gapped_kmer @@ -44,7 +47,7 @@ def _sparse_vv_dot(X_data, X_indices, X_indptr, Y_data, Y_indices, Y_indptr, i, return dot @njit(parallel=True) -def _sparse_mm_dot(X_data, X_indices, X_indptr, Y_data, Y_indices, Y_indptr, k): +def _sparse_mm_dot(X_data, X_indices, X_indptr, Y_data, Y_indices, Y_indptr, k, stranded, progress_proxy): n_rows = len(Y_indptr) - 1 neighbors = np.empty((n_rows, k), dtype='int32') @@ -55,18 +58,23 @@ def _sparse_mm_dot(X_data, X_indices, X_indptr, Y_data, Y_indices, Y_indptr, k): for j in range(n_rows): xdot = _sparse_vv_dot(X_data, X_indices, X_indptr, X_data, X_indices, X_indptr, i, j) - ydot = _sparse_vv_dot(X_data, X_indices, X_indptr, Y_data, Y_indices, Y_indptr, i, j) - dot[j] = max(xdot, ydot) + if stranded: + dot[j] = xdot + else: + ydot = _sparse_vv_dot(X_data, X_indices, X_indptr, Y_data, Y_indices, Y_indptr, i, j) + dot[j] = max(xdot, ydot) dot_argsort = np.argsort(-dot, kind='mergesort')[:k] neighbors[i] = dot_argsort sims[i] = dot[dot_argsort] + progress_proxy.update(1) + return sims, neighbors def cosine_similarity_from_seqlets(seqlets, n_neighbors, sign, topn=20, - min_k=4, max_k=6, max_gap=15, max_len=15, max_entries=500, - alphabet_size=4): + min_k=4, max_k=6, max_gap=15, max_len=15, max_entries=500, + alphabet_size=4, stranded=False, verbose=False): X_fwd = gapped_kmer._seqlet_to_gkmers(seqlets, topn, min_k, max_k, max_gap, max_len, max_entries, True, sign) @@ -79,11 +87,14 @@ def cosine_similarity_from_seqlets(seqlets, n_neighbors, sign, topn=20, n, d = X.shape k = min(n_neighbors+1, n) - return _sparse_mm_dot(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, k) + total = (len(Y.indptr) - 1) + with ProgressBar(total=total, disable=not verbose) as progress: + return _sparse_mm_dot(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, k, stranded, progress) -def jaccard_from_seqlets(seqlets, min_overlap, filter_seqlets=None, - seqlet_neighbors=None): + +def jaccard_from_seqlets(seqlets, min_overlap, filter_seqlets=None, seqlet_neighbors=None, + stranded=False, verbose=False): all_fwd_data, all_rev_data = util.get_2d_data_from_patterns(seqlets) @@ -102,19 +113,23 @@ def jaccard_from_seqlets(seqlets, min_overlap, filter_seqlets=None, affmat_fwd = jaccard(seqlet_neighbors=seqlet_neighbors, X=filters_all_fwd_data, Y=all_fwd_data, min_overlap=min_overlap, func=int, - return_sparse=True) + return_sparse=True, verbose=verbose) - affmat_rev = jaccard(seqlet_neighbors=seqlet_neighbors, - X=filters_all_rev_data, Y=all_fwd_data, - min_overlap=min_overlap, func=int, - return_sparse=True) + if stranded: + affmat = affmat_fwd + else: + affmat_rev = jaccard(seqlet_neighbors=seqlet_neighbors, + X=filters_all_rev_data, Y=all_fwd_data, + min_overlap=min_overlap, func=int, + return_sparse=True, verbose=verbose) + + affmat = np.maximum(affmat_fwd, affmat_rev) - affmat = np.maximum(affmat_fwd, affmat_rev) return affmat def jaccard(X, Y, min_overlap=None, seqlet_neighbors=None, func=np.ceil, - return_sparse=False): + return_sparse=False, verbose=False): if seqlet_neighbors is None: seqlet_neighbors = np.tile(np.arange(X.shape[0]), (Y.shape[0], 1)) @@ -133,7 +148,10 @@ def jaccard(X, Y, min_overlap=None, seqlet_neighbors=None, func=np.ceil, seqlet_neighbors = seqlet_neighbors.astype('int32') scores = np.zeros((Y.shape[0], seqlet_neighbors.shape[1], len_output), dtype='float32') - _jaccard(X, Y, seqlet_neighbors, scores) + total = Y.shape[0] + + with ProgressBar(total=total, disable=not verbose) as progress: + _jaccard(X, Y, seqlet_neighbors, scores, progress) if return_sparse == True: return scores.max(axis=-1) @@ -141,6 +159,7 @@ def jaccard(X, Y, min_overlap=None, seqlet_neighbors=None, func=np.ceil, argmaxs = np.argmax(scores, axis=-1) idxs = np.arange(seqlet_neighbors.shape[1]) results = np.zeros((Y.shape[0], seqlet_neighbors.shape[1], 2)) + for i in range(Y.shape[0]): results[i, :, 0] = scores[i][idxs, argmaxs[i]] results[i, :, 1] = argmaxs[i] - n_pad @@ -183,8 +202,8 @@ def pairwise_jaccard(X, k): return jaccards, neighbors -@njit('void(float32[:, :, :], float32[:, :, :], int32[:, :], float32[:, :, :])', parallel=True) -def _jaccard(X, Y, neighbors, scores): +@njit(nb.void(nb.float32[:, :, :], nb.float32[:, :, :], nb.int32[:, :], nb.float32[:, :, :], ProgressBarType), parallel=True) +def _jaccard(X, Y, neighbors, scores, progress_proxy): nx, d, m = X.shape ny = Y.shape[0] len_output = scores.shape[-1] @@ -214,6 +233,7 @@ def _jaccard(X, Y, neighbors, scores): scores[l, i, idx] = min_sum / max_sum + progress_proxy.update(1) def pearson_correlation(X, Y, min_overlap=None, func=np.ceil): diff --git a/modiscolite/aggregator.py b/fastermodiscolite/aggregator.py similarity index 95% rename from modiscolite/aggregator.py rename to fastermodiscolite/aggregator.py index 876fc3e..dbab06a 100644 --- a/modiscolite/aggregator.py +++ b/fastermodiscolite/aggregator.py @@ -9,6 +9,7 @@ from . import util from collections import OrderedDict +from tqdm import tqdm from sklearn.metrics import roc_auc_score @@ -68,9 +69,9 @@ def _expand_seqlets_to_fill_pattern(pattern, track_set, left_flank_to_add, def _align_patterns(parent_pattern, child_pattern, metric, min_overlap, - transformer, include_hypothetical): + transformer, include_hypothetical, stranded=False): - fwd_data_parent, rev_data_parent = util.get_2d_data_from_patterns( + fwd_data_parent, _ = util.get_2d_data_from_patterns( [parent_pattern], transformer=transformer, include_hypothetical=include_hypothetical) @@ -84,7 +85,7 @@ def _align_patterns(parent_pattern, child_pattern, metric, min_overlap, best_crossmetric_rev, best_crossmetric_argmax_rev = metric(rev_data_child, fwd_data_parent, min_overlap).squeeze() - if best_crossmetric_rev > best_crossmetric: + if (best_crossmetric_rev > best_crossmetric) and (not stranded): return int(best_crossmetric_argmax_rev), True, best_crossmetric_rev else: return int(best_crossmetric_argmax), False, best_crossmetric @@ -92,13 +93,13 @@ def _align_patterns(parent_pattern, child_pattern, metric, min_overlap, def merge_in_seqlets_filledges(parent_pattern, seqlets_to_merge, track_set, metric, min_overlap, transformer='l1', - include_hypothetical=True): + include_hypothetical=True, stranded=False): parent_pattern = parent_pattern.copy() for seqlet in seqlets_to_merge: alnmt, revcomp_match, alnmt_score = _align_patterns(parent_pattern, - seqlet, metric, min_overlap, transformer, include_hypothetical) + seqlet, metric, min_overlap, transformer, include_hypothetical, stranded) if revcomp_match: seqlet = seqlet.revcomp() @@ -173,10 +174,10 @@ def _detect_spurious_merging(patterns, track_set, perplexity, min_in_subcluster, min_overlap, prob_and_pertrack_sim_merge_thresholds, prob_and_pertrack_sim_dealbreaker_thresholds, min_frac, min_num, flank_to_add, window_size, bg_freq, - n_seeds, max_seqlets_subsample=1000): + n_seeds, max_seqlets_subsample=1000, stranded=False, verbose=False): to_return = [] - for i, pattern in enumerate(patterns): + for i, pattern in enumerate(tqdm(patterns, desc="Detecting spurious merging of patterns:", disable=not verbose)): if len(pattern.seqlets) > min_in_subcluster: pattern.compute_subpatterns(perplexity=perplexity, n_seeds=n_seeds) @@ -186,7 +187,7 @@ def _detect_spurious_merging(patterns, track_set, perplexity, prob_and_pertrack_sim_merge_thresholds=prob_and_pertrack_sim_merge_thresholds, prob_and_pertrack_sim_dealbreaker_thresholds=prob_and_pertrack_sim_dealbreaker_thresholds, min_frac=min_frac, min_num=min_num, flank_to_add=flank_to_add, window_size=window_size, - bg_freq=bg_freq, max_seqlets_subsample=1000) + bg_freq=bg_freq, max_seqlets_subsample=max_seqlets_subsample, stranded=stranded) to_return.extend(refined_subpatterns[0]) else: @@ -197,13 +198,13 @@ def _detect_spurious_merging(patterns, track_set, perplexity, prob_and_pertrack_sim_merge_thresholds=prob_and_pertrack_sim_merge_thresholds, prob_and_pertrack_sim_dealbreaker_thresholds=prob_and_pertrack_sim_dealbreaker_thresholds, min_frac=min_frac, min_num=min_num, flank_to_add=flank_to_add, window_size=window_size, - bg_freq=bg_freq, max_seqlets_subsample=1000) + bg_freq=bg_freq, max_seqlets_subsample=max_seqlets_subsample, stranded=stranded) def SimilarPatternsCollapser(patterns, track_set, min_overlap, prob_and_pertrack_sim_merge_thresholds, prob_and_pertrack_sim_dealbreaker_thresholds, min_frac, min_num, flank_to_add, window_size, bg_freq, - max_seqlets_subsample=1000): + max_seqlets_subsample=1000, stranded=False): patterns = [x.copy() for x in patterns] merge_hierarchy_levels = [] @@ -256,7 +257,8 @@ def SimilarPatternsCollapser(patterns, track_set, metric=affinitymat.pearson_correlation, min_overlap=min_overlap, include_hypothetical=False, - transformer='magnitude') + transformer='magnitude', + stranded=stranded) pairwise_sims[i, j] = aligner_sim @@ -384,7 +386,8 @@ def SimilarPatternsCollapser(patterns, track_set, metric=affinitymat.pearson_correlation, min_overlap=min_overlap, transformer='magnitude', - track_set=track_set) + track_set=track_set, + stranded=stranded) new_pattern = polish_pattern(new_pattern, min_frac=min_frac, min_num=min_num, track_set=track_set, flank=flank_to_add, @@ -501,5 +504,4 @@ def SimilarPatternsCollapser(patterns, track_set, current_level_nodes = next_level_nodes - return patterns, PatternMergeHierarchy(root_nodes=current_level_nodes) - \ No newline at end of file + return patterns, PatternMergeHierarchy(root_nodes=current_level_nodes) \ No newline at end of file diff --git a/modiscolite/bed_writer.py b/fastermodiscolite/bed_writer.py similarity index 100% rename from modiscolite/bed_writer.py rename to fastermodiscolite/bed_writer.py diff --git a/fastermodiscolite/cli.py b/fastermodiscolite/cli.py new file mode 100644 index 0000000..3f550df --- /dev/null +++ b/fastermodiscolite/cli.py @@ -0,0 +1,497 @@ +# tf-modisco command-line tool +# Author: Jacob Schreiber , Ivy Raine + +from pathlib import Path +from typing import List, Literal, Union, Optional + +import click +import h5py +import numpy as np + +import fastermodiscolite +from fastermodiscolite.util import calculate_window_offsets, MemeDataType + + +def _split_chroms(chroms: str) -> Union[List[str], Literal["*"]]: + """Convert comma-delimited ``chroms`` argument to a list or '*' literal.""" + return "*" if chroms == "*" else chroms.split(",") + + +@click.group( + help="""TF-MoDISco is a motif detection algorithm that takes in nucleotide +sequence and their neural-network attribution scores, then extracts motifs that +are repeatedly enriched for attribution signal across the dataset. Use the +sub-commands below to run motif discovery, generate reports, or convert result +files between formats.""" +) +def cli() -> None: + pass + + +@cli.command(help="Run TF-MoDISco and extract the motifs.") +@click.option( + "-s", + "--sequences", + 'seq_path', + type=click.Path(exists=True), + help="A .npy or .npz file containing the one-hot encoded sequences.", +) +@click.option( + "-a", + "--attributions", + 'attr_path', + type=click.Path(exists=True), + help="A .npy or .npz file containing the hypothetical attributions, i.e., the attributions for all nucleotides at all positions." +) +@click.option( + "-i", + "--h5py", + 'h5_path', + type=click.Path(exists=True), + help="Legacy HDF5 file that stores both sequences and attribution scores.", +) +@click.option( + "-n", + "--max-seqlets", + type=int, + required=True, + help="The maximum number of seqlets per metacluster." +) +@click.option( + "-l", + "--n-leiden", + type=int, + default=50, + show_default=True, + help="The number of Leiden clusterings to perform with different random seeds." +) +@click.option( + "-w", + "--window", + type=int, + default=400, + show_default=True, + help="The window surrounding the peak center that will be considered for motif discovery." +) +@click.option( + "-z", + "--size", + "sliding", + type=int, + default=20, + show_default=True, + help="The size of the seqlet cores, corresponding to `sliding_window_size`." +) +@click.option( + "-t", + "--trim-size", + type=int, + default=30, + show_default=True, + help="The size to trim to, corresponding to `trim_to_window_size`." +) +@click.option( + "-f", + "--seqlet-flank-size", + type=int, + default=5, + show_default=True, + help="The size of the flanks to add to each seqlet, corresponding to `flank_size`." +) +@click.option( + "-g", + "--initial-flank-to-add", + type=int, + default=10, + show_default=True, + help="The size of the flanks to add to each pattern initially, corresponding to `initial_flank_to_add`." +) +@click.option( + "-j", + "--final-flank-to-add", + type=int, + default=0, + show_default=True, + help="The size of the flanks to add to each pattern at the end, corresponding to `final_flank_to_add`." +) +@click.option( + "-o", + "--output", + type=click.Path(), + default="modisco_results.h5", + show_default=True, + help="Path to the output HDF5 file.", +) +@click.option( + "-@", + "--num-cores", + type=int, + default=-1, + show_default=True, + help="Number of CPU cores to use (-1 for all available).", +) +@click.option( + "-r", + "--stranded", + is_flag=True, + help="Treat input as stranded so do not add reverse-complement.", +) +@click.option( + "-p", + "--pattern-type", + type=click.Choice(["both", "pos", "neg"]), + default="both", + show_default=True, + help="Which pattern signs to compute: `both`, `pos`, or `neg`. Default is `both`.", +) +@click.option("-v", "--verbose", is_flag=True) +def motifs( + seq_path: str, + attr_path: str, + h5_path: str, + max_seqlets: int, + n_leiden: int, + window: int, + sliding: int, + trim_size: int, + seqlet_flank_size: int, + initial_flank_to_add: int, + final_flank_to_add: int, + output: str, + num_cores: int, + pattern_type: str, + stranded: bool, + verbose: bool, +): + """Run TF-MoDISco and extract the motifs.""" + if h5_path: + f = h5py.File(h5_path, "r") + try: + center = f["hyp_scores"].shape[1] // 2 + start, end = calculate_window_offsets(center, window) + attributions = f["hyp_scores"][..., :][..., start:end, :] + sequences = f["input_seqs"][..., :][..., start:end, :] + except KeyError: + center = f["shap"]["seq"].shape[2] // 2 + start, end = calculate_window_offsets(center, window) + attributions = f["shap"]["seq"][..., :, start:end].transpose(0, 2, 1) + sequences = f["raw"]["seq"][..., :, start:end].transpose(0, 2, 1) + f.close() + else: + seq_path = Path(seq_path) + attr_path = Path(attr_path) + sequences = ( + np.load(seq_path)["arr_0"] + if seq_path.suffix == ".npz" + else np.load(seq_path) + ) + attributions = ( + np.load(attr_path)["arr_0"] + if attr_path.suffix == ".npz" + else np.load(attr_path) + ) + center = sequences.shape[2] // 2 + start, end = calculate_window_offsets(center, window) + sequences = sequences[:, :, start:end].transpose(0, 2, 1) + attributions = attributions[:, :, start:end].transpose(0, 2, 1) + + if sequences.shape[1] < window: + raise ValueError( + f"Window ({window}) cannot be longer than the sequence length ({sequences.shape[1]})." + ) + + pos_patterns, neg_patterns = fastermodiscolite.tfmodisco.TFMoDISco( + one_hot=sequences.astype("float32"), + hypothetical_contribs=attributions.astype("float32"), + max_seqlets_per_metacluster=max_seqlets, + sliding_window_size=sliding, + flank_size=seqlet_flank_size, + trim_to_window_size=trim_size, + initial_flank_to_add=initial_flank_to_add, + final_flank_to_add=final_flank_to_add, + target_seqlet_fdr=0.05, + n_leiden_runs=n_leiden, + num_cores=num_cores, + pattern_type=pattern_type, + stranded=stranded, + verbose=verbose, + ) + fastermodiscolite.io.save_hdf5(output, pos_patterns, neg_patterns, window) + + +@cli.command( + help="Create an HTML report (logos + optional TOMTOM tables) from an HDF5 results file." +) +@click.option( + "-i", + "--h5py", + "h5_path", + type=click.Path(exists=True), + required=True, + help="HDF5 file containing the output from modiscolite.", +) +@click.option( + "-o", + "--output", + type=click.Path(), + required=True, + help="Directory to write the HTML report and associated assets.", +) +@click.option( + "-t", + "--write-tomtom", + is_flag=True, + help="Write the TOMTOM results to the output directory if flag is given.", +) +@click.option( + "-s", + "--suffix", + default="./", + show_default=True, + help="The suffix to add to the beginning of images. Should be equal to the output if using a Jupyter notebook.", +) +@click.option( + "-m", + "--meme-db", + type=click.Path(exists=True), + default=None, + help="A MEME file containing motifs.", +) +@click.option( + "-n", + "--n-matches", + default=3, + type=int, + show_default=True, + help="The number of top TOMTOM matches to include in the report.", +) +@click.option( + "-l", + "--lite", + is_flag=True, + help="Whether to use tomtom-lite when mapping patterns to motifs. Note that this also changes the distance function from correlation to Euclidean distance, and so the best motif may differ when there are many similar versions.", +) +@click.option( + "-@", + "--num-cores", + type=int, + default=-1, + show_default=True, + help="Number of CPU cores to use (-1 for all available).", +) +@click.option("-v", "--verbose", is_flag=True) +def report( + h5_path, output, write_tomtom, suffix, meme_db, n_matches, lite, num_cores, verbose +): + """Generate an interactive HTML motif report.""" + fastermodiscolite.report.report_motifs( + h5_path, + output, + img_path_suffix=suffix, + meme_motif_db=meme_db, + is_writing_tomtom_matrix=write_tomtom, + top_n_matches=n_matches, + ttl=lite, + num_cores=num_cores, + verbose=verbose + ) + + +@cli.command(help="Convert an old HDF5 file to the new format.") +@click.option( + "-i", + "--h5py", + "h5_path", + type=click.Path(exists=True), + required=True, + help="An HDF5 file formatted in the old way.", +) +@click.option( + "-o", + "--output", + type=click.Path(), + required=True, + help="An HDF5 file formatted in the new way.", +) +def convert(h5_path, output): + """Convert old HDF5 to new format.""" + fastermodiscolite.io.convert(h5_path, output) + + +@cli.command(help="Convert a new HDF5 file back to the legacy format.") +@click.option( + "-i", + "--h5py", + "h5_path", + type=click.Path(exists=True), + required=True, + help="An HDF5 file formatted in the new way.", +) +@click.option( + "-o", + "--output", + type=click.Path(), + required=True, + help="An HDF5 file formatted in the old way.", +) +def convert_backward(h5_path, output): + """Convert new HDF5 to original legacy format.""" + fastermodiscolite.io.convert_new_to_old(h5_path, output) + + +@cli.command(help="Write a MEME file from a results HDF5.") +@click.option( + "-i", + "--h5py", + "h5_path", + type=click.Path(exists=True), + help="An HDF5 file containing the output from modiscolite.", +) +@click.option( + "-t", + "--datatype", + type=MemeDataType, + required=True, + help="""A case-sensitive string specifying the desired data of the output file., +The options are as follows: +- 'PFM': The position-frequency matrix. +- 'CWM': The contribution-weight matrix. +- 'hCWM': The hypothetical contribution-weight matrix; hypothetical + contribution scores are the contributions of nucleotides not encoded + by the one-hot encoding sequence. +- 'CWM-PFM': The softmax of the contribution-weight matrix. +- 'hCWM-PFM': The softmax of the hypothetical contribution-weight matrix.""" +) +@click.option( + "-o", + "--output", + type=click.Path(), + default=None, + help="The path to the output file.", +) +@click.option("-q", "--quiet", is_flag=True, help="Suppress output to stdout.") +def meme(h5_path, datatype, output, quiet): + fastermodiscolite.io.write_meme_from_h5(h5_path, datatype, output, quiet) + + +@cli.command(help="Output a BED file of seqlets from a modisco results file to stdout (default) and/or to a file (if specified).") +@click.option( + "-i", + "--h5py", + "h5_path", + type=click.Path(exists=True), + required=True, + help="An HDF5 file containing the output from modiscolite.", +) +@click.option( + "-o", + "--output", + type=click.Path(), + default=None, + help="The path to the output file.", +) +@click.option( + "-p", + "--peaksfile", + type=click.Path(exists=True), + required=True, + help="The path to the peaks file. This is to compute the absolute start and\ +end positions of the seqlets within a reference genome, as well as the chroms.", +) +@click.option( + "-c", + "--chroms", + callback=lambda _, __, val: _split_chroms(val), + required=True, + help="""A comma-delimited list of chromosomes, or '*', denoting which +chromosomes to process. Should be the same set of chromosomes used during +interpretation. '*' will use every chr in the provided peaks file. +Examples: 'chr1,chr2,chrX' || '*' || '1,2,X'.""", +) +@click.option("-q", "--quiet", is_flag=True, help="Suppress output to stdout.") +@click.option( + "-w", + "--windowsize", + type=int, + default=None, + help="""Optional. This is for backwards compatibility for older modisco h5 +files that don't contain the window size as an attribute. This should be set +the to size of the window around the peak center that was used for.""", +) +def seqlet_bed( + h5_path: str, + output: Optional[str], + peaksfile: str, + chroms: Union[List[str], Literal["*"]], + quiet: bool, + windowsize: Optional[int], +) -> None: + """Output a BED file of seqlets from a modisco results file.""" + fastermodiscolite.io.write_bed_from_h5( + h5_path, peaksfile, output, chroms, windowsize, quiet + ) + + +@cli.command(help="Output a FASTA file of seqlets from a modisco results file to stdout (default) and/or to a file (if specified).") +@click.option( + "-i", + "--h5py", + "h5_path", + type=click.Path(exists=True), + required=True, + help="An HDF5 file containing the output from modiscolite.", +) +@click.option( + "-o", + "--output", + type=click.Path(), + default=None, + help="The path to the output file.", +) +@click.option( + "-p", + "--peaksfile", + type=click.Path(exists=True), + required=True, + help="The path to the peaks file. This is to compute the absolute start and\ +end positions of the seqlets within a reference genome, as well as the chroms.", +) +@click.option( + "-s", + "--sequences", + type=click.Path(exists=True), + required=True, + help="A .npy or .npz file containing the one-hot encoded sequences.", +) +@click.option( + "-c", + "--chroms", + callback=lambda _, __, val: _split_chroms(val), + required=True, + help="""A comma-delimited list of chromosomes, or '*', denoting which +chromosomes to process. Should be the same set of chromosomes used during +interpretation. '*' will use every chr in the provided peaks file. +Examples: 'chr1,chr2,chrX' || '*' || '1,2,X'.""", +) +@click.option("-q", "--quiet", is_flag=True, help="Suppress output to stdout.") +@click.option( + "-w", + "--windowsize", + type=int, + default=None, + help="""Optional. This is for backwards compatibility for older modisco h5 +files that don't contain the window size as an attribute. This should be set +the to size of the window around the peak center that was used for.""", +) +def seqlet_fasta( + h5_path: str, + output: Optional[str], + peaksfile: str, + sequences: str, + chroms: Union[List[str], Literal["*"]], + quiet: bool, + windowsize: Optional[int], +) -> None: + fastermodiscolite.io.write_fasta_from_h5( + h5_path, peaksfile, sequences, output, chroms, windowsize, quiet + ) diff --git a/fastermodiscolite/cluster.py b/fastermodiscolite/cluster.py new file mode 100644 index 0000000..49ac0ee --- /dev/null +++ b/fastermodiscolite/cluster.py @@ -0,0 +1,65 @@ +# cluster.py +# Authors: Jacob Schreiber +# adapted from code written by Avanti Shrikumar + +import logging +import joblib +import leidenalg +import numpy as np +import igraph as ig + + +def _LeidenCluster(affinity_mat, seed=100, n_leiden_iterations=-1): + n_vertices = affinity_mat.shape[0] + n_cols = affinity_mat.indptr + sources = np.concatenate( + [ + np.ones(n_cols[i + 1] - n_cols[i], dtype="int32") * i + for i in range(n_vertices) + ] + ) + + g = ig.Graph(directed=None) + g.add_vertices(n_vertices) + g.add_edges(zip(sources, affinity_mat.indices)) + + partition = leidenalg.find_partition( + graph=g, + partition_type=leidenalg.ModularityVertexPartition, + weights=affinity_mat.data, + n_iterations=n_leiden_iterations, + initial_membership=None, + seed=seed, + ) + + quality = np.array(partition.quality()) + membership = np.array(partition.membership) + + return membership, quality + + +def LeidenClusterParallel( + affinity_mat, n_seeds=2, n_leiden_iterations=-1, n_jobs=-1, verbose=False +): + parallel_leiden_results = joblib.Parallel( + n_jobs=n_jobs, verbose=100 if verbose else 0 + )( + joblib.delayed(_LeidenCluster)( + affinity_mat, seed=100 * seed, n_leiden_iterations=n_leiden_iterations + ) + for seed in range(1, n_seeds + 1) + ) + + logger = logging.getLogger("modisco-lite") + best_quality = None + best_clustering = None + + for seed, (membership, quality) in enumerate(parallel_leiden_results): + if verbose: + logger.info(f"Leiden clustering quality for seed {seed}: {quality}") + + if best_quality is None or quality > best_quality: + best_quality = quality + best_clustering = membership + + return best_clustering diff --git a/modiscolite/core.py b/fastermodiscolite/core.py similarity index 97% rename from modiscolite/core.py rename to fastermodiscolite/core.py index 639531b..66ba9e4 100644 --- a/modiscolite/core.py +++ b/fastermodiscolite/core.py @@ -150,8 +150,8 @@ def compute_subpatterns(self, perplexity, n_seeds, n_iterations=-1): sp_density_adapted_affmat /= np.sum(sp_density_adapted_affmat.data) #Do Leiden clustering - self.subclusters = cluster.LeidenCluster(sp_density_adapted_affmat, - n_seeds=n_seeds, n_leiden_iterations=n_iterations) + self.subclusters = cluster.LeidenClusterParallel(sp_density_adapted_affmat, + n_seeds=n_seeds, n_leiden_iterations=n_iterations, verbose=False) #this method assumes all the seqlets have been expanded so they # all start at 0 diff --git a/modiscolite/extract_seqlets.py b/fastermodiscolite/extract_seqlets.py similarity index 94% rename from modiscolite/extract_seqlets.py rename to fastermodiscolite/extract_seqlets.py index 3ccf82b..a470ea1 100644 --- a/modiscolite/extract_seqlets.py +++ b/fastermodiscolite/extract_seqlets.py @@ -2,6 +2,7 @@ # Authors: Jacob Schreiber # adapted from code written by Avanti Shrikumar +import logging import numpy as np from . import core @@ -152,17 +153,24 @@ def extract_seqlets(attribution_scores, window_size, flank, suppress, target_fdr, min_passing_windows_frac, max_passing_windows_frac, weak_threshold_for_counting_sign): + logger = logging.getLogger('modisco-lite') + logger.info(f"Extracting seqlets for {attribution_scores.shape[0]} tasks:") + + logger.info("- Smoothing and splitting tracks") pos_values, neg_values, smoothed_tracks = _smooth_and_split( attribution_scores, window_size) + logger.info("- Computing null values with Laplacian null model") pos_null_values, neg_null_values = _laplacian_null(track=smoothed_tracks, window_size=window_size, num_to_samp=10000) + logger.info("- Computing isotonic thresholds") pos_threshold = _isotonic_thresholds(pos_values, pos_null_values, increasing=True, target_fdr=target_fdr) neg_threshold = _isotonic_thresholds(neg_values, neg_null_values, increasing=False, target_fdr=target_fdr) + logger.info("- Refining thresholds") pos_threshold, neg_threshold = _refine_thresholds( vals=np.concatenate([pos_values, neg_values], axis=0), pos_threshold=pos_threshold, @@ -188,6 +196,7 @@ def extract_seqlets(attribution_scores, window_size, flank, suppress, smoothed_tracks[:, :flank] = -np.inf smoothed_tracks[:, -flank:] = -np.inf + logger.info("- Extracting seqlets") seqlets = _iterative_extract_seqlets(score_track=smoothed_tracks, window_size=window_size, flank=flank, diff --git a/modiscolite/fasta_writer.py b/fastermodiscolite/fasta_writer.py similarity index 100% rename from modiscolite/fasta_writer.py rename to fastermodiscolite/fasta_writer.py diff --git a/modiscolite/gapped_kmer.py b/fastermodiscolite/gapped_kmer.py similarity index 100% rename from modiscolite/gapped_kmer.py rename to fastermodiscolite/gapped_kmer.py diff --git a/modiscolite/io.py b/fastermodiscolite/io.py similarity index 99% rename from modiscolite/io.py rename to fastermodiscolite/io.py index 7769e9d..b83e7b0 100644 --- a/modiscolite/io.py +++ b/fastermodiscolite/io.py @@ -6,7 +6,6 @@ import textwrap import h5py -import hdf5plugin from typing import List, Literal, Union diff --git a/modiscolite/meme_writer.py b/fastermodiscolite/meme_writer.py similarity index 100% rename from modiscolite/meme_writer.py rename to fastermodiscolite/meme_writer.py diff --git a/fastermodiscolite/report.py b/fastermodiscolite/report.py new file mode 100644 index 0000000..48a5017 --- /dev/null +++ b/fastermodiscolite/report.py @@ -0,0 +1,499 @@ +import os +import h5py +import types +import pickle +import pandas +import shutil +import tempfile +import logomaker + +import matplotlib +matplotlib.use('pdf') +from matplotlib import pyplot as plt + +import numpy as np +import pandas as pd +from tqdm import tqdm + +from pathlib import Path +from typing import List, Union + +import joblib +from memelite import tomtom +from memelite.io import read_meme + + +pd.options.display.max_colwidth = 500 + +def compute_per_position_ic(ppm, background, pseudocount): + alphabet_len = len(background) + ic = ((np.log((ppm+pseudocount)/(1 + pseudocount*alphabet_len))/np.log(2)) + *ppm - (np.log(background)*background/np.log(2))[None,:]) + return np.sum(ic,axis=1) + + +def write_meme_file(ppm, bg, fname): + f = open(fname, 'w') + f.write('MEME version 4\n\n') + f.write('ALPHABET= ACGT\n\n') + f.write('strands: + -\n\n') + f.write('Background letter frequencies (from unknown source):\n') + f.write('A %.3f C %.3f G %.3f T %.3f\n\n' % tuple(list(bg))) + f.write('MOTIF 1 TEMP\n') + f.write('letter-probability matrix: alength= 4 w= %d nsites= 1 E= 0e+0\n' % ppm.shape[0]) + for s in ppm: + f.write('%.5f %.5f %.5f %.5f\n' % tuple(s)) + f.write("URL\n\n") + f.close() + + +def fetch_tomtom_matches( + ppm, + cwm, + is_writing_tomtom_matrix, + output_dir, + pattern_name, + motifs_db, + background=[0.25, 0.25, 0.25, 0.25], + tomtom_exec_path="tomtom", + trim_threshold=0.3, + trim_min_length=3, +): + """Fetches top matches from a motifs database using TomTom. + Args: + ppm: position probability matrix- numpy matrix of dimension (N,4) + cwm: contribution weight matrix- numpy matrix of dimension (N,4) + is_writing_tomtom_matrix: if True, write the tomtom matrix to a file + output_dir: directory for writing the TOMTOM file + pattern_name: the name of the pattern, to be used for writing to file + background: list with ACGT background probabilities + tomtom_exec_path: path to TomTom executable + motifs_db: path to motifs database in meme format + n: number of top matches to return, ordered by p-value + temp_dir: directory for storing temp files + trim_threshold: the ppm is trimmed from left till first position for which + probability for any base pair >= trim_threshold. Similarly from right. + Returns: + list: a list of up to n results returned by tomtom, each entry is a + dictionary with keys 'Target ID', 'p-value', 'E-value', 'q-value' + """ + + _, fname = tempfile.mkstemp() + _, tomtom_fname = tempfile.mkstemp() + + score = np.sum(np.abs(cwm), axis=1) + trim_thresh = ( + np.max(score) * trim_threshold + ) # Cut off anything less than 30% of max score + pass_inds = np.where(score >= trim_thresh)[0] + trimmed = ppm[np.min(pass_inds) : np.max(pass_inds) + 1] + + # can be None of no base has prob>t + if trimmed is None: + return [] + + # trim and prepare meme file + write_meme_file(trimmed, background, fname) + + if not shutil.which(tomtom_exec_path): + raise ValueError( + "`tomtom` executable could not be called globally or locally." + " Please install it and try again. You may install it using conda with " + "`conda install -c bioconda meme`" + ) + # run tomtom + os.system( + f"{tomtom_exec_path} -no-ssc -oc . --verbosity 1 -text -min-overlap 5 -mi 1 " + f"-dist pearson -evalue -thresh 10.0 {fname} {motifs_db} > {tomtom_fname}" + ) + + tomtom_results = pd.read_csv(tomtom_fname, sep="\t", usecols=(1, 5), comment="#") + + os.system("rm " + fname) + if is_writing_tomtom_matrix: + output_subdir = os.path.join(output_dir, "tomtom") + os.makedirs(output_subdir, exist_ok=True) + output_filepath = os.path.join(output_subdir, f"{pattern_name}.tomtom.tsv") + os.system(f'mv {tomtom_fname} {output_filepath}') + else: + os.system('rm ' + tomtom_fname) + return tomtom_results + + +def _fetch_tomtom_matches( + ppm, + cwm, + is_writing_tomtom_matrix, + output_dir, + pattern_name, + motifs_db, + background=[0.25, 0.25, 0.25, 0.25], + tomtom_exec_path="tomtom", + trim_threshold=0.3, + trim_min_length=3, + top_n_matches=3, +): + r = fetch_tomtom_matches( + ppm=ppm, + cwm=cwm, + is_writing_tomtom_matrix=is_writing_tomtom_matrix, + output_dir=output_dir, + pattern_name=pattern_name, + motifs_db=motifs_db, + background=background, + tomtom_exec_path=tomtom_exec_path, + trim_threshold=trim_threshold, + trim_min_length=trim_min_length, + ) + result = dict() + + for i, (target, qval) in r.iterrows(): + target = target.strip() + if i < top_n_matches: + result[f"match{i}"] = target + result[f"qval{i}"] = qval + else: + result[f"match{i}"] = None + result[f"qval{i}"] = None + + return result + + +def generate_tomtom_dataframe( + modisco_h5py: os.PathLike, + output_dir: os.PathLike, + meme_motif_db: Union[os.PathLike, None], + is_writing_tomtom_matrix: bool, + pattern_groups: List[str], + top_n_matches=3, + tomtom_exec: str = "tomtom", + trim_threshold=0.3, + trim_min_length=3, + num_cores=-1, + verbose=False, +): + name_seq_scores = list() + + with h5py.File(modisco_h5py, "r") as modisco_results: + for contribution_dir_name in pattern_groups: + if contribution_dir_name not in modisco_results.keys(): + continue + + metacluster = list( + sorted( + modisco_results[contribution_dir_name].items(), + key=lambda x: int(x[0].split("_")[-1]), + ) + ) + + for idx, (_, pattern) in enumerate( + tqdm(metacluster, desc=f"Reading patterns for {contribution_dir_name}", disable=not verbose) + ): + name_seq_scores.append( + ( + f"{contribution_dir_name}.pattern_{idx}", + np.array(pattern["sequence"][:]), + np.array(pattern["contrib_scores"][:]), + ) + ) + + return pd.DataFrame( + joblib.Parallel(n_jobs=num_cores, verbose=100 if verbose else 0)( + joblib.delayed(_fetch_tomtom_matches)( + ppm=seq, + cwm=scores, + is_writing_tomtom_matrix=is_writing_tomtom_matrix, + output_dir=output_dir, + pattern_name=name, + motifs_db=meme_motif_db, + tomtom_exec_path=tomtom_exec, + trim_threshold=trim_threshold, + trim_min_length=trim_min_length, + top_n_matches=top_n_matches, + ) + for name, seq, scores in name_seq_scores + ) + ) + + +def tomtomlite_dataframe( + modisco_h5py: os.PathLike, + output_dir: os.PathLike, + meme_motif_db: Union[os.PathLike, None], + pattern_groups: List[str], + top_n_matches=3, + trim_threshold=0.3, + trim_min_length=3, + verbose=False, +): + """Use tomtom-lite to match patterns to a motif database.""" + + tomtom_results = {} + + for i in range(top_n_matches): + tomtom_results[f"match{i}"] = [] + tomtom_results[f"pval{i}"] = [] + + ppms = [] + with h5py.File(modisco_h5py, "r") as modisco_results: + for contribution_dir_name in tqdm( + pattern_groups, desc="Generating tomtom results", disable=not verbose + ): + if contribution_dir_name not in modisco_results.keys(): + continue + + metacluster = sorted( + modisco_results[contribution_dir_name].items(), + key=lambda x: int(x[0].split("_")[-1]), + ) + + for _, pattern in metacluster: + ppm = np.array(pattern["sequence"][:]) + cwm = np.array(pattern["contrib_scores"][:]) + + score = np.sum(np.abs(cwm), axis=1) + trim_thresh = ( + np.max(score) * trim_threshold + ) # Cut off anything less than 30% of max score + pass_inds = np.where(score >= trim_thresh)[0] + + ppm = ppm[np.min(pass_inds) : np.max(pass_inds) + 1] + ppms.append(ppm.T) + + target_db = read_meme(meme_motif_db) + target_names = list(target_db.keys()) + target_pwms = list(target_db.values()) + + p, scores, offsets, overlaps, strands, idxs = tomtom( + ppms, target_pwms, n_nearest=top_n_matches + ) + + for i in tqdm(range(idxs.shape[0]), desc="Generating tomtom results", disable=not verbose): + for j in range(top_n_matches): + target_name = target_names[int(idxs[i, j])].strip() + pval = p[i, j] + + tomtom_results[f"match{j}"].append(target_name) + tomtom_results[f"pval{j}"].append(pval) + + return pd.DataFrame(tomtom_results) + + +def path_to_image_html(path): + return '' + + +def _plot_weights(array, path, figsize=(10, 3)): + """Plot weights as a sequence logo and save to file.""" + fig = plt.figure(figsize=figsize) + ax = fig.add_subplot(111) + + df = pd.DataFrame(array, columns=["A", "C", "G", "T"]) + df.index.name = "pos" + + crp_logo = logomaker.Logo(df, ax=ax) + crp_logo.style_spines(visible=False) + plt.ylim(min(df.sum(axis=1).min(), 0), df.sum(axis=1).max()) + + plt.savefig(path) + plt.close() + + +def make_logo(match, logo_dir, motifs): + if match == "NA": + return + + background = np.array([0.25, 0.25, 0.25, 0.25]) + ppm = motifs[match] + ic = compute_per_position_ic(ppm, background, 0.001) + + _plot_weights(ppm * ic[:, None], path="{}/{}.png".format(logo_dir, match)) + + +def create_modisco_logos( + modisco_h5py: os.PathLike, + modisco_logo_dir, + trim_threshold, + pattern_groups: List[str], + verbose=False, +): + """Open a modisco results file and create and write logos to file for each pattern.""" + modisco_results = h5py.File(modisco_h5py, "r") + + tags = [] + + for name in pattern_groups: + if name not in modisco_results.keys(): + continue + + metacluster = list( + sorted( + modisco_results[name].items(), key=lambda x: int(x[0].split("_")[-1]) + ) + ) + + for pattern_name, pattern in tqdm( + metacluster, + desc=f"Creating modisco logos for {name}", + total=len(metacluster), + disable=not verbose, + ): + tag = "{}.{}".format(name, pattern_name) + tags.append(tag) + + cwm_fwd = np.array(pattern["contrib_scores"][:]) + cwm_rev = cwm_fwd[::-1, ::-1] + + score_fwd = np.sum(np.abs(cwm_fwd), axis=1) + score_rev = np.sum(np.abs(cwm_rev), axis=1) + + trim_thresh_fwd = np.max(score_fwd) * trim_threshold + trim_thresh_rev = np.max(score_rev) * trim_threshold + + pass_inds_fwd = np.where(score_fwd >= trim_thresh_fwd)[0] + pass_inds_rev = np.where(score_rev >= trim_thresh_rev)[0] + + start_fwd, end_fwd = ( + max(np.min(pass_inds_fwd) - 4, 0), + min(np.max(pass_inds_fwd) + 4 + 1, len(score_fwd) + 1), + ) + start_rev, end_rev = ( + max(np.min(pass_inds_rev) - 4, 0), + min(np.max(pass_inds_rev) + 4 + 1, len(score_rev) + 1), + ) + + trimmed_cwm_fwd = cwm_fwd[start_fwd:end_fwd] + trimmed_cwm_rev = cwm_rev[start_rev:end_rev] + + _plot_weights( + trimmed_cwm_fwd, path=f"{modisco_logo_dir}/{tag}.cwm.fwd.png" + ) + _plot_weights( + trimmed_cwm_rev, path=f"{modisco_logo_dir}/{tag}.cwm.rev.png" + ) + + modisco_results.close() + return tags + + +def report_motifs( + modisco_h5py: Path, + output_dir: os.PathLike, + img_path_suffix: os.PathLike, + meme_motif_db: Union[os.PathLike, None], + is_writing_tomtom_matrix: bool, + top_n_matches=3, + trim_threshold=0.3, + trim_min_length=3, + ttl=False, + num_cores=-1, + verbose=False, +): + if not os.path.isdir(output_dir): + os.mkdir(output_dir) + + modisco_logo_dir = os.path.join(output_dir, "trimmed_logos") + if not os.path.isdir(modisco_logo_dir): + os.mkdir(modisco_logo_dir) + + pattern_groups = ["pos_patterns", "neg_patterns"] + + create_modisco_logos(modisco_h5py, modisco_logo_dir, trim_threshold, pattern_groups, verbose=verbose) + + results = list() + + with h5py.File(modisco_h5py, "r") as modisco_results: + for name in tqdm(pattern_groups, desc="Generating patterns dataframe", disable=not verbose): + if name not in modisco_results.keys(): + continue + + metacluster = sorted( + modisco_results[name].items(), + key=lambda x: int(x[0].split("_")[-1]), + ) + for pattern_name, pattern in metacluster: + results.append( + { + "pattern": f"{name}.{pattern_name}", + "num_seqlets": pattern["seqlets"]["n_seqlets"][:][0], + "modisco_cwm_fwd": os.path.join( + img_path_suffix, + "trimmed_logos", + f"{name}.{pattern_name}.cwm.fwd.png", + ), + "modisco_cwm_rev": os.path.join( + img_path_suffix, + "trimmed_logos", + f"{name}.{pattern_name}.cwm.rev.png", + ), + } + ) + + patterns_df = pd.DataFrame(results) + reordered_columns = ["pattern", "num_seqlets", "modisco_cwm_fwd", "modisco_cwm_rev"] + + # If the optional meme_motif_db is not provided, then we won't generate TOMTOM comparison. + if meme_motif_db is not None: + motifs = read_meme(meme_motif_db) + motifs = {name: pwm.T for name, pwm in motifs.items()} + + if ttl: + tomtom_df = tomtomlite_dataframe( + modisco_h5py, + output_dir, + meme_motif_db, + top_n_matches=top_n_matches, + pattern_groups=pattern_groups, + trim_threshold=trim_threshold, + trim_min_length=trim_min_length, + ) + else: + motifs = {key.split()[0]: value for key, value in motifs.items()} + + tomtom_df = generate_tomtom_dataframe( + modisco_h5py, + output_dir, + meme_motif_db, + is_writing_tomtom_matrix, + top_n_matches=top_n_matches, + tomtom_exec="tomtom", + pattern_groups=pattern_groups, + trim_threshold=trim_threshold, + trim_min_length=trim_min_length, + num_cores=num_cores, + verbose=verbose + ) + + patterns_df = pd.concat([patterns_df, tomtom_df], axis=1) + + for i in range(top_n_matches): + name = f"match{i}" + logos = [] + + for _, row in patterns_df.iterrows(): + if name in patterns_df.columns: + if pd.isnull(row[name]): + logos.append("NA") + else: + make_logo(row[name], output_dir, motifs) + logos.append(f"{img_path_suffix}{row[name]}.png") + else: + break + + patterns_df[f"{name}_logo"] = logos + val = f"pval{i}" if ttl else f"qval{i}" + reordered_columns.extend([name, val, f"{name}_logo"]) + + patterns_df = patterns_df[reordered_columns] + patterns_df.to_html( + open(os.path.join(output_dir, "motifs.html"), "w"), + escape=False, + formatters=dict( + modisco_cwm_fwd=path_to_image_html, + modisco_cwm_rev=path_to_image_html, + match0_logo=path_to_image_html, + match1_logo=path_to_image_html, + match2_logo=path_to_image_html, + ), + index=False, + ) diff --git a/fastermodiscolite/tfmodisco.py b/fastermodiscolite/tfmodisco.py new file mode 100644 index 0000000..b73037f --- /dev/null +++ b/fastermodiscolite/tfmodisco.py @@ -0,0 +1,677 @@ +# tfmodisco.py +# Authors: Jacob Schreiber +# adapted from code written by Avanti Shrikumar + +import logging +import numpy as np + +import scipy +import scipy.sparse + +from collections import defaultdict +from tqdm import tqdm +import numba + +from . import affinitymat +from . import aggregator +from . import extract_seqlets +from . import core +from . import util +from . import cluster + +def _density_adaptation(affmat_nn, seqlet_neighbors, tsne_perplexity): + eps = 0.0000001 + + rows, cols, data = [], [], [] + for row in range(len(affmat_nn)): + for col, datum in zip(seqlet_neighbors[row], affmat_nn[row]): + rows.append(row) + cols.append(col) + data.append(datum) + + affmat_nn = scipy.sparse.csr_matrix( + (data, (rows, cols)), shape=(len(affmat_nn), len(affmat_nn)), dtype="float64" + ) + + affmat_nn.data = np.maximum( + np.log((1.0 / (0.5 * np.maximum(affmat_nn.data, eps))) - 1), 0 + ) + affmat_nn.eliminate_zeros() + + counts_nn = scipy.sparse.csr_matrix( + (np.ones_like(affmat_nn.data), affmat_nn.indices, affmat_nn.indptr), + shape=affmat_nn.shape, + dtype="float64", + ) + + affmat_nn += affmat_nn.T + counts_nn += counts_nn.T + affmat_nn.data /= counts_nn.data + del counts_nn + + betas = [ + util.binary_search_perplexity(tsne_perplexity, affmat_nn[i].data) + for i in range(affmat_nn.shape[0]) + ] + normfactors = np.array( + [ + np.exp(-np.array(affmat_nn[i].data) / beta).sum() + 1 + for i, beta in enumerate(betas) + ] + ) + + for i in range(affmat_nn.shape[0]): + for j_idx in range(affmat_nn.indptr[i], affmat_nn.indptr[i + 1]): + j = affmat_nn.indices[j_idx] + distance = affmat_nn.data[j_idx] + + rbf_i = np.exp(-distance / betas[i]) / normfactors[i] + rbf_j = np.exp(-distance / betas[j]) / normfactors[j] + + affmat_nn.data[j_idx] = np.sqrt(rbf_i * rbf_j) + + affmat_diags = scipy.sparse.diags(1.0 / normfactors) + affmat_nn += affmat_diags + return affmat_nn + + +def _filter_patterns( + patterns, + min_seqlet_support, + window_size, + min_ic_in_window, + background, + ppm_pseudocount, +): + passing_patterns = [] + for pattern in tqdm(patterns, desc="Filtering patterns:"): + if len(pattern.seqlets) < min_seqlet_support: + continue + + ppm = pattern.sequence + per_position_ic = util.compute_per_position_ic( + ppm=ppm, background=background, pseudocount=ppm_pseudocount + ) + + if len(per_position_ic) < window_size: + if np.sum(per_position_ic) < min_ic_in_window: + continue + else: + # do the sliding window sum rearrangement + windowed_ic = np.sum( + util.rolling_window(a=per_position_ic, window=window_size), axis=-1 + ) + + if np.max(windowed_ic) < min_ic_in_window: + continue + + passing_patterns.append(pattern) + + return passing_patterns + + +def _patterns_from_clusters( + seqlets, + track_set, + min_overlap, + min_frac, + min_num, + flank_to_add, + window_size, + bg_freq, + cluster_indices, + track_sign, + stranded=False, +): + seqlet_sort_metric = lambda x: -np.sum(np.abs(x.contrib_scores)) + num_clusters = max(cluster_indices + 1) + cluster_to_seqlets = defaultdict(list) + + for seqlet, idx in zip(seqlets, cluster_indices): + cluster_to_seqlets[idx].append(seqlet) + + patterns = [] + for i in tqdm(range(num_clusters), desc="Generating patterns from clusters:"): + sorted_seqlets = sorted(cluster_to_seqlets[i], key=seqlet_sort_metric) + pattern = core.SeqletSet([sorted_seqlets[0]]) + + if len(sorted_seqlets) > 1: + pattern = aggregator.merge_in_seqlets_filledges( + parent_pattern=pattern, + seqlets_to_merge=sorted_seqlets[1:], + track_set=track_set, + metric=affinitymat.jaccard, + min_overlap=min_overlap, + stranded=stranded, + ) + + pattern = aggregator.polish_pattern( + pattern, + min_frac=min_frac, + min_num=min_num, + track_set=track_set, + flank=flank_to_add, + window_size=window_size, + bg_freq=bg_freq, + ) + + if pattern is not None: + if np.sign(np.sum(pattern.contrib_scores)) == track_sign: + patterns.append(pattern) + + return patterns + + +def _filter_by_correlation( + seqlets, seqlet_neighbors, coarse_affmat_nn, fine_affmat_nn, correlation_threshold +): + correlations = [] + for fine_affmat_row, coarse_affmat_row in zip(fine_affmat_nn, coarse_affmat_nn): + to_compare_mask = np.abs(fine_affmat_row) > 0 + corr = scipy.stats.spearmanr( + fine_affmat_row[to_compare_mask], coarse_affmat_row[to_compare_mask] + ) + correlations.append(corr.correlation) + + correlations = np.array(correlations) + filtered_rows_mask = np.array(correlations) > correlation_threshold + + filtered_seqlets = [ + seqlet for seqlet, mask in zip(seqlets, filtered_rows_mask) if mask == True + ] + + # figure out a mapping from pre-filtering to the + # post-filtering indices + new_idx_mapping = np.cumsum(filtered_rows_mask) - 1 + retained_indices = set(np.where(filtered_rows_mask == True)[0]) + + filtered_neighbors = [] + filtered_affmat_nn = [] + for old_row_idx, (old_neighbors, affmat_row) in enumerate( + zip(seqlet_neighbors, fine_affmat_nn) + ): + if old_row_idx in retained_indices: + filtered_old_neighbors = [ + neighbor for neighbor in old_neighbors if neighbor in retained_indices + ] + filtered_affmat_row = [ + affmatval + for affmatval, neighbor in zip(affmat_row, old_neighbors) + if neighbor in retained_indices + ] + filtered_neighbors_row = [ + new_idx_mapping[neighbor] for neighbor in filtered_old_neighbors + ] + filtered_neighbors.append(filtered_neighbors_row) + filtered_affmat_nn.append(filtered_affmat_row) + + return filtered_seqlets, filtered_neighbors, filtered_affmat_nn + + +def seqlets_to_patterns( + seqlets, + track_set, + track_signs=None, + min_overlap_while_sliding=0.7, + nearest_neighbors_to_compute=500, + affmat_correlation_threshold=0.15, + tsne_perplexity=10.0, + n_leiden_iterations=-1, + n_leiden_runs=50, + frac_support_to_trim_to=0.2, + min_num_to_trim_to=30, + trim_to_window_size=20, + initial_flank_to_add=5, + final_flank_to_add=0, + prob_and_pertrack_sim_merge_thresholds=[(0.8, 0.8), (0.5, 0.85), (0.2, 0.9)], + prob_and_pertrack_sim_dealbreaker_thresholds=[ + (0.4, 0.75), + (0.2, 0.8), + (0.1, 0.85), + (0.0, 0.9), + ], + subcluster_perplexity=50, + merging_max_seqlets_subsample=300, + final_min_cluster_size=20, + min_ic_in_window=0.6, + min_ic_windowsize=6, + ppm_pseudocount=0.001, + num_cores=-1, + stranded=False, + verbose=False, +): + logger = logging.getLogger("modisco-lite") + + bg_freq = np.mean([seqlet.sequence for seqlet in seqlets], axis=(0, 1)) + + seqlets = sorted(seqlets, key=lambda x: -np.sum(np.abs(x.contrib_scores))) + + for round_idx in range(2): + if len(seqlets) == 0: + return None + + # Step 1: Generate coarse resolution + logger.info( + f"- Round {round_idx}: Generating coarse resolution affinity matrix for {len(seqlets)} seqlets" + ) + coarse_affmat_nn, seqlet_neighbors = affinitymat.cosine_similarity_from_seqlets( + seqlets=seqlets, n_neighbors=nearest_neighbors_to_compute, sign=track_signs, + stranded=stranded, verbose=verbose + ) + + # Step 2: Generate fine representation + logger.info( + f"- Round {round_idx}: Generating fine resolution affinity matrix for " + f"{len(seqlets)} seqlets and {len(seqlet_neighbors)} neighbors" + ) + fine_affmat_nn = affinitymat.jaccard_from_seqlets( + seqlets=seqlets, + seqlet_neighbors=seqlet_neighbors, + min_overlap=min_overlap_while_sliding, + stranded=stranded, + verbose=verbose, + ) + + if round_idx == 0: + logger.info(f"- Round {round_idx}: Filtering seqlets by correlation") + filtered_seqlets, seqlet_neighbors, filtered_affmat_nn = ( + _filter_by_correlation( + seqlets, + seqlet_neighbors, + coarse_affmat_nn, + fine_affmat_nn, + affmat_correlation_threshold, + ) + ) + else: + filtered_seqlets = seqlets + filtered_affmat_nn = fine_affmat_nn + + del coarse_affmat_nn + del fine_affmat_nn + del seqlets + + # Step 4: Density adaptation + logger.info(f"- Round {round_idx}: Density adaptation") + csr_density_adapted_affmat = _density_adaptation( + filtered_affmat_nn, seqlet_neighbors, tsne_perplexity + ) + + del filtered_affmat_nn + del seqlet_neighbors + + # Step 5: Clustering + logger.info(f"- Round {round_idx}: Clustering with Leiden algorithm") + cluster_indices = cluster.LeidenClusterParallel( + csr_density_adapted_affmat, + n_seeds=n_leiden_runs, + n_leiden_iterations=n_leiden_iterations, + n_jobs=num_cores, + verbose=verbose, + ) + + del csr_density_adapted_affmat + + logger.info(f"- Round {round_idx}: Generating patterns from clusters") + patterns = _patterns_from_clusters( + filtered_seqlets, + track_set=track_set, + min_overlap=min_overlap_while_sliding, + min_frac=frac_support_to_trim_to, + min_num=min_num_to_trim_to, + flank_to_add=initial_flank_to_add, + window_size=trim_to_window_size, + bg_freq=bg_freq, + cluster_indices=cluster_indices, + track_sign=track_signs, + stranded=stranded, + ) + + # obtain unique seqlets from adjusted motifs + seqlets = list( + dict([(y.string, y) for x in patterns for y in x.seqlets]).values() + ) + + del seqlets + + logger.info("Detecting spurious merging of patterns") + merged_patterns, pattern_merge_hierarchy = aggregator._detect_spurious_merging( + patterns=patterns, + track_set=track_set, + perplexity=subcluster_perplexity, + min_in_subcluster=max(final_min_cluster_size, subcluster_perplexity), + min_overlap=min_overlap_while_sliding, + prob_and_pertrack_sim_merge_thresholds=prob_and_pertrack_sim_merge_thresholds, + prob_and_pertrack_sim_dealbreaker_thresholds=prob_and_pertrack_sim_dealbreaker_thresholds, + min_frac=frac_support_to_trim_to, + min_num=min_num_to_trim_to, + flank_to_add=initial_flank_to_add, + window_size=trim_to_window_size, + bg_freq=bg_freq, + max_seqlets_subsample=merging_max_seqlets_subsample, + n_seeds=n_leiden_runs, + stranded=stranded, + verbose=verbose, + ) + + logger.info("Filtering and merging patterns") + merged_patterns = sorted(merged_patterns, key=lambda x: -len(x.seqlets)) + patterns = _filter_patterns( + merged_patterns, + min_seqlet_support=final_min_cluster_size, + window_size=min_ic_windowsize, + min_ic_in_window=min_ic_in_window, + background=bg_freq, + ppm_pseudocount=ppm_pseudocount, + ) + + # apply subclustering procedure on the final patterns + for patternidx, pattern in enumerate(tqdm(patterns, desc="Computing subpatterns:", disable=not verbose)): + pattern = aggregator._expand_seqlets_to_fill_pattern( + pattern, + track_set, + left_flank_to_add=final_flank_to_add, + right_flank_to_add=final_flank_to_add, + ) + + pattern.compute_subpatterns( + subcluster_perplexity, + n_seeds=n_leiden_runs, + n_iterations=n_leiden_iterations, + ) + + patterns[patternidx] = pattern + + return patterns + + +def TFMoDISco( + one_hot, + hypothetical_contribs, + sliding_window_size=21, + flank_size=10, + min_metacluster_size=100, + weak_threshold_for_counting_sign=0.8, + max_seqlets_per_metacluster=20000, + target_seqlet_fdr=0.2, + min_passing_windows_frac=0.03, + max_passing_windows_frac=0.2, + n_leiden_runs=50, + n_leiden_iterations=-1, + min_overlap_while_sliding=0.7, + nearest_neighbors_to_compute=500, + affmat_correlation_threshold=0.15, + tsne_perplexity=10.0, + frac_support_to_trim_to=0.2, + min_num_to_trim_to=30, + trim_to_window_size=30, + initial_flank_to_add=10, + final_flank_to_add=0, + prob_and_pertrack_sim_merge_thresholds=[(0.8, 0.8), (0.5, 0.85), (0.2, 0.9)], + prob_and_pertrack_sim_dealbreaker_thresholds=[ + (0.4, 0.75), + (0.2, 0.8), + (0.1, 0.85), + (0.0, 0.9), + ], + subcluster_perplexity=50, + merging_max_seqlets_subsample=300, + final_min_cluster_size=20, + min_ic_in_window=0.6, + min_ic_windowsize=6, + ppm_pseudocount=0.001, + stranded=False, + pattern_type="both", # "both", "pos", or "neg" + num_cores=-1, + verbose=False, +): + """ + TFMoDISco: Transcription Factor Motif Discovery from Importance Scores. + + This function implements a fast, memory-efficient re-write of the original + TF-MoDISco algorithm (Shrikumar *et al.* 2018). Given base-wise attribution + scores and the underlying one-hot encoded sequences, the method identifies + recurring sequence patterns (motifs) by: + + 1. Extracting high-attribution "seqlets" with a sliding window. + 2. Clustering seqlets using cosine, Jaccard and density-adapted similarity + measures combined with the Leiden community detection algorithm. + 3. Polishing clusters into representative patterns and greedily merging + redundant patterns. + 4. Optionally sub-clustering each final pattern to reveal sub-motifs. + + Two independent motif discovery passes are performed — one on positively + scoring seqlets and one on negatively scoring seqlets — and the discovered + motif sets are returned separately. + + Args: + one_hot (np.ndarray): + One-hot encoded input sequences with shape + ``(n_sequences, seq_len, n_channels)`` where the last dimension + enumerates the alphabet (e.g. 4 channels for DNA). + hypothetical_contribs (np.ndarray): + Hypothetical contribution scores of identical shape as ``one_hot``. + These are typically obtained from deeplift-style hypothetical + importance computations. + sliding_window_size (int, optional): + Width of the window used to scan the attribution map when selecting + candidate seqlets. Defaults to ``21``. + flank_size (int, optional): + Number of additional positions to include on each side of an + extracted seqlet. Defaults to ``10``. + min_metacluster_size (int, optional): + Minimum number of seqlets required to initialise the positive / + negative metaclusters. Defaults to ``100``. + weak_threshold_for_counting_sign (float, optional): + Fraction of the FDR-determined attribution threshold used when + deciding the sign of a seqlet. Defaults to ``0.8``. + max_seqlets_per_metacluster (int, optional): + Upper limit on the number of seqlets processed per metacluster for + memory reasons. Defaults to ``20000``. + target_seqlet_fdr (float, optional): + Desired false discovery rate when determining the seqlet + attribution threshold. Defaults to ``0.2``. + min_passing_windows_frac (float, optional): + Lower bound on the fraction of sliding windows that may pass the + seqlet threshold. Defaults to ``0.03``. + max_passing_windows_frac (float, optional): + Upper bound on the fraction of sliding windows that may pass the + seqlet threshold. Defaults to ``0.2``. + n_leiden_runs (int, optional): + Number of Leiden initialisations in each clustering stage. + Defaults to ``50``. + n_leiden_iterations (int, optional): + Maximum iterations per Leiden run. ``-1`` lets the algorithm decide + when to stop. Defaults to ``-1``. + min_overlap_while_sliding (float, optional): + Minimum fractional overlap required when comparing two seqlets via + sliding-window similarity. Defaults to ``0.7``. + nearest_neighbors_to_compute (int, optional): + Number of nearest neighbours retained per seqlet in the affinity + matrices. Defaults to ``500``. + affmat_correlation_threshold (float, optional): + Minimum Spearman correlation between coarse and fine affinity rows + required to keep a seqlet after the first clustering round. + Defaults to ``0.15``. + tsne_perplexity (float, optional): + Perplexity parameter used during the density adaptation step. + Defaults to ``10.0``. + frac_support_to_trim_to (float, optional): + Fraction of supporting seqlets retained when trimming patterns. + Defaults to ``0.2``. + min_num_to_trim_to (int, optional): + Absolute minimum number of seqlets retained when trimming patterns. + Defaults to ``30``. + trim_to_window_size (int, optional): + Window size (in bp) retained around the core of each pattern during + trimming. Defaults to ``30``. + initial_flank_to_add (int, optional): + Flank length added to each side of a pattern during the polishing + stage. Defaults to ``10``. + final_flank_to_add (int, optional): + Additional flank length added at the very end of motif discovery. + Defaults to ``0``. + prob_and_pertrack_sim_merge_thresholds (list[tuple], optional): + Sequence-level and per-track similarity thresholds for greedy motif + merging. Defaults to ``[(0.8, 0.8), (0.5, 0.85), (0.2, 0.9)]``. + prob_and_pertrack_sim_dealbreaker_thresholds (list[tuple], optional): + Similarity thresholds that prevent merging when similarity is too + low. Defaults to + ``[(0.4, 0.75), (0.2, 0.8), (0.1, 0.85), (0.0, 0.9)]``. + subcluster_perplexity (int, optional): + Perplexity used during the optional sub-clustering step inside each + final pattern. Defaults to ``50``. + merging_max_seqlets_subsample (int, optional): + Maximum number of seqlets sampled per pattern when detecting + spurious merges. Defaults to ``300``. + final_min_cluster_size (int, optional): + Minimum number of seqlets required for a cluster to be retained + after all merging and filtering. Defaults to ``20``. + min_ic_in_window (float, optional): + Minimum information content (in bits) that must be present within + any window of length ``min_ic_windowsize`` for a pattern to be + kept. Defaults to ``0.6``. + min_ic_windowsize (int, optional): + Window length (in bp) used when computing information content. + Defaults to ``6``. + ppm_pseudocount (float, optional): + Pseudocount added to the position probability matrix when + computing information content. Defaults to ``0.001``. + pattern_type (str, optional): + Which pattern signs to compute. Must be one of ``"both"`` (compute + both positive and negative patterns), ``"pos"`` (only positive + patterns), or ``"neg"`` (only negative patterns). Defaults to + ``"both"``. + num_cores (int, optional): + Number of CPU cores to employ. ``-1`` enables all available cores + (Numba default). Defaults to ``-1``. + verbose (bool, optional): + If ``True``, configure a logger and emit detailed progress + messages. Defaults to ``False``. + + Returns: + Tuple[list[core.SeqletSet] | None, list[core.SeqletSet] | None]: + * **pos_patterns** - Motifs derived from positively scoring + seqlets, or ``None`` if too few positive seqlets were found. + * **neg_patterns** - Motifs derived from negatively scoring + seqlets, or ``None`` if too few negative seqlets were found. + """ + + if num_cores > 0: + numba.set_num_threads(num_cores) + + logger = logging.getLogger("modisco-lite") + if verbose: + handler = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.DEBUG) + + from . import __version__ + logger.info(f"Running TFMoDISco version {__version__}") + + contrib_scores = np.multiply(one_hot, hypothetical_contribs) + + track_set = core.TrackSet( + one_hot=one_hot, + contrib_scores=contrib_scores, + hypothetical_contribs=hypothetical_contribs, + ) + + seqlet_coords, threshold = extract_seqlets.extract_seqlets( + attribution_scores=contrib_scores.sum(axis=2), + window_size=sliding_window_size, + flank=flank_size, + suppress=(int(0.5 * sliding_window_size) + flank_size), + target_fdr=target_seqlet_fdr, + min_passing_windows_frac=min_passing_windows_frac, + max_passing_windows_frac=max_passing_windows_frac, + weak_threshold_for_counting_sign=weak_threshold_for_counting_sign, + ) + + seqlets = track_set.create_seqlets(seqlet_coords) + + pos_seqlets, neg_seqlets = [], [] + for seqlet in seqlets: + flank = int(0.5 * (len(seqlet) - sliding_window_size)) + attr = np.sum(seqlet.contrib_scores[flank:-flank]) + + if (attr > threshold): + pos_seqlets.append(seqlet) + elif (attr < -threshold): + neg_seqlets.append(seqlet) + + del seqlets + + if (pattern_type in ("both", "pos")) and (len(pos_seqlets) > min_metacluster_size): + pos_seqlets = pos_seqlets[:max_seqlets_per_metacluster] + logger.info(f"- Extracting {len(pos_seqlets)} positive seqlets") + + pos_patterns = seqlets_to_patterns( + seqlets=pos_seqlets, + track_set=track_set, + track_signs=1, + min_overlap_while_sliding=min_overlap_while_sliding, + nearest_neighbors_to_compute=nearest_neighbors_to_compute, + affmat_correlation_threshold=affmat_correlation_threshold, + tsne_perplexity=tsne_perplexity, + n_leiden_iterations=n_leiden_iterations, + n_leiden_runs=n_leiden_runs, + frac_support_to_trim_to=frac_support_to_trim_to, + min_num_to_trim_to=min_num_to_trim_to, + trim_to_window_size=trim_to_window_size, + initial_flank_to_add=initial_flank_to_add, + final_flank_to_add=final_flank_to_add, + prob_and_pertrack_sim_merge_thresholds=prob_and_pertrack_sim_merge_thresholds, + prob_and_pertrack_sim_dealbreaker_thresholds=prob_and_pertrack_sim_dealbreaker_thresholds, + subcluster_perplexity=subcluster_perplexity, + merging_max_seqlets_subsample=merging_max_seqlets_subsample, + final_min_cluster_size=final_min_cluster_size, + min_ic_in_window=min_ic_in_window, + min_ic_windowsize=min_ic_windowsize, + ppm_pseudocount=ppm_pseudocount, + stranded=stranded, + num_cores=num_cores, + verbose=verbose, + ) + else: + pos_patterns = None + + if (pattern_type in ("both", "neg")) and (len(neg_seqlets) > min_metacluster_size): + neg_seqlets = neg_seqlets[:max_seqlets_per_metacluster] + logger.info(f"- Extracting {len(neg_seqlets)} negative seqlets") + + neg_patterns = seqlets_to_patterns( + seqlets=neg_seqlets, + track_set=track_set, + track_signs=-1, + min_overlap_while_sliding=min_overlap_while_sliding, + nearest_neighbors_to_compute=nearest_neighbors_to_compute, + affmat_correlation_threshold=affmat_correlation_threshold, + tsne_perplexity=tsne_perplexity, + n_leiden_iterations=n_leiden_iterations, + n_leiden_runs=n_leiden_runs, + frac_support_to_trim_to=frac_support_to_trim_to, + min_num_to_trim_to=min_num_to_trim_to, + trim_to_window_size=trim_to_window_size, + initial_flank_to_add=initial_flank_to_add, + final_flank_to_add=final_flank_to_add, + prob_and_pertrack_sim_merge_thresholds=prob_and_pertrack_sim_merge_thresholds, + prob_and_pertrack_sim_dealbreaker_thresholds=prob_and_pertrack_sim_dealbreaker_thresholds, + subcluster_perplexity=subcluster_perplexity, + merging_max_seqlets_subsample=merging_max_seqlets_subsample, + final_min_cluster_size=final_min_cluster_size, + min_ic_in_window=min_ic_in_window, + min_ic_windowsize=min_ic_windowsize, + ppm_pseudocount=ppm_pseudocount, + stranded=stranded, + num_cores=num_cores, + verbose=verbose, + ) + else: + neg_patterns = None + + return pos_patterns, neg_patterns diff --git a/modiscolite/util.py b/fastermodiscolite/util.py similarity index 100% rename from modiscolite/util.py rename to fastermodiscolite/util.py diff --git a/modisco b/modisco deleted file mode 100755 index 56bdf27..0000000 --- a/modisco +++ /dev/null @@ -1,242 +0,0 @@ -#!/usr/bin/env python -# tf-modisco command-line tool -# Author: Jacob Schreiber , Ivy Raine - -from typing import List, Literal, Union -import h5py -import hdf5plugin -import argparse -import modiscolite - -import numpy as np - -from modiscolite.util import calculate_window_offsets - - -desc = """TF-MoDISco is a motif detection algorithm that takes in nucleotide - sequence and the attributions from a neural network model and return motifs - that are repeatedly enriched for attriution score across the examples. - This tool will take in one-hot encoded sequence, the corresponding - attribution scores, and a few other parameters, and return the motifs.""" - -# Read in the arguments -parser = argparse.ArgumentParser(description=desc) -subparsers = parser.add_subparsers(help="Must be either 'motifs', 'report', 'convert', 'convert-backward', 'meme', 'seqlet-bed', or 'seqlet-fasta'.", required=True, dest='cmd') - -motifs_parser = subparsers.add_parser("motifs", help="Run TF-MoDISco and extract the motifs.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -motifs_parser.add_argument("-s", "--sequences", type=str, - help="A .npy or .npz file containing the one-hot encoded sequences.") -motifs_parser.add_argument("-a", "--attributions", type=str, - help="A .npy or .npz file containing the hypothetical attributions, i.e., the attributions for all nucleotides at all positions.") -motifs_parser.add_argument("-i", "--h5py", type=str, - help="A legacy h5py file containing the one-hot encoded sequences and shap scores.") -motifs_parser.add_argument("-n", "--max_seqlets", type=int, required=True, - help="The maximum number of seqlets per metacluster.") -motifs_parser.add_argument("-l", "--n_leiden", type=int, default=2, - help="The number of Leiden clusterings to perform with different random seeds.") -motifs_parser.add_argument("-w", "--window", type=int, default=400, - help="The window surrounding the peak center that will be considered for motif discovery.") -motifs_parser.add_argument("-z", "--size", type=int, default=20, - help="The size of the seqlet cores, corresponding to `sliding_window_size`.") -motifs_parser.add_argument("-t", "--trim_size", type=int, default=30, - help="The size to trim to, corresponding to `trim_to_window_size`.") -motifs_parser.add_argument("-f", "--seqlet_flank_size", type=int, default=5, - help="The size of the flanks to add to each seqlet, corresponding to `flank_size`.") -motifs_parser.add_argument("-g", "--initial_flank_to_add", type=int, default=10, - help="The size of the flanks to add to each pattern initially, corresponding to `initial_flank_to_add`.") -motifs_parser.add_argument("-j", "--final_flank_to_add", type=int, default=0, - help="The size of the flanks to add to each pattern at the end, corresponding to `final_flank_to_add`.") -motifs_parser.add_argument("-o", "--output", type=str, default="modisco_results.h5", - help="The path to the output file.") -motifs_parser.add_argument("-v", "--verbose", action="store_true", default=False, - help="Controls the amount of output from the code.") - -report_parser = subparsers.add_parser("report", help="Create a HTML report of the results.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -report_parser.add_argument("-i", "--h5py", type=str, required=True, - help="An HDF5 file containing the output from modiscolite.") -report_parser.add_argument("-o", "--output", type=str, required=True, - help="A directory to put the output results including the html report.") -report_parser.add_argument("-t", "--write-tomtom", action="store_true", - default=False, - help="Write the TOMTOM results to the output directory if flag is given.") -report_parser.add_argument("-s", "--suffix", type=str, default="./", - help="The suffix to add to the beginning of images. Should be equal to the output if using a Jupyter notebook.") -report_parser.add_argument("-m", "--meme_db", type=str, default=None, - help="A MEME file containing motifs.") -report_parser.add_argument("-n", "--n_matches", type=int, default=3, - help="The number of top TOMTOM matches to include in the report.") -report_parser.add_argument("-l", "--lite", default=False, action='store_true', - help="Whether to use tomtom-lite when mapping patterns to motifs. Note that this also changes the distance function from correlation to Euclidean distance, and so the best motif may differ when there are many similar versions.") - -convert_parser = subparsers.add_parser("convert", help="Convert an old h5py to the new format.") -convert_parser.add_argument("-i", "--h5py", type=str, required=True, - help="An HDF5 file formatted in the original way.") -convert_parser.add_argument("-o", "--output", type=str, required=True, - help="An HDF5 file formatted in the new way.") - -convertback_parser = subparsers.add_parser("convert-backward", help="Convert a new h5py to the old format.") -convertback_parser.add_argument("-i", "--h5py", type=str, required=True, - help="An HDF5 file formatted in the new way.") -convertback_parser.add_argument("-o", "--output", type=str, required=True, - help="An HDF5 file formatted in the old way.") - -meme_parser = subparsers.add_parser("meme", help="""Output a MEME file from a -modisco results file to stdout (default) and/or to a file (if specified).""", - formatter_class=argparse.RawTextHelpFormatter) -meme_parser.add_argument("-i", "--h5py", type=str, - help="An HDF5 file containing the output from modiscolite.") -meme_parser.add_argument("-t", "--datatype", type=modiscolite.util.MemeDataType, - choices=list(modiscolite.util.MemeDataType), required=True, - help="""A case-sensitive string specifying the desired data of the output file., -The options are as follows: -- 'PFM': The position-frequency matrix. -- 'CWM': The contribution-weight matrix. -- 'hCWM': The hypothetical contribution-weight matrix; hypothetical - contribution scores are the contributions of nucleotides not encoded - by the one-hot encoding sequence. -- 'CWM-PFM': The softmax of the contribution-weight matrix. -- 'hCWM-PFM': The softmax of the hypothetical contribution-weight matrix.""" -) -meme_parser.add_argument("-o", "--output", type=str, help="The path to the output file.") -meme_parser.add_argument("-q", "--quiet", action="store_true", default=False, - help="Suppress output to stdout.") - -seqlet_bed_parser = subparsers.add_parser("seqlet-bed", help="""Output a BED -file of seqlets from a modisco results file to stdout (default) and/or to a -file (if specified).""") -seqlet_bed_parser.add_argument("-i", "--h5py", type=str, required=True, - help="An HDF5 file containing the output from modiscolite.") -seqlet_bed_parser.add_argument("-o", "--output", type=str, default=None, - help="The path to the output file.") -seqlet_bed_parser.add_argument("-p", "--peaksfile", type=str, required=True, - help="The path to the peaks file. This is to compute the absolute start and\ -end positions of the seqlets within a reference genome, as well as the chroms.") -seqlet_bed_parser.add_argument("-c", "--chroms", type=str, required=True, - help="""A comma-delimited list of chromosomes, or '*', denoting which -chromosomes to process. Should be the same set of chromosomes used during -interpretation. '*' will use every chr in the provided peaks file. -Examples: 'chr1,chr2,chrX' || '*' || '1,2,X'.""") -seqlet_bed_parser.add_argument("-q", "--quiet", action="store_true", default=False, - help="Suppress output to stdout.") -seqlet_bed_parser.add_argument("-w", "--windowsize", type=int, - help="""Optional. This is for backwards compatibility for older modisco h5 -files that don't contain the window size as an attribute. This should be set -the to size of the window around the peak center that was used for.""") - -seqlet_fasta_parser = subparsers.add_parser("seqlet-fasta", help="""Output a FASTA -file of seqlets from a modisco results file to stdout (default) and/or to a -file (if specified).""") -seqlet_fasta_parser.add_argument("-i", "--h5py", type=str, required=True, - help="An HDF5 file containing the output from modiscolite.") -seqlet_fasta_parser.add_argument("-o", "--output", type=str, default=None, - help="The path to the output file.") -seqlet_fasta_parser.add_argument("-p", "--peaksfile", type=str, required=True, - help="The path to the peaks file. This is to compute the absolute start and\ -end positions of the seqlets within a reference genome, as well as the chroms.") -seqlet_fasta_parser.add_argument("-s", "--sequences", type=str, required=True, - help="A .npy or .npz file containing the one-hot encoded sequences.") -seqlet_fasta_parser.add_argument("-c", "--chroms", type=str, required=True, - help="""A comma-delimited list of chromosomes, or '*', denoting which -chromosomes to process. Should be the same set of chromosomes used during -interpretation. '*' will use every chr in the provided peaks file. -Examples: 'chr1,chr2,chrX' || '*' || '1,2,X'.""") -seqlet_fasta_parser.add_argument("-q", "--quiet", action="store_true", default=False, - help="Suppress output to stdout.") -seqlet_fasta_parser.add_argument("-w", "--windowsize", type=int, - help="""Optional. This is for backwards compatibility for older modisco h5 -files that don't contain the window size as an attribute. This should be set -the to size of the window around the peak center that was used for.""") - -def convert_arg_chroms_to_list(chroms: str) -> Union[List[str], Literal['*']]: - """Converts the chroms argument to a list of chromosomes.""" - if chroms == "*": - # Return all chromosome numbers - return '*' - else: - return chroms.split(",") - -# Pull the arguments -args = parser.parse_args() - -if args.cmd == "motifs": - if args.h5py is not None: - # Load the scores - scores = h5py.File(args.h5py, 'r') - - try: - center = scores['hyp_scores'].shape[1] // 2 - start, end = calculate_window_offsets(center, args.window) - - attributions = scores['hyp_scores'][:, start:end, :] - sequences = scores['input_seqs'][:, start:end, :] - except KeyError: - center = scores['shap']['seq'].shape[2] // 2 - start, end = calculate_window_offsets(center, args.window) - - attributions = scores['shap']['seq'][:, :, start:end].transpose(0, 2, 1) - sequences = scores['raw']['seq'][:, :, start:end].transpose(0, 2, 1) - - print(start, end, attributions.shape, sequences.shape) - - scores.close() - - else: - if args.sequences[-3:] == 'npy': - sequences = np.load(args.sequences) - elif args.sequences[-3:] == 'npz': - sequences = np.load(args.sequences)['arr_0'] - - if args.attributions[-3:] == 'npy': - attributions = np.load(args.attributions) - elif args.attributions[-3:] == 'npz': - attributions = np.load(args.attributions)['arr_0'] - - center = sequences.shape[2] // 2 - start, end = calculate_window_offsets(center, args.window) - - sequences = sequences[:, :, start:end].transpose(0, 2, 1) - attributions = attributions[:, :, start:end].transpose(0, 2, 1) - - if sequences.shape[1] < args.window: - raise ValueError("Window ({}) cannot be ".format(args.window) + - "longer than the sequences".format(sequences.shape)) - - sequences = sequences.astype('float32') - attributions = attributions.astype('float32') - - - pos_patterns, neg_patterns = modiscolite.tfmodisco.TFMoDISco( - hypothetical_contribs=attributions, - one_hot=sequences, - max_seqlets_per_metacluster=args.max_seqlets, - sliding_window_size=args.size, - flank_size=args.seqlet_flank_size, - trim_to_window_size=args.trim_size, - initial_flank_to_add=args.initial_flank_to_add, - final_flank_to_add=args.final_flank_to_add, - target_seqlet_fdr=0.05, - n_leiden_runs=args.n_leiden, - verbose=args.verbose) - - modiscolite.io.save_hdf5(args.output, pos_patterns, neg_patterns, args.window) - -elif args.cmd == 'meme': - modiscolite.io.write_meme_from_h5(args.h5py, args.datatype, args.output, args.quiet) - -elif args.cmd == 'seqlet-bed': - modiscolite.io.write_bed_from_h5(args.h5py, args.peaksfile, args.output, convert_arg_chroms_to_list(args.chroms), args.windowsize, args.quiet) - -elif args.cmd == 'seqlet-fasta': - modiscolite.io.write_fasta_from_h5(args.h5py, args.peaksfile, args.sequences, args.output, convert_arg_chroms_to_list(args.chroms), args.windowsize, args.quiet) - -elif args.cmd == 'report': - modiscolite.report.report_motifs(args.h5py, args.output, - img_path_suffix=args.suffix, meme_motif_db=args.meme_db, - is_writing_tomtom_matrix=args.write_tomtom, - top_n_matches=args.n_matches, ttl=args.lite) - -elif args.cmd == 'convert': - modiscolite.io.convert(args.h5py, args.output) - -elif args.cmd == 'convert-backward': - modiscolite.io.convert_new_to_old(args.h5py, args.output) diff --git a/modiscolite/cluster.py b/modiscolite/cluster.py deleted file mode 100644 index 6ce5558..0000000 --- a/modiscolite/cluster.py +++ /dev/null @@ -1,37 +0,0 @@ -# cluster.py -# Authors: Jacob Schreiber -# adapted from code written by Avanti Shrikumar - -import leidenalg -import numpy as np -import igraph as ig - -def LeidenCluster(affinity_mat, n_seeds=2, n_leiden_iterations=-1): - n_vertices = affinity_mat.shape[0] - n_cols = affinity_mat.indptr - sources = np.concatenate([np.ones(n_cols[i+1] - n_cols[i], dtype='int32') * i for i in range(n_vertices)]) - - g = ig.Graph(directed=None) - g.add_vertices(n_vertices) - g.add_edges(zip(sources, affinity_mat.indices)) - - best_clustering = None - best_quality = None - - for seed in range(1, n_seeds+1): - partition = leidenalg.find_partition( - graph=g, - partition_type=leidenalg.ModularityVertexPartition, - weights=affinity_mat.data, - n_iterations=n_leiden_iterations, - initial_membership=None, - seed=seed*100) - - quality = np.array(partition.quality()) - membership = np.array(partition.membership) - - if best_quality is None or quality > best_quality: - best_quality = quality - best_clustering = membership - - return best_clustering diff --git a/modiscolite/report.py b/modiscolite/report.py deleted file mode 100644 index 9feba3f..0000000 --- a/modiscolite/report.py +++ /dev/null @@ -1,359 +0,0 @@ -import os -import h5py -import types -import pickle -import pandas -import shutil -import tempfile -import logomaker - -import matplotlib -matplotlib.use('pdf') -from matplotlib import pyplot as plt - -import numpy as np -import pandas as pd - -from pathlib import Path -from typing import List, Union - -from memelite import tomtom -from memelite.io import read_meme - - -pd.options.display.max_colwidth = 500 - -def compute_per_position_ic(ppm, background, pseudocount): - alphabet_len = len(background) - ic = ((np.log((ppm+pseudocount)/(1 + pseudocount*alphabet_len))/np.log(2)) - *ppm - (np.log(background)*background/np.log(2))[None,:]) - return np.sum(ic,axis=1) - - -def write_meme_file(ppm, bg, fname): - f = open(fname, 'w') - f.write('MEME version 4\n\n') - f.write('ALPHABET= ACGT\n\n') - f.write('strands: + -\n\n') - f.write('Background letter frequencies (from unknown source):\n') - f.write('A %.3f C %.3f G %.3f T %.3f\n\n' % tuple(list(bg))) - f.write('MOTIF 1 TEMP\n') - f.write('letter-probability matrix: alength= 4 w= %d nsites= 1 E= 0e+0\n' % ppm.shape[0]) - for s in ppm: - f.write('%.5f %.5f %.5f %.5f\n' % tuple(s)) - f.write("URL\n\n") - f.close() - - -def fetch_tomtom_matches(ppm, cwm, is_writing_tomtom_matrix, output_dir, - pattern_name, motifs_db, background=[0.25, 0.25, 0.25, 0.25], - tomtom_exec_path='tomtom', trim_threshold=0.3, trim_min_length=3): - - """Fetches top matches from a motifs database using TomTom. - Args: - ppm: position probability matrix- numpy matrix of dimension (N,4) - cwm: contribution weight matrix- numpy matrix of dimension (N,4) - is_writing_tomtom_matrix: if True, write the tomtom matrix to a file - output_dir: directory for writing the TOMTOM file - pattern_name: the name of the pattern, to be used for writing to file - background: list with ACGT background probabilities - tomtom_exec_path: path to TomTom executable - motifs_db: path to motifs database in meme format - n: number of top matches to return, ordered by p-value - temp_dir: directory for storing temp files - trim_threshold: the ppm is trimmed from left till first position for which - probability for any base pair >= trim_threshold. Similarly from right. - Returns: - list: a list of up to n results returned by tomtom, each entry is a - dictionary with keys 'Target ID', 'p-value', 'E-value', 'q-value' - """ - - _, fname = tempfile.mkstemp() - _, tomtom_fname = tempfile.mkstemp() - - score = np.sum(np.abs(cwm), axis=1) - trim_thresh = np.max(score) * trim_threshold # Cut off anything less than 30% of max score - pass_inds = np.where(score >= trim_thresh)[0] - trimmed = ppm[np.min(pass_inds): np.max(pass_inds) + 1] - - # can be None of no base has prob>t - if trimmed is None: - return [] - - # trim and prepare meme file - write_meme_file(trimmed, background, fname) - - if not shutil.which(tomtom_exec_path): - raise ValueError(f'`tomtom` executable could not be called globally or locally. Please install it and try again. You may install it using conda with `conda install -c bioconda meme`') - - # run tomtom - cmd = '%s -no-ssc -oc . --verbosity 1 -text -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10.0 %s %s > %s' % (tomtom_exec_path, fname, motifs_db, tomtom_fname) - os.system(cmd) - - tomtom_results = pandas.read_csv(tomtom_fname, sep="\t", usecols=(1, 5), comment='#') - - os.system('rm ' + fname) - if is_writing_tomtom_matrix: - output_subdir = os.path.join(output_dir, "tomtom") - os.makedirs(output_subdir, exist_ok=True) - output_filepath = os.path.join(output_subdir, f"{pattern_name}.tomtom.tsv") - os.system(f'mv {tomtom_fname} {output_filepath}') - else: - os.system('rm ' + tomtom_fname) - return tomtom_results - - -def generate_tomtom_dataframe(modisco_h5py: os.PathLike, - output_dir: os.PathLike, meme_motif_db: Union[os.PathLike, None], - is_writing_tomtom_matrix: bool, pattern_groups: List[str], - top_n_matches=3, tomtom_exec: str="tomtom", trim_threshold=0.3, - trim_min_length=3): - - tomtom_results = {} - - for i in range(top_n_matches): - tomtom_results[f'match{i}'] = [] - tomtom_results[f'qval{i}'] = [] - - with h5py.File(modisco_h5py, 'r') as modisco_results: - for contribution_dir_name in pattern_groups: - if contribution_dir_name not in modisco_results.keys(): - continue - - metacluster = modisco_results[contribution_dir_name] - key = lambda x: int(x[0].split("_")[-1]) - - for idx, (_, pattern) in enumerate(sorted(metacluster.items(), key=key)): - # Rest of your code goes here - - ppm = np.array(pattern['sequence'][:]) - cwm = np.array(pattern["contrib_scores"][:]) - - pattern_name = f'{contribution_dir_name}.pattern_{idx}' - - r = fetch_tomtom_matches(ppm, cwm, - is_writing_tomtom_matrix=is_writing_tomtom_matrix, - output_dir=output_dir, pattern_name=pattern_name, - motifs_db=meme_motif_db, tomtom_exec_path=tomtom_exec, - trim_threshold=trim_threshold, - trim_min_length=trim_min_length) - - i = -1 - for i, (target, qval) in r.iloc[:top_n_matches].iterrows(): - target = target.strip() - - tomtom_results[f'match{i}'].append(target) - tomtom_results[f'qval{i}'].append(qval) - - for j in range(i+1, top_n_matches): - tomtom_results[f'match{j}'].append(None) - tomtom_results[f'qval{j}'].append(None) - - return pandas.DataFrame(tomtom_results) - - -def tomtomlite_dataframe( - modisco_h5py: os.PathLike, - output_dir: os.PathLike, - meme_motif_db: Union[os.PathLike, None], - pattern_groups: List[str], - top_n_matches=3, - trim_threshold=0.3, - trim_min_length=3): - """Use tomtom-lite to match patterns to a motif database.""" - - tomtom_results = {} - - for i in range(top_n_matches): - tomtom_results[f'match{i}'] = [] - tomtom_results[f'pval{i}'] = [] - - ppms = [] - with h5py.File(modisco_h5py, 'r') as modisco_results: - for contribution_dir_name in pattern_groups: - if contribution_dir_name not in modisco_results.keys(): - continue - - metacluster = modisco_results[contribution_dir_name] - key = lambda x: int(x[0].split("_")[-1]) - - for idx, (_, pattern) in enumerate(sorted(metacluster.items(), key=key)): - ppm = np.array(pattern['sequence'][:]) - cwm = np.array(pattern["contrib_scores"][:]) - - score = np.sum(np.abs(cwm), axis=1) - trim_thresh = np.max(score) * trim_threshold # Cut off anything less than 30% of max score - pass_inds = np.where(score >= trim_thresh)[0] - - ppm = ppm[np.min(pass_inds): np.max(pass_inds) + 1] - ppms.append(ppm.T) - - target_db = read_meme(meme_motif_db) - target_names = list(target_db.keys()) - target_pwms = list(target_db.values()) - - p, scores, offsets, overlaps, strands, idxs = tomtom(ppms, target_pwms, - n_nearest=top_n_matches) - - for i in range(idxs.shape[0]): - for j in range(top_n_matches): - target_name = target_names[int(idxs[i, j])].strip() - pval = p[i, j] - - tomtom_results[f'match{j}'].append(target_name) - tomtom_results[f'pval{j}'].append(pval) - - return pandas.DataFrame(tomtom_results) - - -def path_to_image_html(path): - return '' - - -def _plot_weights(array, path, figsize=(10,3)): - """Plot weights as a sequence logo and save to file.""" - fig = plt.figure(figsize=figsize) - ax = fig.add_subplot(111) - - df = pandas.DataFrame(array, columns=['A', 'C', 'G', 'T']) - df.index.name = 'pos' - - crp_logo = logomaker.Logo(df, ax=ax) - crp_logo.style_spines(visible=False) - plt.ylim(min(df.sum(axis=1).min(), 0), df.sum(axis=1).max()) - - plt.savefig(path) - plt.close() - - -def make_logo(match, logo_dir, motifs): - if match == 'NA': - return - - background = np.array([0.25, 0.25, 0.25, 0.25]) - ppm = motifs[match] - ic = compute_per_position_ic(ppm, background, 0.001) - - _plot_weights(ppm*ic[:, None], path='{}/{}.png'.format(logo_dir, match)) - - -def create_modisco_logos(modisco_h5py: os.PathLike, modisco_logo_dir, trim_threshold, pattern_groups: List[str]): - """Open a modisco results file and create and write logos to file for each pattern.""" - modisco_results = h5py.File(modisco_h5py, 'r') - - tags = [] - - for name in pattern_groups: - if name not in modisco_results.keys(): - continue - - metacluster = modisco_results[name] - key = lambda x: int(x[0].split("_")[-1]) - for pattern_name, pattern in sorted(metacluster.items(), key=key): - tag = '{}.{}'.format(name, pattern_name) - tags.append(tag) - - cwm_fwd = np.array(pattern['contrib_scores'][:]) - cwm_rev = cwm_fwd[::-1, ::-1] - - score_fwd = np.sum(np.abs(cwm_fwd), axis=1) - score_rev = np.sum(np.abs(cwm_rev), axis=1) - - trim_thresh_fwd = np.max(score_fwd) * trim_threshold - trim_thresh_rev = np.max(score_rev) * trim_threshold - - pass_inds_fwd = np.where(score_fwd >= trim_thresh_fwd)[0] - pass_inds_rev = np.where(score_rev >= trim_thresh_rev)[0] - - start_fwd, end_fwd = max(np.min(pass_inds_fwd) - 4, 0), min(np.max(pass_inds_fwd) + 4 + 1, len(score_fwd) + 1) - start_rev, end_rev = max(np.min(pass_inds_rev) - 4, 0), min(np.max(pass_inds_rev) + 4 + 1, len(score_rev) + 1) - - trimmed_cwm_fwd = cwm_fwd[start_fwd:end_fwd] - trimmed_cwm_rev = cwm_rev[start_rev:end_rev] - - _plot_weights(trimmed_cwm_fwd, path='{}/{}.cwm.fwd.png'.format(modisco_logo_dir, tag)) - _plot_weights(trimmed_cwm_rev, path='{}/{}.cwm.rev.png'.format(modisco_logo_dir, tag)) - - modisco_results.close() - return tags - -def report_motifs(modisco_h5py: Path, output_dir: os.PathLike, img_path_suffix: os.PathLike, - meme_motif_db: Union[os.PathLike, None], is_writing_tomtom_matrix: bool, top_n_matches=3, - trim_threshold=0.3, trim_min_length=3, ttl=False): - - if not os.path.isdir(output_dir): - os.mkdir(output_dir) - - modisco_logo_dir = os.path.join(output_dir, 'trimmed_logos') - if not os.path.isdir(modisco_logo_dir): - os.mkdir(modisco_logo_dir) - - pattern_groups = ['pos_patterns', 'neg_patterns'] - - create_modisco_logos(modisco_h5py, modisco_logo_dir, trim_threshold, pattern_groups) - - results = {'pattern': [], 'num_seqlets': [], 'modisco_cwm_fwd': [], 'modisco_cwm_rev': []} - - with h5py.File(modisco_h5py, 'r') as modisco_results: - for name in pattern_groups: - if name not in modisco_results.keys(): - continue - - metacluster = modisco_results[name] - key = lambda x: int(x[0].split("_")[-1]) - for pattern_name, pattern in sorted(metacluster.items(), key=key): - num_seqlets = pattern['seqlets']['n_seqlets'][:][0] - pattern_tag = f'{name}.{pattern_name}' - - results['pattern'].append(pattern_tag) - results['num_seqlets'].append(num_seqlets) - results['modisco_cwm_fwd'].append(os.path.join(img_path_suffix, 'trimmed_logos', f'{pattern_tag}.cwm.fwd.png')) - results['modisco_cwm_rev'].append(os.path.join(img_path_suffix, 'trimmed_logos', f'{pattern_tag}.cwm.rev.png')) - - patterns_df = pd.DataFrame(results) - reordered_columns = ['pattern', 'num_seqlets', 'modisco_cwm_fwd', 'modisco_cwm_rev'] - - # If the optional meme_motif_db is not provided, then we won't generate TOMTOM comparison. - if meme_motif_db is not None: - motifs = read_meme(meme_motif_db) - motifs = {name: pwm.T for name, pwm in motifs.items()} - - if ttl: - tomtom_df = tomtomlite_dataframe(modisco_h5py, output_dir, meme_motif_db, - top_n_matches=top_n_matches, pattern_groups=pattern_groups, - trim_threshold=trim_threshold, trim_min_length=trim_min_length) - else: - motifs = {key.split()[0]: value for key, value in motifs.items()} - - tomtom_df = generate_tomtom_dataframe(modisco_h5py, output_dir, meme_motif_db, - is_writing_tomtom_matrix, - top_n_matches=top_n_matches, tomtom_exec='tomtom', - pattern_groups=pattern_groups, trim_threshold=trim_threshold, - trim_min_length=trim_min_length) - - patterns_df = pandas.concat([patterns_df, tomtom_df], axis=1) - - for i in range(top_n_matches): - name = f'match{i}' - logos = [] - - for _, row in patterns_df.iterrows(): - if name in patterns_df.columns: - if pandas.isnull(row[name]): - logos.append("NA") - else: - make_logo(row[name], output_dir, motifs) - logos.append(f'{img_path_suffix}{row[name]}.png') - else: - break - - patterns_df[f"{name}_logo"] = logos - val = f'pval{i}' if ttl else f'qval{i}' - reordered_columns.extend([name, val, f'{name}_logo']) - - patterns_df = patterns_df[reordered_columns] - patterns_df.to_html(open(os.path.join(output_dir, 'motifs.html'), 'w'), - escape=False, formatters=dict(modisco_cwm_fwd=path_to_image_html, - modisco_cwm_rev=path_to_image_html, match0_logo=path_to_image_html, - match1_logo=path_to_image_html, match2_logo=path_to_image_html), - index=False) diff --git a/modiscolite/tfmodisco.py b/modiscolite/tfmodisco.py deleted file mode 100644 index 855f924..0000000 --- a/modiscolite/tfmodisco.py +++ /dev/null @@ -1,372 +0,0 @@ -# tfmodisco.py -# Authors: Jacob Schreiber -# adapted from code written by Avanti Shrikumar - -import numpy as np - -import scipy -import scipy.sparse - -from collections import OrderedDict -from collections import defaultdict - -from . import affinitymat -from . import aggregator -from . import extract_seqlets -from . import core -from . import util -from . import cluster - -def _density_adaptation(affmat_nn, seqlet_neighbors, tsne_perplexity): - eps = 0.0000001 - - rows, cols, data = [], [], [] - for row in range(len(affmat_nn)): - for col, datum in zip(seqlet_neighbors[row], affmat_nn[row]): - rows.append(row) - cols.append(col) - data.append(datum) - - affmat_nn = scipy.sparse.csr_matrix((data, (rows, cols)), - shape=(len(affmat_nn), len(affmat_nn)), dtype='float64') - - affmat_nn.data = np.maximum(np.log((1.0/(0.5*np.maximum(affmat_nn.data, eps)))-1), 0) - affmat_nn.eliminate_zeros() - - counts_nn = scipy.sparse.csr_matrix((np.ones_like(affmat_nn.data), - affmat_nn.indices, affmat_nn.indptr), shape=affmat_nn.shape, dtype='float64') - - affmat_nn += affmat_nn.T - counts_nn += counts_nn.T - affmat_nn.data /= counts_nn.data - del counts_nn - - betas = [util.binary_search_perplexity(tsne_perplexity, affmat_nn[i].data) for i in range(affmat_nn.shape[0])] - normfactors = np.array([np.exp(-np.array(affmat_nn[i].data)/beta).sum()+1 for i, beta in enumerate(betas)]) - - for i in range(affmat_nn.shape[0]): - for j_idx in range(affmat_nn.indptr[i], affmat_nn.indptr[i+1]): - j = affmat_nn.indices[j_idx] - distance = affmat_nn.data[j_idx] - - rbf_i = np.exp(-distance / betas[i]) / normfactors[i] - rbf_j = np.exp(-distance / betas[j]) / normfactors[j] - - affmat_nn.data[j_idx] = np.sqrt(rbf_i * rbf_j) - - affmat_diags = scipy.sparse.diags(1.0 / normfactors) - affmat_nn += affmat_diags - return affmat_nn - - -def _filter_patterns(patterns, min_seqlet_support, window_size, - min_ic_in_window, background, ppm_pseudocount): - passing_patterns = [] - for pattern in patterns: - if len(pattern.seqlets) < min_seqlet_support: - continue - - ppm = pattern.sequence - per_position_ic = util.compute_per_position_ic(ppm=ppm, - background=background, pseudocount=ppm_pseudocount) - - if len(per_position_ic) < window_size: - if np.sum(per_position_ic) < min_ic_in_window: - continue - else: - #do the sliding window sum rearrangement - windowed_ic = np.sum(util.rolling_window( - a=per_position_ic, window=window_size), axis=-1) - - if np.max(windowed_ic) < min_ic_in_window: - continue - - passing_patterns.append(pattern) - - return passing_patterns - - -def _patterns_from_clusters(seqlets, track_set, min_overlap, - min_frac, min_num, flank_to_add, window_size, bg_freq, cluster_indices, - track_sign): - - seqlet_sort_metric = lambda x: -np.sum(np.abs(x.contrib_scores)) - num_clusters = max(cluster_indices+1) - cluster_to_seqlets = defaultdict(list) - - for seqlet, idx in zip(seqlets, cluster_indices): - cluster_to_seqlets[idx].append(seqlet) - - patterns = [] - for i in range(num_clusters): - sorted_seqlets = sorted(cluster_to_seqlets[i], key=seqlet_sort_metric) - pattern = core.SeqletSet([sorted_seqlets[0]]) - - if len(sorted_seqlets) > 1: - pattern = aggregator.merge_in_seqlets_filledges( - parent_pattern=pattern, seqlets_to_merge=sorted_seqlets[1:], - track_set=track_set, metric=affinitymat.jaccard, - min_overlap=min_overlap) - - pattern = aggregator.polish_pattern(pattern, min_frac=min_frac, - min_num=min_num, track_set=track_set, flank=flank_to_add, - window_size=window_size, bg_freq=bg_freq) - - if pattern is not None: - if np.sign(np.sum(pattern.contrib_scores)) == track_sign: - patterns.append(pattern) - - return patterns - - -def _filter_by_correlation(seqlets, seqlet_neighbors, coarse_affmat_nn, - fine_affmat_nn, correlation_threshold): - - correlations = [] - for fine_affmat_row, coarse_affmat_row in zip(fine_affmat_nn, coarse_affmat_nn): - to_compare_mask = np.abs(fine_affmat_row) > 0 - corr = scipy.stats.spearmanr(fine_affmat_row[to_compare_mask], - coarse_affmat_row[to_compare_mask]) - correlations.append(corr.correlation) - - correlations = np.array(correlations) - filtered_rows_mask = np.array(correlations) > correlation_threshold - - filtered_seqlets = [seqlet for seqlet, mask in zip(seqlets, - filtered_rows_mask) if mask == True] - - #figure out a mapping from pre-filtering to the - # post-filtering indices - new_idx_mapping = np.cumsum(filtered_rows_mask) - 1 - retained_indices = set(np.where(filtered_rows_mask == True)[0]) - - filtered_neighbors = [] - filtered_affmat_nn = [] - for old_row_idx, (old_neighbors, affmat_row) in enumerate(zip(seqlet_neighbors, fine_affmat_nn)): - if old_row_idx in retained_indices: - filtered_old_neighbors = [neighbor for neighbor in old_neighbors if neighbor in retained_indices] - filtered_affmat_row = [affmatval for affmatval, neighbor in zip(affmat_row,old_neighbors) if neighbor in retained_indices] - filtered_neighbors_row = [new_idx_mapping[neighbor] for neighbor in filtered_old_neighbors] - filtered_neighbors.append(filtered_neighbors_row) - filtered_affmat_nn.append(filtered_affmat_row) - - return filtered_seqlets, filtered_neighbors, filtered_affmat_nn - - -def seqlets_to_patterns(seqlets, track_set, track_signs=None, - min_overlap_while_sliding=0.7, nearest_neighbors_to_compute=500, - affmat_correlation_threshold=0.15, tsne_perplexity=10.0, - n_leiden_iterations=-1, n_leiden_runs=50, frac_support_to_trim_to=0.2, - min_num_to_trim_to=30, trim_to_window_size=20, initial_flank_to_add=5, - final_flank_to_add=0, - prob_and_pertrack_sim_merge_thresholds=[(0.8,0.8), (0.5, 0.85), (0.2, 0.9)], - prob_and_pertrack_sim_dealbreaker_thresholds=[(0.4, 0.75), (0.2,0.8), (0.1, 0.85), (0.0,0.9)], - subcluster_perplexity=50, merging_max_seqlets_subsample=300, - final_min_cluster_size=20,min_ic_in_window=0.6, min_ic_windowsize=6, - ppm_pseudocount=0.001): - - bg_freq = np.mean([seqlet.sequence for seqlet in seqlets], axis=(0, 1)) - - seqlets_sorter = (lambda arr: sorted(arr, key=lambda x: - -np.sum(np.abs(x.contrib_scores)))) - - seqlets = seqlets_sorter(seqlets) - - for round_idx in range(2): - if len(seqlets) == 0: - return None - - # Step 1: Generate coarse resolution - coarse_affmat_nn, seqlet_neighbors = affinitymat.cosine_similarity_from_seqlets( - seqlets=seqlets, n_neighbors=nearest_neighbors_to_compute, sign=track_signs) - - # Step 2: Generate fine representation - fine_affmat_nn = affinitymat.jaccard_from_seqlets( - seqlets=seqlets, seqlet_neighbors=seqlet_neighbors, - min_overlap=min_overlap_while_sliding) - - if round_idx == 0: - filtered_seqlets, seqlet_neighbors, filtered_affmat_nn = ( - _filter_by_correlation(seqlets, seqlet_neighbors, - coarse_affmat_nn, fine_affmat_nn, - affmat_correlation_threshold)) - else: - filtered_seqlets = seqlets - filtered_affmat_nn = fine_affmat_nn - - del coarse_affmat_nn - del fine_affmat_nn - del seqlets - - # Step 4: Density adaptation - csr_density_adapted_affmat = _density_adaptation( - filtered_affmat_nn, seqlet_neighbors, tsne_perplexity) - - del filtered_affmat_nn - del seqlet_neighbors - - # Step 5: Clustering - cluster_indices = cluster.LeidenCluster( - csr_density_adapted_affmat, - n_seeds=n_leiden_runs, - n_leiden_iterations=n_leiden_iterations) - - del csr_density_adapted_affmat - - patterns = _patterns_from_clusters(filtered_seqlets, - track_set=track_set, - min_overlap=min_overlap_while_sliding, - min_frac=frac_support_to_trim_to, - min_num=min_num_to_trim_to, - flank_to_add=initial_flank_to_add, - window_size=trim_to_window_size, - bg_freq=bg_freq, - cluster_indices=cluster_indices, - track_sign=track_signs) - - #obtain unique seqlets from adjusted motifs - seqlets = list(dict([(y.string, y) - for x in patterns for y in x.seqlets]).values()) - - del seqlets - - merged_patterns, pattern_merge_hierarchy = aggregator._detect_spurious_merging( - patterns=patterns, track_set=track_set, perplexity=subcluster_perplexity, - min_in_subcluster=max(final_min_cluster_size, subcluster_perplexity), - min_overlap=min_overlap_while_sliding, - prob_and_pertrack_sim_merge_thresholds=prob_and_pertrack_sim_merge_thresholds, - prob_and_pertrack_sim_dealbreaker_thresholds=prob_and_pertrack_sim_dealbreaker_thresholds, - min_frac=frac_support_to_trim_to, min_num=min_num_to_trim_to, - flank_to_add=initial_flank_to_add, - window_size=trim_to_window_size, bg_freq=bg_freq, - max_seqlets_subsample=merging_max_seqlets_subsample, - n_seeds=n_leiden_runs) - - #Now start merging patterns - merged_patterns = sorted(merged_patterns, key=lambda x: -len(x.seqlets)) - - patterns = _filter_patterns(merged_patterns, - min_seqlet_support=final_min_cluster_size, - window_size=min_ic_windowsize, min_ic_in_window=min_ic_in_window, - background=bg_freq, ppm_pseudocount=ppm_pseudocount) - - #apply subclustering procedure on the final patterns - for patternidx, pattern in enumerate(patterns): - pattern = aggregator._expand_seqlets_to_fill_pattern(pattern, track_set, - left_flank_to_add=final_flank_to_add, - right_flank_to_add=final_flank_to_add) - - pattern.compute_subpatterns(subcluster_perplexity, - n_seeds=n_leiden_runs, n_iterations=n_leiden_iterations) - - patterns[patternidx] = pattern - - return patterns - - -def TFMoDISco(one_hot, hypothetical_contribs, sliding_window_size=21, - flank_size=10, min_metacluster_size=100, - weak_threshold_for_counting_sign=0.8, max_seqlets_per_metacluster=20000, - target_seqlet_fdr=0.2, min_passing_windows_frac=0.03, - max_passing_windows_frac=0.2, n_leiden_runs=50, n_leiden_iterations=-1, - min_overlap_while_sliding=0.7, nearest_neighbors_to_compute=500, - affmat_correlation_threshold=0.15, tsne_perplexity=10.0, - frac_support_to_trim_to=0.2, min_num_to_trim_to=30, trim_to_window_size=30, - initial_flank_to_add=10, final_flank_to_add=0, - prob_and_pertrack_sim_merge_thresholds=[(0.8,0.8), (0.5, 0.85), (0.2, 0.9)], - prob_and_pertrack_sim_dealbreaker_thresholds=[(0.4, 0.75), (0.2,0.8), (0.1, 0.85), (0.0,0.9)], - subcluster_perplexity=50, merging_max_seqlets_subsample=300, - final_min_cluster_size=20, min_ic_in_window=0.6, min_ic_windowsize=6, - ppm_pseudocount=0.001, verbose=False): - - contrib_scores = np.multiply(one_hot, hypothetical_contribs) - - track_set = core.TrackSet(one_hot=one_hot, - contrib_scores=contrib_scores, - hypothetical_contribs=hypothetical_contribs) - - seqlet_coords, threshold = extract_seqlets.extract_seqlets( - attribution_scores=contrib_scores.sum(axis=2), - window_size=sliding_window_size, - flank=flank_size, - suppress=(int(0.5*sliding_window_size) + flank_size), - target_fdr=target_seqlet_fdr, - min_passing_windows_frac=min_passing_windows_frac, - max_passing_windows_frac=max_passing_windows_frac, - weak_threshold_for_counting_sign=weak_threshold_for_counting_sign) - - seqlets = track_set.create_seqlets(seqlet_coords) - - pos_seqlets, neg_seqlets = [], [] - for seqlet in seqlets: - flank = int(0.5*(len(seqlet)-sliding_window_size)) - attr = np.sum(seqlet.contrib_scores[flank:-flank]) - - if attr > threshold: - pos_seqlets.append(seqlet) - elif attr < -threshold: - neg_seqlets.append(seqlet) - - del seqlets - - if len(pos_seqlets) > min_metacluster_size: - pos_seqlets = pos_seqlets[:max_seqlets_per_metacluster] - if verbose: - print("Using {} positive seqlets".format(len(pos_seqlets))) - - pos_patterns = seqlets_to_patterns(seqlets=pos_seqlets, - track_set=track_set, - track_signs=1, - min_overlap_while_sliding=min_overlap_while_sliding, - nearest_neighbors_to_compute=nearest_neighbors_to_compute, - affmat_correlation_threshold=affmat_correlation_threshold, - tsne_perplexity=tsne_perplexity, - n_leiden_iterations=n_leiden_iterations, - n_leiden_runs=n_leiden_runs, - frac_support_to_trim_to=frac_support_to_trim_to, - min_num_to_trim_to=min_num_to_trim_to, - trim_to_window_size=trim_to_window_size, - initial_flank_to_add=initial_flank_to_add, - final_flank_to_add=final_flank_to_add, - prob_and_pertrack_sim_merge_thresholds=prob_and_pertrack_sim_merge_thresholds, - prob_and_pertrack_sim_dealbreaker_thresholds=prob_and_pertrack_sim_dealbreaker_thresholds, - subcluster_perplexity=subcluster_perplexity, - merging_max_seqlets_subsample=merging_max_seqlets_subsample, - final_min_cluster_size=final_min_cluster_size, - min_ic_in_window=min_ic_in_window, - min_ic_windowsize=min_ic_windowsize, - ppm_pseudocount=ppm_pseudocount) - else: - pos_patterns = None - - if len(neg_seqlets) > min_metacluster_size: - neg_seqlets = neg_seqlets[:max_seqlets_per_metacluster] - if verbose: - print("Extracted {} negative seqlets".format(len(neg_seqlets))) - - neg_patterns = seqlets_to_patterns(seqlets=neg_seqlets, - track_set=track_set, - track_signs=-1, - min_overlap_while_sliding=min_overlap_while_sliding, - nearest_neighbors_to_compute=nearest_neighbors_to_compute, - affmat_correlation_threshold=affmat_correlation_threshold, - tsne_perplexity=tsne_perplexity, - n_leiden_iterations=n_leiden_iterations, - n_leiden_runs=n_leiden_runs, - frac_support_to_trim_to=frac_support_to_trim_to, - min_num_to_trim_to=min_num_to_trim_to, - trim_to_window_size=trim_to_window_size, - initial_flank_to_add=initial_flank_to_add, - final_flank_to_add=final_flank_to_add, - prob_and_pertrack_sim_merge_thresholds=prob_and_pertrack_sim_merge_thresholds, - prob_and_pertrack_sim_dealbreaker_thresholds=prob_and_pertrack_sim_dealbreaker_thresholds, - subcluster_perplexity=subcluster_perplexity, - merging_max_seqlets_subsample=merging_max_seqlets_subsample, - final_min_cluster_size=final_min_cluster_size, - min_ic_in_window=min_ic_in_window, - min_ic_windowsize=min_ic_windowsize, - ppm_pseudocount=ppm_pseudocount) - else: - neg_patterns = None - - return pos_patterns, neg_patterns diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..922618f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,44 @@ +[build-system] +requires = ["setuptools>=61", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "faster-modisco-lite" +version = "3.0.0" +description = "Transcription Factor MOtif Discovery from Importance SCOres - lite" +readme = "README.md" +requires-python = ">=3.7" +license = { file = "LICENSE" } +authors = [ + { name = "Muhammed Hasan Celik", email = "muhammedhasancelik@gmail.com" }, + { name = "Jacob Schreiber" }, +] +dependencies = [ + "click", + "numpy>=1.21.5", + "scipy>=1.6.2", + "numba>=0.53.1", + "scikit-learn>=1.0.2", + "leidenalg>=0.8.10", + "igraph>=0.9.11", + "tqdm>=4.38.0", + "pandas>=1.4.3", + "logomaker>=0.8", + "h5py>=3.7.0", + "memelite", + "joblib", + "numba-progress", +] + +[project.optional-dependencies] +test = ["pytest", "pytest-cov"] +dev = ["pytest", "pytest-cov"] + +[project.scripts] +fastermodisco = "fastermodiscolite.cli:cli" + +[project.urls] +Homepage = "https://github.com/MuhammedHasan/tfmodisco-lite" + +[tool.setuptools] +packages = ["fastermodiscolite"] diff --git a/setup.py b/setup.py deleted file mode 100644 index 9912bfb..0000000 --- a/setup.py +++ /dev/null @@ -1,28 +0,0 @@ -from setuptools import setup - -setup( - name='modisco-lite', - version='2.4.0', - author='Jacob Schreiber', - author_email='jmschreiber91@gmail.com', - packages=['modiscolite'], - python_requires='>=3.7', - scripts=['modisco'], - url='https://github.com/jmschrei/tfmodisco-lite', - license='LICENSE.txt', - description='Transcription Factor MOtif Discovery from Importance SCOres - lite', - install_requires=[ - 'numpy >= 1.21.5', - 'scipy >= 1.6.2', - 'numba >= 0.53.1', - 'scikit-learn >= 1.0.2', - 'leidenalg >= 0.8.10', - 'igraph >= 0.9.11', - 'tqdm >= 4.38.0', - 'pandas >= 1.4.3', - 'logomaker >= 0.8', - 'h5py >= 3.7.0', - 'hdf5plugin', - 'memelite' - ] -) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..8135b81 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,21 @@ +from pathlib import Path +from urllib.request import urlretrieve +import pytest + + +BASE_URL = "https://raw.githubusercontent.com/AvantiShri/model_storage/d53ee8e/modisco/gkmexplain_scores" + + +@pytest.fixture(scope="session") +def data_ohe_hyps(): + data_dir = Path(__file__).parent / "data" + data_dir.mkdir(parents=True, exist_ok=True) + + if not (data_dir / "ohe1.npz").exists(): + urlretrieve(f"{BASE_URL}/ohe1.npz", data_dir / "ohe1.npz") + + if not (data_dir / "hypscores1.npz").exists(): + urlretrieve(f"{BASE_URL}/hypscores1.npz", data_dir / "hypscores1.npz") + + return data_dir / "ohe1.npz", data_dir / "hypscores1.npz" + diff --git a/tests/test_motif.py b/tests/test_motif.py new file mode 100644 index 0000000..76b3a5e --- /dev/null +++ b/tests/test_motif.py @@ -0,0 +1,182 @@ +import random +import h5py +import numpy as np +from click.testing import CliRunner +from fastermodiscolite.cli import motifs +from conftest import data_ohe_hyps + + +def test_modisco_motif(data_ohe_hyps, tmp_path): + random.seed(42) + np.random.seed(42) + + ohe, hyps = data_ohe_hyps + output = tmp_path / "modisco_results.h5" + + runner = CliRunner() + result = runner.invoke(motifs, ["-s", ohe, "-a", hyps, "-n", 2000, "-o", output, '-w', 200, '-v']) + assert result.exit_code == 0 + + with h5py.File(output, "r") as f: + assert list(f['pos_patterns'].keys()) == ['pattern_0', 'pattern_1', 'pattern_2'] + + np.testing.assert_allclose(f['pos_patterns']['pattern_0']['sequence'][:], np.array([ + [0.29230769, 0.32307692, 0.16923077, 0.21538462], + [0.26153846, 0.21538462, 0.21538462, 0.30769231], + [0.24615385, 0.29230769, 0.15384615, 0.30769231], + [0.24615385, 0.27692308, 0.29230769, 0.18461538], + [0.23076923, 0.24615385, 0.2 , 0.32307692], + [0.26153846, 0.24615385, 0.30769231, 0.18461538], + [0.26153846, 0.27692308, 0.23076923, 0.23076923], + [0.35384615, 0.18461538, 0.23076923, 0.23076923], + [0.30769231, 0.21538462, 0.26153846, 0.21538462], + [0.32307692, 0.27692308, 0.21538462, 0.18461538], + [0.24615385, 0.09230769, 0.4 , 0.26153846], + [0.10769231, 0.87692308, 0.01538462, 0. ], + [0.4 , 0.24615385, 0.07692308, 0.27692308], + [0. , 1. , 0. , 0. ], + [0.92307692, 0.04615385, 0. , 0.03076923], + [0.01538462, 0.03076923, 0.95384615, 0. ], + [0. , 0.93846154, 0. , 0.06153846], + [0.72307692, 0.01538462, 0.09230769, 0.16923077], + [0.01538462, 0. , 0.89230769, 0.09230769], + [0. , 0.01538462, 0.96923077, 0.01538462], + [0.44615385, 0.12307692, 0.23076923, 0.2 ], + [0.33846154, 0.09230769, 0.41538462, 0.15384615], + [0.29230769, 0.2 , 0.33846154, 0.16923077], + [0.09230769, 0.33846154, 0.26153846, 0.30769231], + [0.29230769, 0.27692308, 0.15384615, 0.27692308], + [0.30769231, 0.24615385, 0.26153846, 0.18461538], + [0.32307692, 0.16923077, 0.30769231, 0.2 ], + [0.23076923, 0.18461538, 0.27692308, 0.30769231], + [0.35384615, 0.21538462, 0.26153846, 0.16923077], + [0.29230769, 0.15384615, 0.33846154, 0.21538462], + [0.24615385, 0.18461538, 0.33846154, 0.23076923], + [0.15384615, 0.30769231, 0.29230769, 0.24615385], + [0.27692308, 0.30769231, 0.24615385, 0.16923077], + [0.29230769, 0.21538462, 0.23076923, 0.26153846], + [0.30769231, 0.13846154, 0.24615385, 0.30769231], + [0.38461538, 0.18461538, 0.26153846, 0.16923077], + [0.38461538, 0.15384615, 0.2 , 0.26153846], + [0.26153846, 0.21538462, 0.24615385, 0.27692308], + [0.23076923, 0.23076923, 0.26153846, 0.27692308], + [0.12307692, 0.26153846, 0.41538462, 0.2 ], + [0.36923077, 0.18461538, 0.29230769, 0.15384615], + [0.2 , 0.18461538, 0.30769231, 0.30769231], + [0.29230769, 0.24615385, 0.26153846, 0.2 ], + [0.27692308, 0.26153846, 0.33846154, 0.12307692], + [0.38461538, 0.16923077, 0.24615385, 0.2 ], + [0.26153846, 0.33846154, 0.18461538, 0.21538462], + [0.47692308, 0.16923077, 0.10769231, 0.24615385], + [0.23076923, 0.27692308, 0.24615385, 0.24615385], + [0.21538462, 0.15384615, 0.32307692, 0.30769231], + [0.21538462, 0.21538462, 0.36923077, 0.2 ] + ]), atol=1e-3, rtol=1e-3) + + np.testing.assert_allclose(f['pos_patterns']['pattern_1']['sequence'][:], np.array([ + [0.29268293, 0.31707317, 0.17073171, 0.2195122 ], + [0.2195122 , 0.17073171, 0.2195122 , 0.3902439 ], + [0.17073171, 0.31707317, 0.17073171, 0.34146341], + [0.24390244, 0.31707317, 0.14634146, 0.29268293], + [0.29268293, 0.17073171, 0.2195122 , 0.31707317], + [0.2195122 , 0.2195122 , 0.36585366, 0.19512195], + [0.24390244, 0.2195122 , 0.29268293, 0.24390244], + [0.34146341, 0.17073171, 0.2195122 , 0.26829268], + [0.34146341, 0.19512195, 0.2195122 , 0.24390244], + [0.24390244, 0.24390244, 0.31707317, 0.19512195], + [0.19512195, 0.26829268, 0.26829268, 0.26829268], + [0.14634146, 0.41463415, 0.26829268, 0.17073171], + [0.29268293, 0.34146341, 0.12195122, 0.24390244], + [0.34146341, 0.2195122 , 0.19512195, 0.24390244], + [0.29268293, 0.2195122 , 0.19512195, 0.29268293], + [0.17073171, 0.2195122 , 0.17073171, 0.43902439], + [0.14634146, 0.31707317, 0.2195122 , 0.31707317], + [0.14634146, 0.09756098, 0.17073171, 0.58536585], + [0.19512195, 0.19512195, 0.53658537, 0.07317073], + [0.41463415, 0.29268293, 0.2195122 , 0.07317073], + [0.68292683, 0.07317073, 0.14634146, 0.09756098], + [0.29268293, 0.02439024, 0.2195122 , 0.46341463], + [0.73170732, 0.09756098, 0.17073171, 0. ], + [0.97560976, 0. , 0. , 0.02439024], + [0. , 0.92682927, 0.04878049, 0.02439024], + [1. , 0. , 0. , 0. ], + [1. , 0. , 0. , 0. ], + [0.56097561, 0. , 0. , 0.43902439], + [0.2195122 , 0. , 0.75609756, 0.02439024], + [0.19512195, 0.14634146, 0.65853659, 0. ], + [0.3902439 , 0.24390244, 0.31707317, 0.04878049], + [0.31707317, 0.36585366, 0.12195122, 0.19512195], + [0.19512195, 0.29268293, 0.19512195, 0.31707317], + [0.24390244, 0.36585366, 0.14634146, 0.24390244], + [0.24390244, 0.41463415, 0.17073171, 0.17073171], + [0.36585366, 0.26829268, 0.14634146, 0.2195122 ], + [0.2195122 , 0.19512195, 0.17073171, 0.41463415], + [0.29268293, 0.12195122, 0.24390244, 0.34146341], + [0.26829268, 0.31707317, 0.24390244, 0.17073171], + [0.26829268, 0.31707317, 0.2195122 , 0.19512195], + [0.2195122 , 0.19512195, 0.29268293, 0.29268293], + [0.14634146, 0.36585366, 0.2195122 , 0.26829268], + [0.26829268, 0.26829268, 0.24390244, 0.2195122 ], + [0.17073171, 0.2195122 , 0.26829268, 0.34146341], + [0.17073171, 0.17073171, 0.29268293, 0.36585366], + [0.19512195, 0.36585366, 0.2195122 , 0.2195122 ], + [0.34146341, 0.09756098, 0.2195122 , 0.34146341], + [0.41463415, 0.2195122 , 0.14634146, 0.2195122 ], + [0.24390244, 0.31707317, 0.26829268, 0.17073171], + [0.19512195, 0.19512195, 0.2195122 , 0.3902439 ] + ]), atol=1e-3, rtol=1e-3) + + np.testing.assert_allclose(f['pos_patterns']['pattern_2']['sequence'], np.array([ + [0.125 , 0.34375, 0.28125, 0.25 ], + [0.25 , 0.3125 , 0.15625, 0.28125], + [0.25 , 0.28125, 0.34375, 0.125 ], + [0.15625, 0.28125, 0.1875 , 0.375 ], + [0.21875, 0.1875 , 0.28125, 0.3125 ], + [0.375 , 0.125 , 0.28125, 0.21875], + [0.21875, 0.21875, 0.375 , 0.1875 ], + [0.3125 , 0.21875, 0.21875, 0.25 ], + [0.25 , 0.25 , 0.375 , 0.125 ], + [0.34375, 0.25 , 0.3125 , 0.09375], + [0.28125, 0.40625, 0.21875, 0.09375], + [0.21875, 0.21875, 0.34375, 0.21875], + [0.21875, 0.28125, 0.09375, 0.40625], + [0.125 , 0.6875 , 0.15625, 0.03125], + [0.90625, 0.0625 , 0. , 0.03125], + [0. , 0. , 0. , 1. ], + [0.03125, 0.21875, 0. , 0.75 ], + [0.53125, 0. , 0.03125, 0.4375 ], + [0.1875 , 0. , 0.75 , 0.0625 ], + [0.03125, 0.96875, 0. , 0. ], + [1. , 0. , 0. , 0. ], + [0.09375, 0.0625 , 0. , 0.84375], + [0.46875, 0.125 , 0.15625, 0.25 ], + [0.375 , 0.125 , 0. , 0.5 ], + [0.1875 , 0.4375 , 0.25 , 0.125 ], + [0.84375, 0.03125, 0.0625 , 0.0625 ], + [0.875 , 0.03125, 0.0625 , 0.03125], + [0.5 , 0.0625 , 0.03125, 0.40625], + [0.25 , 0.125 , 0.5 , 0.125 ], + [0.53125, 0.0625 , 0.3125 , 0.09375], + [0.25 , 0.21875, 0.34375, 0.1875 ], + [0.28125, 0.3125 , 0.21875, 0.1875 ], + [0.25 , 0.34375, 0.21875, 0.1875 ], + [0.25 , 0.375 , 0.09375, 0.28125], + [0.125 , 0.34375, 0.25 , 0.28125], + [0.375 , 0.21875, 0.15625, 0.25 ], + [0.3125 , 0.21875, 0.21875, 0.25 ], + [0.21875, 0.28125, 0.125 , 0.375 ], + [0.21875, 0.125 , 0.53125, 0.125 ], + [0.375 , 0.15625, 0.3125 , 0.15625], + [0.28125, 0.25 , 0.21875, 0.25 ], + [0.21875, 0.25 , 0.34375, 0.1875 ], + [0.25 , 0.15625, 0.28125, 0.3125 ], + [0.125 , 0.25 , 0.21875, 0.40625], + [0.21875, 0.28125, 0.28125, 0.21875], + [0.15625, 0.28125, 0.21875, 0.34375], + [0.28125, 0.28125, 0.21875, 0.21875], + [0.3125 , 0.0625 , 0.34375, 0.28125], + [0.4375 , 0.21875, 0.09375, 0.25 ], + [0.125 , 0.28125, 0.3125 , 0.28125] + ]), atol=1e-3, rtol=1e-3) + +