diff --git a/config/config.yaml b/config/config.yaml index 6fad6d4..bd569c1 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -78,6 +78,19 @@ rcistarget_parameters: geneErnMethod: "aprox" # alternatively, exact but more computationally intense: "icistarget" geneErnMaxRank: 5000 +### MEME-AME - region-based motif enrichment analysis +# https://www.bioconductor.org/packages/release/bioc/html/RcisTarget.html +# resources: https://resources.aertslab.org/cistarget/ +meme_parameters: + genome_fasta: "resources/atacseq_pipeline/hg38/hg38.fa" + databases: + hocomocov13: "resources/db_meme/H13CORE_meme_format.meme" # get from https://hocomoco13.autosome.org/downloads_v13; TOOD: get via zenodo + use_length_matched_background: True # Set to True to use length-matched background, otherwise dinucleotide shuffled background will be used + ame_params: + method: fisher + scoring: avg + additional_options: "-evalue-report-threshold 0.05" + ### Enrichment plot # tool specific column names for aggregation, plotting & summaries @@ -124,6 +137,13 @@ column_names: effect_size: "NES" # NES combines statistical significance and effect size overlap: "nEnrGenes" term: "description" # a combination of the motif name and the motifAnnot_highConfCat entries + MEME_AME: + top_n: 25 + p_value: "p_value" + adj_pvalue: "adj_p_value" + effect_size: "FoldEnrich" + overlap: "TP" + term: "motif_ID" ##### AGGREGATE & SUMMARIZE ##### @@ -135,6 +155,7 @@ adjp_th: LOLA: 0.01 pycisTarget: 5 # keep results greater(!) than provided threshold RcisTarget: 5 # keep results greater(!) than provided threshold + MEME_AME: 0.05 # number of top terms per feature set within each group for all overview plots (adjusted p-value, effect-size and bubble-heatmap) top_terms_n: 5 diff --git a/workflow/Snakefile b/workflow/Snakefile index a82cdc4..07b818f 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -3,6 +3,7 @@ conda: "envs/global.yaml" # libraries +import pdb import yaml import pandas as pd import os @@ -62,6 +63,11 @@ pycistarget_db_dict = {k: v for k, v in pycistarget_db_dict.items() if v!=""} rcistarget_db_dict = config["rcistarget_parameters"]["databases"] rcistarget_db_dict = {k: v for k, v in rcistarget_db_dict.items() if v!=""} +# load MEME-ame databases dictionary and keep only non-empty +meme_ame_db_dict = config["meme_parameters"]["databases"] +meme_ame_db_dict = {k: v for k, v in meme_ame_db_dict.items() if v!=""} + + ##### set global variables result_path = os.path.join(config["result_path"], module_name) @@ -76,6 +82,13 @@ rule all: expand(os.path.join(result_path, '{background_region_set}', 'GREAT','genes.txt'), background_region_set=background_regions_dict.keys()), expand(os.path.join(result_path, '{region_set}', 'pycisTarget','{db}','{region_set}_{db}.csv'), region_set=regions_dict.keys(), db=pycistarget_db_dict.keys()), expand(os.path.join(result_path, '{region_set}', 'pycisTarget','{db}','{region_set}_{db}.png'), region_set=regions_dict.keys(), db=pycistarget_db_dict.keys()), + # custom additions to get extra results from pycistarget + expand(os.path.join(result_path, '{region_set}', 'pycisTarget','{db}','{region_set}_{db}.motif_hits.csv'), region_set=regions_dict.keys(), db=pycistarget_db_dict.keys()), + expand(os.path.join(result_path, '{region_set}', 'pycisTarget','{db}','{region_set}_{db}.cistromes.csv'), region_set=regions_dict.keys(), db=pycistarget_db_dict.keys()), + expand(os.path.join(result_path, '{region_set}', 'MEME_AME', 'regions.fasta'), region_set=regions_dict.keys()), + expand(os.path.join(result_path, '{region_set}', 'MEME_AME','{db}','{region_set}_{db}.csv'), region_set=regions_dict.keys(), db=meme_ame_db_dict.keys()), + expand(os.path.join(result_path, '{region_set}', 'MEME_AME','{db}','{region_set}_{db}.png'), region_set=regions_dict.keys(), db=meme_ame_db_dict.keys()), + # gene enrichment analyses for mapped region - ORA_GSEApy expand(os.path.join(result_path, '{region_set}', 'ORA_GSEApy','{db}','{region_set}_{db}.csv'), region_set=regions_dict.keys(), db=database_dict.keys()), expand(os.path.join(result_path, '{region_set}', 'ORA_GSEApy','{db}','{region_set}_{db}.png'), region_set=regions_dict.keys(), db=database_dict.keys()), @@ -97,6 +110,7 @@ rule all: expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='GREAT', db=database_dict.keys()), expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='LOLA', db=lola_db_dict.keys()), expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='pycisTarget', db=pycistarget_db_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='MEME_AME', db=meme_ame_db_dict.keys()), expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=list(set(genes["group"].tolist()+regions["group"].tolist())), tool='RcisTarget', db=rcistarget_db_dict.keys()), # config envs = expand(os.path.join(result_path,'envs','{env}.yaml'),env=['region_enrichment_analysis','gene_enrichment_analysis','visualization','pycisTarget','RcisTarget']), diff --git a/workflow/envs/bedtools.yaml b/workflow/envs/bedtools.yaml new file mode 100644 index 0000000..e0af200 --- /dev/null +++ b/workflow/envs/bedtools.yaml @@ -0,0 +1,7 @@ +name: bedtools +channels: + - bioconda + - conda-forge +dependencies: + - bedtools + - python=3.9 \ No newline at end of file diff --git a/workflow/envs/meme.yaml b/workflow/envs/meme.yaml new file mode 100644 index 0000000..2fc5edf --- /dev/null +++ b/workflow/envs/meme.yaml @@ -0,0 +1,8 @@ +name: meme +channels: + - bioconda + - conda-forge +dependencies: + - bioconda::meme=5.0.2 + - icu=58.2 + - python=3.6 \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d68ee45..6e10ed0 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -63,7 +63,7 @@ def get_group_paths(wildcards): feature_sets = list(annot.index[annot["group"]==wildcards.group]) # for tool GREAT, LOLA or pycisTarget only consider region sets - if wildcards.tool=="GREAT" or wildcards.tool=="LOLA" or wildcards.tool=="pycisTarget": + if wildcards.tool=="GREAT" or wildcards.tool=="LOLA" or wildcards.tool=="pycisTarget" or wildcards.tool=="MEME_AME": feature_sets = [feature_set for feature_set in feature_sets if feature_set in regions_dict.keys()] if wildcards.tool=="ORA_GSEApy"or wildcards.tool=="RcisTarget": feature_sets = [feature_set for feature_set in feature_sets if feature_set in genes_dict.keys()] + [feature_set for feature_set in feature_sets if feature_set in regions_dict.keys()] diff --git a/workflow/rules/enrichment_analysis.smk b/workflow/rules/enrichment_analysis.smk index d53b97e..faedb0d 100644 --- a/workflow/rules/enrichment_analysis.smk +++ b/workflow/rules/enrichment_analysis.smk @@ -111,6 +111,8 @@ rule process_results_pycisTarget: motif_hdf5 = os.path.join(result_path,'{region_set}','pycisTarget','{database}','motif_enrichment_cistarget_{region_set}.hdf5'), output: motif_csv = os.path.join(result_path,'{region_set}','pycisTarget','{database}','{region_set}_{database}.csv'), + hits_csv = os.path.join(result_path,'{region_set}','pycisTarget','{database}','{region_set}_{database}.motif_hits.csv'), + cistrome_csv = os.path.join(result_path,'{region_set}','pycisTarget','{database}','{region_set}_{database}.cistromes.csv'), threads: config.get("threads", 1) resources: mem_mb=config.get("mem", "16000"), @@ -120,7 +122,173 @@ rule process_results_pycisTarget: "logs/rules/process_results_pycisTarget_{region_set}_{database}.log" script: "../scripts/process_results_pycisTarget.py" + +# Testing begins for meme-ame +# preprocess regions BED file into FASTA file +rule regions_bed_to_fasta: + input: + bed = get_region_path, + genome = config["meme_parameters"]["genome_fasta"] # Path to the genome FASTA file + output: + fasta = os.path.join(result_path, '{region_set}', 'MEME_AME', 'regions.fasta') + params: + bedtools_exec = "bedtools" # Path to bedtools if not in PATH + threads: config.get("threads", 1) + resources: + mem_mb=config.get("mem", "4000"), + conda: + "../envs/bedtools.yaml" + log: + "logs/rules/regions_bed_to_fasta_{region_set}.log" + shell: + """ + {{ + {params.bedtools_exec} getfasta \ + -fi {input.genome} \ + -bed {input.bed} \ + -fo {output.fasta} + }} || {{ + echo "An error occurred during BED to FASTA conversion for regions"; touch {output.fasta}; exit 1; + }} + """ +# preprocess background BED file into FASTA file with length matching (conditional) +rule background_bed_to_fasta: + input: + bed = get_background_region_path, + genome = config["meme_parameters"]["genome_fasta"], # Path to the genome FASTA file + regions_bed = get_region_path # Input regions BED file for length distribution + output: + fasta = os.path.join(result_path, '{region_set}', 'MEME_AME', 'background.fasta'), + # region_length = os.path.join(result_path, '{region_set}', 'MEME_AME', 'region_lengths.txt'), + params: + output_dir = os.path.join(result_path, '{region_set}', 'MEME_AME'), + bedtools_exec = "bedtools", # Path to bedtools if not in PATH + awk_exec = "awk", # Path to awk if not in PATH + use_length_matching = config["meme_parameters"]["use_length_matched_background"] # Conditional flag + threads: config.get("threads", 1) + resources: + mem_mb=config.get("mem", "4000"), + conda: + "../envs/bedtools.yaml" + log: + "logs/rules/background_bed_to_fasta_{region_set}.log" + shell: + """ + {{ + if [ "{params.use_length_matching}" = "True" ]; then + + # Extract lengths of input regions + {params.awk_exec} '{{print $3 - $2}}' {input.regions_bed} > {params.output_dir}/region_lengths.txt + + # Generate background regions with matching lengths + {params.awk_exec} 'BEGIN {{ + srand(42); # Set random seed for reproducibility + lengthfile = "{params.output_dir}/region_lengths.txt"; + num_lengths = 0; + while ((getline len < lengthfile) > 0) {{ + lengths[num_lengths++] = len; # Store all lengths in an array + }} + close(lengthfile); + }} {{ + # Pick a random length from the array + random_idx = int(rand() * num_lengths); + region_length = lengths[random_idx]; + # Calculate new region based on midpoint and random length + midpoint = int(($2 + $3) / 2); + start = midpoint - int(region_length / 2); + end = start + region_length; + print $1"\t"start"\t"end; + }}' {input.bed} > $(dirname {output.fasta})/{wildcards.region_set}_matched_background.bed + + + # Convert matched background BED to FASTA + {params.bedtools_exec} getfasta \ + -fi {input.genome} \ + -bed $(dirname {output.fasta})/{wildcards.region_set}_matched_background.bed \ + -fo {output.fasta} + + + else + # Skip background FASTA generation (use dinucleotide shuffle in MEME-AME) + touch {output.fasta} + fi + }} || {{ + echo "An error occurred during BED to FASTA conversion for background"; touch {output.fasta}; exit 1; + }} + """ + +# perform motif enrichment analysis using MEME-AME +rule region_motif_enrichment_analysis_MEME_AME: + input: + regions_fasta = os.path.join(result_path, '{region_set}', 'MEME_AME','regions.fasta'), + background_fasta = os.path.join(result_path, '{region_set}', 'MEME_AME','background.fasta'), + database = lambda wildcards: meme_ame_db_dict[wildcards.database], + output: + ame_results = os.path.join(result_path, '{region_set}', 'MEME_AME', '{database}', 'ame.tsv'), + ame_html = os.path.join(result_path, '{region_set}', 'MEME_AME', '{database}', 'ame.html'), + params: + ame_exec = "ame", # Path to the AME executable if not in PATH + ame_method = config["meme_parameters"]["ame_params"]["method"], # AME method + ame_scoring = config["meme_parameters"]["ame_params"]["scoring"], # AME scoring method + ame_options = config["meme_parameters"]["ame_params"]["additional_options"], # Additional AME options + use_length_matching = config["meme_parameters"]["use_length_matched_background"] # Conditional flag + threads: config.get("threads", 1) + resources: + mem_mb=config.get("mem", "16000"), + conda: + "../envs/meme.yaml", + log: + "logs/rules/region_motif_enrichment_analysis_MEME_AME_{region_set}_{database}.log" + shell: + """ + {{ + if [ "{params.use_length_matching}" = "True" ]; then + # Use the generated background FASTA + {params.ame_exec} \ + --oc $(dirname {output.ame_results}) \ + --control {input.background_fasta} \ + --method {params.ame_method} \ + --scoring {params.ame_scoring} \ + {params.ame_options} \ + {input.regions_fasta} \ + {input.database} \ + # > {output.ame_results} \ + # && mv $(dirname {output.ame_results})/ame.html {output.ame_html} + else + # Use dinucleotide shuffle (default behavior of AME) + {params.ame_exec} \ + --oc $(dirname {output.ame_results}) \ + --control --shuffle-- \ + --method {params.ame_method} \ + --scoring {params.ame_scoring} \ + {params.ame_options} \ + {input.regions_fasta} \ + {input.database} \ + # > {output.ame_results} \ + # && mv $(dirname {output.ame_results})/ame.html {output.ame_html} + fi + }} || {{ + echo "An error occurred during the MEME-AME analysis"; touch {output.ame_results} {output.ame_html}; exit 0; + }} + """ + +rule postprocess_meme_ame_results: + input: + ame_results = os.path.join(result_path, '{region_set}', 'MEME_AME', '{database}', 'ame.tsv') + output: + aggregated_results = os.path.join(result_path, '{region_set}', 'MEME_AME', '{database}', '{region_set}_{database}.csv') + threads: config.get("threads", 1) + resources: + mem_mb=config.get("mem", "4000"), + # conda: + # "../envs/python.yaml" + log: + "logs/rules/postprocess_meme_ame_results_{region_set}_{database}.log" + script: + "../scripts/postprocess_meme_ame_results.py" + + # performs gene over-represenation analysis (ORA) using GSEApy rule gene_ORA_GSEApy: input: diff --git a/workflow/scripts/aggregate.py b/workflow/scripts/aggregate.py index f995965..d030d96 100644 --- a/workflow/scripts/aggregate.py +++ b/workflow/scripts/aggregate.py @@ -47,7 +47,7 @@ # concatenate all results into one results dataframe result_df = pd.concat(results_list, axis=0) -# save all enirchment results +# save all enrichment results result_df.to_csv(results_all_path) # find union of statistically significant terms @@ -61,3 +61,34 @@ # save filtered enirchment results by significance result_sig_df.to_csv(results_sig_path) + +# If pycistarget, then also save motif hits +if tool == "pycisTarget": + hits_list = [] + cistrome_list = [] + + for result_path in result_paths: + if os.stat(result_path).st_size != 0: + tmp_name = os.path.basename(result_path).replace("_{}.csv".format(db),"") + + hits_path = result_path.replace(".csv", ".motif_hits.csv") + cistrome_path = result_path.replace(".csv", ".cistromes.csv") + + # add hits to results + tmp_res = pd.read_csv(hits_path) + tmp_res['name'] = tmp_name + hits_list.append(tmp_res) + + # add cistrome to results + tmp_res = pd.read_csv(cistrome_path) + tmp_res['name'] = tmp_name + cistrome_list.append(tmp_res) + + # concatenate all results into one results dataframe + hits_df = pd.concat(hits_list, axis=0) + hits_df.to_csv(results_all_path.replace(".csv", ".motif_hits.csv")) + + cistrome_df = pd.concat(cistrome_list, axis=0) + cistrome_df.to_csv(results_all_path.replace(".csv", ".cistromes.csv")) + + diff --git a/workflow/scripts/postprocess_meme_ame_results.py b/workflow/scripts/postprocess_meme_ame_results.py new file mode 100644 index 0000000..56f2ec3 --- /dev/null +++ b/workflow/scripts/postprocess_meme_ame_results.py @@ -0,0 +1,52 @@ +import pandas as pd +import numpy as np +import os +import sys + + +########### FUNCTIONS ########### +def check_empty_tsv_minus_comments(file_path, comment_char='#'): + with open(file_path, 'r') as file: + for line in file: + # Skip lines that start with the comment character + if not line.strip().startswith(comment_char) and line.strip(): + return False # Found non-comment, non-empty line + return True # All lines were either comments or empty + + +#### Process MEME-AME results to aggregate and clean the data. +input_file = snakemake.input['ame_results'] +output_file = snakemake.output['aggregated_results'] + + +# Ensure output directory exists +os.makedirs(os.path.dirname(output_file), exist_ok=True) + +# quit early if file is empty +if check_empty_tsv_minus_comments(input_file): + open(output_file, 'w').close() + quit() +else: + # Process the MEME-AME results + try: + # Read the file, skipping blank lines and comments + df = pd.read_csv(input_file, sep='\t', comment='#', skip_blank_lines=True) + + # Drop any completely empty rows + df.dropna(how='all', inplace=True) + + # Calculate effect size (i.e. log2 fold enrich) + df["FoldEnrich"] = df["%TP"] / df["%FP"] + + # Clean up colum names so that it's R friendly + df.columns = df.columns.str.replace("-", "_").str.replace("%", "perc") + + # Save the cleaned and aggregated results as a CSV + df.to_csv(output_file, index=False) + + print(f"Processed MEME-AME results saved to {output_file}") + + except Exception as e: + print(f"Error processing MEME-AME results: {e}") + sys.exit(1) + diff --git a/workflow/scripts/process_results_pycisTarget.py b/workflow/scripts/process_results_pycisTarget.py index d203a3d..b99507d 100644 --- a/workflow/scripts/process_results_pycisTarget.py +++ b/workflow/scripts/process_results_pycisTarget.py @@ -1,6 +1,6 @@ - # libraries import os +import pandas as pd import pycistarget from pycistarget.input_output import read_hdf5 @@ -11,6 +11,8 @@ # output motif_csv_path = snakemake.output['motif_csv'] +hits_csv_path = snakemake.output['hits_csv'] +cistrome_csv_path = snakemake.output['cistrome_csv'] # parameters region_set_name = snakemake.wildcards["region_set"] @@ -19,6 +21,8 @@ # quit early if file is empty if os.path.getsize(motif_hdf5_path) == 0: open(motif_csv_path, 'w').close() + open(hits_csv_path, 'w').close() + open(cistrome_csv_path, 'w').close() quit() # load pycisTarget results from hdf5 @@ -34,3 +38,24 @@ # save motif enrichments as CSV for downstream processing and plotting results_df.to_csv(motif_csv_path) + + +# Save motif hits +motifs_df = pd.DataFrame(results[region_set_name].motif_hits) +motifs_df["database"] = motifs_df["database"].apply( + lambda x: ";".join(x) if type(x) == list else "" +) +motifs_df["region_set"] = motifs_df["region_set"].apply( + lambda x: ";".join(x) if type(x) == list else "" +) +motifs_df.to_csv(hits_csv_path) + +# Save cistrome hits +cistrome_df = pd.DataFrame(results[region_set_name].cistromes) +cistrome_df["database"] = cistrome_df["database"].apply( + lambda x: ";".join(x) if type(x) == list else "" +) +cistrome_df["region_set"] = cistrome_df["region_set"].apply( + lambda x: ";".join(x) if type(x) == list else "" +) +cistrome_df.to_csv(cistrome_csv_path) diff --git a/workflow/scripts/region_enrichment_analysis_GREAT.R b/workflow/scripts/region_enrichment_analysis_GREAT.R index 79f0fb7..d2fa52b 100644 --- a/workflow/scripts/region_enrichment_analysis_GREAT.R +++ b/workflow/scripts/region_enrichment_analysis_GREAT.R @@ -18,8 +18,8 @@ annot_terms_with_features <- function(res, df) { term <- df$id[i] # Check p-value thresholds - if (sum(("p_adjust" %in% names(df) && df$p_adjust[i] > 0.1), - ("p_adjust_hyper" %in% names(df) && df$p_adjust_hyper[i] > 0.1)) > 1) { + if (sum(("p_adjust" %in% names(df) && df$p_adjust[i] > 0.2), + ("p_adjust_hyper" %in% names(df) && df$p_adjust_hyper[i] > 0.2)) > 1) { regions[i] <- "" genes[i] <- "" } else {