epigen · fwzhao · Apr 1, 2025 · Apr 1, 2025 · Apr 1, 2025 · Apr 2, 2025
diff --git a/config/config.yaml b/config/config.yaml
@@ -78,6 +78,19 @@ rcistarget_parameters:
     geneErnMethod: "aprox" # alternatively, exact but more computationally intense: "icistarget"
     geneErnMaxRank: 5000
 
+### MEME-AME - region-based motif enrichment analysis
+# https://www.bioconductor.org/packages/release/bioc/html/RcisTarget.html
+# resources: https://resources.aertslab.org/cistarget/
+meme_parameters:
+    genome_fasta: "resources/atacseq_pipeline/hg38/hg38.fa"
+    databases:
+        hocomocov13: "resources/db_meme/H13CORE_meme_format.meme"  # get from https://hocomoco13.autosome.org/downloads_v13; TOOD: get via zenodo
+    use_length_matched_background: True  # Set to True to use length-matched background, otherwise dinucleotide shuffled background will be used
+    ame_params:
+        method: fisher
+        scoring: avg
+        additional_options: "-evalue-report-threshold 0.05"
+
 ### Enrichment plot
 
 # tool specific column names for aggregation, plotting & summaries
@@ -124,6 +137,13 @@ column_names:
         effect_size: "NES" # NES combines statistical significance and effect size
         overlap: "nEnrGenes"
         term: "description" # a combination of the motif name and the motifAnnot_highConfCat entries
+    MEME_AME:
+        top_n: 25
+        p_value: "p_value"
+        adj_pvalue: "adj_p_value"
+        effect_size: "FoldEnrich"
+        overlap: "TP"
+        term: "motif_ID"
 
 ##### AGGREGATE & SUMMARIZE #####
 
@@ -135,6 +155,7 @@ adjp_th:
     LOLA: 0.01
     pycisTarget: 5 # keep results greater(!) than provided threshold
     RcisTarget: 5 # keep results greater(!) than provided threshold
+    MEME_AME: 0.05
 
 # number of top terms per feature set within each group for all overview plots (adjusted p-value, effect-size and bubble-heatmap)
 top_terms_n: 5

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -3,6 +3,7 @@
 conda: "envs/global.yaml"
 
 # libraries
+import pdb
 import yaml
 import pandas as pd
 import os
@@ -62,6 +63,11 @@ pycistarget_db_dict = {k: v for k, v in pycistarget_db_dict.items() if v!=""}
 rcistarget_db_dict = config["rcistarget_parameters"]["databases"]
 rcistarget_db_dict = {k: v for k, v in rcistarget_db_dict.items() if v!=""}
 
+# load MEME-ame databases dictionary and keep only non-empty
+meme_ame_db_dict = config["meme_parameters"]["databases"]
+meme_ame_db_dict = {k: v for k, v in meme_ame_db_dict.items() if v!=""}
+
+
 ##### set global variables
 result_path = os.path.join(config["result_path"], module_name)
 
@@ -76,6 +82,13 @@ rule all:
         expand(os.path.join(result_path, '{background_region_set}', 'GREAT','genes.txt'), background_region_set=background_regions_dict.keys()),
         expand(os.path.join(result_path, '{region_set}', 'pycisTarget','{db}','{region_set}_{db}.csv'), region_set=regions_dict.keys(), db=pycistarget_db_dict.keys()),
         expand(os.path.join(result_path, '{region_set}', 'pycisTarget','{db}','{region_set}_{db}.png'), region_set=regions_dict.keys(), db=pycistarget_db_dict.keys()),
+        # custom additions to get extra results from pycistarget
+        expand(os.path.join(result_path, '{region_set}', 'pycisTarget','{db}','{region_set}_{db}.motif_hits.csv'), region_set=regions_dict.keys(), db=pycistarget_db_dict.keys()),
+        expand(os.path.join(result_path, '{region_set}', 'pycisTarget','{db}','{region_set}_{db}.cistromes.csv'), region_set=regions_dict.keys(), db=pycistarget_db_dict.keys()),
+        expand(os.path.join(result_path, '{region_set}', 'MEME_AME', 'regions.fasta'), region_set=regions_dict.keys()),
+        expand(os.path.join(result_path, '{region_set}', 'MEME_AME','{db}','{region_set}_{db}.csv'), region_set=regions_dict.keys(), db=meme_ame_db_dict.keys()),
+        expand(os.path.join(result_path, '{region_set}', 'MEME_AME','{db}','{region_set}_{db}.png'), region_set=regions_dict.keys(), db=meme_ame_db_dict.keys()),
+
         # gene enrichment analyses for mapped region - ORA_GSEApy
         expand(os.path.join(result_path, '{region_set}', 'ORA_GSEApy','{db}','{region_set}_{db}.csv'), region_set=regions_dict.keys(), db=database_dict.keys()),
         expand(os.path.join(result_path, '{region_set}', 'ORA_GSEApy','{db}','{region_set}_{db}.png'), region_set=regions_dict.keys(), db=database_dict.keys()),
@@ -97,6 +110,7 @@ rule all:
         expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='GREAT', db=database_dict.keys()),
         expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='LOLA', db=lola_db_dict.keys()),
         expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='pycisTarget', db=pycistarget_db_dict.keys()),
+        expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='MEME_AME', db=meme_ame_db_dict.keys()),
         expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=list(set(genes["group"].tolist()+regions["group"].tolist())), tool='RcisTarget', db=rcistarget_db_dict.keys()),
         # config
         envs = expand(os.path.join(result_path,'envs','{env}.yaml'),env=['region_enrichment_analysis','gene_enrichment_analysis','visualization','pycisTarget','RcisTarget']),

diff --git a/workflow/envs/bedtools.yaml b/workflow/envs/bedtools.yaml
@@ -0,0 +1,7 @@
+name: bedtools
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - bedtools
+  - python=3.9
diff --git a/workflow/envs/meme.yaml b/workflow/envs/meme.yaml
@@ -0,0 +1,8 @@
+name: meme
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - bioconda::meme=5.0.2
+  - icu=58.2
+  - python=3.6
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -63,7 +63,7 @@ def get_group_paths(wildcards):
     feature_sets = list(annot.index[annot["group"]==wildcards.group])
 
     # for tool GREAT, LOLA or pycisTarget only consider region sets
-    if wildcards.tool=="GREAT" or wildcards.tool=="LOLA" or wildcards.tool=="pycisTarget":
+    if wildcards.tool=="GREAT" or wildcards.tool=="LOLA" or wildcards.tool=="pycisTarget" or wildcards.tool=="MEME_AME":
         feature_sets = [feature_set for feature_set in feature_sets if feature_set in regions_dict.keys()]
     if wildcards.tool=="ORA_GSEApy"or wildcards.tool=="RcisTarget":
         feature_sets = [feature_set for feature_set in feature_sets if feature_set in genes_dict.keys()] + [feature_set for feature_set in feature_sets if feature_set in regions_dict.keys()]

diff --git a/workflow/rules/enrichment_analysis.smk b/workflow/rules/enrichment_analysis.smk
@@ -111,6 +111,8 @@ rule process_results_pycisTarget:
         motif_hdf5 = os.path.join(result_path,'{region_set}','pycisTarget','{database}','motif_enrichment_cistarget_{region_set}.hdf5'),
     output:
         motif_csv = os.path.join(result_path,'{region_set}','pycisTarget','{database}','{region_set}_{database}.csv'),
+        hits_csv = os.path.join(result_path,'{region_set}','pycisTarget','{database}','{region_set}_{database}.motif_hits.csv'),
+        cistrome_csv = os.path.join(result_path,'{region_set}','pycisTarget','{database}','{region_set}_{database}.cistromes.csv'),
     threads: config.get("threads", 1)
     resources:
         mem_mb=config.get("mem", "16000"),
@@ -120,7 +122,173 @@ rule process_results_pycisTarget:
         "logs/rules/process_results_pycisTarget_{region_set}_{database}.log"
     script:
         "../scripts/process_results_pycisTarget.py"
+
+# Testing begins for meme-ame
+# preprocess regions BED file into FASTA file
+rule regions_bed_to_fasta:
+    input:
+        bed = get_region_path,
+        genome = config["meme_parameters"]["genome_fasta"]  # Path to the genome FASTA file
+    output:
+        fasta = os.path.join(result_path, '{region_set}', 'MEME_AME', 'regions.fasta')
+    params:
+        bedtools_exec = "bedtools"  # Path to bedtools if not in PATH
+    threads: config.get("threads", 1)
+    resources:
+        mem_mb=config.get("mem", "4000"),
+    conda:
+        "../envs/bedtools.yaml"
+    log:
+        "logs/rules/regions_bed_to_fasta_{region_set}.log"
+    shell:
+        """
+        {{
+            {params.bedtools_exec} getfasta \
+                -fi {input.genome} \
+                -bed {input.bed} \
+                -fo {output.fasta}
+        }} || {{
+            echo "An error occurred during BED to FASTA conversion for regions"; touch {output.fasta}; exit 1;
+        }}
+        """
+# preprocess background BED file into FASTA file with length matching (conditional)
+rule background_bed_to_fasta:
+    input:
+        bed = get_background_region_path,
+        genome = config["meme_parameters"]["genome_fasta"],  # Path to the genome FASTA file
+        regions_bed = get_region_path  # Input regions BED file for length distribution
+    output:
+        fasta = os.path.join(result_path, '{region_set}', 'MEME_AME', 'background.fasta'),
+        # region_length = os.path.join(result_path, '{region_set}', 'MEME_AME', 'region_lengths.txt'),
+    params:
+        output_dir = os.path.join(result_path, '{region_set}', 'MEME_AME'),
+        bedtools_exec = "bedtools",  # Path to bedtools if not in PATH
+        awk_exec = "awk",  # Path to awk if not in PATH
+        use_length_matching = config["meme_parameters"]["use_length_matched_background"]  # Conditional flag
+    threads: config.get("threads", 1)
+    resources:
+        mem_mb=config.get("mem", "4000"),
+    conda:
+        "../envs/bedtools.yaml"
+    log:
+        "logs/rules/background_bed_to_fasta_{region_set}.log"
+    shell:
+        """
+        {{
+            if [ "{params.use_length_matching}" = "True" ]; then
+
+                # Extract lengths of input regions
+                {params.awk_exec} '{{print $3 - $2}}' {input.regions_bed} > {params.output_dir}/region_lengths.txt
+
+                # Generate background regions with matching lengths
+                {params.awk_exec} 'BEGIN {{
+                srand(42);  # Set random seed for reproducibility
+                lengthfile = "{params.output_dir}/region_lengths.txt";
+                num_lengths = 0;
+                while ((getline len < lengthfile) > 0) {{
+                    lengths[num_lengths++] = len;  # Store all lengths in an array
+                }}
+                close(lengthfile);
+                }} {{
+                # Pick a random length from the array
+                random_idx = int(rand() * num_lengths);
+                region_length = lengths[random_idx];
 
+                # Calculate new region based on midpoint and random length
+                midpoint = int(($2 + $3) / 2);
+                start = midpoint - int(region_length / 2);
+                end = start + region_length;
+                print $1"\t"start"\t"end;
+                }}' {input.bed} > $(dirname {output.fasta})/{wildcards.region_set}_matched_background.bed
+
+
+                # Convert matched background BED to FASTA
+                {params.bedtools_exec} getfasta \
+                    -fi {input.genome} \
+                    -bed $(dirname {output.fasta})/{wildcards.region_set}_matched_background.bed \
+                    -fo {output.fasta}
+
+
+            else
+                # Skip background FASTA generation (use dinucleotide shuffle in MEME-AME)
+                touch {output.fasta}
+            fi
+        }} || {{
+            echo "An error occurred during BED to FASTA conversion for background"; touch {output.fasta}; exit 1;
+        }}
+        """
+
+# perform motif enrichment analysis using MEME-AME
+rule region_motif_enrichment_analysis_MEME_AME:
+    input:
+        regions_fasta = os.path.join(result_path, '{region_set}', 'MEME_AME','regions.fasta'),
+        background_fasta = os.path.join(result_path, '{region_set}', 'MEME_AME','background.fasta'),
+        database = lambda wildcards: meme_ame_db_dict[wildcards.database],
+    output:
+        ame_results = os.path.join(result_path, '{region_set}', 'MEME_AME', '{database}', 'ame.tsv'),
+        ame_html = os.path.join(result_path, '{region_set}', 'MEME_AME', '{database}', 'ame.html'),
+    params:
+        ame_exec = "ame",  # Path to the AME executable if not in PATH
+        ame_method = config["meme_parameters"]["ame_params"]["method"],  # AME method
+        ame_scoring = config["meme_parameters"]["ame_params"]["scoring"],  # AME scoring method
+        ame_options = config["meme_parameters"]["ame_params"]["additional_options"],  # Additional AME options
+        use_length_matching = config["meme_parameters"]["use_length_matched_background"]  # Conditional flag
+    threads: config.get("threads", 1)
+    resources:
+        mem_mb=config.get("mem", "16000"),
+    conda:
+        "../envs/meme.yaml",
+    log:
+        "logs/rules/region_motif_enrichment_analysis_MEME_AME_{region_set}_{database}.log"
+    shell:
+        """
+        {{
+            if [ "{params.use_length_matching}" = "True" ]; then
+                # Use the generated background FASTA
+                {params.ame_exec} \
+                    --oc $(dirname {output.ame_results}) \
+                    --control {input.background_fasta} \
+                    --method {params.ame_method} \
+                    --scoring {params.ame_scoring} \
+                    {params.ame_options} \
+                    {input.regions_fasta} \
+                    {input.database} \
+                    # > {output.ame_results} \
+                    # && mv $(dirname {output.ame_results})/ame.html {output.ame_html}
+            else
+                # Use dinucleotide shuffle (default behavior of AME)
+                {params.ame_exec} \
+                    --oc $(dirname {output.ame_results}) \
+                    --control --shuffle-- \
+                    --method {params.ame_method} \
+                    --scoring {params.ame_scoring} \
+                    {params.ame_options} \
+                    {input.regions_fasta} \
+                    {input.database} \
+                    # > {output.ame_results} \
+                    # && mv $(dirname {output.ame_results})/ame.html {output.ame_html}
+            fi
+        }} || {{
+            echo "An error occurred during the MEME-AME analysis"; touch {output.ame_results} {output.ame_html}; exit 0;
+        }}
+        """
+
+rule postprocess_meme_ame_results:
+    input:
+        ame_results = os.path.join(result_path, '{region_set}', 'MEME_AME', '{database}', 'ame.tsv')
+    output:
+        aggregated_results = os.path.join(result_path, '{region_set}', 'MEME_AME', '{database}', '{region_set}_{database}.csv')
+    threads: config.get("threads", 1)
+    resources:
+        mem_mb=config.get("mem", "4000"),
+    # conda:
+    #     "../envs/python.yaml"
+    log:
+        "logs/rules/postprocess_meme_ame_results_{region_set}_{database}.log"
+    script:
+        "../scripts/postprocess_meme_ame_results.py"
+
+
 # performs gene over-represenation analysis (ORA) using GSEApy
 rule gene_ORA_GSEApy:
     input:

diff --git a/workflow/scripts/aggregate.py b/workflow/scripts/aggregate.py
@@ -47,7 +47,7 @@
 # concatenate all results into one results dataframe
 result_df = pd.concat(results_list, axis=0)
 
-# save all enirchment results
+# save all enrichment results
 result_df.to_csv(results_all_path)
 
 # find union of statistically significant terms
@@ -61,3 +61,34 @@
 
 # save filtered enirchment results by significance
 result_sig_df.to_csv(results_sig_path)
+
+# If pycistarget, then also save motif hits
+if tool == "pycisTarget":
+    hits_list = []
+    cistrome_list = []
+
+    for result_path in result_paths:
+        if os.stat(result_path).st_size != 0:
+            tmp_name = os.path.basename(result_path).replace("_{}.csv".format(db),"")
+
+            hits_path = result_path.replace(".csv", ".motif_hits.csv")
+            cistrome_path = result_path.replace(".csv", ".cistromes.csv")
+
+            # add hits to results
+            tmp_res = pd.read_csv(hits_path)
+            tmp_res['name'] = tmp_name
+            hits_list.append(tmp_res)
+
+            # add cistrome to results
+            tmp_res = pd.read_csv(cistrome_path)
+            tmp_res['name'] = tmp_name
+            cistrome_list.append(tmp_res)
+
+    # concatenate all results into one results dataframe
+    hits_df = pd.concat(hits_list, axis=0)
+    hits_df.to_csv(results_all_path.replace(".csv", ".motif_hits.csv"))
+
+    cistrome_df = pd.concat(cistrome_list, axis=0)
+    cistrome_df.to_csv(results_all_path.replace(".csv", ".cistromes.csv"))
+
+
diff --git a/workflow/scripts/postprocess_meme_ame_results.py b/workflow/scripts/postprocess_meme_ame_results.py
@@ -0,0 +1,52 @@
+import pandas as pd
+import numpy as np
+import os
+import sys
+
+
+########### FUNCTIONS ###########
+def check_empty_tsv_minus_comments(file_path, comment_char='#'):
+    with open(file_path, 'r') as file:
+        for line in file:
+            # Skip lines that start with the comment character
+            if not line.strip().startswith(comment_char) and line.strip():
+                return False  # Found non-comment, non-empty line
+    return True  # All lines were either comments or empty
+
+
+#### Process MEME-AME results to aggregate and clean the data.
+input_file = snakemake.input['ame_results']
+output_file = snakemake.output['aggregated_results']
+
+
+# Ensure output directory exists
+os.makedirs(os.path.dirname(output_file), exist_ok=True)
+
+# quit early if file is empty
+if check_empty_tsv_minus_comments(input_file):
+    open(output_file, 'w').close()
+    quit()
+else:
+    # Process the MEME-AME results
+    try:
+        # Read the file, skipping blank lines and comments
+        df = pd.read_csv(input_file, sep='\t', comment='#', skip_blank_lines=True)
+
+        # Drop any completely empty rows
+        df.dropna(how='all', inplace=True)
+
+        # Calculate effect size (i.e. log2 fold enrich)
+        df["FoldEnrich"] = df["%TP"] / df["%FP"]
+
+        # Clean up colum names so that it's R friendly
+        df.columns = df.columns.str.replace("-", "_").str.replace("%", "perc")
+
+        # Save the cleaned and aggregated results as a CSV
+        df.to_csv(output_file, index=False)
+
+        print(f"Processed MEME-AME results saved to {output_file}")
+
+    except Exception as e:
+        print(f"Error processing MEME-AME results: {e}")
+        sys.exit(1)
+