Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,19 @@ rcistarget_parameters:
geneErnMethod: "aprox" # alternatively, exact but more computationally intense: "icistarget"
geneErnMaxRank: 5000

### MEME-AME - region-based motif enrichment analysis
# https://www.bioconductor.org/packages/release/bioc/html/RcisTarget.html
# resources: https://resources.aertslab.org/cistarget/
meme_parameters:
genome_fasta: "resources/atacseq_pipeline/hg38/hg38.fa"
databases:
hocomocov13: "resources/db_meme/H13CORE_meme_format.meme" # get from https://hocomoco13.autosome.org/downloads_v13; TOOD: get via zenodo
use_length_matched_background: True # Set to True to use length-matched background, otherwise dinucleotide shuffled background will be used
ame_params:
method: fisher
scoring: avg
additional_options: "-evalue-report-threshold 0.05"

### Enrichment plot

# tool specific column names for aggregation, plotting & summaries
Expand Down Expand Up @@ -124,6 +137,13 @@ column_names:
effect_size: "NES" # NES combines statistical significance and effect size
overlap: "nEnrGenes"
term: "description" # a combination of the motif name and the motifAnnot_highConfCat entries
MEME_AME:
top_n: 25
p_value: "p_value"
adj_pvalue: "adj_p_value"
effect_size: "FoldEnrich"
overlap: "TP"
term: "motif_ID"

##### AGGREGATE & SUMMARIZE #####

Expand All @@ -135,6 +155,7 @@ adjp_th:
LOLA: 0.01
pycisTarget: 5 # keep results greater(!) than provided threshold
RcisTarget: 5 # keep results greater(!) than provided threshold
MEME_AME: 0.05

# number of top terms per feature set within each group for all overview plots (adjusted p-value, effect-size and bubble-heatmap)
top_terms_n: 5
Expand Down
14 changes: 14 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
conda: "envs/global.yaml"

# libraries
import pdb
import yaml
import pandas as pd
import os
Expand Down Expand Up @@ -62,6 +63,11 @@ pycistarget_db_dict = {k: v for k, v in pycistarget_db_dict.items() if v!=""}
rcistarget_db_dict = config["rcistarget_parameters"]["databases"]
rcistarget_db_dict = {k: v for k, v in rcistarget_db_dict.items() if v!=""}

# load MEME-ame databases dictionary and keep only non-empty
meme_ame_db_dict = config["meme_parameters"]["databases"]
meme_ame_db_dict = {k: v for k, v in meme_ame_db_dict.items() if v!=""}


##### set global variables
result_path = os.path.join(config["result_path"], module_name)

Expand All @@ -76,6 +82,13 @@ rule all:
expand(os.path.join(result_path, '{background_region_set}', 'GREAT','genes.txt'), background_region_set=background_regions_dict.keys()),
expand(os.path.join(result_path, '{region_set}', 'pycisTarget','{db}','{region_set}_{db}.csv'), region_set=regions_dict.keys(), db=pycistarget_db_dict.keys()),
expand(os.path.join(result_path, '{region_set}', 'pycisTarget','{db}','{region_set}_{db}.png'), region_set=regions_dict.keys(), db=pycistarget_db_dict.keys()),
# custom additions to get extra results from pycistarget
expand(os.path.join(result_path, '{region_set}', 'pycisTarget','{db}','{region_set}_{db}.motif_hits.csv'), region_set=regions_dict.keys(), db=pycistarget_db_dict.keys()),
expand(os.path.join(result_path, '{region_set}', 'pycisTarget','{db}','{region_set}_{db}.cistromes.csv'), region_set=regions_dict.keys(), db=pycistarget_db_dict.keys()),
expand(os.path.join(result_path, '{region_set}', 'MEME_AME', 'regions.fasta'), region_set=regions_dict.keys()),
expand(os.path.join(result_path, '{region_set}', 'MEME_AME','{db}','{region_set}_{db}.csv'), region_set=regions_dict.keys(), db=meme_ame_db_dict.keys()),
expand(os.path.join(result_path, '{region_set}', 'MEME_AME','{db}','{region_set}_{db}.png'), region_set=regions_dict.keys(), db=meme_ame_db_dict.keys()),

# gene enrichment analyses for mapped region - ORA_GSEApy
expand(os.path.join(result_path, '{region_set}', 'ORA_GSEApy','{db}','{region_set}_{db}.csv'), region_set=regions_dict.keys(), db=database_dict.keys()),
expand(os.path.join(result_path, '{region_set}', 'ORA_GSEApy','{db}','{region_set}_{db}.png'), region_set=regions_dict.keys(), db=database_dict.keys()),
Expand All @@ -97,6 +110,7 @@ rule all:
expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='GREAT', db=database_dict.keys()),
expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='LOLA', db=lola_db_dict.keys()),
expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='pycisTarget', db=pycistarget_db_dict.keys()),
expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='MEME_AME', db=meme_ame_db_dict.keys()),
expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=list(set(genes["group"].tolist()+regions["group"].tolist())), tool='RcisTarget', db=rcistarget_db_dict.keys()),
# config
envs = expand(os.path.join(result_path,'envs','{env}.yaml'),env=['region_enrichment_analysis','gene_enrichment_analysis','visualization','pycisTarget','RcisTarget']),
Expand Down
7 changes: 7 additions & 0 deletions workflow/envs/bedtools.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: bedtools
channels:
- bioconda
- conda-forge
dependencies:
- bedtools
- python=3.9
8 changes: 8 additions & 0 deletions workflow/envs/meme.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: meme
channels:
- bioconda
- conda-forge
dependencies:
- bioconda::meme=5.0.2
- icu=58.2
- python=3.6
2 changes: 1 addition & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def get_group_paths(wildcards):
feature_sets = list(annot.index[annot["group"]==wildcards.group])

# for tool GREAT, LOLA or pycisTarget only consider region sets
if wildcards.tool=="GREAT" or wildcards.tool=="LOLA" or wildcards.tool=="pycisTarget":
if wildcards.tool=="GREAT" or wildcards.tool=="LOLA" or wildcards.tool=="pycisTarget" or wildcards.tool=="MEME_AME":
feature_sets = [feature_set for feature_set in feature_sets if feature_set in regions_dict.keys()]
if wildcards.tool=="ORA_GSEApy"or wildcards.tool=="RcisTarget":
feature_sets = [feature_set for feature_set in feature_sets if feature_set in genes_dict.keys()] + [feature_set for feature_set in feature_sets if feature_set in regions_dict.keys()]
Expand Down
168 changes: 168 additions & 0 deletions workflow/rules/enrichment_analysis.smk
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ rule process_results_pycisTarget:
motif_hdf5 = os.path.join(result_path,'{region_set}','pycisTarget','{database}','motif_enrichment_cistarget_{region_set}.hdf5'),
output:
motif_csv = os.path.join(result_path,'{region_set}','pycisTarget','{database}','{region_set}_{database}.csv'),
hits_csv = os.path.join(result_path,'{region_set}','pycisTarget','{database}','{region_set}_{database}.motif_hits.csv'),
cistrome_csv = os.path.join(result_path,'{region_set}','pycisTarget','{database}','{region_set}_{database}.cistromes.csv'),
threads: config.get("threads", 1)
resources:
mem_mb=config.get("mem", "16000"),
Expand All @@ -120,7 +122,173 @@ rule process_results_pycisTarget:
"logs/rules/process_results_pycisTarget_{region_set}_{database}.log"
script:
"../scripts/process_results_pycisTarget.py"

# Testing begins for meme-ame
# preprocess regions BED file into FASTA file
rule regions_bed_to_fasta:
input:
bed = get_region_path,
genome = config["meme_parameters"]["genome_fasta"] # Path to the genome FASTA file
output:
fasta = os.path.join(result_path, '{region_set}', 'MEME_AME', 'regions.fasta')
params:
bedtools_exec = "bedtools" # Path to bedtools if not in PATH
threads: config.get("threads", 1)
resources:
mem_mb=config.get("mem", "4000"),
conda:
"../envs/bedtools.yaml"
log:
"logs/rules/regions_bed_to_fasta_{region_set}.log"
shell:
"""
{{
{params.bedtools_exec} getfasta \
-fi {input.genome} \
-bed {input.bed} \
-fo {output.fasta}
}} || {{
echo "An error occurred during BED to FASTA conversion for regions"; touch {output.fasta}; exit 1;
}}
"""
# preprocess background BED file into FASTA file with length matching (conditional)
rule background_bed_to_fasta:
input:
bed = get_background_region_path,
genome = config["meme_parameters"]["genome_fasta"], # Path to the genome FASTA file
regions_bed = get_region_path # Input regions BED file for length distribution
output:
fasta = os.path.join(result_path, '{region_set}', 'MEME_AME', 'background.fasta'),
# region_length = os.path.join(result_path, '{region_set}', 'MEME_AME', 'region_lengths.txt'),
params:
output_dir = os.path.join(result_path, '{region_set}', 'MEME_AME'),
bedtools_exec = "bedtools", # Path to bedtools if not in PATH
awk_exec = "awk", # Path to awk if not in PATH
use_length_matching = config["meme_parameters"]["use_length_matched_background"] # Conditional flag
threads: config.get("threads", 1)
resources:
mem_mb=config.get("mem", "4000"),
conda:
"../envs/bedtools.yaml"
log:
"logs/rules/background_bed_to_fasta_{region_set}.log"
shell:
"""
{{
if [ "{params.use_length_matching}" = "True" ]; then

# Extract lengths of input regions
{params.awk_exec} '{{print $3 - $2}}' {input.regions_bed} > {params.output_dir}/region_lengths.txt

# Generate background regions with matching lengths
{params.awk_exec} 'BEGIN {{
srand(42); # Set random seed for reproducibility
lengthfile = "{params.output_dir}/region_lengths.txt";
num_lengths = 0;
while ((getline len < lengthfile) > 0) {{
lengths[num_lengths++] = len; # Store all lengths in an array
}}
close(lengthfile);
}} {{
# Pick a random length from the array
random_idx = int(rand() * num_lengths);
region_length = lengths[random_idx];

# Calculate new region based on midpoint and random length
midpoint = int(($2 + $3) / 2);
start = midpoint - int(region_length / 2);
end = start + region_length;
print $1"\t"start"\t"end;
}}' {input.bed} > $(dirname {output.fasta})/{wildcards.region_set}_matched_background.bed


# Convert matched background BED to FASTA
{params.bedtools_exec} getfasta \
-fi {input.genome} \
-bed $(dirname {output.fasta})/{wildcards.region_set}_matched_background.bed \
-fo {output.fasta}


else
# Skip background FASTA generation (use dinucleotide shuffle in MEME-AME)
touch {output.fasta}
fi
}} || {{
echo "An error occurred during BED to FASTA conversion for background"; touch {output.fasta}; exit 1;
}}
"""

# perform motif enrichment analysis using MEME-AME
rule region_motif_enrichment_analysis_MEME_AME:
input:
regions_fasta = os.path.join(result_path, '{region_set}', 'MEME_AME','regions.fasta'),
background_fasta = os.path.join(result_path, '{region_set}', 'MEME_AME','background.fasta'),
database = lambda wildcards: meme_ame_db_dict[wildcards.database],
output:
ame_results = os.path.join(result_path, '{region_set}', 'MEME_AME', '{database}', 'ame.tsv'),
ame_html = os.path.join(result_path, '{region_set}', 'MEME_AME', '{database}', 'ame.html'),
params:
ame_exec = "ame", # Path to the AME executable if not in PATH
ame_method = config["meme_parameters"]["ame_params"]["method"], # AME method
ame_scoring = config["meme_parameters"]["ame_params"]["scoring"], # AME scoring method
ame_options = config["meme_parameters"]["ame_params"]["additional_options"], # Additional AME options
use_length_matching = config["meme_parameters"]["use_length_matched_background"] # Conditional flag
threads: config.get("threads", 1)
resources:
mem_mb=config.get("mem", "16000"),
conda:
"../envs/meme.yaml",
log:
"logs/rules/region_motif_enrichment_analysis_MEME_AME_{region_set}_{database}.log"
shell:
"""
{{
if [ "{params.use_length_matching}" = "True" ]; then
# Use the generated background FASTA
{params.ame_exec} \
--oc $(dirname {output.ame_results}) \
--control {input.background_fasta} \
--method {params.ame_method} \
--scoring {params.ame_scoring} \
{params.ame_options} \
{input.regions_fasta} \
{input.database} \
# > {output.ame_results} \
# && mv $(dirname {output.ame_results})/ame.html {output.ame_html}
else
# Use dinucleotide shuffle (default behavior of AME)
{params.ame_exec} \
--oc $(dirname {output.ame_results}) \
--control --shuffle-- \
--method {params.ame_method} \
--scoring {params.ame_scoring} \
{params.ame_options} \
{input.regions_fasta} \
{input.database} \
# > {output.ame_results} \
# && mv $(dirname {output.ame_results})/ame.html {output.ame_html}
fi
}} || {{
echo "An error occurred during the MEME-AME analysis"; touch {output.ame_results} {output.ame_html}; exit 0;
}}
"""

rule postprocess_meme_ame_results:
input:
ame_results = os.path.join(result_path, '{region_set}', 'MEME_AME', '{database}', 'ame.tsv')
output:
aggregated_results = os.path.join(result_path, '{region_set}', 'MEME_AME', '{database}', '{region_set}_{database}.csv')
threads: config.get("threads", 1)
resources:
mem_mb=config.get("mem", "4000"),
# conda:
# "../envs/python.yaml"
log:
"logs/rules/postprocess_meme_ame_results_{region_set}_{database}.log"
script:
"../scripts/postprocess_meme_ame_results.py"


# performs gene over-represenation analysis (ORA) using GSEApy
rule gene_ORA_GSEApy:
input:
Expand Down
33 changes: 32 additions & 1 deletion workflow/scripts/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
# concatenate all results into one results dataframe
result_df = pd.concat(results_list, axis=0)

# save all enirchment results
# save all enrichment results
result_df.to_csv(results_all_path)

# find union of statistically significant terms
Expand All @@ -61,3 +61,34 @@

# save filtered enirchment results by significance
result_sig_df.to_csv(results_sig_path)

# If pycistarget, then also save motif hits
if tool == "pycisTarget":
hits_list = []
cistrome_list = []

for result_path in result_paths:
if os.stat(result_path).st_size != 0:
tmp_name = os.path.basename(result_path).replace("_{}.csv".format(db),"")

hits_path = result_path.replace(".csv", ".motif_hits.csv")
cistrome_path = result_path.replace(".csv", ".cistromes.csv")

# add hits to results
tmp_res = pd.read_csv(hits_path)
tmp_res['name'] = tmp_name
hits_list.append(tmp_res)

# add cistrome to results
tmp_res = pd.read_csv(cistrome_path)
tmp_res['name'] = tmp_name
cistrome_list.append(tmp_res)

# concatenate all results into one results dataframe
hits_df = pd.concat(hits_list, axis=0)
hits_df.to_csv(results_all_path.replace(".csv", ".motif_hits.csv"))

cistrome_df = pd.concat(cistrome_list, axis=0)
cistrome_df.to_csv(results_all_path.replace(".csv", ".cistromes.csv"))


52 changes: 52 additions & 0 deletions workflow/scripts/postprocess_meme_ame_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pandas as pd
import numpy as np
import os
import sys


########### FUNCTIONS ###########
def check_empty_tsv_minus_comments(file_path, comment_char='#'):
with open(file_path, 'r') as file:
for line in file:
# Skip lines that start with the comment character
if not line.strip().startswith(comment_char) and line.strip():
return False # Found non-comment, non-empty line
return True # All lines were either comments or empty


#### Process MEME-AME results to aggregate and clean the data.
input_file = snakemake.input['ame_results']
output_file = snakemake.output['aggregated_results']


# Ensure output directory exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# quit early if file is empty
if check_empty_tsv_minus_comments(input_file):
open(output_file, 'w').close()
quit()
else:
# Process the MEME-AME results
try:
# Read the file, skipping blank lines and comments
df = pd.read_csv(input_file, sep='\t', comment='#', skip_blank_lines=True)

# Drop any completely empty rows
df.dropna(how='all', inplace=True)

# Calculate effect size (i.e. log2 fold enrich)
df["FoldEnrich"] = df["%TP"] / df["%FP"]

# Clean up colum names so that it's R friendly
df.columns = df.columns.str.replace("-", "_").str.replace("%", "perc")

# Save the cleaned and aggregated results as a CSV
df.to_csv(output_file, index=False)

print(f"Processed MEME-AME results saved to {output_file}")

except Exception as e:
print(f"Error processing MEME-AME results: {e}")
sys.exit(1)

Loading