Skip to content

Commit b0d6cd3

Browse files
committed
provide TSS bed file
1 parent c0f1350 commit b0d6cd3

File tree

7 files changed

+22
-9
lines changed

7 files changed

+22
-9
lines changed

README.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,14 +79,15 @@ The processing and quantification described here was performed using a publicly
7979
- MultiQC report generation using MultiQC, extended with an in-house developed plugin [atacseq_report](./workflow/scripts/multiqc_atacseq).
8080
- Quantification (counts/)
8181
- Consensus region set generation across all called peaks (consensus_regions.bed).
82-
- Read count quantification of the consensus regions across samples, yielding a count matrix with dimensions regions X samples (consensus_counts.csv).
83-
- Peak support quantification of the consensus regions across samples, yielding a count matrix with dimensions regions X samples (support_counts.csv).
82+
- Read count quantification of the consensus regions across samples, yielding a count matrix with dimensions consensus regions X samples (consensus_counts.csv).
83+
- Peak support quantification of the consensus regions across samples, yielding a count matrix with dimensions consensus regions X samples (support_counts.csv).
8484
- Consensus regions mapped to closest gene TSS according to HOMER (Distance to TSS) within proximal TSS up and downstream distances (TSS_counts.csv).
8585
- Read count quantification of promoter regions based on provided proximal TSS up and downstream distances (promoter_regions.bed and promoter_counts.csv).
86+
- [Pseudoautosomal regions in human](https://www.ensembl.org/info/genome/genebuild/human_PARS.html) chromosome Y are skipped.
8687
- Aggregation of all sample-wise HOMER known motif enrichment results into one CSV in long-format (HOMER_knownMotifs.csv).
8788
- Annotation (counts/)
88-
- Sample annotation file based on MultiQC general stats (annotation.csv)
89-
- Consensus region set annotation using (region_annotation.csv)
89+
- Sample annotation file based on MultiQC general stats and provided annotations for downstream analysis (annotation.csv)
90+
- Consensus region set annotation using (consensus_annotation.csv)
9091
- UROPA with regulatory build and gencode as references
9192
- HOMER with annotatePeaks.pl
9293
- bedtools for nucleotide counts/content (e.g., % of GC)

config/config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ tss_size: 100
4343

4444
# assumed TSS proximal distance upstream/downstream (up/dn)
4545
# additionally, used for promoter region quantification and to map consensus regions to closest TSS
46+
# default of GenomicRanges::promoters(x, upstream=2000, downstream=200)
4647
proximal_size_up: 1000
4748
proximal_size_dn: 500
4849

workflow/Snakefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ rule all:
4747
tss_counts = os.path.join(result_path,"counts","TSS_counts.csv") if len(samples_quantify)>0 else [],
4848
HOMER_knownMotifs = os.path.join(result_path,"counts","HOMER_knownMotifs.csv") if len(samples_quantify)>0 else [],
4949
# ANNOTATION
50-
region_annotation = os.path.join(result_path,'counts',"region_annotation.csv") if len(samples_quantify)>0 else [],
50+
consensus_annotation = os.path.join(result_path,'counts',"consensus_annotation.csv") if len(samples_quantify)>0 else [],
5151
# EXPORT environments and configurations
5252
envs = expand(os.path.join(config["result_path"],'envs',module_name,'{env}.yaml'),env=envs),
5353
configs = os.path.join(config["result_path"],'configs',module_name,'{}_config.yaml'.format(config["project_name"])),

workflow/rules/quantification.smk

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,12 @@ rule homer_aggregate:
159159
# map consensus regions to closest TSS per gene
160160
rule map_consensus_tss:
161161
input:
162-
region_annotation = os.path.join(result_path,'counts',"region_annotation.csv"),
162+
region_annotation = os.path.join(result_path,'counts',"consensus_annotation.csv"),
163163
consensus_counts = os.path.join(result_path,"counts","consensus_counts.csv"),
164164
output:
165165
tss_counts = os.path.join(result_path,"counts","TSS_counts.csv"),
166166
tss_annot = os.path.join(result_path,"counts","TSS_annotation.csv"),
167+
tss_bed = os.path.join(result_path,"counts","TSS_regions.bed"),
167168
params:
168169
# cluster parameters
169170
partition=config.get("partition"),

workflow/rules/region_annotation.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ rule region_annotation_aggregate:
154154
homer_annotations = os.path.join(result_path,"tmp","homer_annotations.tsv"),
155155
bedtools_annotation = os.path.join(result_path, "tmp", "bedtools_annotation.bed"),
156156
output:
157-
region_annotation = os.path.join(result_path,'counts',"region_annotation.csv"),
157+
region_annotation = os.path.join(result_path,'counts',"consensus_annotation.csv"),
158158
params:
159159
# cluster parameters
160160
partition=config.get("partition"),

workflow/scripts/get_promoter_regions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def get_promoter(feature, upstream, downstream, chrom_sizes):
5656
chrom, size = line.strip().split('\t')
5757
chrom_sizes[chrom] = int(size)
5858

59-
# filter for features that are genes AND not Pseudoautosomal regions denoted by "PAR" and create promoters
59+
# filter for features that are genes AND not pseudoautosomal regions denoted by "PAR" and create promoters
6060
# https://www.ensembl.org/info/genome/genebuild/human_PARS.html
6161
promoters = gtf.filter(lambda x: (x[2] == 'gene') & ("PAR" not in x["gene_id"])).each(get_promoter, TSS_up, TSS_dn, chrom_sizes)
6262

workflow/scripts/map_consensus_tss.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#### libraries
44
import pandas as pd
5+
import pybedtools as bedtools
56

67
# map region to gene and classify if TSS
78
def map_region(x):
@@ -20,6 +21,7 @@ def map_region(x):
2021
# output
2122
tss_counts_path = snakemake.output["tss_counts"]
2223
tss_annot_path = snakemake.output["tss_annot"]
24+
tss_bed_path = snakemake.output["tss_bed"]
2325

2426
# parameters
2527
TSS_up = -snakemake.config["proximal_size_up"]
@@ -42,5 +44,13 @@ def map_region(x):
4244
annot_regions.set_index('peak_id', inplace=True)
4345
TSS_annot = annot_regions.loc[TSS_regions["peak_id"],:]
4446
TSS_annot.reset_index(inplace=True)
47+
TSS_annot = TSS_annot.sort_values(by="peak_id")
4548
TSS_annot.index = TSS_regions.index
46-
TSS_annot.to_csv(tss_annot_path)
49+
50+
TSS_annot.to_csv(tss_annot_path)
51+
52+
# save bed file of TSS regions
53+
TSS_annot.reset_index(inplace=True)
54+
TSS_bed_df = TSS_annot[["gencode_chr", 'gencode_start', 'gencode_end', 'homer_Nearest_Ensembl']]
55+
TSS_bed = pybedtools.BedTool.from_dataframe(TSS_bed_df)
56+
TSS_bed.saveas(tss_bed_path)

0 commit comments

Comments
 (0)