provide TSS bed file

sreichl · sreichl · commit b0d6cd356940 · 2024-04-29T16:43:28.000+02:00
diff --git a/README.md b/README.md
@@ -79,14 +79,15 @@ The processing and quantification described here was performed using a publicly
     - MultiQC report generation using MultiQC, extended with an in-house developed plugin [atacseq_report](./workflow/scripts/multiqc_atacseq).
 - Quantification (counts/)
     - Consensus region set generation across all called peaks (consensus_regions.bed).
-    - Read count quantification of the consensus regions across samples, yielding a count matrix with dimensions regions X samples (consensus_counts.csv).
-    - Peak support quantification of the consensus regions across samples, yielding a count matrix with dimensions regions X samples (support_counts.csv).
+    - Read count quantification of the consensus regions across samples, yielding a count matrix with dimensions consensus regions X samples (consensus_counts.csv).
+    - Peak support quantification of the consensus regions across samples, yielding a count matrix with dimensions consensus regions X samples (support_counts.csv).
     - Consensus regions mapped to closest gene TSS according to HOMER (Distance to TSS) within proximal TSS up and downstream distances (TSS_counts.csv).
     - Read count quantification of promoter regions based on provided proximal TSS up and downstream distances (promoter_regions.bed and promoter_counts.csv).
+      - [Pseudoautosomal regions in human](https://www.ensembl.org/info/genome/genebuild/human_PARS.html) chromosome Y are skipped.
     - Aggregation of all sample-wise HOMER known motif enrichment results into one CSV in long-format (HOMER_knownMotifs.csv).
 - Annotation (counts/)
-    - Sample annotation file based on MultiQC general stats (annotation.csv)
-    - Consensus region set annotation using (region_annotation.csv) 
+    - Sample annotation file based on MultiQC general stats and provided annotations for downstream analysis (annotation.csv)
+    - Consensus region set annotation using (consensus_annotation.csv)
       - UROPA with regulatory build and gencode as references
       - HOMER with annotatePeaks.pl
       - bedtools for nucleotide counts/content (e.g., % of GC)
diff --git a/config/config.yaml b/config/config.yaml
@@ -43,6 +43,7 @@ tss_size: 100
 
 # assumed TSS proximal distance upstream/downstream (up/dn)
 # additionally, used for promoter region quantification and to map consensus regions to closest TSS
+# default of GenomicRanges::promoters(x, upstream=2000, downstream=200)
 proximal_size_up: 1000
 proximal_size_dn: 500
 
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -47,7 +47,7 @@ rule all:
         tss_counts = os.path.join(result_path,"counts","TSS_counts.csv") if len(samples_quantify)>0 else [],
         HOMER_knownMotifs = os.path.join(result_path,"counts","HOMER_knownMotifs.csv") if len(samples_quantify)>0 else [],
         # ANNOTATION
-        region_annotation = os.path.join(result_path,'counts',"region_annotation.csv") if len(samples_quantify)>0 else [],
+        consensus_annotation = os.path.join(result_path,'counts',"consensus_annotation.csv") if len(samples_quantify)>0 else [],
         # EXPORT environments and configurations
         envs = expand(os.path.join(config["result_path"],'envs',module_name,'{env}.yaml'),env=envs),
         configs = os.path.join(config["result_path"],'configs',module_name,'{}_config.yaml'.format(config["project_name"])),
diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
@@ -159,11 +159,12 @@ rule homer_aggregate:
 # map consensus regions to closest TSS per gene
 rule map_consensus_tss:
     input:
-        region_annotation = os.path.join(result_path,'counts',"region_annotation.csv"),
+        region_annotation = os.path.join(result_path,'counts',"consensus_annotation.csv"),
         consensus_counts = os.path.join(result_path,"counts","consensus_counts.csv"),
     output:
         tss_counts = os.path.join(result_path,"counts","TSS_counts.csv"),
         tss_annot = os.path.join(result_path,"counts","TSS_annotation.csv"),
+        tss_bed = os.path.join(result_path,"counts","TSS_regions.bed"),
     params:
         # cluster parameters
         partition=config.get("partition"),
diff --git a/workflow/rules/region_annotation.smk b/workflow/rules/region_annotation.smk
@@ -154,7 +154,7 @@ rule region_annotation_aggregate:
         homer_annotations = os.path.join(result_path,"tmp","homer_annotations.tsv"),
         bedtools_annotation = os.path.join(result_path, "tmp", "bedtools_annotation.bed"),
     output:
-        region_annotation = os.path.join(result_path,'counts',"region_annotation.csv"),
+        region_annotation = os.path.join(result_path,'counts',"consensus_annotation.csv"),
     params:
         # cluster parameters
         partition=config.get("partition"),
diff --git a/workflow/scripts/get_promoter_regions.py b/workflow/scripts/get_promoter_regions.py
@@ -56,7 +56,7 @@ def get_promoter(feature, upstream, downstream, chrom_sizes):
         chrom, size = line.strip().split('\t')
         chrom_sizes[chrom] = int(size)
 
-# filter for features that are genes AND not Pseudoautosomal regions denoted by "PAR" and create promoters
+# filter for features that are genes AND not pseudoautosomal regions denoted by "PAR" and create promoters
 # https://www.ensembl.org/info/genome/genebuild/human_PARS.html
 promoters = gtf.filter(lambda x: (x[2] == 'gene') & ("PAR" not in x["gene_id"])).each(get_promoter, TSS_up, TSS_dn, chrom_sizes)
 
diff --git a/workflow/scripts/map_consensus_tss.py b/workflow/scripts/map_consensus_tss.py
@@ -2,6 +2,7 @@
 
 #### libraries
 import pandas as pd
+import pybedtools as bedtools
 
 # map region to gene and classify if TSS
 def map_region(x):
@@ -20,6 +21,7 @@ def map_region(x):
 # output
 tss_counts_path = snakemake.output["tss_counts"]
 tss_annot_path = snakemake.output["tss_annot"]
+tss_bed_path = snakemake.output["tss_bed"]
 
 # parameters
 TSS_up = -snakemake.config["proximal_size_up"]
@@ -42,5 +44,13 @@ def map_region(x):
 annot_regions.set_index('peak_id', inplace=True)
 TSS_annot = annot_regions.loc[TSS_regions["peak_id"],:]
 TSS_annot.reset_index(inplace=True)
+TSS_annot = TSS_annot.sort_values(by="peak_id")
 TSS_annot.index = TSS_regions.index
-TSS_annot.to_csv(tss_annot_path)
+
+TSS_annot.to_csv(tss_annot_path)
+
+# save bed file of TSS regions
+TSS_annot.reset_index(inplace=True)
+TSS_bed_df = TSS_annot[["gencode_chr",  'gencode_start', 'gencode_end', 'homer_Nearest_Ensembl']]
+TSS_bed = pybedtools.BedTool.from_dataframe(TSS_bed_df)
+TSS_bed.saveas(tss_bed_path)