Skip to content

Commit b4be61b

Browse files
committed
add annotation for promoter regions #41
1 parent b0d6cd3 commit b4be61b

File tree

3 files changed

+18
-6
lines changed

3 files changed

+18
-6
lines changed

workflow/rules/quantification.smk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ rule get_promoter_regions:
2525
config["gencode_gtf"],
2626
output:
2727
promoter_regions = os.path.join(result_path,"counts","promoter_regions.bed"),
28+
promoter_annot = os.path.join(result_path,"counts","promoter_annotation.csv"),
2829
params:
2930
# cluster parameters
3031
partition=config.get("partition"),

workflow/scripts/get_promoter_regions.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#### libraries
44
import pybedtools as bedtools
5+
import pandas as pd
56

67
# extract promoter regions
78
def get_promoter(feature, upstream, downstream, chrom_sizes):
@@ -26,7 +27,7 @@ def get_promoter(feature, upstream, downstream, chrom_sizes):
2627
start,
2728
end,
2829
gene_id,
29-
# feature.attrs['gene_name'] if 'gene_name' in feature.attrs else feature.attrs['gene_id'],
30+
feature.attrs['gene_name'] if 'gene_name' in feature.attrs else feature.attrs['gene_id'],
3031
# '.',
3132
# feature.strand
3233
])
@@ -41,10 +42,12 @@ def get_promoter(feature, upstream, downstream, chrom_sizes):
4142

4243
# output
4344
promoter_regions_path = snakemake.output["promoter_regions"]
45+
promoter_annot_path = snakemake.output["promoter_annot"]
4446

4547
# parameters
4648
TSS_up = snakemake.config["proximal_size_up"]
4749
TSS_dn = snakemake.config["proximal_size_dn"]
50+
genome_fasta_path = snakemake.config["genome_fasta"]
4851

4952
# load the genome annotation file using pybedtools
5053
gtf = bedtools.BedTool(gtf_file)
@@ -68,3 +71,14 @@ def get_promoter(feature, upstream, downstream, chrom_sizes):
6871

6972
# save the promoter regions to a BED file
7073
promoters.saveas(promoter_regions_path)
74+
75+
# calculate GC content and length for each region and save as annotation
76+
gc_content_length = promoters.nucleotide_content(fi=genome_fasta_path).to_dataframe()
77+
gc_content_length.columns = [col.split('_', 1)[-1].replace('at', 'AT').replace('gc', 'GC').replace('oth', 'otherBases') for col in gc_content_length.columns]
78+
gc_content_length = gc_content_length.add_prefix('bedtools_')
79+
gc_content_length.columns = ["chr", "start", "end", "gene", "gene_name"] + gc_content_length.columns[5:].tolist()
80+
gc_content_length.set_index("gene", inplace=True)
81+
gc_content_length.to_csv(promoter_annot_path)
82+
83+
# load, remove last column (gene name) and save again as final promoter BED file for quantification
84+
bedtools.BedTool(promoter_regions_path).cut(range(0, 4)).saveas(promoter_regions_path)

workflow/scripts/map_consensus_tss.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,10 @@ def map_region(x):
4444
annot_regions.set_index('peak_id', inplace=True)
4545
TSS_annot = annot_regions.loc[TSS_regions["peak_id"],:]
4646
TSS_annot.reset_index(inplace=True)
47-
TSS_annot = TSS_annot.sort_values(by="peak_id")
4847
TSS_annot.index = TSS_regions.index
49-
5048
TSS_annot.to_csv(tss_annot_path)
5149

5250
# save bed file of TSS regions
53-
TSS_annot.reset_index(inplace=True)
54-
TSS_bed_df = TSS_annot[["gencode_chr", 'gencode_start', 'gencode_end', 'homer_Nearest_Ensembl']]
55-
TSS_bed = pybedtools.BedTool.from_dataframe(TSS_bed_df)
51+
TSS_bed_df = TSS_annot.sort_values(by="peak_id")[["gencode_chr", 'gencode_start', 'gencode_end', 'homer_Nearest_Ensembl']]
52+
TSS_bed = bedtools.BedTool.from_dataframe(TSS_bed_df)
5653
TSS_bed.saveas(tss_bed_path)

0 commit comments

Comments
 (0)