22
33#### libraries
44import pybedtools as bedtools
5+ import pandas as pd
56
67# extract promoter regions
78def get_promoter (feature , upstream , downstream , chrom_sizes ):
@@ -26,7 +27,7 @@ def get_promoter(feature, upstream, downstream, chrom_sizes):
2627 start ,
2728 end ,
2829 gene_id ,
29- # feature.attrs['gene_name'] if 'gene_name' in feature.attrs else feature.attrs['gene_id'],
30+ feature .attrs ['gene_name' ] if 'gene_name' in feature .attrs else feature .attrs ['gene_id' ],
3031# '.',
3132# feature.strand
3233 ])
@@ -41,10 +42,12 @@ def get_promoter(feature, upstream, downstream, chrom_sizes):
4142
4243# output
4344promoter_regions_path = snakemake .output ["promoter_regions" ]
45+ promoter_annot_path = snakemake .output ["promoter_annot" ]
4446
4547# parameters
4648TSS_up = snakemake .config ["proximal_size_up" ]
4749TSS_dn = snakemake .config ["proximal_size_dn" ]
50+ genome_fasta_path = snakemake .config ["genome_fasta" ]
4851
4952# load the genome annotation file using pybedtools
5053gtf = bedtools .BedTool (gtf_file )
@@ -68,3 +71,14 @@ def get_promoter(feature, upstream, downstream, chrom_sizes):
6871
6972# save the promoter regions to a BED file
7073promoters .saveas (promoter_regions_path )
74+
75+ # calculate GC content and length for each region and save as annotation
76+ gc_content_length = promoters .nucleotide_content (fi = genome_fasta_path ).to_dataframe ()
77+ gc_content_length .columns = [col .split ('_' , 1 )[- 1 ].replace ('at' , 'AT' ).replace ('gc' , 'GC' ).replace ('oth' , 'otherBases' ) for col in gc_content_length .columns ]
78+ gc_content_length = gc_content_length .add_prefix ('bedtools_' )
79+ gc_content_length .columns = ["chr" , "start" , "end" , "gene" , "gene_name" ] + gc_content_length .columns [5 :].tolist ()
80+ gc_content_length .set_index ("gene" , inplace = True )
81+ gc_content_length .to_csv (promoter_annot_path )
82+
83+ # load, remove last column (gene name) and save again as final promoter BED file for quantification
84+ bedtools .BedTool (promoter_regions_path ).cut (range (0 , 4 )).saveas (promoter_regions_path )
0 commit comments