44import pybedtools as bedtools
55
66# extract promoter regions
7- def get_promoter (feature , upstream , downstream ):
7+ def get_promoter (feature , upstream , downstream , chrom_sizes ):
88 if feature .strand == '+' :
99 start = feature .start - upstream
1010 end = feature .start + downstream
@@ -14,6 +14,8 @@ def get_promoter(feature, upstream, downstream):
1414
1515 # Ensure start is not negative
1616 start = max (start , 0 )
17+ # Ensure end is not longer than chromosome
18+ end = min (end , chrom_sizes .get (feature .chrom , end ))
1719
1820 # Extract gene_id and remove version number if present
1921 gene_id = feature .attrs ['gene_id' ].split ('.' )[0 ]
@@ -46,15 +48,19 @@ def get_promoter(feature, upstream, downstream):
4648
4749# load the genome annotation file using pybedtools
4850gtf = bedtools .BedTool (gtf_file )
49- # load and get list of valid chromosomes
51+
52+ # load and get list of valid chromosomes and sizes
53+ chrom_sizes = {}
5054with open (chrom_file , 'r' ) as f :
51- valid_chromosomes = {line .split ('\t ' )[0 ] for line in f }
55+ for line in f :
56+ chrom , size = line .strip ().split ('\t ' )
57+ chrom_sizes [chrom ] = int (size )
5258
5359# filter for features that are genes and create promoters
54- promoters = gtf .filter (lambda x : x [2 ] == 'gene' ).each (get_promoter , TSS_up , TSS_dn )
60+ promoters = gtf .filter (lambda x : x [2 ] == 'gene' ).each (get_promoter , TSS_up , TSS_dn , chrom_sizes )
5561
5662# filter for valid chromosomes
57- promoters = promoters .filter (lambda x : x .chrom in valid_chromosomes )
63+ promoters = promoters .filter (lambda x : x .chrom in chrom_sizes )
5864
5965# sort promoter regions
6066promoters = promoters .sort (faidx = chrom_file )
0 commit comments