|
1 | 1 | #A little script to construct an annotation from an SJ.out.tab file |
2 | | - |
| 2 | +import argparse |
3 | 3 | import pandas as pd |
4 | 4 | import os, sys |
5 | 5 |
|
| 6 | +def Mobius(sjfile,gfile,ft,flat_ann): |
| 7 | + #First read in the SJ.out.tab as sys.argv[1] |
| 8 | + sj = pd.read_csv(sjfile,sep='\t',header=None,names=['Gene','Start','End','strand','intron motif','Annotated','Unique','Multi','Overhang']) |
| 9 | + |
| 10 | + #Read in SuperTranscript fasta file |
| 11 | + sf = open(sys.argv[2],'r') |
| 12 | + glength = {} #A dictionary holding the SuperTranscript gene lengths |
| 13 | + gene='' |
| 14 | + for line in sf: |
| 15 | + if('>' in line): |
| 16 | + gene= (line.split(' ')[0]).split('>')[1] |
| 17 | + glength[gene] = '' |
| 18 | + else: |
| 19 | + glength[gene] = glength[gene] + line.split('\n')[0].split('\r')[0] |
6 | 20 |
|
7 | | -#First read in the SJ.out.tab as sys.argv[1] |
8 | | -sj = pd.read_csv(sys.argv[1],sep='\t',header=None,names=['Gene','Start','End','strand','intron motif','Annotated','Unique','Multi','Overhang']) |
9 | | -print(sj.head()) |
| 21 | + #Create gtf file |
| 22 | + gtf = open('Spliced.gtf','w') |
| 23 | + slist = {} |
10 | 24 |
|
11 | | -#Read in SuperTranscript fasta file |
12 | | -sf = open(sys.argv[2],'r') |
13 | | -glength = {} #A dictionary holding the SuperTranscript gene lengths |
14 | | -gene='' |
15 | | -for line in sf: |
16 | | - if('>' in line): |
17 | | - gene= (line.split(' ')[0]).split('>')[1] |
18 | | - glength[gene] = '' |
19 | | - else: |
20 | | - glength[gene] = glength[gene] + line.split('\n')[0].split('\r')[0] |
21 | 25 |
|
22 | | -#Create gtf file |
23 | | -gtf = open('Spliced.gtf','w') |
24 | | -curr_gene = sj.iloc[0,0] |
25 | | -slist = {} |
| 26 | + #Make a dictionary for each gene, which holds a list of splice junction start and end points |
| 27 | + #For each row |
| 28 | + for i in range(0,len(sj['Gene'])): |
| 29 | + curr_gene = sj.iloc[i,0] |
| 30 | + if(curr_gene not in slist.keys()): slist[curr_gene] = [1] |
| 31 | + if((sj.iloc[i,7] + sj.iloc[i,8]) > 5): #More than 10 reads (either unique or multi spanninf junction) |
| 32 | + slist[curr_gene].append(int(sj.iloc[i,1])) #This is actually the intron start part (i.e one base over) |
| 33 | + slist[curr_gene].append(int(sj.iloc[i,2])+1) #This is the end of the exon-exon junction, so need to add one for the actual exon start place |
26 | 34 |
|
| 35 | + #If forcing transcript start and ends |
| 36 | + if(ft): |
| 37 | + #Make a dictionary with the transcript list per gene |
| 38 | + igtf = pd.read_csv(flat_ann,sep='\t',header=None,names=['Gene','Source','Type','Start','End','v1','v2','v3','v4'],skiprows=2) |
| 39 | + prev_gene ='' |
| 40 | + prev_trans='' |
| 41 | + for i in range(0,len(igtf)): |
| 42 | + curr_gene = igtf.iloc[i,0] |
| 43 | + curr_trans = igtf.iloc[i,8].split(';')[1].split('"')[1] |
27 | 44 |
|
28 | | -#Make a dictionary for each gene, which holds a list of splice junction start and end points |
29 | | -#For each row |
30 | | -for i in range(0,len(sj['Gene'])): |
31 | | - curr_gene = sj.iloc[i,0] |
| 45 | + #If gene not already in spliced dict, then make a key with an empty list value |
| 46 | + if (curr_gene not in slist.keys()): slist[curr_gene]=[] |
32 | 47 |
|
33 | | - if(curr_gene not in slist.keys()): slist[curr_gene] = [1] |
| 48 | + #Switch transcripts in gene |
| 49 | + if(curr_trans != prev_trans and curr_gene==prev_gene): #Just switched from one transcript in gene to another one in same gene |
| 50 | + slist[curr_gene].append(prev_end) #End of previous transcript |
| 51 | + slist[curr_gene].append(igtf.iloc[i,3]) #Start of new transcript |
34 | 52 |
|
35 | | - if((sj.iloc[i,7] + sj.iloc[i,8]) > 10): #More than 10 reads (either unique or multi spanninf junction) |
36 | | - slist[curr_gene].append(int(sj.iloc[i,1])) |
37 | | - slist[curr_gene].append(int(sj.iloc[i,2])+1) #This is the end of the exon-exon junction, so need to add one for the actual exon start place |
| 53 | + #Switch gene |
| 54 | + if(curr_gene != prev_gene): #Beginning a whole new gene |
| 55 | + slist[curr_gene].append(igtf.iloc[i,3]) #Start site of first transcript in new gene |
| 56 | + if(i != 0): slist[prev_gene].append(prev_end) #Make sure add in the end point of the last transcript on the last gene |
| 57 | + |
| 58 | + prev_gene = curr_gene |
| 59 | + prev_trans = curr_trans |
| 60 | + prev_end = igtf.iloc[i,4] |
38 | 61 |
|
39 | | -#Now sort each list for each gene, only keep unique elements |
40 | | -for key in glength.keys(): |
41 | | - if(key in slist.keys()): |
42 | | - slist[key] = list(set(slist[key])) |
43 | | - slist[key].sort() |
| 62 | + #Now sort each list for each gene, only keep unique elements |
| 63 | + for key in glength.keys(): |
| 64 | + exon_counter = 1 |
| 65 | + if(key in slist.keys()): |
| 66 | + slist[key] = list(set(slist[key])) |
| 67 | + slist[key].sort() |
44 | 68 |
|
45 | | - #Now for each coord in list make a block |
46 | | - for i in range(1,len(slist[key])): |
47 | | - ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(slist[key][i-1]) + '\t' + str(slist[key][i]-1) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + '.' + '\n' #Note: need to minus one off end to account for the fact that the exon ends before the exon-exon boundary exists |
48 | | - gtf.write(ann) |
| 69 | + #Now for each coord in list make a block |
| 70 | + for i in range(1,len(slist[key])): |
| 71 | + ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(slist[key][i-1]) + '\t' + str(slist[key][i]-1) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + 'gene_id "' +str(key) +'"; transcript_id "' + str(key) + '"; exon_number ' + str(exon_counter) + '; exon_id "' + str(key)+':'+str(exon_counter)+ '"' + '\n' #Note: need to minus one off end to account for the fact that the exon ends before the exon-exon boundary exists |
| 72 | + exon_counter +=1 |
| 73 | + gtf.write(ann) |
49 | 74 |
|
50 | 75 |
|
51 | | - #For the list splice junnction, we need to make a block from the last sj to the end of the ST |
52 | | - if(key not in slist): last = 1 |
53 | | - else: last = slist[key][-1] |
| 76 | + #For the list splice junnction, we need to make a block from the last sj to the end of the ST |
| 77 | + if(key not in slist): last = 1 |
| 78 | + else: last = slist[key][-1] |
54 | 79 |
|
55 | | - ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(last) + '\t' + str(len(glength[key])) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + '.' + '\n' |
| 80 | + #If not forcing transcript start and ends |
| 81 | + if(not ft): |
| 82 | + if(last != len(glength[key])): |
| 83 | + ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(last) + '\t' + str(len(glength[key])) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + 'gene_id "' +str(key) +'"; transcript_id "' + str(key) + '"; exon_number ' + str(exon_counter) + '; exon_id "' + str(key)+':'+str(exon_counter)+ '"' + '\n' |
| 84 | + gtf.write(ann) |
| 85 | + |
| 86 | + |
| 87 | +if __name__ == '__main__': |
| 88 | + |
| 89 | + #Make argument parser |
| 90 | + parser = argparse.ArgumentParser() |
| 91 | + |
| 92 | + #Add Arguments |
| 93 | + parser.add_argument("SpliceJunctions",help="The name of the Splice Junctions tab file (in the format of the one STAR produces)") |
| 94 | + parser.add_argument("GenomeFasta",help="A fasta file containing the sequence for all genes in genome") |
| 95 | + parser.add_argument("-forceTrans","-ft",help="Force blocks where annotated transcripts start and end",action='store_true') |
| 96 | + parser.add_argument("-AnnoTrans","-a",help="Flattened SuperTranscript annotation file",default="") |
| 97 | + args= parser.parse_args() |
56 | 98 |
|
57 | | - gtf.write(ann) |
| 99 | + print('Constructing Dynamic Blocks') |
| 100 | + Mobius(args.SpliceJunctions,args.GenomeFasta,args.forceTrans,args.AnnoTrans) |
| 101 | + print('Done') |
0 commit comments