Skip to content

Commit 6f4cbd4

Browse files
authored
Merge pull request #16 from Quarkins/master
Updating periphery scripts
2 parents 2272ecc + af36eb2 commit 6f4cbd4

File tree

2 files changed

+123
-93
lines changed

2 files changed

+123
-93
lines changed

Mobius.py

Lines changed: 84 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,101 @@
11
#A little script to construct an annotation from an SJ.out.tab file
2-
2+
import argparse
33
import pandas as pd
44
import os, sys
55

6+
def Mobius(sjfile,gfile,ft,flat_ann):
7+
#First read in the SJ.out.tab as sys.argv[1]
8+
sj = pd.read_csv(sjfile,sep='\t',header=None,names=['Gene','Start','End','strand','intron motif','Annotated','Unique','Multi','Overhang'])
9+
10+
#Read in SuperTranscript fasta file
11+
sf = open(sys.argv[2],'r')
12+
glength = {} #A dictionary holding the SuperTranscript gene lengths
13+
gene=''
14+
for line in sf:
15+
if('>' in line):
16+
gene= (line.split(' ')[0]).split('>')[1]
17+
glength[gene] = ''
18+
else:
19+
glength[gene] = glength[gene] + line.split('\n')[0].split('\r')[0]
620

7-
#First read in the SJ.out.tab as sys.argv[1]
8-
sj = pd.read_csv(sys.argv[1],sep='\t',header=None,names=['Gene','Start','End','strand','intron motif','Annotated','Unique','Multi','Overhang'])
9-
print(sj.head())
21+
#Create gtf file
22+
gtf = open('Spliced.gtf','w')
23+
slist = {}
1024

11-
#Read in SuperTranscript fasta file
12-
sf = open(sys.argv[2],'r')
13-
glength = {} #A dictionary holding the SuperTranscript gene lengths
14-
gene=''
15-
for line in sf:
16-
if('>' in line):
17-
gene= (line.split(' ')[0]).split('>')[1]
18-
glength[gene] = ''
19-
else:
20-
glength[gene] = glength[gene] + line.split('\n')[0].split('\r')[0]
2125

22-
#Create gtf file
23-
gtf = open('Spliced.gtf','w')
24-
curr_gene = sj.iloc[0,0]
25-
slist = {}
26+
#Make a dictionary for each gene, which holds a list of splice junction start and end points
27+
#For each row
28+
for i in range(0,len(sj['Gene'])):
29+
curr_gene = sj.iloc[i,0]
30+
if(curr_gene not in slist.keys()): slist[curr_gene] = [1]
31+
if((sj.iloc[i,7] + sj.iloc[i,8]) > 5): #More than 10 reads (either unique or multi spanninf junction)
32+
slist[curr_gene].append(int(sj.iloc[i,1])) #This is actually the intron start part (i.e one base over)
33+
slist[curr_gene].append(int(sj.iloc[i,2])+1) #This is the end of the exon-exon junction, so need to add one for the actual exon start place
2634

35+
#If forcing transcript start and ends
36+
if(ft):
37+
#Make a dictionary with the transcript list per gene
38+
igtf = pd.read_csv(flat_ann,sep='\t',header=None,names=['Gene','Source','Type','Start','End','v1','v2','v3','v4'],skiprows=2)
39+
prev_gene =''
40+
prev_trans=''
41+
for i in range(0,len(igtf)):
42+
curr_gene = igtf.iloc[i,0]
43+
curr_trans = igtf.iloc[i,8].split(';')[1].split('"')[1]
2744

28-
#Make a dictionary for each gene, which holds a list of splice junction start and end points
29-
#For each row
30-
for i in range(0,len(sj['Gene'])):
31-
curr_gene = sj.iloc[i,0]
45+
#If gene not already in spliced dict, then make a key with an empty list value
46+
if (curr_gene not in slist.keys()): slist[curr_gene]=[]
3247

33-
if(curr_gene not in slist.keys()): slist[curr_gene] = [1]
48+
#Switch transcripts in gene
49+
if(curr_trans != prev_trans and curr_gene==prev_gene): #Just switched from one transcript in gene to another one in same gene
50+
slist[curr_gene].append(prev_end) #End of previous transcript
51+
slist[curr_gene].append(igtf.iloc[i,3]) #Start of new transcript
3452

35-
if((sj.iloc[i,7] + sj.iloc[i,8]) > 10): #More than 10 reads (either unique or multi spanninf junction)
36-
slist[curr_gene].append(int(sj.iloc[i,1]))
37-
slist[curr_gene].append(int(sj.iloc[i,2])+1) #This is the end of the exon-exon junction, so need to add one for the actual exon start place
53+
#Switch gene
54+
if(curr_gene != prev_gene): #Beginning a whole new gene
55+
slist[curr_gene].append(igtf.iloc[i,3]) #Start site of first transcript in new gene
56+
if(i != 0): slist[prev_gene].append(prev_end) #Make sure add in the end point of the last transcript on the last gene
57+
58+
prev_gene = curr_gene
59+
prev_trans = curr_trans
60+
prev_end = igtf.iloc[i,4]
3861

39-
#Now sort each list for each gene, only keep unique elements
40-
for key in glength.keys():
41-
if(key in slist.keys()):
42-
slist[key] = list(set(slist[key]))
43-
slist[key].sort()
62+
#Now sort each list for each gene, only keep unique elements
63+
for key in glength.keys():
64+
exon_counter = 1
65+
if(key in slist.keys()):
66+
slist[key] = list(set(slist[key]))
67+
slist[key].sort()
4468

45-
#Now for each coord in list make a block
46-
for i in range(1,len(slist[key])):
47-
ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(slist[key][i-1]) + '\t' + str(slist[key][i]-1) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + '.' + '\n' #Note: need to minus one off end to account for the fact that the exon ends before the exon-exon boundary exists
48-
gtf.write(ann)
69+
#Now for each coord in list make a block
70+
for i in range(1,len(slist[key])):
71+
ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(slist[key][i-1]) + '\t' + str(slist[key][i]-1) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + 'gene_id "' +str(key) +'"; transcript_id "' + str(key) + '"; exon_number ' + str(exon_counter) + '; exon_id "' + str(key)+':'+str(exon_counter)+ '"' + '\n' #Note: need to minus one off end to account for the fact that the exon ends before the exon-exon boundary exists
72+
exon_counter +=1
73+
gtf.write(ann)
4974

5075

51-
#For the list splice junnction, we need to make a block from the last sj to the end of the ST
52-
if(key not in slist): last = 1
53-
else: last = slist[key][-1]
76+
#For the list splice junnction, we need to make a block from the last sj to the end of the ST
77+
if(key not in slist): last = 1
78+
else: last = slist[key][-1]
5479

55-
ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(last) + '\t' + str(len(glength[key])) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + '.' + '\n'
80+
#If not forcing transcript start and ends
81+
if(not ft):
82+
if(last != len(glength[key])):
83+
ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(last) + '\t' + str(len(glength[key])) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + 'gene_id "' +str(key) +'"; transcript_id "' + str(key) + '"; exon_number ' + str(exon_counter) + '; exon_id "' + str(key)+':'+str(exon_counter)+ '"' + '\n'
84+
gtf.write(ann)
85+
86+
87+
if __name__ == '__main__':
88+
89+
#Make argument parser
90+
parser = argparse.ArgumentParser()
91+
92+
#Add Arguments
93+
parser.add_argument("SpliceJunctions",help="The name of the Splice Junctions tab file (in the format of the one STAR produces)")
94+
parser.add_argument("GenomeFasta",help="A fasta file containing the sequence for all genes in genome")
95+
parser.add_argument("-forceTrans","-ft",help="Force blocks where annotated transcripts start and end",action='store_true')
96+
parser.add_argument("-AnnoTrans","-a",help="Flattened SuperTranscript annotation file",default="")
97+
args= parser.parse_args()
5698

57-
gtf.write(ann)
99+
print('Constructing Dynamic Blocks')
100+
Mobius(args.SpliceJunctions,args.GenomeFasta,args.forceTrans,args.AnnoTrans)
101+
print('Done')

STViewer.py

100644100755
Lines changed: 39 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
#Visualise a given gene in your super transcript
2-
import argparse
2+
33
import pandas as pd
44
import matplotlib.pyplot as plt
5-
plt.style.use('seaborn-white')
65
import numpy as np
76
import sys
87
import os
98
from matplotlib.pyplot import cm
9+
import seaborn as sns
1010
from matplotlib import gridspec
1111

12+
font = {'style' : 'oblique',
13+
'size' : 16}
14+
1215
##################################################
1316
###### Visualise blocks in SuperTranscript #######
1417
##################################################
@@ -33,7 +36,6 @@ def Visualise(gene_name):
3336
#Match transcripts to super transcript
3437
print("Producing match to super transcript")
3538
BLAT_command = "blat Super.fasta %s.fasta supercomp.psl" %(gene_name)
36-
#BLAT_command = "./blat Super.fasta %s.fasta -prot -tileSize=4 supercomp.psl" %(gene_name)
3739
os.system(BLAT_command)
3840

3941
#First read in BLAT output:
@@ -54,27 +56,26 @@ def Visualise(gene_name):
5456
#Get Super Transcript Length
5557
ST_length = int(vData.iloc[0,14])
5658

57-
#SuperTranscript as one block
58-
#plt.barh(len(transcripts),ST_length,color='Black',left=0)
59-
6059
gs = gridspec.GridSpec(2,1,height_ratios=[4,1])
6160
ax1=plt.subplot(gs[0])
6261
accum = 0
63-
for row in range(0,len(gff_data)):
64-
size = 1 + int(gff_data.iloc[row,4]) - int(gff_data.iloc[row,3]) #+1 for converting from gff co-ords to BLAT co-ords
65-
plt.barh(len(transcripts),size,color='#ffc024',left=accum,alpha=0.8)
66-
accum=accum+size
67-
#if(row > 0): plt.axvline(int(gff_data.iloc[row,3]),linestyle='dashed',color='black',linewidth=0.5)
62+
#for row in range(0,len(gff_data)):
63+
# size = 1 + int(gff_data.iloc[row,4]) - int(gff_data.iloc[row,3]) #1+ for converting co-ordinate systems
64+
# plt.barh(len(transcripts),size,color='#ffc024',left=accum,alpha=0.8)
65+
# accum=accum+size
6866

67+
plt.barh(len(transcripts),ST_length,color='#ffc024',left=0)
6968
plot_dict = {}
7069
col_dict = {}
7170
labs = []
7271

73-
col2=iter(cm.plasma(np.linspace(0,1,len(transcripts))))
72+
#col2=iter(cm.plasma(np.linspace(0,1,len(transcripts))))
73+
col2=iter(cm.terrain(np.linspace(0,1,len(transcripts))))
7474
for i,key in enumerate(transcripts):
7575
plot_dict[key] = i
7676
col_dict[key] = next(col2)
77-
lab = "T"+str(i)
77+
#lab = "T"+str(i)
78+
lab =""
7879
labs.append(lab)
7980

8081
#Empty vector to store coverage
@@ -101,9 +102,9 @@ def Visualise(gene_name):
101102
ind=np.arange(len(transcripts)+1)
102103
width=0.8
103104
labs.append('Super')
104-
plt.yticks(ind + width/2.,labs)
105-
plt.title('Breakdown of Super Transcript')
106-
plt.ylabel('Transcripts')
105+
plt.yticks(ind + width/2.,labs,fontsize="medium",fontweight="semibold")
106+
#plt.title('Breakdown of Super Transcript')
107+
plt.ylabel('Transcripts',fontdict=font)
107108

108109

109110
#################################
@@ -113,46 +114,31 @@ def Visualise(gene_name):
113114
#For a super block, how many transcripts cover that area....
114115
ax2=plt.subplot(gs[1],sharex=ax1)
115116
x = np.arange(ST_length)
116-
plt.bar(x,coverage,alpha=0.7)
117+
plt.bar(x,coverage,color='slategray',alpha=0.7)
117118
plt.xlim([0,ST_length+1])
118119

119120
#Fix x-axes
120121
ax2.set_yticklabels([])
121-
plt.xlabel('Bases')
122-
plt.ylabel('Coverage')
123-
plt.savefig("Visualise.pdf")
124-
#plt.show()
125-
126-
###########################################
127-
# Create GFF with transcript annotation ###
128-
###########################################
129-
130-
fg = open(gene_name + ".gff","w")
131-
#For each blat alignment
132-
for j in range(0,len(vData)):
133-
qStarts = vData.iloc[j,19].split(",")
134-
blocksizes = vData.iloc[j,18].split(",")
135-
for k in range(0,len(qStarts)-1): #Split qStarts, last in list is simply blank space
136-
fg.write(gene_name + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + qStarts[k] + '\t' + str(int(qStarts[k]) + int(blocksizes[k])) + '\t' + '.' + '\t' +'.' + '\t' + '0' + '\t' + 'gene_id "' + gene_name +'"; transcript_id "' + vData.iloc[j,9] + '";' + '\n')
137-
fg.close()
138-
139-
122+
plt.xlabel('Bases',fontdict=font)
123+
plt.ylabel('Coverage',fontdict=font)
124+
plt.savefig('GeneView.pdf')
125+
plt.show()
140126

141127
if __name__=='__main__':
142-
#Make argument parser
143-
parser = argparse.ArgumentParser()
144-
145-
#Add Arguments
146-
parser.add_argument("GeneName",help="The name of the gene whom you wish to view")
147-
args= parser.parse_args()
148-
if(not os.path.isfile(args.GeneName + ".fasta")):
149-
print("No fasta file for gene/cluster of interest\n")
150-
sys.exit()
151-
if(not os.path.isfile("SuperDuper.fasta")):
152-
print("No fasta file for SuperTranscript\n")
153-
sys.exit()
154-
if(not os.path.isfile("SuperDuper.gff")):
155-
print("No annotation file for SuperTranscript\n")
156-
sys.exit()
157-
158-
Visualise(args.GeneName)
128+
if(len(sys.argv) != 2):
129+
print("Visualisation function requires one argument\n")
130+
print("The gene whose super transcripts you wish to visualise\n")
131+
else:
132+
#Check all the super files are there
133+
if(not os.path.isfile(sys.argv[1] + ".fasta")):
134+
print("No fasta file for gene/cluster of interest\n")
135+
sys.exit()
136+
if(not os.path.isfile("SuperDuper.fasta")):
137+
print("No fasta file for SuperTranscript\n")
138+
sys.exit()
139+
if(not os.path.isfile("SuperDuper.gff")):
140+
print("No annotation file for SuperTranscript\n")
141+
sys.exit()
142+
143+
144+
Visualise(sys.argv[1])

0 commit comments

Comments
 (0)