Merge pull request #16 from Quarkins/master

Quarkins · web-flow · commit 6f4cbd48550f · 2016-09-21T14:02:04.000+10:00
Updating periphery scripts
diff --git a/Mobius.py b/Mobius.py
@@ -1,57 +1,101 @@
 #A little script to construct an annotation from an SJ.out.tab file
-
+import argparse
 import pandas as pd
 import os, sys
 
+def Mobius(sjfile,gfile,ft,flat_ann):
+    #First read in the SJ.out.tab as sys.argv[1]
+    sj = pd.read_csv(sjfile,sep='\t',header=None,names=['Gene','Start','End','strand','intron motif','Annotated','Unique','Multi','Overhang'])
+
+    #Read in SuperTranscript fasta file
+    sf = open(sys.argv[2],'r')
+    glength = {} #A dictionary holding the SuperTranscript gene lengths
+    gene=''
+    for line in sf:
+        if('>' in line):
+            gene= (line.split(' ')[0]).split('>')[1]
+            glength[gene] = ''
+        else:
+            glength[gene] = glength[gene] + line.split('\n')[0].split('\r')[0]
 
-#First read in the SJ.out.tab as sys.argv[1]
-sj = pd.read_csv(sys.argv[1],sep='\t',header=None,names=['Gene','Start','End','strand','intron motif','Annotated','Unique','Multi','Overhang'])
-print(sj.head())
+    #Create gtf file
+    gtf = open('Spliced.gtf','w')
+    slist = {}
 
-#Read in SuperTranscript fasta file
-sf = open(sys.argv[2],'r')
-glength = {} #A dictionary holding the SuperTranscript gene lengths
-gene=''
-for line in sf:
-	if('>' in line):
-		gene= (line.split(' ')[0]).split('>')[1]
-		glength[gene] = ''
-	else:
-		glength[gene] = glength[gene] + line.split('\n')[0].split('\r')[0]
 
-#Create gtf file
-gtf = open('Spliced.gtf','w')
-curr_gene = sj.iloc[0,0]	
-slist = {}
+    #Make a dictionary for each gene, which holds a list of splice junction start and end points
+    #For each row
+    for i in range(0,len(sj['Gene'])):
+        curr_gene = sj.iloc[i,0]
+        if(curr_gene not in slist.keys()): slist[curr_gene] = [1]
+        if((sj.iloc[i,7] + sj.iloc[i,8]) > 5): #More than 10 reads (either unique or multi spanninf junction)
+            slist[curr_gene].append(int(sj.iloc[i,1])) #This is actually the intron start part (i.e one base over)
+            slist[curr_gene].append(int(sj.iloc[i,2])+1) #This is the end of the exon-exon junction, so need to add one for the actual exon start place
 
+    #If forcing transcript start and ends
+    if(ft): 
+        #Make a dictionary with the transcript list per gene
+        igtf = pd.read_csv(flat_ann,sep='\t',header=None,names=['Gene','Source','Type','Start','End','v1','v2','v3','v4'],skiprows=2)
+        prev_gene =''
+        prev_trans=''
+        for i in range(0,len(igtf)):
+            curr_gene = igtf.iloc[i,0]
+            curr_trans = igtf.iloc[i,8].split(';')[1].split('"')[1]
 
-#Make a dictionary for each gene, which holds a list of splice junction start and end points
-#For each row
-for i in range(0,len(sj['Gene'])):
-	curr_gene = sj.iloc[i,0]
+	    #If gene not already in spliced dict, then make a key with an empty list value
+            if (curr_gene not in slist.keys()): slist[curr_gene]=[]
 
-	if(curr_gene not in slist.keys()): slist[curr_gene] = [1]
+            #Switch transcripts in gene
+            if(curr_trans != prev_trans and curr_gene==prev_gene): #Just switched from one transcript in gene to another one in same gene
+                slist[curr_gene].append(prev_end) #End of previous transcript
+                slist[curr_gene].append(igtf.iloc[i,3]) #Start of new transcript
 
-	if((sj.iloc[i,7] + sj.iloc[i,8]) > 10): #More than 10 reads (either unique or multi spanninf junction)
-		slist[curr_gene].append(int(sj.iloc[i,1]))
-		slist[curr_gene].append(int(sj.iloc[i,2])+1) #This is the end of the exon-exon junction, so need to add one for the actual exon start place
+            #Switch gene
+            if(curr_gene != prev_gene): #Beginning a whole new gene
+                slist[curr_gene].append(igtf.iloc[i,3]) #Start site of first transcript in new gene
+                if(i != 0): slist[prev_gene].append(prev_end) #Make sure add in the end point of the last transcript on the last gene
+		
+            prev_gene = curr_gene
+            prev_trans = curr_trans
+            prev_end = igtf.iloc[i,4]
 
-#Now sort each list for each gene, only keep unique elements
-for key in glength.keys():
-	if(key in slist.keys()):
-		slist[key] = list(set(slist[key]))
-		slist[key].sort()
+     #Now sort each list for each gene, only keep unique elements
+    for key in glength.keys():
+        exon_counter = 1
+        if(key in slist.keys()):
+            slist[key] = list(set(slist[key]))
+            slist[key].sort()
 
-		#Now for each coord in list make a block
-		for i in range(1,len(slist[key])):
-			ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(slist[key][i-1])  + '\t' + str(slist[key][i]-1) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + '.' + '\n' #Note: need to minus one off end to account for the fact that the exon ends before the exon-exon boundary exists
-			gtf.write(ann)
+	    #Now for each coord in list make a block
+            for i in range(1,len(slist[key])):
+                    ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(slist[key][i-1])  + '\t' + str(slist[key][i]-1) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + 'gene_id "' +str(key) +'"; transcript_id "' + str(key) + '"; exon_number ' + str(exon_counter) + '; exon_id "' + str(key)+':'+str(exon_counter)+ '"' + '\n' #Note: need to minus one off end to account for the fact that the exon ends before the exon-exon boundary exists
+                    exon_counter +=1
+                    gtf.write(ann)
 
 
-	#For the list splice junnction, we need to make a block from the last sj to the end of the ST
-	if(key not in slist): last = 1
-	else: last = slist[key][-1]
+        #For the list splice junnction, we need to make a block from the last sj to the end of the ST
+        if(key not in slist): last = 1
+        else: last = slist[key][-1]
  
-	ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(last)  + '\t' + str(len(glength[key])) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + '.' + '\n'
+	#If not forcing transcript start and ends
+        if(not ft):
+            if(last != len(glength[key])): 
+                ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(last)  + '\t' + str(len(glength[key])) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + 'gene_id "' +str(key) +'"; transcript_id "' + str(key) + '"; exon_number ' + str(exon_counter) + '; exon_id "' + str(key)+':'+str(exon_counter)+ '"' + '\n'
+                gtf.write(ann)
+
+
+if __name__ == '__main__':
+
+        #Make argument parser
+        parser = argparse.ArgumentParser()
+
+        #Add Arguments
+        parser.add_argument("SpliceJunctions",help="The name of the Splice Junctions tab file (in the format of the one STAR produces)")
+        parser.add_argument("GenomeFasta",help="A fasta file containing the sequence for all genes in genome")
+        parser.add_argument("-forceTrans","-ft",help="Force blocks where annotated transcripts start and end",action='store_true')
+        parser.add_argument("-AnnoTrans","-a",help="Flattened SuperTranscript annotation file",default="")
+        args= parser.parse_args()
 
-	gtf.write(ann)
+        print('Constructing Dynamic Blocks')
+        Mobius(args.SpliceJunctions,args.GenomeFasta,args.forceTrans,args.AnnoTrans)
+        print('Done')
diff --git a/STViewer.py b/STViewer.py
@@ -1,14 +1,17 @@
 #Visualise a given gene in your super transcript
-import argparse
+
 import pandas as pd
 import matplotlib.pyplot as plt
-plt.style.use('seaborn-white')
 import numpy as np
 import sys
 import os
 from matplotlib.pyplot import cm
+import seaborn as sns
 from matplotlib import gridspec
 
+font = {'style' : 'oblique',
+        'size'   : 16}
+
 ##################################################
 ###### Visualise blocks in SuperTranscript #######
 ##################################################
@@ -33,7 +36,6 @@ def Visualise(gene_name):
 	#Match transcripts to super transcript
 	print("Producing match to super transcript")
 	BLAT_command = "blat Super.fasta %s.fasta supercomp.psl" %(gene_name)
-	#BLAT_command = "./blat Super.fasta %s.fasta -prot -tileSize=4 supercomp.psl" %(gene_name)
 	os.system(BLAT_command)
 
 	#First read in BLAT output:
@@ -54,27 +56,26 @@ def Visualise(gene_name):
 	#Get Super Transcript Length
 	ST_length = int(vData.iloc[0,14])
 
-	#SuperTranscript as one block
-	#plt.barh(len(transcripts),ST_length,color='Black',left=0)
-
 	gs = gridspec.GridSpec(2,1,height_ratios=[4,1])
 	ax1=plt.subplot(gs[0])
 	accum = 0
-	for row in range(0,len(gff_data)):
-		size = 1 + int(gff_data.iloc[row,4]) - int(gff_data.iloc[row,3])  #+1 for converting from gff co-ords to BLAT co-ords
-		plt.barh(len(transcripts),size,color='#ffc024',left=accum,alpha=0.8)
-		accum=accum+size
-		#if(row > 0): plt.axvline(int(gff_data.iloc[row,3]),linestyle='dashed',color='black',linewidth=0.5)
+	#for row in range(0,len(gff_data)):
+	#	size = 1 + int(gff_data.iloc[row,4]) - int(gff_data.iloc[row,3]) #1+ for converting co-ordinate systems
+	#	plt.barh(len(transcripts),size,color='#ffc024',left=accum,alpha=0.8)
+	#	accum=accum+size
 
+	plt.barh(len(transcripts),ST_length,color='#ffc024',left=0)
 	plot_dict = {}
 	col_dict = {}
 	labs = []
 
-	col2=iter(cm.plasma(np.linspace(0,1,len(transcripts))))
+	#col2=iter(cm.plasma(np.linspace(0,1,len(transcripts))))
+	col2=iter(cm.terrain(np.linspace(0,1,len(transcripts))))
 	for i,key in enumerate(transcripts):
         	plot_dict[key] = i
         	col_dict[key] = next(col2)
-        	lab = "T"+str(i)
+        	#lab = "T"+str(i)
+        	lab =""
         	labs.append(lab)
 
 	#Empty vector to store coverage
@@ -101,9 +102,9 @@ def Visualise(gene_name):
 	ind=np.arange(len(transcripts)+1)
 	width=0.8
 	labs.append('Super')
-	plt.yticks(ind + width/2.,labs)
-	plt.title('Breakdown of Super Transcript')
-	plt.ylabel('Transcripts')
+	plt.yticks(ind + width/2.,labs,fontsize="medium",fontweight="semibold")
+	#plt.title('Breakdown of Super Transcript')
+	plt.ylabel('Transcripts',fontdict=font)
 
 
 	#################################
@@ -113,46 +114,31 @@ def Visualise(gene_name):
 	#For a super block, how many transcripts cover that area....
 	ax2=plt.subplot(gs[1],sharex=ax1)
 	x = np.arange(ST_length)
-	plt.bar(x,coverage,alpha=0.7)
+	plt.bar(x,coverage,color='slategray',alpha=0.7)
 	plt.xlim([0,ST_length+1])
 
 	#Fix x-axes
 	ax2.set_yticklabels([])
-	plt.xlabel('Bases')
-	plt.ylabel('Coverage')
-	plt.savefig("Visualise.pdf")
-	#plt.show()
-
-	###########################################
-	# Create GFF with transcript annotation ###
-	###########################################
-
-	fg = open(gene_name + ".gff","w")
-	#For each blat alignment
-	for j in range(0,len(vData)): 
-		qStarts = vData.iloc[j,19].split(",")
-		blocksizes  = vData.iloc[j,18].split(",")
-		for k in range(0,len(qStarts)-1): #Split qStarts, last in list is simply blank space
-			fg.write(gene_name + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + qStarts[k] + '\t' + str(int(qStarts[k]) + int(blocksizes[k])) + '\t' + '.' + '\t' +'.' + '\t' + '0' + '\t' + 'gene_id "' + gene_name +'"; transcript_id "' + vData.iloc[j,9] + '";'   + '\n')
-	fg.close()
-				
-
+	plt.xlabel('Bases',fontdict=font)
+	plt.ylabel('Coverage',fontdict=font)
+	plt.savefig('GeneView.pdf')
+	plt.show()
 
 if __name__=='__main__':
-	#Make argument parser
-        parser = argparse.ArgumentParser()
-
-        #Add Arguments
-        parser.add_argument("GeneName",help="The name of the gene whom you wish to view")
-        args= parser.parse_args()
-        if(not os.path.isfile(args.GeneName + ".fasta")):
-                print("No fasta file for gene/cluster of interest\n")
-                sys.exit()
-        if(not os.path.isfile("SuperDuper.fasta")):
-                print("No fasta file for SuperTranscript\n")
-                sys.exit()
-        if(not os.path.isfile("SuperDuper.gff")):
-                print("No annotation file for SuperTranscript\n")
-                sys.exit()
-
-        Visualise(args.GeneName)
+	if(len(sys.argv) != 2):
+		print("Visualisation function requires one argument\n")
+		print("The gene whose super transcripts you wish to visualise\n")
+	else:
+		#Check all the super files are there
+		if(not os.path.isfile(sys.argv[1] + ".fasta")):
+			print("No fasta file for gene/cluster of interest\n")
+			sys.exit()
+		if(not os.path.isfile("SuperDuper.fasta")):
+			print("No fasta file for SuperTranscript\n")
+			sys.exit()
+		if(not os.path.isfile("SuperDuper.gff")):
+			print("No annotation file for SuperTranscript\n")
+			sys.exit()
+			
+
+		Visualise(sys.argv[1])