Tidying up comments

Quarkins · Quarkins · commit 8841deee0737 · 2016-09-28T09:07:08.000+10:00
diff --git a/BuildSuperTranscript.py b/BuildSuperTranscript.py
@@ -1,4 +1,6 @@
-#A quick little first pass script to read a BLAT output and then:
+# Author: Anthony Hawkins
+
+#A script to construct a SuperTranscript given a .fasta file of transcript sequence
 # 1) Determine block sequences
 # 2) Construct graph structure that stores each block with eges detailing how blocks are connect within transcripts
 # 3) Sort blocks from the graph into topological order
@@ -16,14 +18,12 @@
 sys.setrecursionlimit(10000)
 
 
-	#Define a function to be used recursively to check for each succesor node whether it only has one in or out
-
+#Define a function to be used recursively to check for each succesor node whether it only has one in or out
 def successor_check(graph,n,tmerge):
 	ess = [node for node in graph.successors(n)] #Get list of succesors
 
 	#Check for all successors
 	for s in ess:
-		#if(len(graph.out_edges([s])) == 1 and len(graph.in_edges([s])) == 1): #I.e. if only one outgoing edge and one incoming edge it belongs to same block
 		if(len(graph.in_edges([s])) <= 1 and len(ess) <= 1): #Succesor node only has one incoming path and is the only option for the previous node
 			tmerge.append(s)
 			successor_check(graph,s,tmerge)
@@ -143,7 +143,7 @@ def filt_dir(table):
 
 	
 
-
+#Main function to produce a SuperTranscript
 def SuperTran(fname,verbose=False):
 	
 	#Start Clock for timing
@@ -173,7 +173,7 @@ def SuperTran(fname,verbose=False):
 	transcript_status = len(transcripts)
 	whirl_status = 0
 
-	#If there is only one transcript in this file, then simply that transcript is the super transcript
+	#If there is only one transcript in this file, then simply that transcript is the super transcript...
 	if(len(transcripts) == 1):
 		if(verbose): print("One\n") 
 		seq = next(iter(transcripts.values())) #Python 3 specific codee...
@@ -184,7 +184,7 @@ def SuperTran(fname,verbose=False):
 		try:
 			seq, anno, whirl_status  = BuildGraph(fname,transcripts,verbose)
 
-		except: #Graph building failed, just take longest transcript or (concatenate all transcripts)
+		except: #Graph building failed, just take longest transcript (or concatenate all transcripts)
 			temp = 0
 			seq = ''
 			print('FAILED to construct')
@@ -261,14 +261,6 @@ def BuildGraph(fname,transcripts,verbose=False):
 
 	
 	for i in range(0,len(bData)):
-
-		#Check explicitly that there are no gaps - OBS these "gaps" are actually gaps between blat blocks and not actually gaps within blat blocks as i initially thought
-		#if( bData.iloc[i,4] > 0 or bData.iloc[i,6] > 0):
-		#	continue
-	
-		#Don't allign the transcripts against each other twice...
-        	#I.e. BLAT does T1 vs T2 but also T2 vs T1 (which should be the same give or take)
-
 		
 		#Extract the info
 		seq=list(transcripts[bData.iloc[i,9]]) #Get sequence from query name
@@ -283,10 +275,6 @@ def BuildGraph(fname,transcripts,verbose=False):
 			qStart.append(int(qStarts[j]))
 			qName.append(bData.iloc[i,9])
 
-		#OBS
-		#For now assume that there is no contradiction in the directionality (i.e. all blocks in psl are consistent with each contig being in the defined direction relative to each pair)
-		#if(trandir[bData.iloc[i,9]] == '-' and trandir[bData.iloc[i,13]] == '-'	
-
 	if(verbose): print("Constructing and merging nodes in graphs based on Blocks and Transcripts")
 
 
@@ -455,13 +443,6 @@ def BuildGraph(fname,transcripts,verbose=False):
 			M_node = None
 			Multi = 100000000
 
-			#Find Highest multiplicity node in loop to use for breaking point of cycle
-			#for node in whirl:
-			#	temp = len(C.out_edges([node])) + len(C.out_edges([node]))
-			#	if(temp >= Multi):
-			#		Multi = temp
-			#		M_node = node
-
                         #Find the node with smallest sequence and break there (instead of the highest multiplicty)
 			for node in whirl:
 				temp = len(C.node[node]['Base'])
diff --git a/Checker.py b/Checker.py
@@ -1,4 +1,8 @@
+#Author: Anthony Hawkins
+
 #A script to systematically check for each gene whether the SuperTranscript builder worked ok
+# And to produce the alternate annotation for the SuperTranscript
+
 import multiprocessing, logging
 from multiprocessing import Pool
 from multiprocessing import Process
diff --git a/Lace.py b/Lace.py
@@ -1,4 +1,9 @@
+#Author: Anthony Hawkins
+
 #Lacing together different transcripts into a SuperTranscript
+#This is the main script which parrallelises making a SuperTranscript for each gene/cluster
+#The main inputs are .fasta file containing all transcripts in all the genes/cluster you wish to constuct
+#and a tab delimited text file of two columns with the mapping of transcripts <-> gene
 
 
 import multiprocessing, logging
@@ -179,10 +184,12 @@ def Split(genome,corsetfile,ncore,maxTran,outdir):
 
 	if(args.alternate):
 		cwd = os.getcwd()
+
 		#Change to output directory
 		os.chdir(args.outputDir)
 		print("Making Alternate Annotation and checks")
 		Checker('SuperDuper.fasta',args.cores)
+
 		#Change back
 		os.chdir(cwd)
 		print('Done')
diff --git a/Mobius.py b/Mobius.py
@@ -1,10 +1,14 @@
-#A little script to construct an annotation from an SJ.out.tab file
+#Author: Anthony Hawkins
+#A little script to construct an annotation from an SJ.out.tab file, a standard STAR output. 
+# This creates the dynamic block annotation
+
 import argparse
 import pandas as pd
 import os, sys
 
 def Mobius(sjfile,gfile,ft,flat_ann):
-    #First read in the SJ.out.tab as sys.argv[1]
+
+	#First read in the SJ.out.tab as sys.argv[1]
     sj = pd.read_csv(sjfile,sep='\t',header=None,names=['Gene','Start','End','strand','intron motif','Annotated','Unique','Multi','Overhang'])
 
     #Read in SuperTranscript fasta file
@@ -28,7 +32,7 @@ def Mobius(sjfile,gfile,ft,flat_ann):
     for i in range(0,len(sj['Gene'])):
         curr_gene = sj.iloc[i,0]
         if(curr_gene not in slist.keys()): slist[curr_gene] = [1]
-        if((sj.iloc[i,7] + sj.iloc[i,8]) > 5): #More than 10 reads (either unique or multi spanninf junction)
+        if((sj.iloc[i,7] + sj.iloc[i,8]) > 5): #More than 5 reads (either unique or multi spanning junction)
             slist[curr_gene].append(int(sj.iloc[i,1])) #This is actually the intron start part (i.e one base over)
             slist[curr_gene].append(int(sj.iloc[i,2])+1) #This is the end of the exon-exon junction, so need to add one for the actual exon start place
 
diff --git a/STViewer.py b/STViewer.py
@@ -1,3 +1,4 @@
+#Author: Anthony Hawkins
 #Visualise a given gene in your super transcript
 
 import pandas as pd
@@ -59,22 +60,16 @@ def Visualise(gene_name):
 	gs = gridspec.GridSpec(2,1,height_ratios=[4,1])
 	ax1=plt.subplot(gs[0])
 	accum = 0
-	#for row in range(0,len(gff_data)):
-	#	size = 1 + int(gff_data.iloc[row,4]) - int(gff_data.iloc[row,3]) #1+ for converting co-ordinate systems
-	#	plt.barh(len(transcripts),size,color='#ffc024',left=accum,alpha=0.8)
-	#	accum=accum+size
 
 	plt.barh(len(transcripts),ST_length,color='#ffc024',left=0)
 	plot_dict = {}
 	col_dict = {}
 	labs = []
 
-	#col2=iter(cm.plasma(np.linspace(0,1,len(transcripts))))
 	col2=iter(cm.terrain(np.linspace(0,1,len(transcripts))))
 	for i,key in enumerate(transcripts):
         	plot_dict[key] = i
         	col_dict[key] = next(col2)
-        	#lab = "T"+str(i)
         	lab =""
         	labs.append(lab)
 
@@ -103,7 +98,6 @@ def Visualise(gene_name):
 	width=0.8
 	labs.append('Super')
 	plt.yticks(ind + width/2.,labs,fontsize="medium",fontweight="semibold")
-	#plt.title('Breakdown of Super Transcript')
 	plt.ylabel('Transcripts',fontdict=font)