Lace/Lace.py at master · Quarkins/Lace · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#Author: Anthony Hawkins

#Lacing together different transcripts into a SuperTranscript
#This is the main script which parrallelises making a SuperTranscript for each gene/cluster
#The main inputs are .fasta file containing all transcripts in all the genes/cluster you wish to constuct
#and a tab delimited text file of two columns with the mapping of transcripts <-> gene


import multiprocessing, logging
from multiprocessing import Pool
from multiprocessing import Process
import os
from BuildSuperTranscript import SuperTran
import sys
import time
import argparse
from Checker import Checker

def worker(fname):
	seq =''
	ann = ''
	whirl_status=0
	try:
		seq,ann,whirl_status,transcript_status = SuperTran(fname)
	except:
		print("Failed:", fname)
	return seq,ann,whirl_status,transcript_status

#A little function to move all .fasta and .psl files created into a sub directory to tidy the space
def Clean(corsetfile,outdir):

	#Make tidy directory
	mcom = 'mkdir %s/SuperFiles' %(outdir)
	os.system(mcom)

	print("Moving all fasta and psl files created to:")
	print(outdir + "/SuperFiles")
	clusters = []
	corse = open(corsetfile,'r')
	for line in corse:
		clust = line.split()[1].rstrip('/n')
		if(clust not in clusters): clusters.append(clust)

	#Now move all the fasta and psl files
	for clust in clusters:
		mcom = 'mv %s/%s.fasta %s/SuperFiles' %(outdir,clust,outdir)
		os.system(mcom)
		mcom = 'mv %s/%s.psl %s/SuperFiles' %(outdir,clust,outdir)
		os.system(mcom)

#Split fasta file into genes first then parallelise the BLAT for the different genes
def Split(genome,corsetfile,ncore,maxTran,outdir):
	start_time = time.time()

	#Find working directory
	dir = os.path.dirname(corsetfile)
	if(dir==''): dir='.'


	#First create dictionary from corset output assigning a transcript to a cluster
	#Note: Corset throws away transcripts which don't have many read (> 10) to them
	#So we need to also ignore them

	cluster = {}
	if(os.path.isfile(corsetfile)):
		print("Creating dictionary of transcripts in clusters...")
		corse = open(corsetfile,'r')
		for line in corse:
			tran = line.split()[0]
			clust = line.split()[1].rstrip('/n')
			cluster[tran] = clust

	#Now loop through fasta file
	if(os.path.isfile(genome)):
		print("Creating a fasta file per gene...")
		#We only want to include transcripts deemed worthy by Corset

		#Parse Fasta file and split by gene
		fT = open(genome,'r')
		transcripts = {}
		geneid = {}
		transid ={}
		for line in fT:
			if(">" in line): #Name of
				tag = (line.split()[0]).lstrip('>')
				transcripts[tag] = ''

				#Assign names
				if(tag in cluster.keys()): geneid[tag] = cluster[tag] #If assigned by corset
				else: geneid[tag] = 'None'
				transid[tag] = tag
			else:
				transcripts[tag] = transcripts[tag] + line.split('\n')[0].split('\r')[0]

		#Make a file for each gene
		gene_list = set(geneid.values())
		if('None' in gene_list): gene_list.remove('None') #Remove the placer holder for the ' None' which were transcripts not mapped to clusters in corset

		cnts = []
		for gene in gene_list:
			#Count number of transcripts assigned to cluster
			cnt=0
			for val in cluster.values():
				if(val ==gene):cnt=cnt+1
			cnts.append(cnt)
			if(cnt > maxTran): print("WARNING: Lace will only take the first " + str(maxTran) +" transcripts since there are too many transcripts in cluster")

			fn = outdir + '/' + gene + '.fasta' #General
			if(os.path.isfile(fn)): continue	#If already file


			f = open(fn,'w')
			ts=0
			for tag in transcripts.keys():
				if(gene == geneid[tag]):
					if(ts==maxTran): break #If already recorded maxTran transcripts in gene.fasta file
					f.write('>' + tag +  '\n')
					f.write(transcripts[tag]+'\n')
					ts += 1
			f.close()

		#Now submit Build Super Transcript for each gene in parallel
		print("Now Building SuperTranscript for each gene...")
		jobs = []

		fnames = []
		for gene in gene_list:
			fname = outdir + '/' + gene + '.fasta'
			fnames.append(fname)

		# BY POOL
		#ncore = 4
		pool = Pool(processes=ncore)
		result = pool.map_async(worker,fnames)
		pool.close()
		pool.join()
		results = result.get()


		#Write Overall Super Duper Tran
		superf = open(outdir + '/' +'SuperDuper.fasta','w')
		supgff = open(outdir + '/' +'SuperDuper.gff','w')

		for i,clust in enumerate(fnames):
			#Just use the name of gene, without the preface
			fn = clust.split("/")[-1]
			fn = fn.split('.fasta')[0]
			superf.write('>' + fn  + ' NoTrans:' + str(results[i][3]) + ',Whirls:' + str(results[i][2])  + '\n')
			superf.write(results[i][0] + '\n')

		#Write Super gff
		for res in results:
                        supgff.write(res[1])

		print("BUILT SUPERTRANSCRIPTS ---- %s seconds ----" %(time.time()-start_time))


if __name__ == '__main__':

	#Print Lace Version
	print(" __      __    ____  ____ ")
	print("(  )    / _\  /    )(  __)")
	print("/  (_/\/    \(  (__  ) _) ")
	print("\_____/\_/\_/\_____)(____)")
	print("Lace Version: 0.82")
	print("Last Editted: 30/01/17")


	#Make argument parser
	parser = argparse.ArgumentParser()

	#Add Arguments
	parser.add_argument("GenomeFile",help="The name of the fasta file containing all transcripts")
	parser.add_argument("ClusterFile",help="The name of the text file with the transcript to cluster mapping")
	parser.add_argument("--cores",help="The number of cores you wish to run the job on (default = 1)",default=1,type=int)
	parser.add_argument("--alternate","-a",help="Create alternate annotations and create metrics on success of SuperTranscript Building",action='store_true')
	parser.add_argument("--tidy","-t",help="Move intermediate fasta files into folder: SuperFiles after running",action='store_true')
	parser.add_argument("--maxTran",help="Set a maximum for the number of transcripts from a cluster to be included for building the SuperTranscript (default=50).",default=50,type=int)
	parser.add_argument("--outputDir","-o",help="Output Directory",default=".")

	args= parser.parse_args()

	#Make output directory if it currently doesnt exist
	if(args.outputDir):
		if(os.path.exists(args.outputDir)):
			print("Output directory exists")
		else:
			print("Creating output directory")
			os.mkdir(args.outputDir)

	Split(args.GenomeFile,args.ClusterFile,args.cores,args.maxTran,args.outputDir)

	if(args.alternate):
		cwd = os.getcwd()

		#Change to output directory
		os.chdir(args.outputDir)
		print("Making Alternate Annotation and checks")
		Checker('SuperDuper.fasta',args.cores)

		#Change back
		os.chdir(cwd)
		print('Done')

	if(args.tidy):
		print("Storing all extraneous files in SuperFiles")
		Clean(args.ClusterFile,args.outputDir)
		print("Done")