forked from Oshlack/Lace
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLace.py
More file actions
209 lines (166 loc) · 6.49 KB
/
Lace.py
File metadata and controls
209 lines (166 loc) · 6.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#Author: Anthony Hawkins
#Lacing together different transcripts into a SuperTranscript
#This is the main script which parrallelises making a SuperTranscript for each gene/cluster
#The main inputs are .fasta file containing all transcripts in all the genes/cluster you wish to constuct
#and a tab delimited text file of two columns with the mapping of transcripts <-> gene
import multiprocessing, logging
from multiprocessing import Pool
from multiprocessing import Process
import os
from BuildSuperTranscript import SuperTran
import sys
import time
import argparse
from Checker import Checker
def worker(fname):
seq =''
ann = ''
whirl_status=0
try:
seq,ann,whirl_status,transcript_status = SuperTran(fname)
except:
print("Failed:", fname)
return seq,ann,whirl_status,transcript_status
#A little function to move all .fasta and .psl files created into a sub directory to tidy the space
def Clean(corsetfile,outdir):
#Make tidy directory
mcom = 'mkdir %s/SuperFiles' %(outdir)
os.system(mcom)
print("Moving all fasta and psl files created to:")
print(outdir + "/SuperFiles")
clusters = []
corse = open(corsetfile,'r')
for line in corse:
clust = line.split()[1].rstrip('/n')
if(clust not in clusters): clusters.append(clust)
#Now move all the fasta and psl files
for clust in clusters:
mcom = 'mv %s/%s.fasta %s/SuperFiles' %(outdir,clust,outdir)
os.system(mcom)
mcom = 'mv %s/%s.psl %s/SuperFiles' %(outdir,clust,outdir)
os.system(mcom)
#Split fasta file into genes first then parallelise the BLAT for the different genes
def Split(genome,corsetfile,ncore,maxTran,outdir):
start_time = time.time()
#Find working directory
dir = os.path.dirname(corsetfile)
if(dir==''): dir='.'
#First create dictionary from corset output assigning a transcript to a cluster
#Note: Corset throws away transcripts which don't have many read (> 10) to them
#So we need to also ignore them
cluster = {}
if(os.path.isfile(corsetfile)):
print("Creating dictionary of transcripts in clusters...")
corse = open(corsetfile,'r')
for line in corse:
tran = line.split()[0]
clust = line.split()[1].rstrip('/n')
cluster[tran] = clust
#Now loop through fasta file
if(os.path.isfile(genome)):
print("Creating a fasta file per gene...")
#We only want to include transcripts deemed worthy by Corset
#Parse Fasta file and split by gene
fT = open(genome,'r')
transcripts = {}
geneid = {}
transid ={}
for line in fT:
if(">" in line): #Name of
tag = (line.split()[0]).lstrip('>')
transcripts[tag] = ''
#Assign names
if(tag in cluster.keys()): geneid[tag] = cluster[tag] #If assigned by corset
else: geneid[tag] = 'None'
transid[tag] = tag
else:
transcripts[tag] = transcripts[tag] + line.split('\n')[0].split('\r')[0]
#Make a file for each gene
gene_list = set(geneid.values())
if('None' in gene_list): gene_list.remove('None') #Remove the placer holder for the ' None' which were transcripts not mapped to clusters in corset
cnts = []
for gene in gene_list:
#Count number of transcripts assigned to cluster
cnt=0
for val in cluster.values():
if(val ==gene):cnt=cnt+1
cnts.append(cnt)
if(cnt > maxTran): print("WARNING: Lace will only take the first " + str(maxTran) +" transcripts since there are too many transcripts in cluster")
fn = outdir + '/' + gene + '.fasta' #General
if(os.path.isfile(fn)): continue #If already file
f = open(fn,'w')
ts=0
for tag in transcripts.keys():
if(gene == geneid[tag]):
if(ts==maxTran): break #If already recorded maxTran transcripts in gene.fasta file
f.write('>' + tag + '\n')
f.write(transcripts[tag]+'\n')
ts += 1
f.close()
#Now submit Build Super Transcript for each gene in parallel
print("Now Building SuperTranscript for each gene...")
jobs = []
fnames = []
for gene in gene_list:
fname = outdir + '/' + gene + '.fasta'
fnames.append(fname)
# BY POOL
#ncore = 4
pool = Pool(processes=ncore)
result = pool.map_async(worker,fnames)
pool.close()
pool.join()
results = result.get()
#Write Overall Super Duper Tran
superf = open(outdir + '/' +'SuperDuper.fasta','w')
supgff = open(outdir + '/' +'SuperDuper.gff','w')
for i,clust in enumerate(fnames):
#Just use the name of gene, without the preface
fn = clust.split("/")[-1]
fn = fn.split('.fasta')[0]
superf.write('>' + fn + ' NoTrans:' + str(results[i][3]) + ',Whirls:' + str(results[i][2]) + '\n')
superf.write(results[i][0] + '\n')
#Write Super gff
for res in results:
supgff.write(res[1])
print("BUILT SUPERTRANSCRIPTS ---- %s seconds ----" %(time.time()-start_time))
if __name__ == '__main__':
#Print Lace Version
print(" __ __ ____ ____ ")
print("( ) / _\ / )( __)")
print("/ (_/\/ \( (__ ) _) ")
print("\_____/\_/\_/\_____)(____)")
print("Lace Version: 0.82")
print("Last Editted: 30/01/17")
#Make argument parser
parser = argparse.ArgumentParser()
#Add Arguments
parser.add_argument("GenomeFile",help="The name of the fasta file containing all transcripts")
parser.add_argument("ClusterFile",help="The name of the text file with the transcript to cluster mapping")
parser.add_argument("--cores",help="The number of cores you wish to run the job on (default = 1)",default=1,type=int)
parser.add_argument("--alternate","-a",help="Create alternate annotations and create metrics on success of SuperTranscript Building",action='store_true')
parser.add_argument("--tidy","-t",help="Move intermediate fasta files into folder: SuperFiles after running",action='store_true')
parser.add_argument("--maxTran",help="Set a maximum for the number of transcripts from a cluster to be included for building the SuperTranscript (default=50).",default=50,type=int)
parser.add_argument("--outputDir","-o",help="Output Directory",default=".")
args= parser.parse_args()
#Make output directory if it currently doesnt exist
if(args.outputDir):
if(os.path.exists(args.outputDir)):
print("Output directory exists")
else:
print("Creating output directory")
os.mkdir(args.outputDir)
Split(args.GenomeFile,args.ClusterFile,args.cores,args.maxTran,args.outputDir)
if(args.alternate):
cwd = os.getcwd()
#Change to output directory
os.chdir(args.outputDir)
print("Making Alternate Annotation and checks")
Checker('SuperDuper.fasta',args.cores)
#Change back
os.chdir(cwd)
print('Done')
if(args.tidy):
print("Storing all extraneous files in SuperFiles")
Clean(args.ClusterFile,args.outputDir)
print("Done")