-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfoldseek2tree.py
More file actions
270 lines (212 loc) · 7.28 KB
/
foldseek2tree.py
File metadata and controls
270 lines (212 loc) · 7.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#foldssek cluster
import subprocess ,shlex
import numpy as np
from scipy.spatial.distance import cdist
import statsmodels
import toytree
import pandas as pd
import re
import os
from scipy.stats import chi2
import argparse
def consensustree(treelist):
'''get a consensus tree from a list of tree files
Parameters
----------
treelist : list
list of tree files
'''
#creat a multitree object from the list of tree files
treelist = [toytree.tree(i , format = 0 ) for i in treelist]
mt = toytree.mtree(treelist)
#get the consensus tree
ct = mt.get_consensus_tree( )
return ct
#smooth distmat with MDS
def MDS_smooth(distmat):
mds = MDS(n_components=int(distmat.shape[0]/2) )#, dissimilarity='precomputed' )
distmat = mds.fit_transform(1-distmat )
distmat = cdist(distmat,distmat, 'minkowski', p=1.5 )
return distmat
def Tajima_dist( kn_ratio,bfactor=19/20, iter = 100 ):
taj = np.add.reduce([ (kn_ratio**(np.ones(kn_ratio.shape)*i) )/ (bfactor**(i-1)*i) for i in range(1,iter) ] )
#set diagonal to 0
np.fill_diagonal(taj, 0)
return taj
def simple_logdist( kn_ratio,bfactor=19/20, iter = 100 ):
d = -bfactor* np.log(1- ( kn_ratio / bfactor ) )
#set diagonal to 0
np.fill_diagonal(d, 0)
return d
def runargs(args):
'''run a command line command
Parameters
----------
args : str
command line command
'''
args = shlex.split( args)
p = subprocess.run( args )
return p
def runFoldseekdb(folder , outfolder , foldseekpath = '../foldseek/bin/foldseek'):
'''run foldseek createdb
parameters
----------
folder : str
path to folder with pdb files
outfolder : str
path to output folder
'''
args = foldseekpath + ' createdb '+ folder + ' '+ outfolder+'structblobDB '
p = runargs(args)
return outfolder+'structblobDB '
def runFoldseek_allvall( structfolder , outfolder , foldseekpath = '../foldseek/bin/foldseek' , maxseqs = 3000):
'''
run foldseek search and createtsv
parameters
----------
dbpath : str
path to foldseek database
outfolder : str
path to output folder
maxseqs : int
maximum number of sequences to compare to
'''
foldseekpath + " easy-search " + structfolder + " "+ structfolder +" "+ outfolder+"/allvall.csv " + structfolder+"/tmp --format-output 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,lddt,lddtfull,alntmscore' --exhaustive-search --alignment-type 2"
return outfolder +'aln_score.tsv'
def runFoldseek_allvall_EZsearch(infolder , outpath , foldseekpath = '../foldseek/bin/foldseek'):
'''
run foldseek easy-search
parameters
----------
infolder : str
path to folder with pdb files
outpath : str
path to output folder
foldseekpath : str
path to foldseek binary
'''
args = foldseekpath + ' easy-search ' + infolder + ' ' + infolder +' '+ outpath + " tmp --format-output 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,lddt,lddtfull,alntmscore' --exhaustive-search "
p = runargs(args)
return outpath
def kernelfun(AA,BB, AB):
return AA + BB - 2*AB
def runFastme( fastmepath , clusterfile ):
'''run fastme
parameters
----------
fastmepath : str
path to fastme binary
clusterfile : str
path to all vs all distance matrix in fastme format
'''
args = fastmepath + ' -i ' + clusterfile + ' -o ' + clusterfile+'_tree.txt -n '
p = runargs(args)
return clusterfile+'_tree.txt'
def runQuicktree( clusterfile , quicktreepath= 'quicktree' ):
'''
run quicktree
parameters
----------
clusterfile : str
path to all vs all distance matrix in fastme format
quicktreepath : str
path to quicktree binary
'''
args = quicktreepath + ' -i m ' + clusterfile +' > ' + clusterfile + '.struct_tree.nwk'
p = runargs(args)
return clusterfile + '.struct_tree.nwk'
def distmat_to_txt( identifiers , distmat, outfile):
'''
write out a distance matrix in fastme format
Parameters
----------
identifiers : list
list of identifiers for your proteins
distmat : np.array
distance matrix
outfile : str
path to output file
'''
#write out distmat in phylip compatible format
outstr = str(len(identifiers)) + '\n'
for i,pdb in enumerate(identifiers):
outstr += pdb + ' ' + ' '.join( [ "{:.4f}".format(d) for d in list( distmat[i,: ] ) ] ) + '\n'
with open(outfile , 'w') as handle:
handle.write(outstr)
handle.close()
return outfile
def postprocess(t, outree, delta=0 ):
'''
postprocess a tree to make sure all branch lengths are positive
Parameters
----------
t : str
path to tree file
delta : float
small number to replace negative branch lengths with'''
#make negative branch lengths a small delta instead
with open(t) as treein:
treestr = ' '.join( [ i.strip() for i in treein ] )
tre = toytree.tree(treestr )
print(tre)
for n in tre.treenode.traverse():
if n.dist< 0:
n.dist = delta
tre.write( outree, tree_format = 0 )
return outree
def read_dbfiles3di(folder, name = 'outdb'):
#find positions
threeDidb = folder+ name +'_ss'
threeDiseq = [ l.strip().replace('\x00','') for l in open(threeDidb)]
lookup = folder+name+'.lookup'
ids = [ l.split()[1].strip() for l in open(lookup)]
AADB = folder+name
AAs = [ l.strip().replace('\x00','') for l in open(AADB)]
mapper3di = dict(zip(ids,threeDiseq))
mapperAA = dict(zip(ids,AAs))
return mapper3di, mapperAA
def structblob2tree(input_folder, outfolder, overwrite = False, fastmepath = 'fastme', quicktreepath = 'quicktree' , foldseekpath = '../foldseek/foldseek' , delta = 0.0001):
'''run structblob pipeline for a folder of pdb files without snakemake
Parameters
----------
input_folder : str
path to folder with pdb files
logfolder : str
path to output folder
'''
#check if the foldseek output is already there
if os.path.exists(outfolder + 'res.m8') and overwrite == False:
print('found foldseek output, skipping foldseek')
alnres = outfolder + 'res.m8'
else:
alnres = runFoldseek_allvall_EZsearch(input_folder , outfolder + 'res.m8', foldseekpath = foldseekpath)
res = pd.read_table(alnres , header = None )
res[0] = res[0].map(lambda x :x.replace('.pdb', ''))
res[1] = res[1].map(lambda x :x.replace('.pdb', ''))
res.columns = 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,lddt,lddtfull,alntmscore'.split(',')
ids = list( set(list(res['query'].unique()) + list(res['target'].unique())))
pos = { protid : i for i,protid in enumerate(ids)}
kernels = ['fident', 'alntmscore', 'lddt']
matrices = { k:np.zeros((len(pos), len(pos))) for k in kernels }
print(res)
#calc kernel for tm, aln score, lddt
for idx,row in res.iterrows():
for k in matrices:
matrices[k][pos[row['query']] , pos[row['target']]] += row[k]
matrices[k][pos[row['target']] , pos[row['query']]] += row[k]
trees = {}
for i,k in enumerate(matrices):
matrices[k] /= 2
matrices[k] = 1-matrices[k]
if k == 'fident':
distmat = Tajima_dist( matrices[k] , bfactor = .93 , iter = 100 )
else:
distmat = Tajima_dist( matrices[k] )
print(distmat, np.amax(distmat), np.amin(distmat) )
np.save( outfolder + k + '_distmat.npy' , distmat)
distmat_txt = distmat_to_txt( ids , distmat , outfolder + k + '_distmat.txt' )
out_tree = runFastme( fastmepath = fastmepath , clusterfile = distmat_txt )
out_tree = postprocess(out_tree, out_tree+'.pp.nwk' , delta = delta)
trees[k] = out_tree
return alnres, trees