corecut/corecut.py at main · DessimozLab/corecut · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pandas as pd
import Bio.PDB
import numpy as np
import os
import tqdm

def extract_core(resdf , outfile,  hitthresh = .8 ,minthresh = .6, corefolder = 'core_structs/' , structfolder = 'structs/' , cterfolder = 'cter_structs/' , nterfolder = 'nter_structs/' ):
	"""

	Extract a core of structures from a results file

	Parameters
		resdf: path to results file
		outfile: path to output file
		hitthresh: proportion of structures that need to map to a residue for it to be included in the core
		minthresh: if no residues meet the hitthresh, the minimum proportion of structures that need to map to a residue for it to be included in the core
		corefolder: name of folder to output core structures to
		structfolder: name of folder to find structures in
		cterfolder: name of folder to find cter structures in
		nterfolder: name of folder to find nter structures in


	"""

	#read all results
	folder =''.join([ sub + '/' for sub in resdf.split('/')[:-1] ])
	print(folder)
	resdf = pd.read_table(resdf, header = None)
	resdf.columns = 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,lddt,lddtfull,alntmscore'.split(',')

	print('extracting core')
	print('hitthresh: ' + str(hitthresh))
	print('minthresh: ' + str(minthresh))

	print(resdf.head(), len(resdf['query'].unique()))
	#map hits to each struc
	hits = {}
	nqueries = len(resdf['query'].unique())
	#proportion of structures that need to map to a residue fo
	with tqdm.tqdm(total=len(resdf['query'].unique())) as pbar:
		for i,q in enumerate(resdf['query'].unique()):
			sub = resdf[resdf['query'] == q]
			hitvec = np.zeros((1,max(sub['qend'])) )
			for idx,r in sub.iterrows():
				hitvec[0,r['qstart']:r['qend']] = hitvec[0,r['qstart']:r['qend']]+1
			hitvec /= nqueries
			core = np.where(hitvec>hitthresh)[1]
			try:
				hits[q]= { 'min': np.amin(core), 'max': np.amax(core) }
			except:
				#be more lenient...
				subthresh = np.amax(hitvec)
				print(hitvec, sub, 'be careful, non homologous sequences may have enterred the dataset!')
				if subthresh>=minthresh:
					print('new core threst set at ' + str(subthresh) )
					core = np.where(hitvec>=subthresh)[1]
					hits[q]= { 'min': np.amin(core), 'max': np.amax(core)}
					print(q , 'added')
				else:
					print(q , 'rejected')
			pbar.set_description('processed: %d' % (1 + i))
			pbar.update(1)
	#make core struct folder
	os.makedirs(folder+corefolder , exist_ok = True)
	os.makedirs(folder+cterfolder , exist_ok = True)
	os.makedirs(folder+nterfolder , exist_ok = True)
	#parse each struct and output core to folder
	parser = Bio.PDB.PDBParser()
	with tqdm.tqdm(total=len(hits)) as pbar:
		for i,q in enumerate(hits):
			if not q.endswith('.pdb'):
				qfile = q + '.pdb'
			else:
				qfile = q
			if '/' in q:
				qfile = qfile.split('/')[-1]

			struct = parser.get_structure(q.split('.')[0], folder+structfolder+qfile )

			#zero based indexing...
			struct_core = Bio.PDB.Dice.extract( struct ,'A' , hits[q]['min']+1 , hits[q]['max']+1 ,folder+corefolder+qfile  )

			struct_core = Bio.PDB.Dice.extract( struct ,'A' , 0, hits[q]['min']+1  ,folder+nterfolder+q  )
			#select from max to end
			struct_core = Bio.PDB.Dice.extract( struct ,'A' , hits[q]['max']+1 , len(struct[0]['A'])  ,folder+cterfolder+qfile  )

			hits[q]['len'] = [ len(chain) for model in struct for chain in model ][0]

			pbar.set_description('processed: %d' % (1 + i))
			pbar.update(1)


	hitsdf = pd.DataFrame.from_dict( hits , orient='index'  )
	hitsdf.to_csv(outfile)
	return folder +'struct_cores.csv'