Skip to content

Commit 78b2750

Browse files
committed
Add synthetic datasets, bmDCA parameters, plmDCA and MSA Tr scores, and notebook for loading them
1 parent c217728 commit 78b2750

File tree

170 files changed

+1057
-2793666
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

170 files changed

+1057
-2793666
lines changed

.gitattributes

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
data/*.txt filter=lfs diff=lfs merge=lfs -text
2+
data/*.fasta filter=lfs diff=lfs merge=lfs -text
3+
data/**/*.txt filter=lfs diff=lfs merge=lfs -text
4+
data/**/*.fasta filter=lfs diff=lfs merge=lfs -text

Contact_prediction_phylogeny.ipynb

Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "821893c3-2096-413b-ba54-1863f8429bae",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import pathlib\n",
11+
"import os\n",
12+
"import warnings\n",
13+
"\n",
14+
"from Bio.PDB import *\n",
15+
"from Bio.PDB.PDBExceptions import PDBConstructionWarning\n",
16+
"\n",
17+
"from utils import *\n",
18+
"\n",
19+
"import swalign"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": null,
25+
"id": "13c3d038-ba73-465e-9c9a-ea07d3cc9841",
26+
"metadata": {},
27+
"outputs": [],
28+
"source": [
29+
"# Folder containing bmDCA parameters used to generate the synthetic MSAs without or with phylogeny\n",
30+
"bmDCA_PARAMETERS_DIR = pathlib.Path(\"data/Synthetic_data/bmDCA_parameters\")\n",
31+
"\n",
32+
"# Folders containing plmDCA coupling scores inferred from the synthetic MSAs\n",
33+
"plmDCA_EQ_INFERRED_SCORES_DIR = pathlib.Path(\"data/Synthetic_data/equilibrium/coupling_scores/plmDCA_inferred\")\n",
34+
"plmDCA_TREE_INFERRED_SCORES_DIR = pathlib.Path(\"data/Synthetic_data/tree/coupling_scores/plmDCA_inferred\")\n",
35+
"\n",
36+
"# Folders containing MSA Transformer coupling scores inferred from the synthetic MSAs\n",
37+
"MSA_TR_EQ_INFERRED_SCORES_DIR = pathlib.Path(\"data/Synthetic_data/equilibrium/coupling_scores/MSA_Transformer_inferred\")\n",
38+
"MSA_TR_TREE_INFERRED_SCORES_DIR = pathlib.Path(\"data/Synthetic_data/tree/coupling_scores/MSA_Transformer_inferred\")\n",
39+
"\n",
40+
"# Folder to host PDB structures\n",
41+
"PDB_DIR = pathlib.Path(\"data/PDB_structures\")\n",
42+
"if not PDB_DIR.exists():\n",
43+
" os.mkdir(PDB_DIR)"
44+
]
45+
},
46+
{
47+
"cell_type": "markdown",
48+
"id": "feb5e97e-4ac0-414d-bf11-16a01576e257",
49+
"metadata": {},
50+
"source": [
51+
"# MSA data dictionary\n",
52+
"\n",
53+
"``\"pfam_seq\"`` is the sequence from the Pfam full MSA corresponding to the PDB structure with ID ``\"pdb_id\"``. "
54+
]
55+
},
56+
{
57+
"cell_type": "code",
58+
"execution_count": null,
59+
"id": "fbb3c61f-125c-4b9f-ba94-cc9f2013b1ee",
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"msa_data = {\n",
64+
" \"PF00004\": {\n",
65+
" \"pdb_id\": \"4d81\",\n",
66+
" \"chain_id\": \"A\",\n",
67+
" \"pfam_seq\": \"ILLYGPPGCGKTMIAAAVANELDSEFIHVDAASIMSKWLGEAEKNVAKIFKTARELSKPAIIFIDELDALLASY-TSEVGGEARVRNQFLKEMDGLADKISKVYVIGATNKPWRLDEPFL-RRFQKRIYIT-\"\n",
68+
" },\n",
69+
" \"PF00005\": {\n",
70+
" \"pdb_id\": \"1l7v\",\n",
71+
" \"chain_id\": \"C\",\n",
72+
" \"pfam_seq\": \"--PLSGEVRAGEILHLVGPNGAGKSTLLARMAGMTS-GKGSIQFAGQPLEAWSATKLALHRAYLSQQQTPPFAMPVWHYQHDKTRTELLNDVAGALALDDKLGRSTNQLSGGEWQRVRLAAVVLQAGQLLLLDEPMN\"\n",
73+
" },\n",
74+
" \"PF00041\": {\n",
75+
" \"pdb_id\": \"3up1\",\n",
76+
" \"chain_id\": \"A\",\n",
77+
" \"pfam_seq\": \"-APFDLSVVYRGANDFVVTFNTSHKKYVKVLMHDVAYRQEKDENKWTHVNLSSTKLTLLQRKLQPAAMYEIKVRSIPDHYKGFWS\"\n",
78+
" },\n",
79+
" \"PF00072\": {\n",
80+
" \"pdb_id\": \"3ilh\",\n",
81+
" \"chain_id\": \"A\",\n",
82+
" \"pfam_seq\": \"VLLIDDDDIVNFLNTTIIRTHRVEEIQSVTSGNAAINKLNELYPSIICIDINMPGINGWELIDLFKQHFNKSIVCLLSSSLDPRDQAKAEASDVDYYVSKPLTANALN----\"\n",
83+
" },\n",
84+
" \"PF00076\": {\n",
85+
" \"pdb_id\": \"3nnh\",\n",
86+
" \"chain_id\": \"A\",\n",
87+
" \"pfam_seq\": \"-FVGQVPRTWSEKDLRELFEQYGAVYEINVLRDNPPQSKGCCFVTFYTRKAALEAQNALHNMKV-----\"\n",
88+
" },\n",
89+
" \"PF00096\": {\n",
90+
" \"pdb_id\": \"4r2a\",\n",
91+
" \"chain_id\": \"A\",\n",
92+
" \"pfam_seq\": \"YACPSCDRRFSRSDELTRHIRIH\"\n",
93+
" },\n",
94+
" \"PF00153\": {\n",
95+
" \"pdb_id\": \"1okc\",\n",
96+
" \"chain_id\": \"A\",\n",
97+
" \"pfam_seq\": \"--RYFAGNLASGGAAGATSLCFVYPLDFARTRLAADVGKGAQREFTGLGNCITKIFKSDGLRGLQGFNSVQGIIIYRAAYGVYDTAKGMLP---\"\n",
98+
" },\n",
99+
" \"PF00512\": {\n",
100+
" \"pdb_id\": \"3dge\",\n",
101+
" \"chain_id\": \"A\",\n",
102+
" \"pfam_seq\": \"MKTEFIANISHERTPLTAIKAYAETIYNSELDLSTLKEFLEVIIDQSNHLENLLNELLDFSRLE--\"\n",
103+
" },\n",
104+
" \"PF00595\": {\n",
105+
" \"pdb_id\": \"1be9\",\n",
106+
" \"chain_id\": \"A\",\n",
107+
" \"pfam_seq\": \"-IVIHR-GSTGLGFNIVGGEDGE---GIFISFILAGGPADLSGLRKGDQILSVNGVDLRNASHEQAAIALKNAGQTVTII--\"\n",
108+
" },\n",
109+
" \"PF01535\": {\n",
110+
" \"pdb_id\": \"4m57\",\n",
111+
" \"chain_id\": \"A\",\n",
112+
" \"pfam_seq\": \"VTYHTLVGGYSSLEMFSEAREVIGYMVQHGL\"\n",
113+
" },\n",
114+
" \"PF02518\": {\n",
115+
" \"pdb_id\": \"3g7e\",\n",
116+
" \"chain_id\": \"A\",\n",
117+
" \"pfam_seq\": \"-DGTGLHHMVFEVVDNAIDAGHCKEIIVTIH---ADNSVSVQDDGRGIPTGIHPHAGGKFDD-NSYKVSGGLHGVGVSVVNALSQKLELVIQRGETEKTGTMVRFWPSLE-\"\n",
118+
" },\n",
119+
" \"PF07679\": {\n",
120+
" \"pdb_id\": \"1fhg\",\n",
121+
" \"chain_id\": \"A\",\n",
122+
" \"pfam_seq\": \"PYFTKTILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEGNCSLTISEVCGDDDAKYTCKAVNSLGEATCTAELLV\"\n",
123+
" },\n",
124+
" \"PF00271\": {\n",
125+
" \"pdb_id\": \"3ex7\",\n",
126+
" \"chain_id\": \"C\",\n",
127+
" \"pfam_seq\": \"-KFDTLCDLY-DTLTITQAVIFCNTKRKVDWTEKMREA-NFTVSSMHGDMPQKERESIMKEFRSGASRVLISTDVWARGLDVPQVSLIINYDLPNNRELYIHRIGRSGRYG\"\n",
128+
" },\n",
129+
" \"PF00397\": {\n",
130+
" \"pdb_id\": \"4rex\",\n",
131+
" \"chain_id\": \"A\",\n",
132+
" \"pfam_seq\": \"LPAGWEMAKTSS-GQRYFLNHIDQTTTWQDP\"\n",
133+
" },\n",
134+
" \"PF13354\": {\n",
135+
" \"pdb_id\": \"6qw8\",\n",
136+
" \"chain_id\": \"A\",\n",
137+
" \"pfam_seq\": \"--DNSQILYRADERFAMCSTSKVMAAAAVLKKSESENLLNQRVEIKKSDLVNYNPIAEKHVNGTMSLAESAAALQYSDNVAMNKLIAHVGPASVTAFARQLGDETFRLDRTEPTLNAIPGDPRDTTSPRAMAQTLRNLTLGKALGDSLVTWMKNTTGAASIQAGLPAWVVGDKTGSGYGTTNDIAVIWPDRAPLILV-\"\n",
138+
" }\n",
139+
"}"
140+
]
141+
},
142+
{
143+
"cell_type": "markdown",
144+
"id": "3e68e145-6515-4509-88c9-4f648b305d4c",
145+
"metadata": {},
146+
"source": [
147+
"# Align with PDB data"
148+
]
149+
},
150+
{
151+
"cell_type": "code",
152+
"execution_count": null,
153+
"id": "9d7b5662-67a3-491c-b0a6-430d3c6359a7",
154+
"metadata": {},
155+
"outputs": [],
156+
"source": [
157+
"match = 2\n",
158+
"mismatch = -2\n",
159+
"gap_penalty = -2\n",
160+
"\n",
161+
"scoring = swalign.IdentityScoringMatrix(match, mismatch)\n",
162+
"sw = swalign.LocalAlignment(scoring, gap_penalty=gap_penalty)"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": null,
168+
"id": "cfcdcdce-f990-46f5-add0-99ff2ceb7ae6",
169+
"metadata": {},
170+
"outputs": [],
171+
"source": [
172+
"idxs_chains = {}\n",
173+
"idxs_pfam_seqs = {}\n",
174+
"dist_mat = {}\n",
175+
"\n",
176+
"for msa_name in msa_data:\n",
177+
" print(msa_name)\n",
178+
" pdb_id = msa_data[msa_name][\"pdb_id\"]\n",
179+
" chain_id = msa_data[msa_name][\"chain_id\"]\n",
180+
"\n",
181+
" # Download and parse structure\n",
182+
" pdbl = PDBList()\n",
183+
" pdbl.retrieve_pdb_file(pdb_id,\n",
184+
" pdir=PDB_DIR,\n",
185+
" file_format=\"mmCif\")\n",
186+
" pdb_parser = MMCIFParser()\n",
187+
" with warnings.catch_warnings():\n",
188+
" warnings.simplefilter(\"ignore\", category=PDBConstructionWarning)\n",
189+
" chain = pdb_parser.get_structure(pdb_id, f\"{PDB_DIR}/{pdb_id}.cif\")[0][chain_id]\n",
190+
" # Convert to one-letter encoding\n",
191+
" pdb_seq = to_one_letter_seq(chain)\n",
192+
" pfam_seq = msa_data[msa_name][\"pfam_seq\"]\n",
193+
" print(f\"Ref: {pdb_seq = }\")\n",
194+
" print(f\"Query: {pfam_seq = }\")\n",
195+
"\n",
196+
" # Align PDB sequence with PFAM sequence\n",
197+
" alignment = sw.align(pdb_seq, pfam_seq)\n",
198+
" alignment.dump()\n",
199+
" # Store matching indices and PDB distance matrix for matching indices\n",
200+
" idxs_chain, idxs_pfam_seq = indices_in_ref_and_query(alignment)\n",
201+
" idxs_chains[msa_name] = idxs_chain\n",
202+
" idxs_pfam_seqs[msa_name] = idxs_pfam_seq\n",
203+
" dist_mat[msa_name] = calc_min_dist_matrix(chain, idxs_chain)\n",
204+
" print()"
205+
]
206+
},
207+
{
208+
"cell_type": "markdown",
209+
"id": "a0606a56-31a0-4af5-9b3d-7bb3bc889ea3",
210+
"metadata": {},
211+
"source": [
212+
"# Compute ground truth (bmDCA) coupling scores for the synthetic MSAs"
213+
]
214+
},
215+
{
216+
"cell_type": "code",
217+
"execution_count": null,
218+
"id": "77e7f676-e903-4a84-b8b1-8239a129bdb0",
219+
"metadata": {},
220+
"outputs": [],
221+
"source": [
222+
"bmDCA_scores = {}\n",
223+
"for pfam_family in msa_data:\n",
224+
" bmDCA_scores[pfam_family] = {}\n",
225+
" _, J2 = read_bmDCA_parameters(bmDCA_PARAMETERS_DIR / f\"{pfam_family}.txt\")\n",
226+
" idx_subset = np.asarray(idxs_pfam_seqs[pfam_family]) # Restrict to sites matching with the PDB\n",
227+
" bmDCA_scores[pfam_family] = zero_sum_gauge_frob_scores(J2)[idx_subset, :][:, idx_subset] # Use APC (default)"
228+
]
229+
},
230+
{
231+
"cell_type": "markdown",
232+
"id": "c16aa2fe-178e-4fdb-8829-30e8d9522bec",
233+
"metadata": {},
234+
"source": [
235+
"# Read in plmDCA scores inferred from the synthetic MSAs"
236+
]
237+
},
238+
{
239+
"cell_type": "code",
240+
"execution_count": null,
241+
"id": "d6a02358-8da8-445c-ad43-72916189da8e",
242+
"metadata": {},
243+
"outputs": [],
244+
"source": [
245+
"plmDCA_equilibrium_scores = {}\n",
246+
"plmDCA_tree_scores = {}\n",
247+
"for pfam_family in msa_data:\n",
248+
" length = len(msa_data[pfam_family][\"pfam_seq\"])\n",
249+
" for dic, path in zip([plmDCA_equilibrium_scores, plmDCA_tree_scores],\n",
250+
" [plmDCA_EQ_INFERRED_SCORES_DIR, plmDCA_TREE_INFERRED_SCORES_DIR]):\n",
251+
" scores = np.loadtxt(path / f\"{pfam_family}.txt\")\n",
252+
" scores[:, :2] -= 1 # Convert 1-based indexing (Julia) to 0-based indexing (Python)\n",
253+
" mat = np.zeros((length, length), dtype=np.float64) # Initialize scores matrix\n",
254+
" mat[tuple(scores[:, :2].astype(int).T)] = scores[:, 2] # Populate scores matrix\n",
255+
" mat += mat.T # Symmetrize scores matrix\n",
256+
" idx_subset = np.asarray(idxs_pfam_seqs[pfam_family]) # Restrict to sites matching with the PDB\n",
257+
" mat = mat[idx_subset, :][:, idx_subset]\n",
258+
" dic[pfam_family] = mat"
259+
]
260+
},
261+
{
262+
"cell_type": "markdown",
263+
"id": "aa903d8a-b35c-40c1-bb31-7a911b523e29",
264+
"metadata": {},
265+
"source": [
266+
"# Read in MSA Transformer scores inferred from the synthetic MSAs\n",
267+
"\n",
268+
"These scores were obtained from each synthetic MSA by computing contact probabilities (scores) according to MSA Transformer [(Rao et al, 2021)](https://proceedings.mlr.press/v139/rao21a.html) from 10 sub-MSAs defined as the rows with labels 0-9 `in ``data/Synthetic_data/MSA_Transformer_subsample_labels``, and then averaging the resulting 10 score matrices."
269+
]
270+
},
271+
{
272+
"cell_type": "code",
273+
"execution_count": null,
274+
"id": "668a6bec-6a33-4d42-98db-d72f8b155117",
275+
"metadata": {},
276+
"outputs": [],
277+
"source": [
278+
"MSA_Tr_equilibrium_scores = {}\n",
279+
"MSA_Tr_tree_scores = {}\n",
280+
"for pfam_family in msa_data:\n",
281+
" for dic, path in zip([MSA_Tr_equilibrium_scores, MSA_Tr_tree_scores],\n",
282+
" [MSA_TR_EQ_INFERRED_SCORES_DIR, MSA_TR_TREE_INFERRED_SCORES_DIR]):\n",
283+
" mat = np.loadtxt(path / f\"{pfam_family}.txt\")\n",
284+
" idx_subset = np.asarray(idxs_pfam_seqs[pfam_family]) # Restrict to sites matching with the PDB\n",
285+
" mat = mat[idx_subset, :][:, idx_subset]\n",
286+
" dic[pfam_family] = mat"
287+
]
288+
},
289+
{
290+
"cell_type": "code",
291+
"execution_count": null,
292+
"id": "d2eb4368-fd4f-49be-8b07-91e665862c10",
293+
"metadata": {},
294+
"outputs": [],
295+
"source": [
296+
"# Exclude PF00096 from the analysis as it is too short\n",
297+
"msa_names_long = [msa_name for msa_name in msa_data if msa_name != \"PF00096\"]\n",
298+
"print(f\"{msa_names_long = }\")"
299+
]
300+
}
301+
],
302+
"metadata": {
303+
"kernelspec": {
304+
"display_name": "Python 3 (ipykernel)",
305+
"language": "python",
306+
"name": "python3"
307+
},
308+
"language_info": {
309+
"codemirror_mode": {
310+
"name": "ipython",
311+
"version": 3
312+
},
313+
"file_extension": ".py",
314+
"mimetype": "text/x-python",
315+
"name": "python",
316+
"nbconvert_exporter": "python",
317+
"pygments_lexer": "ipython3",
318+
"version": "3.9.12"
319+
}
320+
},
321+
"nbformat": 4,
322+
"nbformat_minor": 5
323+
}

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ In order to run the notebooks, the following python packages are required:
2424
- matplotlib
2525
- statsmodels
2626
- biopython
27+
- swalign
2728
- esm==0.4.0
2829

2930
``prody`` and ``HMMER`` are required to run the Python script ``data/Pfam_Seed/fetch_seed_MSAs.py``, if you wish to create new

0 commit comments

Comments
 (0)