|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": null, |
| 6 | + "id": "821893c3-2096-413b-ba54-1863f8429bae", |
| 7 | + "metadata": {}, |
| 8 | + "outputs": [], |
| 9 | + "source": [ |
| 10 | + "import pathlib\n", |
| 11 | + "import os\n", |
| 12 | + "import warnings\n", |
| 13 | + "\n", |
| 14 | + "from Bio.PDB import *\n", |
| 15 | + "from Bio.PDB.PDBExceptions import PDBConstructionWarning\n", |
| 16 | + "\n", |
| 17 | + "from utils import *\n", |
| 18 | + "\n", |
| 19 | + "import swalign" |
| 20 | + ] |
| 21 | + }, |
| 22 | + { |
| 23 | + "cell_type": "code", |
| 24 | + "execution_count": null, |
| 25 | + "id": "13c3d038-ba73-465e-9c9a-ea07d3cc9841", |
| 26 | + "metadata": {}, |
| 27 | + "outputs": [], |
| 28 | + "source": [ |
| 29 | + "# Folder containing bmDCA parameters used to generate the synthetic MSAs without or with phylogeny\n", |
| 30 | + "bmDCA_PARAMETERS_DIR = pathlib.Path(\"data/Synthetic_data/bmDCA_parameters\")\n", |
| 31 | + "\n", |
| 32 | + "# Folders containing plmDCA coupling scores inferred from the synthetic MSAs\n", |
| 33 | + "plmDCA_EQ_INFERRED_SCORES_DIR = pathlib.Path(\"data/Synthetic_data/equilibrium/coupling_scores/plmDCA_inferred\")\n", |
| 34 | + "plmDCA_TREE_INFERRED_SCORES_DIR = pathlib.Path(\"data/Synthetic_data/tree/coupling_scores/plmDCA_inferred\")\n", |
| 35 | + "\n", |
| 36 | + "# Folders containing MSA Transformer coupling scores inferred from the synthetic MSAs\n", |
| 37 | + "MSA_TR_EQ_INFERRED_SCORES_DIR = pathlib.Path(\"data/Synthetic_data/equilibrium/coupling_scores/MSA_Transformer_inferred\")\n", |
| 38 | + "MSA_TR_TREE_INFERRED_SCORES_DIR = pathlib.Path(\"data/Synthetic_data/tree/coupling_scores/MSA_Transformer_inferred\")\n", |
| 39 | + "\n", |
| 40 | + "# Folder to host PDB structures\n", |
| 41 | + "PDB_DIR = pathlib.Path(\"data/PDB_structures\")\n", |
| 42 | + "if not PDB_DIR.exists():\n", |
| 43 | + " os.mkdir(PDB_DIR)" |
| 44 | + ] |
| 45 | + }, |
| 46 | + { |
| 47 | + "cell_type": "markdown", |
| 48 | + "id": "feb5e97e-4ac0-414d-bf11-16a01576e257", |
| 49 | + "metadata": {}, |
| 50 | + "source": [ |
| 51 | + "# MSA data dictionary\n", |
| 52 | + "\n", |
| 53 | + "``\"pfam_seq\"`` is the sequence from the Pfam full MSA corresponding to the PDB structure with ID ``\"pdb_id\"``. " |
| 54 | + ] |
| 55 | + }, |
| 56 | + { |
| 57 | + "cell_type": "code", |
| 58 | + "execution_count": null, |
| 59 | + "id": "fbb3c61f-125c-4b9f-ba94-cc9f2013b1ee", |
| 60 | + "metadata": {}, |
| 61 | + "outputs": [], |
| 62 | + "source": [ |
| 63 | + "msa_data = {\n", |
| 64 | + " \"PF00004\": {\n", |
| 65 | + " \"pdb_id\": \"4d81\",\n", |
| 66 | + " \"chain_id\": \"A\",\n", |
| 67 | + " \"pfam_seq\": \"ILLYGPPGCGKTMIAAAVANELDSEFIHVDAASIMSKWLGEAEKNVAKIFKTARELSKPAIIFIDELDALLASY-TSEVGGEARVRNQFLKEMDGLADKISKVYVIGATNKPWRLDEPFL-RRFQKRIYIT-\"\n", |
| 68 | + " },\n", |
| 69 | + " \"PF00005\": {\n", |
| 70 | + " \"pdb_id\": \"1l7v\",\n", |
| 71 | + " \"chain_id\": \"C\",\n", |
| 72 | + " \"pfam_seq\": \"--PLSGEVRAGEILHLVGPNGAGKSTLLARMAGMTS-GKGSIQFAGQPLEAWSATKLALHRAYLSQQQTPPFAMPVWHYQHDKTRTELLNDVAGALALDDKLGRSTNQLSGGEWQRVRLAAVVLQAGQLLLLDEPMN\"\n", |
| 73 | + " },\n", |
| 74 | + " \"PF00041\": {\n", |
| 75 | + " \"pdb_id\": \"3up1\",\n", |
| 76 | + " \"chain_id\": \"A\",\n", |
| 77 | + " \"pfam_seq\": \"-APFDLSVVYRGANDFVVTFNTSHKKYVKVLMHDVAYRQEKDENKWTHVNLSSTKLTLLQRKLQPAAMYEIKVRSIPDHYKGFWS\"\n", |
| 78 | + " },\n", |
| 79 | + " \"PF00072\": {\n", |
| 80 | + " \"pdb_id\": \"3ilh\",\n", |
| 81 | + " \"chain_id\": \"A\",\n", |
| 82 | + " \"pfam_seq\": \"VLLIDDDDIVNFLNTTIIRTHRVEEIQSVTSGNAAINKLNELYPSIICIDINMPGINGWELIDLFKQHFNKSIVCLLSSSLDPRDQAKAEASDVDYYVSKPLTANALN----\"\n", |
| 83 | + " },\n", |
| 84 | + " \"PF00076\": {\n", |
| 85 | + " \"pdb_id\": \"3nnh\",\n", |
| 86 | + " \"chain_id\": \"A\",\n", |
| 87 | + " \"pfam_seq\": \"-FVGQVPRTWSEKDLRELFEQYGAVYEINVLRDNPPQSKGCCFVTFYTRKAALEAQNALHNMKV-----\"\n", |
| 88 | + " },\n", |
| 89 | + " \"PF00096\": {\n", |
| 90 | + " \"pdb_id\": \"4r2a\",\n", |
| 91 | + " \"chain_id\": \"A\",\n", |
| 92 | + " \"pfam_seq\": \"YACPSCDRRFSRSDELTRHIRIH\"\n", |
| 93 | + " },\n", |
| 94 | + " \"PF00153\": {\n", |
| 95 | + " \"pdb_id\": \"1okc\",\n", |
| 96 | + " \"chain_id\": \"A\",\n", |
| 97 | + " \"pfam_seq\": \"--RYFAGNLASGGAAGATSLCFVYPLDFARTRLAADVGKGAQREFTGLGNCITKIFKSDGLRGLQGFNSVQGIIIYRAAYGVYDTAKGMLP---\"\n", |
| 98 | + " },\n", |
| 99 | + " \"PF00512\": {\n", |
| 100 | + " \"pdb_id\": \"3dge\",\n", |
| 101 | + " \"chain_id\": \"A\",\n", |
| 102 | + " \"pfam_seq\": \"MKTEFIANISHERTPLTAIKAYAETIYNSELDLSTLKEFLEVIIDQSNHLENLLNELLDFSRLE--\"\n", |
| 103 | + " },\n", |
| 104 | + " \"PF00595\": {\n", |
| 105 | + " \"pdb_id\": \"1be9\",\n", |
| 106 | + " \"chain_id\": \"A\",\n", |
| 107 | + " \"pfam_seq\": \"-IVIHR-GSTGLGFNIVGGEDGE---GIFISFILAGGPADLSGLRKGDQILSVNGVDLRNASHEQAAIALKNAGQTVTII--\"\n", |
| 108 | + " },\n", |
| 109 | + " \"PF01535\": {\n", |
| 110 | + " \"pdb_id\": \"4m57\",\n", |
| 111 | + " \"chain_id\": \"A\",\n", |
| 112 | + " \"pfam_seq\": \"VTYHTLVGGYSSLEMFSEAREVIGYMVQHGL\"\n", |
| 113 | + " },\n", |
| 114 | + " \"PF02518\": {\n", |
| 115 | + " \"pdb_id\": \"3g7e\",\n", |
| 116 | + " \"chain_id\": \"A\",\n", |
| 117 | + " \"pfam_seq\": \"-DGTGLHHMVFEVVDNAIDAGHCKEIIVTIH---ADNSVSVQDDGRGIPTGIHPHAGGKFDD-NSYKVSGGLHGVGVSVVNALSQKLELVIQRGETEKTGTMVRFWPSLE-\"\n", |
| 118 | + " },\n", |
| 119 | + " \"PF07679\": {\n", |
| 120 | + " \"pdb_id\": \"1fhg\",\n", |
| 121 | + " \"chain_id\": \"A\",\n", |
| 122 | + " \"pfam_seq\": \"PYFTKTILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEGNCSLTISEVCGDDDAKYTCKAVNSLGEATCTAELLV\"\n", |
| 123 | + " },\n", |
| 124 | + " \"PF00271\": {\n", |
| 125 | + " \"pdb_id\": \"3ex7\",\n", |
| 126 | + " \"chain_id\": \"C\",\n", |
| 127 | + " \"pfam_seq\": \"-KFDTLCDLY-DTLTITQAVIFCNTKRKVDWTEKMREA-NFTVSSMHGDMPQKERESIMKEFRSGASRVLISTDVWARGLDVPQVSLIINYDLPNNRELYIHRIGRSGRYG\"\n", |
| 128 | + " },\n", |
| 129 | + " \"PF00397\": {\n", |
| 130 | + " \"pdb_id\": \"4rex\",\n", |
| 131 | + " \"chain_id\": \"A\",\n", |
| 132 | + " \"pfam_seq\": \"LPAGWEMAKTSS-GQRYFLNHIDQTTTWQDP\"\n", |
| 133 | + " },\n", |
| 134 | + " \"PF13354\": {\n", |
| 135 | + " \"pdb_id\": \"6qw8\",\n", |
| 136 | + " \"chain_id\": \"A\",\n", |
| 137 | + " \"pfam_seq\": \"--DNSQILYRADERFAMCSTSKVMAAAAVLKKSESENLLNQRVEIKKSDLVNYNPIAEKHVNGTMSLAESAAALQYSDNVAMNKLIAHVGPASVTAFARQLGDETFRLDRTEPTLNAIPGDPRDTTSPRAMAQTLRNLTLGKALGDSLVTWMKNTTGAASIQAGLPAWVVGDKTGSGYGTTNDIAVIWPDRAPLILV-\"\n", |
| 138 | + " }\n", |
| 139 | + "}" |
| 140 | + ] |
| 141 | + }, |
| 142 | + { |
| 143 | + "cell_type": "markdown", |
| 144 | + "id": "3e68e145-6515-4509-88c9-4f648b305d4c", |
| 145 | + "metadata": {}, |
| 146 | + "source": [ |
| 147 | + "# Align with PDB data" |
| 148 | + ] |
| 149 | + }, |
| 150 | + { |
| 151 | + "cell_type": "code", |
| 152 | + "execution_count": null, |
| 153 | + "id": "9d7b5662-67a3-491c-b0a6-430d3c6359a7", |
| 154 | + "metadata": {}, |
| 155 | + "outputs": [], |
| 156 | + "source": [ |
| 157 | + "match = 2\n", |
| 158 | + "mismatch = -2\n", |
| 159 | + "gap_penalty = -2\n", |
| 160 | + "\n", |
| 161 | + "scoring = swalign.IdentityScoringMatrix(match, mismatch)\n", |
| 162 | + "sw = swalign.LocalAlignment(scoring, gap_penalty=gap_penalty)" |
| 163 | + ] |
| 164 | + }, |
| 165 | + { |
| 166 | + "cell_type": "code", |
| 167 | + "execution_count": null, |
| 168 | + "id": "cfcdcdce-f990-46f5-add0-99ff2ceb7ae6", |
| 169 | + "metadata": {}, |
| 170 | + "outputs": [], |
| 171 | + "source": [ |
| 172 | + "idxs_chains = {}\n", |
| 173 | + "idxs_pfam_seqs = {}\n", |
| 174 | + "dist_mat = {}\n", |
| 175 | + "\n", |
| 176 | + "for msa_name in msa_data:\n", |
| 177 | + " print(msa_name)\n", |
| 178 | + " pdb_id = msa_data[msa_name][\"pdb_id\"]\n", |
| 179 | + " chain_id = msa_data[msa_name][\"chain_id\"]\n", |
| 180 | + "\n", |
| 181 | + " # Download and parse structure\n", |
| 182 | + " pdbl = PDBList()\n", |
| 183 | + " pdbl.retrieve_pdb_file(pdb_id,\n", |
| 184 | + " pdir=PDB_DIR,\n", |
| 185 | + " file_format=\"mmCif\")\n", |
| 186 | + " pdb_parser = MMCIFParser()\n", |
| 187 | + " with warnings.catch_warnings():\n", |
| 188 | + " warnings.simplefilter(\"ignore\", category=PDBConstructionWarning)\n", |
| 189 | + " chain = pdb_parser.get_structure(pdb_id, f\"{PDB_DIR}/{pdb_id}.cif\")[0][chain_id]\n", |
| 190 | + " # Convert to one-letter encoding\n", |
| 191 | + " pdb_seq = to_one_letter_seq(chain)\n", |
| 192 | + " pfam_seq = msa_data[msa_name][\"pfam_seq\"]\n", |
| 193 | + " print(f\"Ref: {pdb_seq = }\")\n", |
| 194 | + " print(f\"Query: {pfam_seq = }\")\n", |
| 195 | + "\n", |
| 196 | + " # Align PDB sequence with PFAM sequence\n", |
| 197 | + " alignment = sw.align(pdb_seq, pfam_seq)\n", |
| 198 | + " alignment.dump()\n", |
| 199 | + " # Store matching indices and PDB distance matrix for matching indices\n", |
| 200 | + " idxs_chain, idxs_pfam_seq = indices_in_ref_and_query(alignment)\n", |
| 201 | + " idxs_chains[msa_name] = idxs_chain\n", |
| 202 | + " idxs_pfam_seqs[msa_name] = idxs_pfam_seq\n", |
| 203 | + " dist_mat[msa_name] = calc_min_dist_matrix(chain, idxs_chain)\n", |
| 204 | + " print()" |
| 205 | + ] |
| 206 | + }, |
| 207 | + { |
| 208 | + "cell_type": "markdown", |
| 209 | + "id": "a0606a56-31a0-4af5-9b3d-7bb3bc889ea3", |
| 210 | + "metadata": {}, |
| 211 | + "source": [ |
| 212 | + "# Compute ground truth (bmDCA) coupling scores for the synthetic MSAs" |
| 213 | + ] |
| 214 | + }, |
| 215 | + { |
| 216 | + "cell_type": "code", |
| 217 | + "execution_count": null, |
| 218 | + "id": "77e7f676-e903-4a84-b8b1-8239a129bdb0", |
| 219 | + "metadata": {}, |
| 220 | + "outputs": [], |
| 221 | + "source": [ |
| 222 | + "bmDCA_scores = {}\n", |
| 223 | + "for pfam_family in msa_data:\n", |
| 224 | + " bmDCA_scores[pfam_family] = {}\n", |
| 225 | + " _, J2 = read_bmDCA_parameters(bmDCA_PARAMETERS_DIR / f\"{pfam_family}.txt\")\n", |
| 226 | + " idx_subset = np.asarray(idxs_pfam_seqs[pfam_family]) # Restrict to sites matching with the PDB\n", |
| 227 | + " bmDCA_scores[pfam_family] = zero_sum_gauge_frob_scores(J2)[idx_subset, :][:, idx_subset] # Use APC (default)" |
| 228 | + ] |
| 229 | + }, |
| 230 | + { |
| 231 | + "cell_type": "markdown", |
| 232 | + "id": "c16aa2fe-178e-4fdb-8829-30e8d9522bec", |
| 233 | + "metadata": {}, |
| 234 | + "source": [ |
| 235 | + "# Read in plmDCA scores inferred from the synthetic MSAs" |
| 236 | + ] |
| 237 | + }, |
| 238 | + { |
| 239 | + "cell_type": "code", |
| 240 | + "execution_count": null, |
| 241 | + "id": "d6a02358-8da8-445c-ad43-72916189da8e", |
| 242 | + "metadata": {}, |
| 243 | + "outputs": [], |
| 244 | + "source": [ |
| 245 | + "plmDCA_equilibrium_scores = {}\n", |
| 246 | + "plmDCA_tree_scores = {}\n", |
| 247 | + "for pfam_family in msa_data:\n", |
| 248 | + " length = len(msa_data[pfam_family][\"pfam_seq\"])\n", |
| 249 | + " for dic, path in zip([plmDCA_equilibrium_scores, plmDCA_tree_scores],\n", |
| 250 | + " [plmDCA_EQ_INFERRED_SCORES_DIR, plmDCA_TREE_INFERRED_SCORES_DIR]):\n", |
| 251 | + " scores = np.loadtxt(path / f\"{pfam_family}.txt\")\n", |
| 252 | + " scores[:, :2] -= 1 # Convert 1-based indexing (Julia) to 0-based indexing (Python)\n", |
| 253 | + " mat = np.zeros((length, length), dtype=np.float64) # Initialize scores matrix\n", |
| 254 | + " mat[tuple(scores[:, :2].astype(int).T)] = scores[:, 2] # Populate scores matrix\n", |
| 255 | + " mat += mat.T # Symmetrize scores matrix\n", |
| 256 | + " idx_subset = np.asarray(idxs_pfam_seqs[pfam_family]) # Restrict to sites matching with the PDB\n", |
| 257 | + " mat = mat[idx_subset, :][:, idx_subset]\n", |
| 258 | + " dic[pfam_family] = mat" |
| 259 | + ] |
| 260 | + }, |
| 261 | + { |
| 262 | + "cell_type": "markdown", |
| 263 | + "id": "aa903d8a-b35c-40c1-bb31-7a911b523e29", |
| 264 | + "metadata": {}, |
| 265 | + "source": [ |
| 266 | + "# Read in MSA Transformer scores inferred from the synthetic MSAs\n", |
| 267 | + "\n", |
| 268 | + "These scores were obtained from each synthetic MSA by computing contact probabilities (scores) according to MSA Transformer [(Rao et al, 2021)](https://proceedings.mlr.press/v139/rao21a.html) from 10 sub-MSAs defined as the rows with labels 0-9 `in ``data/Synthetic_data/MSA_Transformer_subsample_labels``, and then averaging the resulting 10 score matrices." |
| 269 | + ] |
| 270 | + }, |
| 271 | + { |
| 272 | + "cell_type": "code", |
| 273 | + "execution_count": null, |
| 274 | + "id": "668a6bec-6a33-4d42-98db-d72f8b155117", |
| 275 | + "metadata": {}, |
| 276 | + "outputs": [], |
| 277 | + "source": [ |
| 278 | + "MSA_Tr_equilibrium_scores = {}\n", |
| 279 | + "MSA_Tr_tree_scores = {}\n", |
| 280 | + "for pfam_family in msa_data:\n", |
| 281 | + " for dic, path in zip([MSA_Tr_equilibrium_scores, MSA_Tr_tree_scores],\n", |
| 282 | + " [MSA_TR_EQ_INFERRED_SCORES_DIR, MSA_TR_TREE_INFERRED_SCORES_DIR]):\n", |
| 283 | + " mat = np.loadtxt(path / f\"{pfam_family}.txt\")\n", |
| 284 | + " idx_subset = np.asarray(idxs_pfam_seqs[pfam_family]) # Restrict to sites matching with the PDB\n", |
| 285 | + " mat = mat[idx_subset, :][:, idx_subset]\n", |
| 286 | + " dic[pfam_family] = mat" |
| 287 | + ] |
| 288 | + }, |
| 289 | + { |
| 290 | + "cell_type": "code", |
| 291 | + "execution_count": null, |
| 292 | + "id": "d2eb4368-fd4f-49be-8b07-91e665862c10", |
| 293 | + "metadata": {}, |
| 294 | + "outputs": [], |
| 295 | + "source": [ |
| 296 | + "# Exclude PF00096 from the analysis as it is too short\n", |
| 297 | + "msa_names_long = [msa_name for msa_name in msa_data if msa_name != \"PF00096\"]\n", |
| 298 | + "print(f\"{msa_names_long = }\")" |
| 299 | + ] |
| 300 | + } |
| 301 | + ], |
| 302 | + "metadata": { |
| 303 | + "kernelspec": { |
| 304 | + "display_name": "Python 3 (ipykernel)", |
| 305 | + "language": "python", |
| 306 | + "name": "python3" |
| 307 | + }, |
| 308 | + "language_info": { |
| 309 | + "codemirror_mode": { |
| 310 | + "name": "ipython", |
| 311 | + "version": 3 |
| 312 | + }, |
| 313 | + "file_extension": ".py", |
| 314 | + "mimetype": "text/x-python", |
| 315 | + "name": "python", |
| 316 | + "nbconvert_exporter": "python", |
| 317 | + "pygments_lexer": "ipython3", |
| 318 | + "version": "3.9.12" |
| 319 | + } |
| 320 | + }, |
| 321 | + "nbformat": 4, |
| 322 | + "nbformat_minor": 5 |
| 323 | +} |
0 commit comments