Bitbol-Lab
diff --git a/‎.gitattributes‎
Lines changed: 4 additions & 0 deletions b/‎.gitattributes‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎Contact_prediction_phylogeny.ipynb‎
Lines changed: 323 additions & 0 deletions b/‎Contact_prediction_phylogeny.ipynb‎
Lines changed: 323 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,4 @@
+data/*.txt filter=lfs diff=lfs merge=lfs -text
+data/*.fasta filter=lfs diff=lfs merge=lfs -text
+data/**/*.txt filter=lfs diff=lfs merge=lfs -text
+data/**/*.fasta filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,323 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "821893c3-2096-413b-ba54-1863f8429bae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "import os\n",
+    "import warnings\n",
+    "\n",
+    "from Bio.PDB import *\n",
+    "from Bio.PDB.PDBExceptions import PDBConstructionWarning\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "import swalign"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13c3d038-ba73-465e-9c9a-ea07d3cc9841",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Folder containing bmDCA parameters used to generate the synthetic MSAs without or with phylogeny\n",
+    "bmDCA_PARAMETERS_DIR = pathlib.Path(\"data/Synthetic_data/bmDCA_parameters\")\n",
+    "\n",
+    "# Folders containing plmDCA coupling scores inferred from the synthetic MSAs\n",
+    "plmDCA_EQ_INFERRED_SCORES_DIR = pathlib.Path(\"data/Synthetic_data/equilibrium/coupling_scores/plmDCA_inferred\")\n",
+    "plmDCA_TREE_INFERRED_SCORES_DIR = pathlib.Path(\"data/Synthetic_data/tree/coupling_scores/plmDCA_inferred\")\n",
+    "\n",
+    "# Folders containing MSA Transformer coupling scores inferred from the synthetic MSAs\n",
+    "MSA_TR_EQ_INFERRED_SCORES_DIR = pathlib.Path(\"data/Synthetic_data/equilibrium/coupling_scores/MSA_Transformer_inferred\")\n",
+    "MSA_TR_TREE_INFERRED_SCORES_DIR = pathlib.Path(\"data/Synthetic_data/tree/coupling_scores/MSA_Transformer_inferred\")\n",
+    "\n",
+    "# Folder to host PDB structures\n",
+    "PDB_DIR = pathlib.Path(\"data/PDB_structures\")\n",
+    "if not PDB_DIR.exists():\n",
+    "    os.mkdir(PDB_DIR)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "feb5e97e-4ac0-414d-bf11-16a01576e257",
+   "metadata": {},
+   "source": [
+    "# MSA data dictionary\n",
+    "\n",
+    "``\"pfam_seq\"`` is the sequence from the Pfam full MSA corresponding to the PDB structure with ID ``\"pdb_id\"``. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbb3c61f-125c-4b9f-ba94-cc9f2013b1ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "msa_data = {\n",
+    "    \"PF00004\": {\n",
+    "        \"pdb_id\": \"4d81\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"ILLYGPPGCGKTMIAAAVANELDSEFIHVDAASIMSKWLGEAEKNVAKIFKTARELSKPAIIFIDELDALLASY-TSEVGGEARVRNQFLKEMDGLADKISKVYVIGATNKPWRLDEPFL-RRFQKRIYIT-\"\n",
+    "    },\n",
+    "    \"PF00005\": {\n",
+    "        \"pdb_id\": \"1l7v\",\n",
+    "        \"chain_id\": \"C\",\n",
+    "        \"pfam_seq\": \"--PLSGEVRAGEILHLVGPNGAGKSTLLARMAGMTS-GKGSIQFAGQPLEAWSATKLALHRAYLSQQQTPPFAMPVWHYQHDKTRTELLNDVAGALALDDKLGRSTNQLSGGEWQRVRLAAVVLQAGQLLLLDEPMN\"\n",
+    "    },\n",
+    "    \"PF00041\": {\n",
+    "        \"pdb_id\": \"3up1\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"-APFDLSVVYRGANDFVVTFNTSHKKYVKVLMHDVAYRQEKDENKWTHVNLSSTKLTLLQRKLQPAAMYEIKVRSIPDHYKGFWS\"\n",
+    "    },\n",
+    "    \"PF00072\": {\n",
+    "        \"pdb_id\": \"3ilh\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"VLLIDDDDIVNFLNTTIIRTHRVEEIQSVTSGNAAINKLNELYPSIICIDINMPGINGWELIDLFKQHFNKSIVCLLSSSLDPRDQAKAEASDVDYYVSKPLTANALN----\"\n",
+    "    },\n",
+    "    \"PF00076\": {\n",
+    "        \"pdb_id\": \"3nnh\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"-FVGQVPRTWSEKDLRELFEQYGAVYEINVLRDNPPQSKGCCFVTFYTRKAALEAQNALHNMKV-----\"\n",
+    "    },\n",
+    "    \"PF00096\": {\n",
+    "        \"pdb_id\": \"4r2a\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"YACPSCDRRFSRSDELTRHIRIH\"\n",
+    "    },\n",
+    "    \"PF00153\": {\n",
+    "        \"pdb_id\": \"1okc\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"--RYFAGNLASGGAAGATSLCFVYPLDFARTRLAADVGKGAQREFTGLGNCITKIFKSDGLRGLQGFNSVQGIIIYRAAYGVYDTAKGMLP---\"\n",
+    "    },\n",
+    "    \"PF00512\": {\n",
+    "        \"pdb_id\": \"3dge\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"MKTEFIANISHERTPLTAIKAYAETIYNSELDLSTLKEFLEVIIDQSNHLENLLNELLDFSRLE--\"\n",
+    "    },\n",
+    "    \"PF00595\": {\n",
+    "        \"pdb_id\": \"1be9\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"-IVIHR-GSTGLGFNIVGGEDGE---GIFISFILAGGPADLSGLRKGDQILSVNGVDLRNASHEQAAIALKNAGQTVTII--\"\n",
+    "    },\n",
+    "    \"PF01535\": {\n",
+    "        \"pdb_id\": \"4m57\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"VTYHTLVGGYSSLEMFSEAREVIGYMVQHGL\"\n",
+    "    },\n",
+    "    \"PF02518\": {\n",
+    "        \"pdb_id\": \"3g7e\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"-DGTGLHHMVFEVVDNAIDAGHCKEIIVTIH---ADNSVSVQDDGRGIPTGIHPHAGGKFDD-NSYKVSGGLHGVGVSVVNALSQKLELVIQRGETEKTGTMVRFWPSLE-\"\n",
+    "    },\n",
+    "    \"PF07679\": {\n",
+    "        \"pdb_id\": \"1fhg\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"PYFTKTILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEGNCSLTISEVCGDDDAKYTCKAVNSLGEATCTAELLV\"\n",
+    "    },\n",
+    "    \"PF00271\": {\n",
+    "        \"pdb_id\": \"3ex7\",\n",
+    "        \"chain_id\": \"C\",\n",
+    "        \"pfam_seq\": \"-KFDTLCDLY-DTLTITQAVIFCNTKRKVDWTEKMREA-NFTVSSMHGDMPQKERESIMKEFRSGASRVLISTDVWARGLDVPQVSLIINYDLPNNRELYIHRIGRSGRYG\"\n",
+    "    },\n",
+    "    \"PF00397\": {\n",
+    "        \"pdb_id\": \"4rex\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"LPAGWEMAKTSS-GQRYFLNHIDQTTTWQDP\"\n",
+    "    },\n",
+    "    \"PF13354\": {\n",
+    "        \"pdb_id\": \"6qw8\",\n",
+    "        \"chain_id\": \"A\",\n",
+    "        \"pfam_seq\": \"--DNSQILYRADERFAMCSTSKVMAAAAVLKKSESENLLNQRVEIKKSDLVNYNPIAEKHVNGTMSLAESAAALQYSDNVAMNKLIAHVGPASVTAFARQLGDETFRLDRTEPTLNAIPGDPRDTTSPRAMAQTLRNLTLGKALGDSLVTWMKNTTGAASIQAGLPAWVVGDKTGSGYGTTNDIAVIWPDRAPLILV-\"\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e68e145-6515-4509-88c9-4f648b305d4c",
+   "metadata": {},
+   "source": [
+    "# Align with PDB data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d7b5662-67a3-491c-b0a6-430d3c6359a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "match = 2\n",
+    "mismatch = -2\n",
+    "gap_penalty = -2\n",
+    "\n",
+    "scoring = swalign.IdentityScoringMatrix(match, mismatch)\n",
+    "sw = swalign.LocalAlignment(scoring, gap_penalty=gap_penalty)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfcdcdce-f990-46f5-add0-99ff2ceb7ae6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idxs_chains = {}\n",
+    "idxs_pfam_seqs = {}\n",
+    "dist_mat = {}\n",
+    "\n",
+    "for msa_name in msa_data:\n",
+    "    print(msa_name)\n",
+    "    pdb_id = msa_data[msa_name][\"pdb_id\"]\n",
+    "    chain_id = msa_data[msa_name][\"chain_id\"]\n",
+    "\n",
+    "    # Download and parse structure\n",
+    "    pdbl = PDBList()\n",
+    "    pdbl.retrieve_pdb_file(pdb_id,\n",
+    "                           pdir=PDB_DIR,\n",
+    "                           file_format=\"mmCif\")\n",
+    "    pdb_parser = MMCIFParser()\n",
+    "    with warnings.catch_warnings():\n",
+    "        warnings.simplefilter(\"ignore\", category=PDBConstructionWarning)\n",
+    "        chain = pdb_parser.get_structure(pdb_id, f\"{PDB_DIR}/{pdb_id}.cif\")[0][chain_id]\n",
+    "    # Convert to one-letter encoding\n",
+    "    pdb_seq = to_one_letter_seq(chain)\n",
+    "    pfam_seq = msa_data[msa_name][\"pfam_seq\"]\n",
+    "    print(f\"Ref: {pdb_seq = }\")\n",
+    "    print(f\"Query: {pfam_seq = }\")\n",
+    "\n",
+    "    # Align PDB sequence with PFAM sequence\n",
+    "    alignment = sw.align(pdb_seq, pfam_seq)\n",
+    "    alignment.dump()\n",
+    "    # Store matching indices and PDB distance matrix for matching indices\n",
+    "    idxs_chain, idxs_pfam_seq = indices_in_ref_and_query(alignment)\n",
+    "    idxs_chains[msa_name] = idxs_chain\n",
+    "    idxs_pfam_seqs[msa_name] = idxs_pfam_seq\n",
+    "    dist_mat[msa_name] = calc_min_dist_matrix(chain, idxs_chain)\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a0606a56-31a0-4af5-9b3d-7bb3bc889ea3",
+   "metadata": {},
+   "source": [
+    "# Compute ground truth (bmDCA) coupling scores for the synthetic MSAs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77e7f676-e903-4a84-b8b1-8239a129bdb0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bmDCA_scores = {}\n",
+    "for pfam_family in msa_data:\n",
+    "    bmDCA_scores[pfam_family] = {}\n",
+    "    _, J2 = read_bmDCA_parameters(bmDCA_PARAMETERS_DIR / f\"{pfam_family}.txt\")\n",
+    "    idx_subset = np.asarray(idxs_pfam_seqs[pfam_family])  # Restrict to sites matching with the PDB\n",
+    "    bmDCA_scores[pfam_family] = zero_sum_gauge_frob_scores(J2)[idx_subset, :][:, idx_subset]  # Use APC (default)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c16aa2fe-178e-4fdb-8829-30e8d9522bec",
+   "metadata": {},
+   "source": [
+    "# Read in plmDCA scores inferred from the synthetic MSAs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6a02358-8da8-445c-ad43-72916189da8e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plmDCA_equilibrium_scores = {}\n",
+    "plmDCA_tree_scores = {}\n",
+    "for pfam_family in msa_data:\n",
+    "    length = len(msa_data[pfam_family][\"pfam_seq\"])\n",
+    "    for dic, path in zip([plmDCA_equilibrium_scores, plmDCA_tree_scores],\n",
+    "                         [plmDCA_EQ_INFERRED_SCORES_DIR, plmDCA_TREE_INFERRED_SCORES_DIR]):\n",
+    "        scores = np.loadtxt(path / f\"{pfam_family}.txt\")\n",
+    "        scores[:, :2] -= 1  # Convert 1-based indexing (Julia) to 0-based indexing (Python)\n",
+    "        mat = np.zeros((length, length), dtype=np.float64)  # Initialize scores matrix\n",
+    "        mat[tuple(scores[:, :2].astype(int).T)] = scores[:, 2]  # Populate scores matrix\n",
+    "        mat += mat.T  # Symmetrize scores matrix\n",
+    "        idx_subset = np.asarray(idxs_pfam_seqs[pfam_family])  # Restrict to sites matching with the PDB\n",
+    "        mat = mat[idx_subset, :][:, idx_subset]\n",
+    "        dic[pfam_family] = mat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa903d8a-b35c-40c1-bb31-7a911b523e29",
+   "metadata": {},
+   "source": [
+    "# Read in MSA Transformer scores inferred from the synthetic MSAs\n",
+    "\n",
+    "These scores were obtained from each synthetic MSA by computing contact probabilities (scores) according to MSA Transformer [(Rao et al, 2021)](https://proceedings.mlr.press/v139/rao21a.html) from 10 sub-MSAs defined as the rows with labels 0-9 `in ``data/Synthetic_data/MSA_Transformer_subsample_labels``, and then averaging the resulting 10 score matrices."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "668a6bec-6a33-4d42-98db-d72f8b155117",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MSA_Tr_equilibrium_scores = {}\n",
+    "MSA_Tr_tree_scores = {}\n",
+    "for pfam_family in msa_data:\n",
+    "    for dic, path in zip([MSA_Tr_equilibrium_scores, MSA_Tr_tree_scores],\n",
+    "                         [MSA_TR_EQ_INFERRED_SCORES_DIR, MSA_TR_TREE_INFERRED_SCORES_DIR]):\n",
+    "        mat = np.loadtxt(path / f\"{pfam_family}.txt\")\n",
+    "        idx_subset = np.asarray(idxs_pfam_seqs[pfam_family])  # Restrict to sites matching with the PDB\n",
+    "        mat = mat[idx_subset, :][:, idx_subset]\n",
+    "        dic[pfam_family] = mat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2eb4368-fd4f-49be-8b07-91e665862c10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Exclude PF00096 from the analysis as it is too short\n",
+    "msa_names_long = [msa_name for msa_name in msa_data if msa_name != \"PF00096\"]\n",
+    "print(f\"{msa_names_long = }\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -24,6 +24,7 @@ In order to run the notebooks, the following python packages are required:
 - matplotlib
 - statsmodels
 - biopython
+- swalign
 - esm==0.4.0
 
 ``prody`` and ``HMMER`` are required to run the Python script ``data/Pfam_Seed/fetch_seed_MSAs.py``, if you wish to create new