Filter preprocessed assembly mmCIF files, fix various bugs in the filtering criterion functions, and optimize the clustering script (#69)

amorehead · web-flow · commit ce95c87499ac · 2024-07-10T11:27:39.000-07:00
* Update README.md

* Update biomolecule.py

* Update mmcif_parsing.py

* Update test_data_parsing.py

* Update filter_pdb_mmcifs.py

* Update README.md

* Update cluster_pdb_mmcifs.py

* Update README.md
diff --git a/README.md b/README.md
@@ -28,6 +28,8 @@ A fork with full Lightning + Hydra support is being maintained by <a href="https
 
 - <a href="https://github.com/amorehead">Alex</a> for the PDB dataset preparation script!
 
+- <a href="https://github.com/milot-mirdita">Milot</a> for optimizing the PDB dataset clustering script!
+
 - <a href="https://github.com/patrick-kidger">Patrick</a> for <a href="https://docs.kidger.site/jaxtyping/">jaxtyping</a>, <a href="https://github.com/fferflo">Florian</a> for <a href="https://github.com/fferflo/einx">einx</a>, and of course, <a href="https://github.com/arogozhnikov">Alex</a> for <a href="https://einops.rocks/">einops</a>
 
 ## Install
@@ -198,18 +200,26 @@ assert sampled_atom_pos.shape == (1, (6 + 5), 3)
 
 ### PDB dataset curation
 
-To acquire the AlphaFold 3 PDB dataset, first download all complexes in the Protein Data Bank (PDB), and then preprocess them with the script referenced below. The PDB can be downloaded from the RCSB: https://www.wwpdb.org/ftp/pdb-ftp-sites#rcsbpdb. The Python script below (i.e., `filter_pdb_mmcifs.py`) assumes you have downloaded the PDB in the **mmCIF file format**, placing it at `data/pdb_data/unfiltered_mmcifs/`. On the RCSB website, navigate down to "Download Protocols", and follow the download instructions depending on your location.
+To acquire the AlphaFold 3 PDB dataset, first download all first-assembly (and asymmetric unit) complexes in the Protein Data Bank (PDB), and then preprocess them with the script referenced below. The PDB can be downloaded from the RCSB: https://www.wwpdb.org/ftp/pdb-ftp-sites#rcsbpdb. The Python script below (i.e., `filter_pdb_mmcifs.py`) assumes you have downloaded the PDB in the **mmCIF file format**, placing it at `data/pdb_data/unfiltered_assembly_mmcifs/` (and `data/pdb_data/unfiltered_asym_mmcifs/`, respectively). On the RCSB website, navigate down to "Download Protocols", and follow the download instructions depending on your location.
 
-For example, one can use the following command to download the PDB as a collection of mmCIF files:
+For example, one can use the following commands to download the PDB as a collection of mmCIF files:
 ```bash
+# For `assembly1` complexes
+rsync -rlpt -v -z --delete --port=33444 \
+rsync.rcsb.org::ftp_data/assemblies/mmCIF/divided/ ./data/pdb_data/unfiltered_assembly_mmcifs/
+# For asymmetric unit complexes
 rsync -rlpt -v -z --delete --port=33444 \
-rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ ./data/pdb_data/unfiltered_mmcifs/
+rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ ./data/pdb_data/unfiltered_asym_mmcifs/
 ```
 
 > WARNING: Downloading PDB can take up to 1TB of space.
 
-After downloading, you should have a directory formatted like this:
-https://files.rcsb.org/pub/pdb/data/structures/divided/mmCIF/
+> NOTE: PDB also hosts snapshots on AWS: https://pdbsnapshots.s3.us-west-2.amazonaws.com/index.html.
+
+> TODO: Use a specific snapshot to make training reproducible.
+
+After downloading, you should have two directories formatted like this:
+https://files.rcsb.org/pub/pdb/data/assemblies/mmCIF/divided/ & https://files.rcsb.org/pub/pdb/data/structures/divided/mmCIF/
 ```bash
 00/
 01/
@@ -218,9 +228,10 @@ https://files.rcsb.org/pub/pdb/data/structures/divided/mmCIF/
 zz/
 ```
 
-In this directory, unzip all the files:
+For these directories, unzip all the files:
 ```bash
-find . -type f -name "*.gz" -exec gzip -d {} \;
+find ./data/pdb_data/unfiltered_assembly_mmcifs/ -type f -name "*.gz" -exec gzip -d {} \;
+find ./data/pdb_data/unfiltered_asym_mmcifs/ -type f -name "*.gz" -exec gzip -d {} \;
 ```
 
 Next run the commands
@@ -235,12 +246,12 @@ find data/ccd_data/ -type f -name "*.gz" -exec gzip -d {} \;
 
 ### PDB dataset filtering
 
-Then run the following with `pdb_dir`, `ccd_dir`, and `mmcif_output_dir` replaced with the locations of your local copies of the PDB, CCD, and your desired dataset output directory (i.e., `./data/pdb_data/unfiltered_mmcifs/`, `./data/ccd_data/`, and `./data/pdb_data/mmcifs/`).
+Then run the following with `pdb_assembly_dir`, `pdb_asym_dir`, `ccd_dir`, and `mmcif_output_dir` replaced with the locations of your local copies of the first-assembly PDB, asymmetric unit PDB, CCD, and your desired dataset output directory (i.e., `./data/pdb_data/unfiltered_assembly_mmcifs/`, `./data/pdb_data/unfiltered_asym_mmcifs/`, `./data/ccd_data/`, and `./data/pdb_data/mmcifs/`).
 ```bash
-python scripts/filter_pdb_mmcifs.py --mmcif_dir <pdb_dir> --ccd_dir <ccd_dir> --output_dir <mmcif_output_dir>
+python scripts/filter_pdb_mmcifs.py --mmcif_assembly_dir <pdb_assembly_dir> --mmcif_asym_dir <pdb_asym_dir> --ccd_dir <ccd_dir> --output_dir <mmcif_output_dir>
 ```
 
-See the script for more options. Each mmCIF that successfully passes
+See the script for more options. Each first-assembly mmCIF that successfully passes
 all processing steps will be written to `mmcif_output_dir` within a subdirectory
 named according to the mmCIF's second and third PDB ID characters (e.g. `5c`).
 
diff --git a/alphafold3_pytorch/common/biomolecule.py b/alphafold3_pytorch/common/biomolecule.py
@@ -721,10 +721,6 @@ def to_mmcif(
         mmcif_dict["_pdbx_struct_assembly.oligomeric_count"].append(
             str(pdbx_struct_assembly_oligomeric_count[assembly_id])
         )
-    assert mmcif_dict[
-        "_pdbx_struct_assembly_gen.assembly_id"
-    ], "No _pdbx_struct_assembly_gen.assembly_id entries found."
-    assert mmcif_dict["_pdbx_struct_assembly.id"], "No _pdbx_struct_assembly.id entries found."
 
     # Populate the _chem_comp table.
     for chem_comp in biomol.chem_comp_table:
diff --git a/alphafold3_pytorch/data/mmcif_parsing.py b/alphafold3_pytorch/data/mmcif_parsing.py
@@ -594,7 +594,7 @@ def _get_header(parsed_info: MmCIFDict) -> PdbHeader:
     if "_pdbx_audit_revision_history.revision_date" in parsed_info:
         header["release_date"] = get_release_date(parsed_info)
     else:
-        logging.warning("Could not determine release_date: %s", parsed_info["_entry.id"])
+        logging.warning("Could not determine release_date for entry: %s", parsed_info["_entry.id"])
 
     header["resolution"] = 0.00
     for res_key in (
diff --git a/scripts/cluster_pdb_mmcifs.py b/scripts/cluster_pdb_mmcifs.py
@@ -25,7 +25,7 @@
 import os
 import subprocess
 from concurrent.futures import ProcessPoolExecutor, as_completed
-from typing import Dict, List, Literal, Optional, Set, Tuple
+from typing import Dict, List, Literal, Optional, Set, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -140,16 +140,20 @@ def convert_modified_residue_three_to_one(
 
     if residue_mol_type == "protein":
         return (
-            PROTEIN_LETTERS_3TO1[mapped_residue]
-            if mapped_residue in PROTEIN_LETTERS_3TO1
-            else "X",
+            (
+                PROTEIN_LETTERS_3TO1[mapped_residue]
+                if mapped_residue in PROTEIN_LETTERS_3TO1
+                else "X"
+            ),
             "protein",
         )
     elif residue_mol_type in {"rna", "dna"}:
         return (
-            NUCLEIC_LETTERS_3TO1[mapped_residue]
-            if mapped_residue in NUCLEIC_LETTERS_3TO1
-            else "X",
+            (
+                NUCLEIC_LETTERS_3TO1[mapped_residue]
+                if mapped_residue in NUCLEIC_LETTERS_3TO1
+                else "X"
+            ),
             "nucleic_acid",
         )
     else:
@@ -366,8 +370,7 @@ def cluster_sequences_using_mmseqs2(
     min_seq_id: float = 0.5,
     coverage: float = 0.8,
     coverage_mode: Literal[0, 1, 2, 3] = 1,
-    k_mer_length: Optional[int] = None,
-    spaced_k_mer_pattern: Optional[str] = None,
+    extra_parameters: Optional[Dict[str, Union[int, float, str]]] = None,
 ) -> Dict[str, int]:
     """Run MMseqs2 on the input FASTA file and write the resulting clusters to a local output directory."""
     assert input_filepath.endswith(".fasta"), "The input file must be a FASTA file."
@@ -396,10 +399,10 @@ def cluster_sequences_using_mmseqs2(
         "--cov-mode",
         str(coverage_mode),
     ]
-    if k_mer_length:
-        mmseqs_command.extend(["-k", str(k_mer_length)])
-    if spaced_k_mer_pattern:
-        mmseqs_command.extend(["--spaced-kmer-pattern", spaced_k_mer_pattern])
+    if extra_parameters:
+        for key, value in extra_parameters.items():
+            mmseqs_command.extend([key, str(value)])
+
     subprocess.run(mmseqs_command)
     assert os.path.isfile(
         output_cluster_filepath
@@ -748,6 +751,11 @@ def cluster_interfaces(
             min_seq_id=0.4,
             coverage=0.8,
             coverage_mode=0,
+            extra_parameters={
+                # cluster reassign improves clusters by reassigning sequences to the best cluster
+                # and fixes transitivity issues of the cascade clustering
+                "--cluster-reassign": 1,
+            },
         )
 
     if not nucleic_acid_chain_cluster_mapping:
@@ -762,9 +770,14 @@ def cluster_interfaces(
             min_seq_id=1.0,
             coverage=0.8,
             coverage_mode=0,
-            # NOTE: The following arguments were taken from: https://github.com/soedinglab/MMseqs2/issues/373#issuecomment-728166556
-            k_mer_length=6,
-            spaced_k_mer_pattern="11011101",
+            extra_parameters={
+                # 7 or 8 should work best, something to test
+                "-k": 8,
+                # there is currently an issue in mmseqs2 with nucleotide search and spaced k-mers
+                "--spaced-kmer-mode": 0,
+                # see above
+                "--cluster-reassign": 1,
+            },
         )
 
     if not peptide_chain_cluster_mapping:
@@ -779,9 +792,30 @@ def cluster_interfaces(
             min_seq_id=1.0,
             coverage=0.8,
             coverage_mode=0,
-            # NOTE: The following arguments were taken from: https://github.com/soedinglab/MMseqs2/issues/373#issuecomment-728166556
-            k_mer_length=6,
-            spaced_k_mer_pattern="11011101",
+            # some of these parameters are from the spacepharer optimized parameters
+            # these were for short CRISPR spacer recognition, so they should work well for arbitrary peptides
+            # this is a adhoc solution, with some recent new introduction like the ungapped prefilter
+            extra_parameters={
+                # spacepharer optimized parameters
+                "--gap-open": 16,
+                "--gap-extend": 2,
+                "--sub-mat": "VTML40.out",
+                # we would like to try using ungapped prefilter mode to avoid
+                # minimum consecutive k-mer match restrictions, but the cluster workflow doesn't expose this yet
+                # let's use a real small k-mer size instead
+                # "--prefilter-mode": 1,
+                "-k": 5,
+                "--spaced-kmer-mode": 0,
+                # Don't try suppresing FP hits since the peptides are too short
+                "--mask": 0,
+                "--comp-bias-corr": 0,
+                # let more things through the prefilter
+                "--min-ungapped-score": 5,
+                # Try disabling completely with "inf"?
+                "-e": 1,
+                # see above
+                "--cluster-reassign": 1,
+            },
         )
 
     if not ligand_chain_cluster_mapping:
diff --git a/scripts/filter_pdb_mmcifs.py b/scripts/filter_pdb_mmcifs.py
diff --git a/tests/test_data_parsing.py b/tests/test_data_parsing.py