lucidrains
diff --git a/‎README.md‎
Lines changed: 10 additions & 8 deletions b/‎README.md‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎scripts/cluster_pdb_mmcifs.py‎ renamed to ‎scripts/cluster_pdb_train_mmcifs.py‎
Lines changed: 7 additions & 9 deletions b/‎scripts/cluster_pdb_mmcifs.py‎ renamed to ‎scripts/cluster_pdb_train_mmcifs.py‎
Lines changed: 7 additions & 9 deletions
@@ -212,7 +212,7 @@ assert sampled_atom_pos.shape == (1, (5 + 4), 3)
 
 ### PDB dataset curation
 
-To acquire the AlphaFold 3 PDB dataset, first download all first-assembly (and asymmetric unit) complexes in the Protein Data Bank (PDB), and then preprocess them with the script referenced below. The PDB can be downloaded from the RCSB: https://www.wwpdb.org/ftp/pdb-ftp-sites#rcsbpdb. The two Python scripts below (i.e., `filter_pdb_mmcifs.py` and `cluster_pdb_mmcifs.py`) assume you have downloaded the PDB in the **mmCIF file format**, placing its first-assembly and asymmetric unit mmCIF files at `data/pdb_data/unfiltered_assembly_mmcifs/` and `data/pdb_data/unfiltered_asym_mmcifs/`, respectively.
+To acquire the AlphaFold 3 PDB dataset, first download all first-assembly (and asymmetric unit) complexes in the Protein Data Bank (PDB), and then preprocess them with the script referenced below. The PDB can be downloaded from the RCSB: https://www.wwpdb.org/ftp/pdb-ftp-sites#rcsbpdb. The two Python scripts below (i.e., `filter_pdb_{train,val}_mmcifs.py` and `cluster_pdb_{train,val}_mmcifs.py`) assume you have downloaded the PDB in the **mmCIF file format**, placing its first-assembly and asymmetric unit mmCIF files at `data/pdb_data/unfiltered_assembly_mmcifs/` and `data/pdb_data/unfiltered_asym_mmcifs/`, respectively.
 
 For reproducibility, we recommend downloading the PDB using AWS snapshots (e.g., `20240101`). To do so, refer to [AWS's documentation](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-welcome.html) to set up the AWS CLI locally. Alternatively, on the RCSB website, navigate down to "Download Protocols", and follow the download instructions depending on your location.
 
@@ -263,25 +263,27 @@ find data/ccd_data/ -type f -name "*.gz" -exec gzip -d {} \;
 
 ### PDB dataset filtering
 
-Then run the following with `pdb_assembly_dir`, `pdb_asym_dir`, `ccd_dir`, and `mmcif_output_dir` replaced with the locations of your local copies of the first-assembly PDB, asymmetric unit PDB, CCD, and your desired dataset output directory (i.e., `./data/pdb_data/unfiltered_assembly_mmcifs/`, `./data/pdb_data/unfiltered_asym_mmcifs/`, `./data/ccd_data/`, and `./data/pdb_data/mmcifs/`).
+Then run the following with `pdb_assembly_dir`, `pdb_asym_dir`, `ccd_dir`, and `mmcif_output_dir` replaced with the locations of your local copies of the first-assembly PDB, asymmetric unit PDB, CCD, and your desired dataset output directory (i.e., `./data/pdb_data/unfiltered_assembly_mmcifs/`, `./data/pdb_data/unfiltered_asym_mmcifs/`, `./data/ccd_data/`, and `./data/pdb_data/{train,val}_mmcifs/`).
 ```bash
-python scripts/filter_pdb_mmcifs.py --mmcif_assembly_dir <pdb_assembly_dir> --mmcif_asym_dir <pdb_asym_dir> --ccd_dir <ccd_dir> --output_dir <mmcif_output_dir>
+python scripts/filter_pdb_train_mmcifs.py --mmcif_assembly_dir <pdb_assembly_dir> --mmcif_asym_dir <pdb_asym_dir> --ccd_dir <ccd_dir> --output_dir <mmcif_output_dir>
+python scripts/filter_pdb_val_mmcifs.py --mmcif_assembly_dir <pdb_assembly_dir> --mmcif_asym_dir <pdb_asym_dir> --output_dir <mmcif_output_dir>
 ```
 
-See the script for more options. Each first-assembly mmCIF that successfully passes
+See the scripts for more options. Each first-assembly mmCIF that successfully passes
 all processing steps will be written to `mmcif_output_dir` within a subdirectory
 named according to the mmCIF's second and third PDB ID characters (e.g. `5c`).
 
 ### PDB dataset clustering
 
-Next, run the following with `mmcif_dir` and `clustering_output_dir` replaced, respectively, with your local output directory created using the dataset filtering script above and with your desired clustering output directory (i.e., `./data/pdb_data/mmcifs/` and `./data/pdb_data/data_caches/clusterings/`):
+Next, run the following with `mmcif_dir` and `{train,val}_clustering_output_dir` replaced, respectively, with your local output directory created using the dataset filtering script above and with your desired clustering output directories (i.e., `./data/pdb_data/{train,val}_mmcifs/` and `./data/pdb_data/data_caches/{train,val}_clusterings/`):
 ```bash
-python scripts/cluster_pdb_mmcifs.py --mmcif_dir <mmcif_dir> --output_dir <clustering_output_dir> --clustering_filtered_pdb_dataset
+python scripts/cluster_pdb_train_mmcifs.py --mmcif_dir <mmcif_dir> --output_dir <train_clustering_output_dir> --clustering_filtered_pdb_dataset
+python scripts/cluster_pdb_val_mmcifs.py --mmcif_dir <mmcif_dir> --reference_clustering_dir <train_clustering_output_dir> --output_dir <val_clustering_output_dir> --clustering_filtered_pdb_dataset
 ```
 
-**Note**: The `--clustering_filtered_pdb_dataset` flag is recommended when clustering the filtered PDB dataset as curated using the script above, as this flag will enable faster runtimes in this context (since filtering leaves each chain's residue IDs 1-based). However, this flag must **not** be provided when clustering other (i.e., non-PDB) datasets of mmCIF files. Otherwise, interface clustering may be performed incorrectly, as these datasets' mmCIF files may not use strict 1-based residue indexing for each chain.
+**Note**: The `--clustering_filtered_pdb_dataset` flag is recommended when clustering the filtered PDB dataset as curated using the scripts above, as this flag will enable faster runtimes in this context (since filtering leaves each chain's residue IDs 1-based). However, this flag must **not** be provided when clustering other (i.e., non-PDB) datasets of mmCIF files. Otherwise, interface clustering may be performed incorrectly, as these datasets' mmCIF files may not use strict 1-based residue indexing for each chain.
 
-**Note**: One can also download preprocessed (i.e., filtered) mmCIF files (~20GB, comprising 148k complexes) and chain/interface clustering files (~1GB) for the PDB's `20240101` AWS snapshot via a [shared OneDrive folder](https://mailmissouri-my.sharepoint.com/:f:/g/personal/acmwhb_umsystem_edu/EqU8tjUmmKxJr-FAlq4tzaIBi2TIBtmw5Vl3k_kmgNlepA?e=mzlyv6).
+**Note**: One can instead download preprocessed (i.e., filtered) mmCIF (`train`/`val`) files (~20GB, comprising 148k complexes) and chain/interface clustering (`train`/`val`) files (~1GB) for the PDB's `20240101` AWS snapshot via a [shared OneDrive folder](https://mailmissouri-my.sharepoint.com/:f:/g/personal/acmwhb_umsystem_edu/EqU8tjUmmKxJr-FAlq4tzaIBi2TIBtmw5Vl3k_kmgNlepA?e=mzlyv6). Each of these `tar` archives should be uncompressed within the `data/pdb_data/` directory.
 
 ## Contributing
 
 
@@ -1,7 +1,7 @@
 # %% [markdown]
-# # Clustering AlphaFold 3 PDB Dataset
+# # Clustering AlphaFold 3 PDB Training Dataset
 #
-# For clustering AlphaFold 3's PDB dataset, we follow the clustering procedure outlined in Abramson et al (2024).
+# For clustering AlphaFold 3's PDB training dataset, we follow the clustering procedure outlined in Abramson et al (2024).
 #
 # In order to reduce bias in the training and evaluation sets, clustering was performed on PDB chains and interfaces, as
 # follows.
@@ -147,7 +147,6 @@ def convert_modified_residue_three_to_one(
         return mapped_residue, "ligand"
 
 
-@typecheck
 def parse_chain_sequences_and_interfaces_from_mmcif(
     filepath: str,
     assume_one_based_residue_ids: bool = False,
@@ -265,7 +264,6 @@ def parse_chain_sequences_and_interfaces_from_mmcif(
     return sequences, interface_chain_ids
 
 
-@typecheck
 def parse_chain_sequences_and_interfaces_from_mmcif_file(
     cif_filepath: str, assume_one_based_residue_ids: bool = False
 ) -> Tuple[str, Dict[str, str], Set[str]]:
@@ -682,24 +680,24 @@ def cluster_interfaces(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Cluster chains and interfaces within the AlphaFold 3 PDB dataset's filtered mmCIF files."
+        description="Cluster chains and interfaces within the AlphaFold 3 PDB training dataset's filtered mmCIF files."
     )
     parser.add_argument(
         "--mmcif_dir",
         type=str,
-        default=os.path.join("data", "pdb_data", "mmcifs"),
+        default=os.path.join("data", "pdb_data", "train_mmcifs"),
         help="Path to the input directory containing (filtered) mmCIF files.",
     )
     parser.add_argument(
         "--output_dir",
         type=str,
-        default=os.path.join("data", "pdb_data", "data_caches", "clusterings"),
-        help="Path to the output FASTA file.",
+        default=os.path.join("data", "pdb_data", "data_caches", "train_clusterings"),
+        help="Path to the output clustering directory.",
     )
     parser.add_argument(
         "--clustering_filtered_pdb_dataset",
         action="store_true",
-        help="Whether the clustering is being performed on the filtered PDB dataset.",
+        help="Whether the clustering is being performed on a filtered PDB dataset.",
     )
     parser.add_argument(
         "-n",