Add MSA dataloading bug fixes (#179)

amorehead · web-flow · commit 846d7af40c94 · 2024-08-22T19:47:05.000-07:00
* Update data_pipeline.py

* Update msa_parsing.py

* Update inputs.py
diff --git a/alphafold3_pytorch/data/data_pipeline.py b/alphafold3_pytorch/data/data_pipeline.py
@@ -52,15 +52,17 @@ def make_msa_mask(features: FeatureDict) -> FeatureDict:
 @typecheck
 def make_msa_features(
     msas: Dict[str, msa_parsing.Msa | None],
-    chain_id_to_chem_types: Dict[str, List[int]],
+    chain_id_to_residue: Dict[str, Dict[str, List[int]]],
+    ligand_chemtype_index: int = 3,
     raise_missing_exception: bool = False,
 ) -> FeatureDict:
     """
     Construct a feature dictionary of MSA features.
     From: https://github.com/aqlaboratory/openfold/blob/6f63267114435f94ac0604b6d89e82ef45d94484/openfold/data/data_pipeline.py#L224
 
     :param msas: The mapping of chain IDs to lists of (optional) MSAs for each chain.
-    :param chain_id_to_chem_types: The mapping of chain IDs to residue (integer) chemical types.
+    :param chain_id_to_residue: The mapping of chain IDs to residue information.
+    :param ligand_index: The index of the ligand in the chemical type list.
     :param raise_missing_exception: Whether to raise an exception if no MSAs are provided for any chain.
     :return: The MSA feature dictionary.
     """
@@ -70,7 +72,7 @@ def make_msa_features(
     # Infer MSA metadata.
     max_alignments = 1
     for msa in msas.values():
-        if exists(msa.sequences) and exists(msa.sequences[0]):
+        if exists(msa) and exists(msa.sequences) and exists(msa.sequences[0]):
             max_alignments = max(max_alignments, len(msa.sequences) if msa else 1)
 
     # Collect MSAs.
@@ -84,10 +86,20 @@ def make_msa_features(
         species_ids = []
         seen_sequences = set()
 
-        chain_chem_types = chain_id_to_chem_types[chain_id]
-        num_res = len(chain_chem_types)
+        chain_chemtype = chain_id_to_residue[chain_id]["chemtype"]
+        chain_residue_index = chain_id_to_residue[chain_id]["residue_index"]
 
-        msa_residue_constants = get_residue_constants(msa.msa_type.replace("protein", "peptide"))
+        num_res = len(chain_chemtype)
+        assert num_res == len(chain_residue_index), (
+            f"Residue features count mismatch for chain {chain_id}: "
+            f"{num_res} != {len(chain_residue_index)}"
+        )
+
+        msa_residue_constants = (
+            get_residue_constants(msa.msa_type.replace("protein", "peptide"))
+            if exists(msa)
+            else None
+        )
 
         gap_ids = [[GAP_ID] * num_res]
         deletion_values = [[0] * num_res]
@@ -98,9 +110,11 @@ def make_msa_features(
         elif not msa:
             # Pad the MSA to the maximum number of alignments
             # if the chain does not have any associated alignments.
-            int_msa_list.append(gap_ids * max_alignments)
-            deletion_matrix_list.append(deletion_values * max_alignments)
-            species_ids_list.append(species * max_alignments)
+            int_msa_list.append(torch.tensor(gap_ids * max_alignments, dtype=torch.long))
+            deletion_matrix_list.append(
+                torch.tensor(deletion_values * max_alignments, dtype=torch.float32)
+            )
+            species_ids_list.append(np.array(species * max_alignments, dtype=object))
             continue
 
         for sequence_index, sequence in enumerate(msa.sequences):
@@ -109,15 +123,26 @@ def make_msa_features(
             seen_sequences.add(sequence)
 
             # Convert the MSA to integers while handling
-            # ligands and (unmappable) modified polymer residues.
+            # ligands and modified polymer residues.
             msa_res_types = []
             msa_deletion_values = []
 
-            polymer_res_index = 0
+            polymer_residue_index = -1
 
-            for chem_type in chain_chem_types:
-                is_ligand = chem_type == 3
-                chem_residue_constants = get_residue_constants(res_chem_index=chem_type)
+            for idx, (chemtype, residue_index) in enumerate(
+                zip(chain_chemtype, chain_residue_index)
+            ):
+                is_polymer = chemtype < ligand_chemtype_index
+                is_ligand = not is_polymer
+
+                chem_residue_constants = get_residue_constants(res_chem_index=chemtype)
+
+                # NOTE: For modified polymer residues, we only increment the polymer residue index
+                # when the current (atomized) modified polymer residue's atom sequence ends.
+                increment_index = (
+                    0 < idx < num_res and chain_residue_index[idx - 1] != residue_index
+                )
+                polymer_residue_index += 1 if is_polymer and (idx == 0 or increment_index) else 0
 
                 if is_ligand:
                     # NOTE: For ligands, we use the unknown amino acid type.
@@ -131,18 +156,20 @@ def make_msa_features(
                     if chem_residue_constants != msa_residue_constants:
                         msa_res_type = chem_residue_constants.restype_num
                     else:
-                        res = sequence[polymer_res_index]
+                        res = sequence[polymer_residue_index]
                         msa_res_type = msa_residue_constants.MSA_CHAR_TO_ID.get(
                             res, msa_residue_constants.restype_num
                         )
 
-                    msa_deletion_value = msa.deletion_matrix[sequence_index][polymer_res_index]
-
-                    polymer_res_index += 1
+                    msa_deletion_value = msa.deletion_matrix[sequence_index][polymer_residue_index]
 
                 msa_res_types.append(msa_res_type)
                 msa_deletion_values.append(msa_deletion_value)
 
+            assert polymer_residue_index + 1 == len(
+                sequence
+            ), f"Polymer residue index length mismatch for MSA chain {chain_id}: {polymer_residue_index + 1} != {len(sequence)}"
+
             int_msa.append(msa_res_types)
             deletion_matrix.append(msa_deletion_values)
 
diff --git a/alphafold3_pytorch/data/msa_parsing.py b/alphafold3_pytorch/data/msa_parsing.py
@@ -121,6 +121,7 @@ def __len__(self):
 
     def truncate(self, max_seqs: int):
         """Truncates the MSA to the first `max_seqs` sequences."""
+        max_seqs = min(len(self.sequences), max_seqs)
         return Msa(
             sequences=self.sequences[:max_seqs],
             deletion_matrix=self.deletion_matrix[:max_seqs],
@@ -130,6 +131,7 @@ def truncate(self, max_seqs: int):
 
     def random_truncate(self, max_seqs: int):
         """Truncates the MSA to a random range of `max_seqs` sequences."""
+        max_seqs = min(len(self.sequences), max_seqs)
         start = random.randint(0, len(self.sequences) - max_seqs)  # nosec
         return Msa(
             sequences=self.sequences[start : start + max_seqs],
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -2264,7 +2264,7 @@ def find_mismatched_symmetry(
 def load_msa_from_msa_dir(
     msa_dir: str | None,
     file_id: str,
-    chain_id_to_chem_types: Dict[str, List[int]],
+    chain_id_to_residue: Dict[str, Dict[str, List[int]]],
     max_msas_per_chain: int | None = None,
     randomly_truncate: bool = True,
     raise_missing_exception: bool = False,
@@ -2279,16 +2279,16 @@ def load_msa_from_msa_dir(
         return {}
 
     msas = {}
-    for chain_id in chain_id_to_chem_types:
+    for chain_id in chain_id_to_residue:
         msa_fpaths = glob.glob(os.path.join(msa_dir, f"{file_id}{chain_id}_*.a3m"))
 
         if not msa_fpaths:
             msas[chain_id] = None
             continue
 
         # NOTE: A single chain-specific MSA file contains alignments for all polymer residues in the chain,
-        # but the ligand (and some "unmappable" modified polymer residues) are not included in the MSA file
-        # and therefore must be manually inserted into the MSAs as unknown amino acid residues.
+        # but the chain's ligands are not included in the MSA file and therefore must be manually inserted
+        # into the MSAs as unknown amino acid residues.
         assert len(msa_fpaths) == 1, (
             f"{len(msa_fpaths)} MSA files found for chain {chain_id} of file {file_id}. "
             "Please ensure that one MSA file is present for each chain."
@@ -2310,7 +2310,7 @@ def load_msa_from_msa_dir(
             )
             msas[chain_id] = msa
 
-    features = make_msa_features(msas, chain_id_to_chem_types)
+    features = make_msa_features(msas, chain_id_to_residue)
     features = make_msa_mask(features)
 
     return features
@@ -2606,10 +2606,12 @@ def pdb_input_to_molecule_input(
 
     # concat for all of additional_molecule_feats
 
+    # NOTE: `Biomolecule.residue_index` is 1-based originally
+    residue_index = torch.from_numpy(biomol.residue_index) - 1
+
     additional_molecule_feats = torch.stack(
         (
-            # NOTE: `Biomolecule.residue_index` is 1-based originally
-            torch.from_numpy(biomol.residue_index) - 1,
+            residue_index,
             torch.arange(num_tokens),
             torch.from_numpy(biomol.chain_index),
             entity_ids,
@@ -2790,15 +2792,63 @@ def pdb_input_to_molecule_input(
     num_present_atoms = mol_total_atoms - num_missing_atom_indices
     assert num_present_atoms == int(biomol.atom_mask.sum())
 
+    # handle `atom_indices_for_frame` for the PAE
+
+    atom_indices_for_frame = tensor(
+        [default(indices, (-1, -1, -1)) for indices in atom_indices_for_frame]
+    )
+
+    # build offsets for all indices
+
+    # derive `atom_lens` based on `one_token_per_atom`, for ligands and modified biomolecules
+    atoms_per_molecule = tensor([mol.GetNumAtoms() for mol in molecules])
+    ones = torch.ones_like(atoms_per_molecule)
+
+    # `is_molecule_mod` can either be
+    # 1. Bool['n'], in which case it will only be used for determining `one_token_per_atom`, or
+    # 2. Bool['n num_mods'], where it will be passed to Alphafold3 for molecule modification embeds
+    is_molecule_mod = tensor(is_molecule_mod)
+    is_molecule_any_mod = False
+
+    if is_molecule_mod.ndim == 2:
+        is_molecule_any_mod = is_molecule_mod[unique_chain_residue_indices].any(dim=-1)
+    else:
+        is_molecule_any_mod = is_molecule_mod[unique_chain_residue_indices]
+
+    # get `one_token_per_atom`
+    # default to what the paper did, which is ligands and any modified biomolecule
+    is_ligand = is_molecule_types[unique_chain_residue_indices][..., IS_LIGAND_INDEX]
+    one_token_per_atom = is_ligand | is_molecule_any_mod
+
+    assert len(molecules) == len(one_token_per_atom)
+
+    # derive the number of repeats needed to expand molecule lengths to token lengths
+    token_repeats = torch.where(one_token_per_atom, atoms_per_molecule, ones)
+
+    # craft offsets for all atom indices
+    atom_indices_offsets = repeat_interleave(
+        exclusive_cumsum(atoms_per_molecule), token_repeats, dim=0
+    )
+
+    # offset only positive atom indices
+    distogram_atom_indices = offset_only_positive(distogram_atom_indices, atom_indices_offsets)
+    molecule_atom_indices = offset_only_positive(molecule_atom_indices, atom_indices_offsets)
+    atom_indices_for_frame = offset_only_positive(
+        atom_indices_for_frame, atom_indices_offsets[..., None]
+    )
+
     # retrieve multiple sequence alignments (MSAs) for each chain
     # NOTE: if they are not locally available, `Nones` will be used
     msa_chain_ids = list(dict.fromkeys(biomol.chain_id.tolist()))
-    chain_id_to_chem_types = {
-        chain_id: biomol.chemtype[biomol.chain_id == chain_id].tolist()
+    chain_id_to_residue = {
+        chain_id: {
+            "chemtype": biomol.chemtype[biomol.chain_id == chain_id].tolist(),
+            "residue_index": residue_index[biomol.chain_id == chain_id].tolist(),
+        }
         for chain_id in msa_chain_ids
     }
     msa_features = load_msa_from_msa_dir(
-        i.msa_dir, file_id, chain_id_to_chem_types, max_msas_per_chain=i.max_msas_per_chain
+        i.msa_dir, file_id, chain_id_to_residue, max_msas_per_chain=i.max_msas_per_chain
     )
 
     msa = msa_features.get("msa")
@@ -2817,6 +2867,10 @@ def pdb_input_to_molecule_input(
     num_msas = len(msa) if exists(msa) else 1
 
     if exists(msa):
+        assert (
+            msa.shape[-1] == num_tokens
+        ), f"The number of tokens in the MSA ({msa.shape[-1]}) does not match the number of tokens in the biomolecule ({num_tokens}). "
+
         has_deletion = torch.clip(msa_features["deletion_matrix"], 0.0, 1.0)
         deletion_value = torch.atan(msa_features["deletion_matrix"] / 3.0) * (2.0 / torch.pi)
 
@@ -2883,51 +2937,6 @@ def pdb_input_to_molecule_input(
         is_resolved_label = ((resolution >= 0.1) & (resolution <= 3.0)).item()
         resolved_labels = torch.full((num_atoms,), is_resolved_label, dtype=torch.long)
 
-    # handle `atom_indices_for_frame` for the PAE
-
-    atom_indices_for_frame = tensor(
-        [default(indices, (-1, -1, -1)) for indices in atom_indices_for_frame]
-    )
-
-    # build offsets for all indices
-
-    # derive `atom_lens` based on `one_token_per_atom`, for ligands and modified biomolecules
-    atoms_per_molecule = tensor([mol.GetNumAtoms() for mol in molecules])
-    ones = torch.ones_like(atoms_per_molecule)
-
-    # `is_molecule_mod` can either be
-    # 1. Bool['n'], in which case it will only be used for determining `one_token_per_atom`, or
-    # 2. Bool['n num_mods'], where it will be passed to Alphafold3 for molecule modification embeds
-    is_molecule_mod = tensor(is_molecule_mod)
-    is_molecule_any_mod = False
-
-    if is_molecule_mod.ndim == 2:
-        is_molecule_any_mod = is_molecule_mod[unique_chain_residue_indices].any(dim=-1)
-    else:
-        is_molecule_any_mod = is_molecule_mod[unique_chain_residue_indices]
-
-    # get `one_token_per_atom`
-    # default to what the paper did, which is ligands and any modified biomolecule
-    is_ligand = is_molecule_types[unique_chain_residue_indices][..., IS_LIGAND_INDEX]
-    one_token_per_atom = is_ligand | is_molecule_any_mod
-
-    assert len(molecules) == len(one_token_per_atom)
-
-    # derive the number of repeats needed to expand molecule lengths to token lengths
-    token_repeats = torch.where(one_token_per_atom, atoms_per_molecule, ones)
-
-    # craft offsets for all atom indices
-    atom_indices_offsets = repeat_interleave(
-        exclusive_cumsum(atoms_per_molecule), token_repeats, dim=0
-    )
-
-    # offset only positive atom indices
-    distogram_atom_indices = offset_only_positive(distogram_atom_indices, atom_indices_offsets)
-    molecule_atom_indices = offset_only_positive(molecule_atom_indices, atom_indices_offsets)
-    atom_indices_for_frame = offset_only_positive(
-        atom_indices_for_frame, atom_indices_offsets[..., None]
-    )
-
     # create molecule input
 
     molecule_input = MoleculeInput(