Add support for plotting plDDTs in output mmCIFs for visualization (#178)

amorehead · web-flow · commit e3c313ccd0f3 · 2024-08-22T11:57:33.000-07:00
* Update mmcif_writing.py

* Update alphafold3.py

* Update alphafold3.py

* Update alphafold3.py
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -4286,7 +4286,8 @@ def _protein_structure_from_feature(
 
     return builder.get_structure()
 
-ScoredSample = Tuple[int, Float["b m 3"], Float[" b"], Float[" b"]] # type: ignore
+Sample = Tuple[Float["b m 3"], Float["b pde n n"], Float["b m"], Float["b dist n n"]]
+ScoredSample = Tuple[int, Float["b m 3"], Float["b m"], Float[" b"], Float[" b"]]
 
 class ScoreDetails(NamedTuple):
     best_gpde_index: int
@@ -4799,27 +4800,22 @@ def compute_unresolved_rasa(
     def compute_model_selection_score(
         self,
         batch: BatchedAtomInput,
-        samples: List[Tuple[
-            Float["b m 3"],
-            Float["b pde n n"],
-            Float["b dist n n"]
-        ]],
+        samples: List[Sample],
         is_fine_tuning: bool = None,
         return_details: bool = False,
         return_unweighted_scores: bool = False,
         compute_rasa: bool = False,
         unresolved_cid: List[int] | None = None,
-        unresolved_residue_mask: Bool["b n"] | None = None,  
+        unresolved_residue_mask: Bool["b n"] | None = None,
         missing_chain_index: int = -1,
     ) -> Float[" b"] | ScoreDetails:
-
         """Compute the model selection score for an input batch and corresponding (sampled) atom
         positions.
 
         :param batch: A batch of `AtomInput` data.
         :param samples: A list of sampled atom positions along with their predicted distance errors and labels.
         :param is_fine_tuning: is fine tuning
-        :param return_top_model: return the top-ranked sample
+        :param return_details: return the top model and its score
         :param return_unweighted_scores: return the unweighted scores (i.e., lDDT)
         :param compute_rasa: compute the relative solvent accessible surface area (RASA) for unresolved proteins
         :param unresolved_cid: unresolved chain ids
@@ -4861,7 +4857,7 @@ def compute_model_selection_score(
         scored_samples: List[ScoredSample] = []
 
         for sample_idx, sample in enumerate(samples):
-            atom_pos_pred, pde_logits, dist_logits = sample
+            atom_pos_pred, pde_logits, plddt, dist_logits = sample
 
             weighted_lddt = self.compute_weighted_lddt(
                 atom_pos_pred,
@@ -4886,50 +4882,51 @@ def compute_model_selection_score(
                 tok_repr_atm_mask,
             )
 
-            scored_samples.append((sample_idx, atom_pos_pred, weighted_lddt, gpde))
+            scored_samples.append((sample_idx, atom_pos_pred, plddt, weighted_lddt, gpde))
 
         # quick collate
 
         *_, all_weighted_lddt, all_gpde = zip(*scored_samples)
 
-        # rank by batch-averaged gPDE
+        # rank by batch-averaged minimum gPDE
 
-        best_gpde_index = torch.stack(all_gpde).mean(dim = -1).argmax().item()
+        best_gpde_index = torch.stack(all_gpde).mean(dim=-1).argmin().item()
 
-        # rank by batch-averaged lDDT
+        # rank by batch-averaged maximum lDDT
 
-        best_lddt_index = torch.stack(all_weighted_lddt).mean(dim = -1).argmax().item()
+        best_lddt_index = torch.stack(all_weighted_lddt).mean(dim=-1).argmax().item()
 
         # some weighted score
 
         model_selection_score = (
-            scored_samples[best_gpde_index][-2] +
-            scored_samples[best_lddt_index][-2]
+            scored_samples[best_gpde_index][-2] + scored_samples[best_lddt_index][-2]
         ) / 2
 
         if not return_details:
             return model_selection_score
 
         score_details = ScoreDetails(
-            best_gpde_index = best_gpde_index,
-            best_lddt_index = best_lddt_index,
-            score = model_selection_score,
-            scored_samples = scored_samples
+            best_gpde_index=best_gpde_index,
+            best_lddt_index=best_lddt_index,
+            score=model_selection_score,
+            scored_samples=scored_samples,
         )
 
         return score_details
 
     @typecheck
     def forward(
-        self,
-        alphafolds: Tuple[Alphafold3],
-        batched_atom_inputs: BatchedAtomInput,
-        **kwargs
+        self, alphafolds: Tuple[Alphafold3], batched_atom_inputs: BatchedAtomInput, **kwargs
     ) -> Float[" b"] | ScoreDetails:
+        """Make model selections by computing the model selection score.
 
-        """
-        give this a tuple of all the Alphafolds and a batch of atomic inputs
-        it will select the best one by the model selection score by returning the index of the Tuple
+        NOTE: Give this function a tuple of `Alphafold3` modules and a batch of atomic inputs, and it will
+        select the best module via the model selection score by returning the index of the corresponding tuple.
+
+        :param alphafolds: Tuple of `Alphafold3` modules
+        :param batched_atom_inputs: A batch of `AtomInput` data
+        :param kwargs: Additional keyword arguments
+        :return: Model selection score
         """
 
         samples = []
@@ -4940,19 +4937,15 @@ def forward(
 
                 pred_atom_pos, logits = alphafold(
                     **batched_atom_inputs.model_forward_dict(),
-                    return_loss = False,
-                    return_confidence_head_logits = True,
-                    return_distogram_head_logits = True
+                    return_loss=False,
+                    return_confidence_head_logits=True,
+                    return_distogram_head_logits=True,
                 )
+                plddt = self.compute_confidence_score.compute_plddt(logits.plddt)
 
-                samples.append((pred_atom_pos, logits.pde, logits.distance))
-
+                samples.append((pred_atom_pos, logits.pde, plddt, logits.distance))
 
-        scores = self.compute_model_selection_score(
-            batched_atom_inputs,
-            samples = samples,
-            **kwargs
-        )
+        scores = self.compute_model_selection_score(batched_atom_inputs, samples=samples, **kwargs)
 
         return scores
 
@@ -6083,11 +6076,11 @@ def forward(
                 is_nucleotide = is_rna | is_dna
                 is_polymer = is_protein | is_rna | is_dna
 
-                is_any_nucleotide_pair = einx.logical_and(
-                    '... i, ... j -> ... i j', torch.ones_like(is_nucleotide), is_nucleotide
+                is_any_nucleotide_pair = repeat(
+                    is_nucleotide, '... j -> ... i j', i=is_nucleotide.shape[-1]
                 )
-                is_any_polymer_pair = einx.logical_and(
-                    '... i, ... j -> ... i j', torch.ones_like(is_polymer), is_polymer
+                is_any_polymer_pair = repeat(
+                    is_polymer, '... j -> ... i j', i=is_polymer.shape[-1]
                 )
 
                 inclusion_radius = torch.where(
@@ -6098,10 +6091,8 @@ def forward(
 
                 is_token_center_atom = torch.zeros_like(atom_pos[..., 0], dtype=torch.bool)
                 is_token_center_atom[torch.arange(batch_size).unsqueeze(1), molecule_atom_indices] = True
-                is_any_token_center_atom_pair = einx.logical_and(
-                    '... i, ... j -> ... i j',
-                    torch.ones_like(is_token_center_atom),
-                    is_token_center_atom,
+                is_any_token_center_atom_pair = repeat(
+                    is_token_center_atom, '... j -> ... i j', i=is_token_center_atom.shape[-1]
                 )
 
                 # compute masks, avoiding self term
diff --git a/alphafold3_pytorch/data/mmcif_writing.py b/alphafold3_pytorch/data/mmcif_writing.py
@@ -2,10 +2,7 @@
 
 import numpy as np
 
-from alphafold3_pytorch.common.biomolecule import (
-    _from_mmcif_object,
-    to_mmcif,
-)
+from alphafold3_pytorch.common.biomolecule import _from_mmcif_object, to_mmcif
 from alphafold3_pytorch.data.data_pipeline import get_assembly
 from alphafold3_pytorch.data.mmcif_parsing import MmcifObject, parse_mmcif_object
 from alphafold3_pytorch.utils.utils import exists
@@ -27,8 +24,10 @@ def write_mmcif(
     insert_orig_atom_names: bool = True,
     insert_alphafold_mmcif_metadata: bool = True,
     sampled_atom_positions: np.ndarray | None = None,
+    b_factors: np.ndarray | None = None,
 ):
-    """Write a BioPython `Structure` object to an mmCIF file using an intermediate `Biomolecule` object."""
+    """Write a BioPython `Structure` object to an mmCIF file using an intermediate `Biomolecule`
+    object."""
     biomol = (
         _from_mmcif_object(mmcif_object)
         if "assembly" in mmcif_object.file_id
@@ -41,6 +40,12 @@ def write_mmcif(
             f"but got {sampled_atom_positions.shape}."
         )
         biomol.atom_positions[atom_mask] = sampled_atom_positions
+        if exists(b_factors):
+            assert biomol.b_factors[atom_mask].shape == b_factors.shape, (
+                f"Expected B-factors to have shape {biomol.b_factors[atom_mask].shape}, "
+                f"but got {b_factors.shape}."
+            )
+            biomol.b_factors[atom_mask] = b_factors
     unique_res_atom_names = biomol.unique_res_atom_names if insert_orig_atom_names else None
     mmcif_string = to_mmcif(
         biomol,