Update plm.py (#257)

amorehead · web-flow · commit 477ad1946e99 · 2024-09-14T12:39:30.000-07:00
diff --git a/alphafold3_pytorch/plm.py b/alphafold3_pytorch/plm.py
@@ -2,24 +2,13 @@
 from functools import partial
 
 import torch
+from beartype.typing import Literal
 from torch import tensor
 from torch.nn import Module
 
-from beartype.typing import Literal
-
-from alphafold3_pytorch.tensor_typing import (
-    typecheck,
-    Float,
-    Int
-)
-
-from alphafold3_pytorch.common.biomolecule import (
-    get_residue_constants,
-)
-
-from alphafold3_pytorch.inputs import (
-    IS_PROTEIN,
-)
+from alphafold3_pytorch.common.biomolecule import get_residue_constants
+from alphafold3_pytorch.inputs import IS_PROTEIN
+from alphafold3_pytorch.tensor_typing import Float, Int, typecheck
 
 # functions
 
@@ -28,41 +17,48 @@ def join(arr, delimiter = ''): # just redo an ugly part of python
 
 # constants
 
-aa_constants = get_residue_constants(res_chem_index = IS_PROTEIN)
-restypes_index = dict(enumerate(aa_constants.restypes))
+aa_constants = get_residue_constants(res_chem_index=IS_PROTEIN)
+restypes = aa_constants.restypes + ["X"]
 
 # class
 
+
 class ESMWrapper(Module):
+    """A wrapper for the ESM model to provide PLM embeddings."""
+
     def __init__(
         self,
-        esm_name,
-        repr_layer = 33
+        esm_name: str,
+        repr_layer: int = 33,
     ):
         super().__init__()
         import esm
+
         self.repr_layer = repr_layer
         self.model, alphabet = esm.pretrained.load_model_and_alphabet_hub(esm_name)
         self.batch_converter = alphabet.get_batch_converter()
 
         self.embed_dim = self.model.embed_dim
-        self.register_buffer('dummy', tensor(0), persistent = False)
+        self.register_buffer("dummy", tensor(0), persistent=False)
 
     @torch.no_grad()
     @typecheck
     def forward(
-        self,
-        aa_ids: Int['b n']
-    ) -> Float['b n dpe']:
+        self, aa_ids: Int["b n"]  # type: ignore
+    ) -> Float["b n dpe"]:  # type: ignore
+        """Get PLM embeddings for a batch of (pseudo-)protein sequences.
 
+        :param aa_ids: A batch of amino acid residue indices.
+        :return: The PLM embeddings for the input sequences.
+        """
         device, repr_layer = self.dummy.device, self.repr_layer
 
         sequence_data = [
             (
-                f"molecule{i}",
-                join([restypes_index.get(i, 'X') for i in ids]),
+                f"molecule{mol_idx}",
+                join([(restypes[i] if 0 <= i < len(restypes) else "X") for i in ids]),
             )
-            for i, ids in enumerate(aa_ids)
+            for mol_idx, ids in enumerate(aa_ids)
         ]
 
         _, _, batch_tokens = self.batch_converter(sequence_data)
@@ -80,64 +76,62 @@ def forward(
 
         return plm_embeddings
 
+
 class ProstT5Wrapper(Module):
+    """A wrapper for the ProstT5 model to provide PLM embeddings."""
+
     def __init__(self):
         super().__init__()
-        from transformers import T5Tokenizer, T5EncoderModel
+        from transformers import T5EncoderModel, T5Tokenizer
 
-        self.register_buffer('dummy', tensor(0), persistent = False)
+        self.register_buffer("dummy", tensor(0), persistent=False)
 
-        self.tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case = False)
+        self.tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5", do_lower_case=False)
         self.model = T5EncoderModel.from_pretrained("Rostlab/ProstT5")
         self.embed_dim = 1024
 
     @torch.no_grad()
     @typecheck
     def forward(
-        self,
-        aa_ids: Int['b n']
-    ) -> Float['b n dpe']:
+        self, aa_ids: Int["b n"]  # type: ignore
+    ) -> Float["b n dpe"]:  # type: ignore
+        """Get PLM embeddings for a batch of (pseudo-)protein sequences.
 
+        :param aa_ids: A batch of amino acid residue indices.
+        :return: The PLM embeddings for the input sequences.
+        """
         device, seq_len = self.dummy.device, aa_ids.shape[-1]
 
         str_sequences = [
-            join([restypes_index.get(i, 'X') for i in ids])
-            for i, ids in enumerate(aa_ids)
+            join([(restypes[i] if 0 <= i < len(restypes) else "X") for i in ids]) for ids in aa_ids
         ]
 
         # following the readme at https://github.com/mheinzinger/ProstT5
 
-        str_sequences = [join(list(re.sub(r"[UZOB]", "X", str_seq)), ' ') for str_seq in str_sequences]
+        str_sequences = [
+            join(list(re.sub(r"[UZOB]", "X", str_seq)), " ") for str_seq in str_sequences
+        ]
 
         # encode to ids
 
         inputs = self.tokenizer.batch_encode_plus(
-            str_sequences,
-            add_special_tokens = True,
-            padding = "longest",
-            return_tensors = 'pt'
+            str_sequences, add_special_tokens=True, padding="longest", return_tensors="pt"
         ).to(device)
 
         # forward through plm
 
-        embeddings = self.model(
-            inputs.input_ids, 
-            attention_mask = inputs.attention_mask
-        )
+        embeddings = self.model(inputs.input_ids, attention_mask=inputs.attention_mask)
 
         # remove prefix
 
-        plm_embedding = embeddings.last_hidden_state[:, 1:(seq_len + 1)]
+        plm_embedding = embeddings.last_hidden_state[:, 1 : (seq_len + 1)]
         return plm_embedding
 
+
 # PLM embedding type and registry
 
 PLMRegistry = dict(
-    esm2_t33_650M_UR50D = partial(ESMWrapper, 'esm2_t33_650M_UR50D'),
-    prostT5 = ProstT5Wrapper
+    esm2_t33_650M_UR50D=partial(ESMWrapper, "esm2_t33_650M_UR50D"), prostT5=ProstT5Wrapper
 )
 
-PLMEmbedding = Literal[
-    "esm2_t33_650M_UR50D",
-    "prostT5"
-]
+PLMEmbedding = Literal["esm2_t33_650M_UR50D", "prostT5"]