Full unmasked (WT-relative scoring)

niklases · niklases · commit 3ef22b37762d · 2026-01-28T20:03:11.000+01:00
diff --git a/pypef/plm/esm_lora_tune.py b/pypef/plm/esm_lora_tune.py
@@ -17,7 +17,8 @@
 from __future__ import annotations
 
 import logging
-from time import sleep
+
+from pypef.plm.prosst_lora_tune import get_logits_from_full_seqs
 logger = logging.getLogger('pypef.llm.esm_lora_tune')
 
 import torch
@@ -142,35 +143,37 @@ def esm_infer(xs, attention_mask, model, device: str | None = None, verbose=Fals
     return torch.flatten(y_preds_total)
 
 
-def esm_unmasked_reconstruction_score(
+def esm_unmasked_wt_score(
         tokenized_sequences, 
         attention_mask, 
+        wt_input_ids,
         model, 
         train: bool = False,
         device=None, 
-        **kws
+        **kwargs
     ):
     if device is None:
         device = get_device()
+    wt_input_ids = wt_input_ids.unsqueeze(0)
     attention_masks = torch.Tensor(np.full(
-        shape=np.shape(tokenized_sequences), fill_value=attention_mask)).to(torch.int64)
+        shape=np.shape(wt_input_ids), fill_value=attention_mask)).to(torch.int64)
     if train:
-        with torch.no_grad():
-            outputs = model(tokenized_sequences.to(device), attention_masks.to(device), 
-                            output_hidden_states=False)
+        outputs = model(wt_input_ids.to(device), attention_masks.to(device), 
+                        output_hidden_states=False)
     else:
-        outputs = model(tokenized_sequences.to(device), attention_masks.to(device), 
+        with torch.no_grad():
+            outputs = model(wt_input_ids.to(device), attention_masks.to(device), 
                             output_hidden_states=False)
     logits = outputs.logits
-    token_probs = torch.log_softmax(logits, dim=-1)
-    for i_s, sequence in enumerate(tokenized_sequences):
-        for i_aa, aa in enumerate(sequence):
+    token_probs = torch.log_softmax(logits, dim=-1).squeeze(0)
+    for i_s, tokenized_seq in enumerate(tokenized_sequences):
+        for i_aa, aa in enumerate(tokenized_seq):
             # alternative: use Tensor.index_select() function
             if i_aa == 0:
-                seq_log_probs = token_probs[i_s, i_aa, aa].reshape(1)
+                seq_log_probs = token_probs[i_aa, aa].reshape(1)
             else:
                 seq_log_probs = torch.cat(
-                    (seq_log_probs, token_probs[i_s, i_aa, aa].reshape(1)), 0)
+                    (seq_log_probs, token_probs[i_aa, aa].reshape(1)), 0)
         if i_s == 0:
             log_probs = torch.sum(torch.Tensor(seq_log_probs)).reshape(1)
         else:
@@ -179,124 +182,6 @@ def esm_unmasked_reconstruction_score(
     return log_probs
 
 
-def esm_masked_pll(
-    input_ids: torch.Tensor,          # (B, L)
-    attention_mask: torch.Tensor,      # (B, L)
-    model,
-    mask_token_id: int,
-    device: str | None = None,
-    verbose: bool = False,
-):
-    """
-    Compute true pseudo-log-likelihood (PLL) for an MLM (ESM).
-
-    Returns:
-        pll_scores: torch.Tensor of shape (B,)
-    """
-    if device is None:
-        device = next(model.parameters()).device
-
-    input_ids = input_ids.to(device)
-    attention_mask = attention_mask.to(device)
-
-    B, L = input_ids.shape
-    pll_scores = torch.zeros(B, device=device)
-
-    model.eval()
-
-    for pos in tqdm(
-        range(L),
-        desc="ESM masked PLL",
-        disable=not verbose
-    ):
-        # Skip padding positions (position padding for all sequences in the batch)
-        if attention_mask[:, pos].sum() == 0:
-            continue
-
-        # Clone and mask position `pos`
-        masked_input_ids = input_ids.clone()
-        masked_input_ids[:, pos] = mask_token_id
-
-        with torch.no_grad():
-            outputs = model(
-                input_ids=masked_input_ids,
-                attention_mask=attention_mask,
-            )
-
-            logits = outputs.logits  # (B, L, V)
-
-        # Log-probabilities at masked position
-        log_probs = F.log_softmax(logits[:, pos, :], dim=-1)
-
-        # True tokens at this position
-        true_tokens = input_ids[:, pos]
-
-        # Gather log-prob of the true token
-        token_log_probs = log_probs.gather(
-            dim=1,
-            index=true_tokens.unsqueeze(1)
-        ).squeeze(1)
-
-        # Only count non-padding
-        pll_scores += token_log_probs * attention_mask[:, pos]
-
-    return pll_scores
-
-
-def esm_infer_masked_pll(
-    xs,
-    attention_mask,
-    model,
-    mask_token_id,
-    batch_size: int = 4,
-    device: str | None = None,
-    verbose: bool = False,
-):
-    if device is None:
-        device = get_device()
-
-    model = model.to(device)
-    model.eval()
-
-    if not isinstance(xs, torch.Tensor):
-        xs = torch.tensor(xs, dtype=torch.long)
-
-    if not isinstance(attention_mask, torch.Tensor):
-        attention_mask = torch.tensor(attention_mask, dtype=torch.long)
-
-    xs = xs.to(device)
-
-    # Expand mask to (N, L) if needed
-    if attention_mask.dim() == 1:
-        attention_mask = attention_mask.unsqueeze(0).expand(xs.shape[0], -1)
-
-    attention_mask = attention_mask.to(device)
-
-    pll_all = []
-
-    for i in tqdm(
-        range(0, xs.shape[0], batch_size),
-        desc="ESM PLL inference",
-        disable=not verbose,
-    ):
-        xs_b = xs[i:i + batch_size]
-        am_b = attention_mask[i:i + batch_size]
-
-        pll_b = esm_masked_pll(
-            input_ids=xs_b,
-            attention_mask=am_b,
-            model=model,
-            mask_token_id=mask_token_id,
-            device=device,
-            verbose=False,
-        )
-
-        pll_all.append(pll_b.cpu())
-
-    return torch.cat(pll_all)
-
-
-
 def esm_mutation_only_mutation_masked_pll(
     tokenized_sequences: torch.Tensor,        # (L,)
     wt_input_ids: torch.Tensor,     # (L,)
@@ -306,6 +191,7 @@ def esm_mutation_only_mutation_masked_pll(
     train: bool = False,
     device: str | None = None,
     verbose: bool = False,
+    **kwargs
 ):
     """
     Correct mutation-only pseudo-log-likelihood for ONE sequence.
@@ -335,16 +221,16 @@ def esm_mutation_only_mutation_masked_pll(
             masked_input_ids = tokenized_seq.clone()
             masked_input_ids[pos] = mask_token_id
             if train:
+                outputs = model(
+                    input_ids=masked_input_ids.unsqueeze(0),
+                    attention_mask=attention_mask.unsqueeze(0),
+                )
+            else:
                 with torch.no_grad():
                     outputs = model(
                         input_ids=masked_input_ids.unsqueeze(0),
                         attention_mask=attention_mask.unsqueeze(0),
                     )
-            else:
-                outputs = model(
-                        input_ids=masked_input_ids.unsqueeze(0),
-                        attention_mask=attention_mask.unsqueeze(0),
-                    )
             logits = outputs.logits  # (1, L, V)
 
             log_probs = F.log_softmax(logits[0, pos], dim=-1)
@@ -393,16 +279,16 @@ def esm_mutation_all_pos_masked_pll(
             masked_input_ids[pos] = mask_token_id
 
             if train:
+                outputs = model(
+                    input_ids=masked_input_ids.unsqueeze(0),
+                    attention_mask=attention_mask.unsqueeze(0),
+                )
+            else:
                 with torch.no_grad():
                     outputs = model(
                         input_ids=masked_input_ids.unsqueeze(0),
                         attention_mask=attention_mask.unsqueeze(0),
                     )
-            else:
-                outputs = model(
-                        input_ids=masked_input_ids.unsqueeze(0),
-                        attention_mask=attention_mask.unsqueeze(0),
-                    )
             logits = outputs.logits  # (1, L, V)
 
             log_probs = F.log_softmax(logits[0, pos], dim=-1)
@@ -437,13 +323,16 @@ def esm_infer_pll(
 
     if not isinstance(attention_mask, torch.Tensor):
         attention_mask = torch.tensor(attention_mask, dtype=torch.long)
-    
-    if inference_type == 'mutation_masking':
+    wt_structure_input_ids = None
+    if inference_type == 'mutation-masking':
         inference_function = esm_mutation_only_mutation_masked_pll
-    elif inference_type == 'full_masking':
+    elif inference_type in ['full-masking', 'all-pos-masking']:
         inference_function = esm_mutation_all_pos_masked_pll
-    elif inference_type == 'unmasked':
-        inference_function = esm_unmasked_reconstruction_score
+    elif inference_type in ['unmasked', 'wt-marginals']:
+        inference_function = esm_unmasked_wt_score
+    elif inference_type == 'prosst':
+        wt_input_ids, wt_structure_input_ids = wt_input_ids
+        inference_function = esm_unmasked_wt_score
     else:
         raise SystemError("Choose between 'mutation_masking', 'unmasked', and 'full_masking'")
 
@@ -462,6 +351,7 @@ def esm_infer_pll(
         pll = inference_function(
             tokenized_sequences=torch.tensor(xs_b[i]),
             wt_input_ids=wt_input_ids,
+            structure_input_ids=wt_structure_input_ids,
             attention_mask=attention_mask,
             model=model,
             mask_token_id=mask_token_id,
diff --git a/pypef/plm/prosst_lora_tune.py b/pypef/plm/prosst_lora_tune.py
@@ -23,7 +23,7 @@
 from Bio import SeqIO, BiopythonParserWarning
 warnings.filterwarnings(action='ignore', category=BiopythonParserWarning)
 
-from pypef.plm.esm_lora_tune import corr_loss
+from pypef.plm.utils import corr_loss
 from pypef.plm.prosst_structure.quantizer import PdbQuantizer
 from pypef.utils.helpers import get_device
 from pypef.plm.utils import load_model_and_tokenizer
diff --git a/tests/test_api_functions.py b/tests/test_api_functions.py
@@ -268,7 +268,6 @@ def test_plm_corr_blat_ecolx():
         esm_base_model = esm_base_model.to(device)
         x_esm, esm_attention_mask = esm_tokenize_sequences(
             sequences, esm_tokenizer, max_length=len(blat_ecolx_wt_seq) + 2)
-
         # Tokenize WT sequence once
         wt_tokens, _ = esm_tokenize_sequences(
             [blat_ecolx_wt_seq],
@@ -282,15 +281,14 @@ def test_plm_corr_blat_ecolx():
             attention_mask=esm_attention_mask,
             model=esm_base_model,
             mask_token_id=esm_tokenizer.mask_token_id,
-            inference_type='mutation_masking',
+            inference_type='mutation-masking',
             batch_size=5,
             train=False,
             verbose=True
         )
         print(f'{x}: ESM1v (unsupervised performance): '  
               f'{spearmanr(y_true, y_esm.cpu())[0]}')
         np.testing.assert_almost_equal(spearmanr(y_true, y_esm.cpu())[0], 0.6367826285982324, decimal=6)
-
         y_esm = esm_infer_pll(
             xs=x_esm,
             wt_input_ids=wt_tokens,
@@ -304,15 +302,14 @@ def test_plm_corr_blat_ecolx():
         )
         print(f'{x}: ESM1v (unsupervised performance): '  
               f'{spearmanr(y_true, y_esm.cpu())[0]}')
-        np.testing.assert_almost_equal(spearmanr(y_true, y_esm.cpu())[0], 0.6381789551033011, decimal=6)
-
+        np.testing.assert_almost_equal(spearmanr(y_true, y_esm.cpu())[0], 0.6498987261125897, decimal=6)
         #y_esm = esm_infer_pll(
         #    xs=x_esm,
         #    wt_input_ids=wt_tokens,
         #    attention_mask=esm_attention_mask,
         #    model=esm_base_model,
         #    mask_token_id=esm_tokenizer.mask_token_id,
-        #    inference_type='full_masking',
+        #    inference_type='full-masking',
         #    batch_size=5,
         #    train=False,
         #    verbose=True
@@ -321,15 +318,30 @@ def test_plm_corr_blat_ecolx():
         #      f'{spearmanr(y_true, y_esm.cpu())[0]}')
         #np.testing.assert_almost_equal(spearmanr(y_true, y_esm.cpu())[0], 0.6360209552304472, decimal=6)
 
-    input_ids, prosst_attention_mask, structure_input_ids = get_structure_quantizied(
+    wt_input_ids, prosst_attention_mask, wt_structure_input_ids = get_structure_quantizied(
         pdb_blat_ecolx, prosst_tokenizer, blat_ecolx_wt_seq)
     x_prosst = prosst_tokenize_sequences(sequences=sequences, vocab=prosst_vocab)
-    y_prosst = get_logits_from_full_seqs(
-            x_prosst, prosst_base_model, input_ids, prosst_attention_mask, 
-            structure_input_ids, train=False, verbose=True
+    #y_prosst = get_logits_from_full_seqs(
+    #        x_prosst, prosst_base_model, wt_input_ids, prosst_attention_mask, 
+    #        wt_structure_input_ids, train=False, verbose=True
+    #)
+    #print(f'ProSST (unsupervised performance): '  # ProteinGym: ProSST: 0.760
+    #      f'{spearmanr(y_true, y_prosst.cpu())[0]:.3f}')
+    print('wt_input_ids:',wt_input_ids)
+    print()
+    print('wt_structure_input_ids:', wt_structure_input_ids)
+    print()
+    y_prosst = esm_infer_pll(
+            xs=x_prosst,
+            wt_input_ids=(wt_input_ids, wt_structure_input_ids), ## TODO
+            attention_mask=prosst_attention_mask,
+            model=prosst_base_model,
+            mask_token_id=prosst_tokenizer.mask_token_id,
+            inference_type='prosst',  ## TODO
+            batch_size=5,
+            train=False,
+            verbose=True        
     )
-    print(f'ProSST (unsupervised performance): '  # ProteinGym: ProSST: 0.760
-          f'{spearmanr(y_true, y_prosst.cpu())[0]:.3f}')
     # ACTUAL OLD VERSION: 0.743