dev: todo: More uniform tokenization and plm inference

niklases · niklases · commit ce54e9bf564d · 2026-01-28T22:28:18.000+01:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -38,7 +38,7 @@ jobs:
         flake8 ./pypef --count --select=E9,F63,F7,F82 --show-source --statistics
     - name: Export Pythonpath and run tests using the main script
       run: |
-        export PYTHONPATH="${PYTHONPATH}:${PWD}" && python -m pytest ./tests/ -v -m "not pip_specific"
+        export PYTHONPATH="${PYTHONPATH}:${PWD}" && python -m pytest ./tests/ -v -m "not (pip_specific or requires_gpu)"
     - name: Export Pythonpath and run tests using pip-installation
       run: |
         export PYTHONPATH=""
@@ -81,7 +81,7 @@ jobs:
     - name: Export Pythonpath and run tests using the main script
       shell: pwsh
       run: |
-        $env:PYTHONPATH = "${PWD};${env:PYTHONPATH}";python -m pytest .\tests\ -v -m "not pip_specific"
+        $env:PYTHONPATH = "${PWD};${env:PYTHONPATH}";python -m pytest .\tests\ -v -m "not (pip_specific or requires_gpu)"
     - name: Export Pythonpath and run tests using pip-installation
       shell: pwsh
       run: |
diff --git a/pypef/plm/esm_lora_tune.py b/pypef/plm/esm_lora_tune.py
@@ -48,7 +48,7 @@ def get_esm_models(model='facebook/esm1v_t33_650M_UR90S_3'):
     return base_model, lora_model, tokenizer, optimizer
 
 
-def esm_tokenize_sequences(sequences, tokenizer, max_length, verbose=True):
+def tokenize_sequences(sequences, tokenizer, max_length, verbose=True):
     tokenized_sequences = []
     for seq in tqdm(sequences, desc='Tokenizing sequences for ESM modeling', disable=not verbose):
         encoded_sequence, attention_mask = tokenizer(
@@ -154,18 +154,49 @@ def esm_unmasked_wt_score(
     ):
     if device is None:
         device = get_device()
-    wt_input_ids = wt_input_ids.unsqueeze(0)
+    if wt_input_ids.dim() == 1:
+        wt_input_ids = wt_input_ids.unsqueeze(0)
+    structure_input_ids = kwargs.get("structure_input_ids", None)
     attention_masks = torch.Tensor(np.full(
         shape=np.shape(wt_input_ids), fill_value=attention_mask)).to(torch.int64)
     if train:
-        outputs = model(wt_input_ids.to(device), attention_masks.to(device), 
-                        output_hidden_states=False)
+        if structure_input_ids is not None:
+            outputs = model(
+                input_ids=wt_input_ids.to(device),
+                attention_mask=attention_masks.to(device),
+                ss_input_ids=structure_input_ids.to(device)
+            )
+        else:
+            outputs = model(
+                wt_input_ids.to(device), 
+                attention_masks.to(device), 
+                output_hidden_states=False
+            )
     else:
         with torch.no_grad():
-            outputs = model(wt_input_ids.to(device), attention_masks.to(device), 
-                            output_hidden_states=False)
+            if structure_input_ids is not None:
+                outputs = model(
+                        input_ids=wt_input_ids.to(device),
+                        attention_mask=attention_masks.to(device),
+                        ss_input_ids=structure_input_ids.to(device)
+                )
+            else:
+                outputs = model(
+                    wt_input_ids.to(device), 
+                    attention_masks.to(device), 
+                    output_hidden_states=False
+                )
+
     logits = outputs.logits
-    token_probs = torch.log_softmax(logits, dim=-1).squeeze(0)
+    logits = logits.squeeze(0)   # remove batch dim
+    #print('logits.shape:', logits.shape)
+    # Better make sure that special tokens are always removed / masked 
+    # and only pure amino acid sequence tokens are present / unmasked
+    #logits = logits[1:-1]        # drop CLS/EOS
+    token_probs = torch.log_softmax(logits, dim=-1)
+    assert len(tokenized_sequences[0]) == token_probs.shape[0], f"{len(tokenized_sequences[0])} != {token_probs.shape[0]}"
+    #print('token_probs.shape:', token_probs.shape)
+
     for i_s, tokenized_seq in enumerate(tokenized_sequences):
         for i_aa, aa in enumerate(tokenized_seq):
             # alternative: use Tensor.index_select() function
@@ -417,7 +448,7 @@ def esm_train(
 def esm_setup(sequences, device: str | None = None, verbose: bool = True):
     esm_base_model, esm_lora_model, esm_tokenizer, esm_optimizer = get_esm_models()
     esm_base_model = esm_base_model.to(device)
-    x_esm, esm_attention_mask = esm_tokenize_sequences(
+    x_esm, esm_attention_mask = tokenize_sequences(
         sequences, esm_tokenizer, max_length=len(sequences[0]), verbose=verbose)
     llm_dict_esm = {
         'esm1v': {
diff --git a/pypef/plm/inference.py b/pypef/plm/inference.py
@@ -8,7 +8,7 @@
 
 from pypef.utils.helpers import get_device
 from pypef.plm.utils import get_batches
-from pypef.plm.esm_lora_tune import esm_infer, esm_setup, esm_tokenize_sequences
+from pypef.plm.esm_lora_tune import esm_infer, esm_setup, tokenize_sequences
 from pypef.plm.prosst_lora_tune import prosst_setup, prosst_tokenize_sequences, prosst_infer
 
 import logging
@@ -21,7 +21,7 @@ def llm_tokenizer(llm_dict, seqs, verbose=True):
     except ValueError:
         raise SystemError("Unequal input sequence length detected!")
     if list(llm_dict.keys())[0] == 'esm1v':
-        x_llm_seqs, _attention_mask = esm_tokenize_sequences(
+        x_llm_seqs, _attention_mask = tokenize_sequences(
             seqs, tokenizer=llm_dict['esm1v']['llm_tokenizer'], 
             max_length=len(seqs[0]), verbose=verbose
         )
diff --git a/pypef/plm/prosst_lora_tune.py b/pypef/plm/prosst_lora_tune.py
@@ -30,15 +30,20 @@
 
 
 def prosst_tokenize_sequences(sequences, vocab, verbose=True):
+    print(vocab)
     sequences = np.atleast_1d(sequences).tolist()
     x_sequences = []
     for sequence in tqdm(
         sequences, desc='Tokenizing sequences for ProSST modeling', 
         disable=not verbose
     ):
-        x_sequence = []
+        x_sequence = [vocab['<cls>']]
         for aa in sequence:
-            x_sequence.append(vocab[aa])
+            try:
+                x_sequence.append(vocab[aa])
+            except KeyError:
+                x_sequence.append(vocab['<unk>'])
+        x_sequence.append(vocab['<eos>'])
         x_sequences.append(x_sequence)
     return torch.Tensor(x_sequences).to(torch.int)
 
diff --git a/scripts/ProteinGym_runs/official/benchmark_runs/pgym_cv_benchmark.py b/scripts/ProteinGym_runs/official/benchmark_runs/pgym_cv_benchmark.py
@@ -18,7 +18,7 @@
 
 from pypef.utils.variant_data import get_mismatches
 from pypef.plm.prosst_lora_tune import prosst_setup, prosst_tokenize_sequences
-from pypef.plm.esm_lora_tune import esm_setup, esm_tokenize_sequences
+from pypef.plm.esm_lora_tune import esm_setup, tokenize_sequences
 from pypef.dca.gremlin_inference import GREMLIN, get_delta_e_statistical_model
 from pypef.hybrid.hybrid_model import DCALLMHybridModel
 
@@ -182,7 +182,7 @@ def main(cfg: DictConfig) -> None:
         elif llm == "esm1v":
             llm_kwargs = esm_setup(sequences=s_train)
             tokenizer = llm_kwargs['esm1v']['llm_tokenizer']
-            x_llm_test, _attn_masks = esm_tokenize_sequences(
+            x_llm_test, _attn_masks = tokenize_sequences(
                 sequences=s_test, tokenizer=tokenizer, max_length=len(s_test[0])
             )
         
diff --git a/scripts/ProteinGym_runs/protgym_hybrid_perf_test_crossval.py b/scripts/ProteinGym_runs/protgym_hybrid_perf_test_crossval.py
@@ -22,7 +22,7 @@
 from pypef.dca.gremlin_inference import GREMLIN
 from pypef.plm.utils import get_batches, corr_loss
 from pypef.plm.esm_lora_tune import (
-    get_esm_models, esm_tokenize_sequences, 
+    get_esm_models, tokenize_sequences, 
     esm_train, esm_infer
 )
 from pypef.plm.prosst_lora_tune import (
@@ -151,7 +151,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
             dca_unopt_perf = spearmanr(fitnesses, y_pred_dca)[0]
             # ESM unsupervised
             try:
-                x_esm, esm_attention_mask = esm_tokenize_sequences(
+                x_esm, esm_attention_mask = tokenize_sequences(
                     sequences, esm_tokenizer, max_length=len(wt_seq), verbose=False
                 )
                 y_esm = inference(sequences, 'esm', model=esm_base_model, verbose=False)
diff --git a/scripts/ProteinGym_runs/protgym_hybrid_perf_test_low_n.py b/scripts/ProteinGym_runs/protgym_hybrid_perf_test_low_n.py
@@ -24,7 +24,7 @@
 from pypef.dca.gremlin_inference import GREMLIN
 from pypef.plm.utils import get_batches
 from pypef.plm.esm_lora_tune import (
-    get_esm_models, esm_tokenize_sequences, 
+    get_esm_models, tokenize_sequences, 
     esm_train, esm_infer, corr_loss
 )
 from pypef.plm.prosst_lora_tune import (
@@ -143,7 +143,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
             dca_unopt_perf = spearmanr(fitnesses, y_pred_dca)[0]
 
             try:
-                x_esm, esm_attention_mask = esm_tokenize_sequences(
+                x_esm, esm_attention_mask = tokenize_sequences(
                     sequences, esm_tokenizer, max_length=len(wt_seq))
                 y_esm = esm_infer(
                     get_batches(x_esm, dtype=float, batch_size=1), 
diff --git a/tests/test_api_functions.py b/tests/test_api_functions.py
@@ -22,7 +22,7 @@
 from pypef.plm.inference import inference, llm_tokenizer
 from pypef.hybrid.hybrid_model import DCALLMHybridModel
 from pypef.plm.esm_lora_tune import (
-    get_esm_models, esm_tokenize_sequences,
+    get_esm_models, tokenize_sequences,
 )
 from pypef.plm.prosst_lora_tune import (
     get_logits_from_full_seqs, get_prosst_models, get_structure_quantizied, 
@@ -266,10 +266,10 @@ def test_plm_corr_blat_ecolx():
     for x in ['facebook/esm1v_t33_650M_UR90S_3']:
         esm_base_model, _esm_lora_model, esm_tokenizer, esm_optimizer = get_esm_models(model=x)
         esm_base_model = esm_base_model.to(device)
-        x_esm, esm_attention_mask = esm_tokenize_sequences(
+        x_esm, esm_attention_mask = tokenize_sequences(
             sequences, esm_tokenizer, max_length=len(blat_ecolx_wt_seq) + 2)
         # Tokenize WT sequence once
-        wt_tokens, _ = esm_tokenize_sequences(
+        wt_tokens, _ = tokenize_sequences(
             [blat_ecolx_wt_seq],
             esm_tokenizer,
             max_length=len(blat_ecolx_wt_seq) + 2
@@ -316,21 +316,18 @@ def test_plm_corr_blat_ecolx():
         #)
         #print(f'{x}: ESM1v (unsupervised performance): '  
         #      f'{spearmanr(y_true, y_esm.cpu())[0]}')
-        #np.testing.assert_almost_equal(spearmanr(y_true, y_esm.cpu())[0], 0.6360209552304472, decimal=6)
+        #np.testing.assert_almost_equal(spearmanr(y_true, y_esm.cpu())[0], 0.666666666666666, decimal=6)
 
     wt_input_ids, prosst_attention_mask, wt_structure_input_ids = get_structure_quantizied(
         pdb_blat_ecolx, prosst_tokenizer, blat_ecolx_wt_seq)
-    x_prosst = prosst_tokenize_sequences(sequences=sequences, vocab=prosst_vocab)
-    #y_prosst = get_logits_from_full_seqs(
-    #        x_prosst, prosst_base_model, wt_input_ids, prosst_attention_mask, 
-    #        wt_structure_input_ids, train=False, verbose=True
-    #)
-    #print(f'ProSST (unsupervised performance): '  # ProteinGym: ProSST: 0.760
-    #      f'{spearmanr(y_true, y_prosst.cpu())[0]:.3f}')
-    print('wt_input_ids:',wt_input_ids)
-    print()
-    print('wt_structure_input_ids:', wt_structure_input_ids)
-    print()
+    x_prosst = tokenize_sequences(sequences=sequences, tokenizer=prosst_tokenizer)
+    y_prosst = get_logits_from_full_seqs(
+            x_prosst, prosst_base_model, wt_input_ids, prosst_attention_mask, 
+            wt_structure_input_ids, train=False, verbose=True
+    )
+    print(f'ProSST (unsupervised performance): '  # ProteinGym: ProSST: 0.760
+          f'{spearmanr(y_true, y_prosst.cpu())[0]:.3f}')
+
     y_prosst = esm_infer_pll(
             xs=x_prosst,
             wt_input_ids=(wt_input_ids, wt_structure_input_ids), ## TODO
@@ -342,6 +339,8 @@ def test_plm_corr_blat_ecolx():
             train=False,
             verbose=True        
     )
+    print(f'ProSST (unsupervised performance): '  # ProteinGym: ProSST: 0.760
+          f'{spearmanr(y_true, y_prosst.cpu())[0]:.3f}')
     # ACTUAL OLD VERSION: 0.743