Update according to token seq max_length+=2

niklases · niklases · commit 77d2bbfd6300 · 2026-01-31T20:02:52.000+01:00
diff --git a/pypef/plm/esm_lora_tune.py b/pypef/plm/esm_lora_tune.py
@@ -191,11 +191,16 @@ def esm_train(
     model.train(False)
 
 
-def esm_setup(sequences, device: str | None = None, verbose: bool = True):
+def esm_setup(wt_seq, sequences, device: str | None = None, verbose: bool = True):
     esm_base_model, esm_lora_model, esm_tokenizer, esm_optimizer = get_esm_models()
     esm_base_model = esm_base_model.to(device)
+    wt_tokens, _ = tokenize_sequences(
+            [wt_seq],
+            esm_tokenizer,
+            max_length=len(wt_seq) + 2
+    )
     x_esm, esm_attention_mask = tokenize_sequences(
-        sequences, esm_tokenizer, max_length=len(sequences[0]), verbose=verbose)
+        sequences, esm_tokenizer, max_length=len(wt_seq) + 2, verbose=verbose)
     llm_dict_esm = {
         'esm1v': {
             'llm_base_model': esm_base_model,
@@ -205,6 +210,7 @@ def esm_setup(sequences, device: str | None = None, verbose: bool = True):
             'llm_inference_function': esm_infer,
             'llm_loss_function': corr_loss,
             'x_llm' : x_esm,
+            'input_ids': wt_tokens,
             'llm_attention_mask':  esm_attention_mask,
             'llm_tokenizer': esm_tokenizer
         }
diff --git a/pypef/plm/inference.py b/pypef/plm/inference.py
@@ -349,11 +349,12 @@ def llm_tokenizer(llm_dict, seqs, verbose=True):
     if list(llm_dict.keys())[0] == 'esm1v':
         x_llm_seqs, _attention_mask = tokenize_sequences(
             seqs, tokenizer=llm_dict['esm1v']['llm_tokenizer'], 
-            max_length=len(seqs[0]), verbose=verbose
+            max_length=len(seqs[0]) + 2, verbose=verbose
         )
     elif list(llm_dict.keys())[0] == 'prosst':
-        x_llm_seqs = prosst_simple_vocab_aa_tokenizer(
-            seqs, vocab=llm_dict['prosst']['llm_vocab'], verbose=verbose
+        x_llm_seqs, _attention_mask = tokenize_sequences(
+            seqs, tokenizer=llm_dict['prosst']['llm_tokenizer'], 
+            max_length=len(seqs[0]) + 2, verbose=verbose
         )
     else:
         raise SystemError(f"Unknown LLM dictionary input:\n{list(llm_dict.keys())[0]}")
@@ -376,17 +377,29 @@ def inference(
         device = get_device()
     if llm == 'esm':
         logger.info("Zero-shot LLM inference on test set using ESM1v...")
-        llm_dict = esm_setup(sequences, verbose=verbose)
+        llm_dict = esm_setup(wt_seq, sequences, verbose=verbose)
         if model is None:
             model = llm_dict['esm1v']['llm_base_model']
         x_llm_test = llm_tokenizer(llm_dict, sequences, verbose)
         y_test_pred = esm_infer(#llm_dict['esm1v']['llm_inference_function'](
-            xs=torch.tensor(get_batches(x_llm_test, batch_size=1, dtype=int)), 
+            xs=torch.from_numpy(get_batches(x_llm_test, batch_size=1, dtype=int)), 
             attention_mask=llm_dict['esm1v']['llm_attention_mask'], 
             model=model, 
             device=device,
             verbose=verbose
         ).cpu()
+        y_test_pred = plm_inference(
+            xs=x_llm_test,
+            wt_input_ids=torch.tensor(llm_dict['esm1v']['input_ids'][0], dtype=torch.long),
+            attention_mask=llm_dict['esm1v']['llm_attention_mask'],
+            model=model,
+            mask_token_id=llm_dict['esm1v']['llm_tokenizer'].mask_token_id,
+            inference_type='unmasked',
+            batch_size=5,
+            train=False,
+            verbose=True
+        ).cpu()
+
     elif llm == 'prosst':
         logger.info("Zero-shot LLM inference on test set using ProSST...")
         llm_dict = prosst_setup(
@@ -395,14 +408,27 @@ def inference(
         if model is None:
             model = llm_dict['prosst']['llm_base_model']
         x_llm_test = llm_tokenizer(llm_dict, sequences, verbose)
-        y_test_pred = prosst_infer(#llm_dict['prosst']['llm_inference_function'](
-            xs=x_llm_test, 
-            model=model, 
-            input_ids=llm_dict['prosst']['input_ids'], 
-            attention_mask=llm_dict['prosst']['llm_attention_mask'], 
-            structure_input_ids=llm_dict['prosst']['structure_input_ids'],
-            verbose=verbose,
-            device=device
+        #y_test_pred = prosst_infer(#llm_dict['prosst']['llm_inference_function'](
+        #    xs=x_llm_test, 
+        #    model=model, 
+        #    input_ids=llm_dict['prosst']['input_ids'], 
+        #    attention_mask=llm_dict['prosst']['llm_attention_mask'], 
+        #    structure_input_ids=llm_dict['prosst']['structure_input_ids'],
+        #    verbose=verbose,
+        #    device=device
+        #).cpu()
+        print('XXX:', np.shape(x_llm_test))
+        y_test_pred = plm_inference(
+            xs=x_llm_test,
+            wt_input_ids=llm_dict['prosst']['input_ids'],
+            attention_mask=llm_dict['prosst']['llm_attention_mask'],
+            model=model,
+            mask_token_id=llm_dict['prosst']['llm_tokenizer'].mask_token_id,
+            inference_type='mutation-masking',
+            wt_structure_input_ids=llm_dict['prosst']['structure_input_ids'],
+            batch_size=5,
+            train=False,
+            verbose=True   
         ).cpu()
     else:
         raise RuntimeError("Unknown LLM option.")
diff --git a/pypef/plm/prosst_lora_tune.py b/pypef/plm/prosst_lora_tune.py
@@ -26,6 +26,7 @@
 from pypef.plm.utils import corr_loss
 from pypef.plm.prosst_structure.quantizer import PdbQuantizer
 from pypef.utils.helpers import get_device
+from pypef.plm.esm_lora_tune import tokenize_sequences
 from pypef.plm.utils import load_model_and_tokenizer
 
 
@@ -37,14 +38,13 @@ def prosst_simple_vocab_aa_tokenizer(sequences, vocab, verbose=True):
         sequences, desc='Tokenizing sequences for ProSST modeling', 
         disable=not verbose
     ):
-        #x_sequence = [vocab['<cls>']]
-        x_sequence = []
+        x_sequence = [vocab['<cls>']]
         for aa in sequence:
             try:
                 x_sequence.append(vocab[aa])
             except KeyError:
                 x_sequence.append(vocab['<unk>'])
-        #x_sequence.append(vocab['<eos>'])
+        x_sequence.append(vocab['<eos>'])
         x_sequences.append(x_sequence)
     return torch.Tensor(x_sequences).to(torch.int)
 
@@ -80,14 +80,15 @@ def get_logits_from_full_seqs(
                     ss_input_ids=structure_input_ids
             )
     logits = torch.log_softmax(outputs.logits[:, 1:-1], dim=-1).squeeze()
-    for i_s, sequence in enumerate(
+    for i_s, x_sequence in enumerate(
         tqdm(
             xs,
             desc=f'ProSST inference: getting sequence logits ({device.upper()})',
             disable=not verbose
         )
     ):
-        for i_aa, x_aa in enumerate(sequence):
+        x_sequence = x_sequence[1:-1] # if cls, eos tokens included
+        for i_aa, x_aa in enumerate(x_sequence):
             if i_aa == 0:
                 seq_log_probs = logits[i_aa, x_aa].reshape(1)
             else:
@@ -297,8 +298,9 @@ def prosst_setup(wt_seq, pdb_file, sequences, device: str | None = None, verbose
     input_ids, prosst_attention_mask, structure_input_ids = get_structure_quantizied(
         pdb_file, prosst_tokenizer, wt_seq, verbose=verbose
     )
-    x_llm_train_prosst = prosst_simple_vocab_aa_tokenizer(
-        sequences=sequences, vocab=prosst_vocab, verbose=verbose
+    x_llm_train_prosst, _attention_mask = tokenize_sequences(
+        sequences=sequences, tokenizer=prosst_tokenizer, 
+        max_length=len(wt_seq) + 2, verbose=verbose
     )
     llm_dict_prosst = {
         'prosst': {
diff --git a/pypef/plm/utils.py b/pypef/plm/utils.py
@@ -27,6 +27,7 @@ def corr_loss(y_true: torch.Tensor, y_pred: torch.Tensor):
 def get_batches(a, dtype, batch_size=5,
                 keep_remaining=False, verbose: bool = False):
     a = np.asarray(a, dtype=dtype)
+    a_remaining = None
     orig_shape = np.shape(a)
     remaining = len(a) % batch_size
     if remaining != 0:
@@ -46,12 +47,13 @@ def get_batches(a, dtype, batch_size=5,
     if verbose:
         print(f'{orig_shape} -> {new_shape} (dropped {remaining})')
     if keep_remaining: 
-        print(f'Appending remaining to collected batches as last batch '
-                    f'(the resulting inhomogenous list shape is '
-                    f'{np.shape(a)} + {np.shape(a_remaining)} = ('
-                    f'{np.shape(a)[0] + 1}, *, {np.shape(a)[-1]}))...')
-        a = a.tolist()
-        a.append(a_remaining)
+        if a_remaining is not None:
+            print(f'Appending remaining to collected batches as last batch '
+                        f'(the resulting inhomogenous list shape is '
+                        f'{np.shape(a)} + {np.shape(a_remaining)} = ('
+                        f'{np.shape(a)[0] + 1}, *, {np.shape(a)[-1]}))...')
+            a = a.tolist()
+            a.append(a_remaining)
     return a
 
 
diff --git a/tests/test_api_functions.py b/tests/test_api_functions.py
@@ -33,6 +33,7 @@
 
 
 torch.manual_seed(42)
+# torch.use_deterministic_algorithms(True)
 np.random.seed(42)
 
 msa_file_avgfp = os.path.abspath(os.path.join(
@@ -111,29 +112,28 @@ def test_hybrid_model_dca_llm():
         decimal=7
     )
     assert len(train_seqs_aneh[0]) == len(g.wt_seq)
-
-    y_pred_esm = inference(train_seqs_aneh, 'esm')
+    aneh_wt_seq = get_wt_sequence(wt_seq_file_aneh)
+    y_pred_esm = inference(train_seqs_aneh, 'esm', wt_seq=aneh_wt_seq)
     np.testing.assert_almost_equal(
         spearmanr(train_ys_aneh, y_pred_esm)[0], 
-        -0.21073416060442696, 
+         -0.713214007088901, 
         decimal=7
     )
-    aneh_wt_seq = get_wt_sequence(wt_seq_file_aneh)
     y_pred_prosst = inference(
         train_seqs_aneh, 'prosst', 
         pdb_file=pdb_file_aneh, wt_seq=aneh_wt_seq
     )
     np.testing.assert_almost_equal(
         spearmanr(train_ys_aneh, y_pred_prosst)[0], 
-        -0.7425657069861902, 
+        -0.7394433335146882, 
         decimal=7
     )
 
     x_dca_test = g.get_scores(test_seqs_aneh, encode=True)
     for i, setup in enumerate([esm_setup, prosst_setup]):
         print(['~~~ ESM ~~~', '~~~ ProSST ~~~'][i])
         if setup == esm_setup:
-            llm_dict = setup(sequences=train_seqs_aneh)
+            llm_dict = setup(sequences=train_seqs_aneh, wt_seq=aneh_wt_seq)
         else:  # elif setup == prosst_setup:
             llm_dict = setup(
                 aneh_wt_seq, pdb_file_aneh, sequences=train_seqs_aneh)
@@ -163,7 +163,7 @@ def test_hybrid_model_dca_llm():
         )
         np.testing.assert_almost_equal(
             spearmanr(hm.y_ttest, hm.y_llm_ttest)[0], 
-            [-0.21761360470606333, -0.8330644449247571][i],
+            [-0.17231040881725562, -0.8330644449247571][i],
             decimal=7
         )  
         # Nondeterministic behavior (without setting seed), should be about ~0.7 to ~0.9, 
@@ -316,7 +316,6 @@ def test_plm_corr_blat_ecolx():
         #print(f'{x}: ESM1v (unsupervised performance): '  
         #      f'{spearmanr(y_true, y_esm.cpu())[0]}')
         #np.testing.assert_almost_equal(spearmanr(y_true, y_esm.cpu())[0], 0.666666666666666, decimal=6)
-    print(prosst_vocab)
     wt_input_ids, prosst_attention_mask, wt_structure_input_ids = get_structure_quantizied(
         pdb_blat_ecolx, prosst_tokenizer, blat_ecolx_wt_seq)
     x_prosst2 = prosst_simple_vocab_aa_tokenizer(sequences, prosst_vocab)