dev/fail: train_plm() seems to work for ESM (II)

niklases · niklases · commit c8d4b28e976a · 2026-02-15T16:06:09.000+01:00
diff --git a/pypef/hybrid/hybrid_model.py b/pypef/hybrid/hybrid_model.py
@@ -100,7 +100,7 @@ def __init__(
                     self.llm_loss_function = llm_model_input['prosst']['llm_loss_function']
                     self.x_train_llm = llm_model_input['prosst']['x_llm']
                     self.llm_attention_mask = llm_model_input['prosst']['llm_attention_mask']
-                    self.input_ids = llm_model_input['prosst']['input_ids']
+                    self.wt_input_ids = llm_model_input['prosst']['wt_input_ids']
                     self.structure_input_ids = llm_model_input['prosst']['structure_input_ids']
                 else:
                     raise RuntimeError("LLM input model dictionary not supported. Currently supported "
@@ -418,15 +418,15 @@ def train_llm(self):
             y_llm_ttest = self.llm_inference_function(
                 xs=self.x_llm_ttest,
                 model=self.llm_base_model,
-                input_ids=self.input_ids,
+                wt_input_ids=self.wt_input_ids,
                 attention_mask=self.llm_attention_mask,
                 structure_input_ids=self.structure_input_ids,
                 device=self.device
             )
             y_llm_ttrain = self.llm_inference_function(
                 xs=self.x_llm_ttrain,
                 model=self.llm_base_model,
-                input_ids=self.input_ids,
+                wt_input_ids=self.wt_input_ids,
                 attention_mask=self.llm_attention_mask,
                 structure_input_ids=self.structure_input_ids,
                 device=self.device
@@ -472,12 +472,12 @@ def train_llm(self):
         # void function, training model in place
         if self.llm_key == 'prosst':
             self.llm_train_function(
-                x_llm_ttrain_b, 
-                scores_ttrain_b,
+                self.x_llm_ttrain, 
+                self.y_ttrain,
                 self.llm_loss_function,
                 self.llm_model,
                 self.llm_optimizer, 
-                self.input_ids,
+                self.wt_input_ids,
                 self.llm_attention_mask,  
                 self.structure_input_ids,
                 n_epochs=50,
@@ -490,7 +490,7 @@ def train_llm(self):
             y_llm_lora_ttrain = self.llm_inference_function(
                 xs=self.x_llm_ttrain,
                 model=self.llm_model,
-                input_ids=self.input_ids,
+                input_ids=self.wt_input_ids,
                 attention_mask=self.llm_attention_mask,
                 structure_input_ids=self.structure_input_ids,
                 device=self.device,
@@ -499,37 +499,47 @@ def train_llm(self):
             y_llm_lora_ttest = self.llm_inference_function(
                 xs=self.x_llm_ttest,
                 model=self.llm_model,
-                input_ids=self.input_ids,
+                input_ids=self.wt_input_ids,
                 attention_mask=self.llm_attention_mask,
                 structure_input_ids=self.structure_input_ids,
                 device=self.device,
                 verbose=self.verbose
             )
         elif self.llm_key == 'esm1v':
             # xs, attns, scores, loss_fn, model, optimizer
+            # x_sequences, 
+            # scores, 
+            # loss_fn, 
+            # model, 
+            # optimizer,
+            # input_ids, 
+            # attention_mask, 
             self.llm_train_function( 
-                x_llm_ttrain_b, 
-                self.llm_attention_mask,
-                scores_ttrain_b,
-                self.llm_loss_function,
-                self.llm_model,
-                self.llm_optimizer,  
-                n_epochs=5, 
+                x_sequences=self.x_llm_ttrain, 
+                scores=self.y_ttrain,
+                loss_fn=self.llm_loss_function,
+                model=self.llm_model,
+                optimizer=self.llm_optimizer, 
+                wt_input_ids=self.wt_input_ids,
+                attention_mask=self.llm_attention_mask,
+                n_epochs=50, 
                 device=self.device,
                 verbose=self.verbose,
                 progress_cb=self.progress_cb, 
                 abort_cb=self.abort_cb
             )
             y_llm_lora_ttrain = self.llm_inference_function(
-                xs=x_llm_ttrain_b,
+                xs=self.x_llm_ttrain,
                 model=self.llm_model,
                 attention_mask=self.llm_attention_mask,
+                wt_input_ids=self.wt_input_ids,
                 device=self.device,
                 verbose=self.verbose
             )
             y_llm_lora_ttest = self.llm_inference_function(
-                xs=x_llm_ttest_b,
+                xs=self.x_llm_ttest,
                 model=self.llm_model,
+                wt_input_ids=self.wt_input_ids,
                 attention_mask=self.llm_attention_mask,
                 device=self.device,
                 verbose=self.verbose
@@ -630,36 +640,44 @@ def hybrid_prediction(
         
         else:
             if self.llm_key == 'prosst':
+                #    xs,
+                #wt_input_ids,
+                #attention_mask,
+                #model,
                 y_llm = self.llm_inference_function(
-                    x_llm, 
-                    self.llm_base_model, 
-                    self.input_ids,
-                    self.llm_attention_mask, 
-                    self.structure_input_ids,
+                    xs=x_llm, 
+                    wt_input_ids=self.wt_input_ids,
+                    attention_mask=self.llm_attention_mask, 
+                    model=self.llm_base_model, 
+                    wt_structure_input_ids=self.structure_input_ids,
                     verbose=verbose,
                     device=self.device).detach().cpu().numpy()
                 y_llm_lora = self.llm_inference_function(
-                    x_llm, 
-                    self.llm_model, 
-                    self.input_ids,
-                    self.llm_attention_mask, 
-                    self.structure_input_ids,
+                    xs=x_llm, 
+                    wt_input_ids=self.wt_input_ids,
+                    attention_mask=self.llm_attention_mask, 
+                    model=self.llm_model, 
+                    wt_structure_input_ids=self.structure_input_ids,
                     verbose=verbose,
                     device=self.device).detach().cpu().numpy()
             elif self.llm_key == 'esm1v':
                 x_llm_b = torch.from_numpy(get_batches(x_llm, batch_size=1, dtype=int))
                 y_llm = self.llm_inference_function(
-                    x_llm_b, 
-                    self.llm_attention_mask,
-                    self.llm_base_model, 
+                    xs=x_llm, 
+                    wt_input_ids=self.wt_input_ids,
+                    attention_mask=self.llm_attention_mask, 
+                    model=self.llm_base_model, 
                     verbose=verbose,
-                    device=self.device).detach().cpu().numpy()
+                    device=self.device
+                ).detach().cpu().numpy()
                 y_llm_lora = self.llm_inference_function(
-                    x_llm_b, 
-                    self.llm_attention_mask,
-                    self.llm_model, 
+                    xs=x_llm, 
+                    wt_input_ids=self.wt_input_ids,
+                    attention_mask=self.llm_attention_mask, 
+                    model=self.llm_model, 
                     verbose=verbose,
-                    device=self.device).detach().cpu().numpy()
+                    device=self.device
+                ).detach().cpu().numpy()
             if np.any(np.isnan(y_llm)) or np.any(np.isnan(y_llm_lora)):
                 logger.warning(
                     f"LLM predictions contains NaN's... replacing NaN's with "
diff --git a/pypef/plm/esm_lora_tune.py b/pypef/plm/esm_lora_tune.py
@@ -45,18 +45,6 @@ def get_esm_models(model='facebook/esm1v_t33_650M_UR90S_3'):
     return base_model, lora_model, tokenizer, optimizer
 
 
-def tokenize_sequences(sequences, tokenizer, max_length, verbose=True):
-    tokenized_sequences = []
-    for seq in tqdm(sequences, desc='Tokenizing sequences', disable=not verbose):
-        encoded_sequence, attention_mask = tokenizer(
-            seq, 
-            padding='max_length', 
-            truncation=True,  # False for not uniform length distribution (truncation) 
-            max_length=max_length
-        ).values()
-        tokenized_sequences.append(encoded_sequence)
-    return tokenized_sequences, attention_mask
-
 
 def get_y_pred_scores(encoded_sequences, attention_masks, 
                       model, device: str | None = None):
diff --git a/pypef/plm/inference.py b/pypef/plm/inference.py
@@ -13,9 +13,10 @@
 from tqdm import tqdm
 from Bio import SeqIO
 
+from pypef.plm.prosst_lora_tune import get_prosst_models, get_structure_quantizied
 from pypef.utils.helpers import get_device
 from pypef.plm.utils import corr_loss, get_batches
-from pypef.plm.esm_lora_tune import get_esm_models, tokenize_sequences
+from pypef.plm.esm_lora_tune import get_esm_models
 
 
 import logging
@@ -55,6 +56,7 @@ def unmasked_wt_score(
         verbose: bool = False,
         **model_kwargs
     ):
+    #print('unmasked_wt_score() tokenized_sequences.shape', tokenized_sequences.shape)
     if device is None:
         device = get_device()
     if wt_input_ids.dim() == 1:
@@ -322,17 +324,21 @@ def plm_inference(
 
     scores = []
     if batch_size is None:
-        xs_b = xs
+        xs_b = torch.atleast_2d(xs)
     else:
-        xs_b = get_batches(xs, dtype=int, batch_size=batch_size, keep_remaining=True, verbose=True)
+        logger.info(f"Splitting tokenized sequences into batches...")
+        xs_b = torch.from_numpy(get_batches(xs, dtype=int, batch_size=batch_size, keep_remaining=True, verbose=True))
     desc = f"Inference: {inference_type} batch (size={batch_size}) processing ({device.upper()})'"
+    #print(desc, "xs_b.shape", xs_b.shape)
 
     kwargs = {}
     if mask_token_id is not None:
         kwargs["mask_token_id"] = mask_token_id
 
     if wt_structure_input_ids is not None:
         kwargs["ss_input_ids"] = wt_structure_input_ids.to(device)
+    
+    #print('xs_b.shape', xs_b.shape, 'xs_b[0]', xs_b[0])
 
     pbar = tqdm(
         range(len(xs_b)),
@@ -342,7 +348,7 @@ def plm_inference(
 
     for i in pbar:
         pll = inference_function(
-            tokenized_sequences=torch.tensor(xs_b[i]),
+            tokenized_sequences=xs_b[i],
             wt_input_ids=wt_input_ids,
             attention_mask=attention_mask,
             model=model,
@@ -361,7 +367,7 @@ def plm_train(
         loss_fn, 
         model, 
         optimizer,
-        input_ids, 
+        wt_input_ids, 
         attention_mask, 
         batch_size: int = 5,
         n_epochs=50, 
@@ -382,14 +388,19 @@ def plm_train(
         torch.manual_seed(seed)
     if device is None:
         device = get_device()
-    logger.info(f"ProSST training using {device.upper()} device "
-                f"(N_Train={len(torch.flatten(score_batches))})...")
-    x_sequences_batched = get_batches(x_sequences, dtype=int, batch_size=batch_size, 
-                                      keep_remaining=False, verbose=True)
+    print(f"Model training using {device.upper()} device "
+          f"(N_Train={len(scores)})...")
+    scores_batched = torch.from_numpy(
+        get_batches(scores, dtype=float, batch_size=batch_size,
+                    keep_remaining=False, verbose=True)
+    )
+    x_sequences_batched = torch.from_numpy(
+        get_batches(x_sequences, dtype=int, batch_size=batch_size, 
+                    keep_remaining=False, verbose=True)
+    )
     x_sequences_batched = x_sequences_batched.to(device)
-    score_batches = get_batches(scores, dtype=float, batch_size=batch_size, 
-                                keep_remaining=False, verbose=True)
-    score_batches = score_batches.to(device)
+    #print('x_sequences_batched.shape:', x_sequences_batched.shape)
+    scores_batched = scores_batched.to(device)
     pbar_epochs = tqdm(range(1, n_epochs + 1), disable=not verbose)
     epoch_spearman_1 = -1.0
     did_not_improve_counter = 0
@@ -404,16 +415,20 @@ def plm_train(
         model.train()
         y_preds_detached = []
         pbar_batches = tqdm(
-            zip(x_sequences_batched, score_batches),
+            zip(x_sequences_batched, scores_batched),
             total=len(x_sequences), leave=False, disable=not verbose
         )
         for batch, (seqs_b, scores_b) in enumerate(pbar_batches):
             if abort_cb and abort_cb():
                 return
+            if seqs_b.dim() == 2:
+                seqs_b = seqs_b.unsqueeze(0)  # e.g., (5, 400)  -> (1, 5 400)
             y_preds_b = plm_inference(
-                seqs_b, model, input_ids, attention_mask,
-                train=True, verbose=False
+                xs=seqs_b, 
+                wt_input_ids=wt_input_ids, attention_mask=attention_mask,
+                model=model, train=True, batch_size=None, verbose=False
             )
+            #print('y_preds_b.shape', y_preds_b.shape, y_preds_b)
             y_preds_detached.append(y_preds_b.detach().cpu().numpy().flatten())
             loss = loss_fn(scores_b, y_preds_b) / n_batch_grad_accumulations
             if progress_cb:
@@ -428,12 +443,12 @@ def plm_train(
                 f"sequence: {(batch + 1) * len(seqs_b):>5d}/{len(x_sequences) * len(seqs_b)}] "
                 f"({device.upper()})"
             )
-        epoch_spearman_2 = spearmanr(score_batches.cpu().numpy().flatten(),
+        epoch_spearman_2 = spearmanr(scores_batched.cpu().numpy().flatten(),
                                      np.array(y_preds_detached).flatten())[0]
         if epoch_spearman_2 == np.nan:
             raise SystemError(
                 f"No correlation between Y_true and Y_pred could be computed...\n"
-                f"Y_true: {score_batches.cpu().numpy().flatten()}, "
+                f"Y_true: {scores_batched.cpu().numpy().flatten()}, "
                 f"Y_pred: {np.array(y_preds_detached)}"
             )
         if epoch_spearman_2 > epoch_spearman_1 or epoch == 0:
@@ -444,7 +459,7 @@ def plm_train(
             best_model_epoch = epoch
             best_model_perf = epoch_spearman_2
             best_model = (
-                f"model_saves/Epoch{epoch}-Ntrain{len(score_batches.cpu().numpy().flatten())}"
+                f"model_saves/Epoch{epoch}-Ntrain{len(scores_batched.cpu().numpy().flatten())}"
                 f"-SpearCorr{epoch_spearman_2:.3f}.pt"
             )
             checkpoint(model, best_model)
@@ -456,7 +471,7 @@ def plm_train(
                 logger.info(f'\nEarly stop at epoch {epoch}...')
                 break
         loss_total = loss_fn(
-            torch.flatten(score_batches).to('cpu'),
+            torch.flatten(scores_batched).to('cpu'),
             torch.flatten(torch.Tensor(np.array(y_preds_detached).flatten()))
         )
         pbar_epochs.set_description(
@@ -586,6 +601,18 @@ def inference(
     return y_test_pred
 
 
+def tokenize_sequences(sequences, tokenizer, max_length, verbose=True):
+    tokenized_sequences = []
+    for seq in tqdm(sequences, desc='Tokenizing sequences', disable=not verbose):
+        encoded_sequence, attention_mask = tokenizer(
+            seq, 
+            padding='max_length', 
+            truncation=True,  # False for not uniform length distribution (truncation) 
+            max_length=max_length
+        ).values()
+        tokenized_sequences.append(encoded_sequence)
+    return tokenized_sequences, attention_mask
+
 
 def esm_setup(wt_seq, sequences, device: str | None = None, verbose: bool = True):
     esm_base_model, esm_lora_model, esm_tokenizer, esm_optimizer = get_esm_models()
@@ -655,7 +682,7 @@ def prosst_setup(wt_seq, pdb_file, sequences, device: str | None = None, verbose
             'x_llm' : x_llm_train_prosst,
             'llm_attention_mask': prosst_attention_mask,
             'llm_vocab': prosst_vocab,
-            'input_ids': input_ids,
+            'wt_input_ids': input_ids,
             'structure_input_ids': structure_input_ids,
             'llm_tokenizer': prosst_tokenizer
         }
diff --git a/pypef/plm/prosst_lora_tune.py b/pypef/plm/prosst_lora_tune.py
@@ -20,15 +20,12 @@
 from scipy.stats import spearmanr
 from tqdm import tqdm
 from peft import LoraConfig, get_peft_model
-from Bio import SeqIO, BiopythonParserWarning
+from Bio import BiopythonParserWarning
 warnings.filterwarnings(action='ignore', category=BiopythonParserWarning)
 
-from pypef.plm.utils import corr_loss
 from pypef.plm.prosst_structure.quantizer import PdbQuantizer
 from pypef.utils.helpers import get_device
-from pypef.plm.esm_lora_tune import tokenize_sequences
 from pypef.plm.utils import load_model_and_tokenizer
-from pypef.plm.inference import plm_inference
 
 
 def prosst_simple_vocab_aa_tokenizer(sequences, vocab, verbose=True):
diff --git a/tests/test_api_functions.py b/tests/test_api_functions.py