Update gremlin (dev)_ implement msa_start & msa_end

niklases · niklases · commit 03b88f89d5dd · 2025-08-15T16:11:05.000+02:00
for trimming MSAs if needed
diff --git a/pypef/dca/gremlin_inference.py b/pypef/dca/gremlin_inference.py
@@ -49,6 +49,9 @@
 from tqdm import tqdm
 import torch
 
+from pypef.llm.utils import get_batches
+from pypef.utils.variant_data import get_mismatches
+
 
 class GREMLIN:
     """
@@ -65,6 +68,8 @@ def __init__(
             eff_cutoff=0.8,
             opt_iter=100,
             max_msa_seqs: int | None = 10000,
+            msa_start: None | int = None,
+            msa_end: None | int = None,
             seqs: list[str] | np.ndarray[str] | None =None,
             device: str | None = None
     ):
@@ -98,12 +103,20 @@ def __init__(
         else:
             self.max_msa_seqs = max_msa_seqs
         self.states = len(self.char_alphabet)
+        self.msa_start = msa_start
+        if msa_end == 0:
+            msa_end = None
+        self.msa_end = msa_end
         logger.info('Loading MSA...')
         if seqs is None:
             self.seqs, self.seq_ids = self.get_sequences_from_msa(alignment)
         else:
             self.seqs = seqs
             self.seq_ids = np.array([n for n in range(len(self.seqs))])
+        self.first_msa_seq = self.seqs[0]
+        if self.msa_start is not None or self.msa_end is not None:
+            logger.info(f'Trimmed sequence length.. first sequence is printed here as '
+                  f'example (Length: {len(self.first_msa_seq)}): {self.first_msa_seq}') 
         logger.info(f'Found {len(self.seqs)} sequences in the MSA...')
         self.msa_ori = self.get_msa_ori()
         logger.info(f'MSA shape: {np.shape(self.msa_ori)}')
@@ -153,7 +166,14 @@ def get_sequences_from_msa(self, msa_file: str):
         with open(msa_file, 'r') as fh:
             alignment = AlignIO.read(fh, "fasta")
         for record in alignment:
-            sequences.append(str(record.seq))
+            seq = str(record.seq)
+            if self.msa_start is not None and self.msa_end is not None:
+                seq = seq[self.msa_start:self.msa_end]
+            elif self.msa_start is not None:
+                seq = seq[self.msa_start:]
+            elif self.msa_end is not None:
+                seq = seq[:self.msa_end]
+            sequences.append(seq)
             seq_ids.append(str(record.id))
         assert len(sequences) == len(seq_ids), f"{len(sequences)}, {len(seq_ids)}"
         return np.array(sequences), np.array(seq_ids)
@@ -353,14 +373,14 @@ def run_optimization(self):
 
         self.mt_v, self.vt_v = torch.zeros_like(self.v), torch.zeros_like(self.v)
         self.mt_w, self.vt_w = torch.zeros_like(self.w), torch.zeros_like(self.w)
-        logger.info(f'Initial loss: {self._loss()}')
+        logger.info(f'Initial loss: {self._loss():.5f}')
         for i in range(self.opt_iter):
             self.opt_adam_step()
             try:
                 if (i + 1) % int(self.opt_iter / 10) == 0:
-                    logger.info(f'Loss step {i + 1}: {self._loss()}')
+                    logger.info(f'Loss step {i + 1}: {self._loss():.5f}')
             except ZeroDivisionError:
-                logger.info(f'Loss step {i + 1}: {self._loss()}')
+                logger.info(f'Loss step {i + 1}: {self._loss():.5f}')
         
         self.v = self.v.detach().cpu().numpy()
         self.w = self.w.detach().cpu().numpy()
@@ -416,7 +436,20 @@ def get_scores(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.
         if v_idx is None:
             v_idx = self.v_idx
         seqs_int = self.seq2int(seqs)
-
+        wt_seq_len = len(self.wt_seq)
+        #if np.shape(seqs_int)[1] != wt_seq_len:
+        #    raise RuntimeError(
+        #        f"Input sequence shape (length: {np.shape(seqs_int)[1]}) does not match GREMLIN "
+        #        f"MSA shape (common sequence length: {wt_seq_len}) inferred from the MSA."
+        #    )
+        # Check nums of mutations to MSA first/WT sequence and gives warning if too apart from MSA seq
+        for i, seq in enumerate(seqs):
+            n_mismatches, mismatches = get_mismatches(self.wt_seq, seq)
+            if n_mismatches / wt_seq_len > 0.05:
+                logger.warning(
+                    f"Sequence {mismatches} contains more than 5% sequence mismatches to the "
+                    f"first MSA/\"WT\" sequence. Effect predictions will likely be incorrect!"
+                )
         try:
             if seqs_int.shape[-1] != len(v_idx):  # The input sequence length ({seqs_int.shape[-1]}) 
                 # does not match the common gap-trimmed MSA sequence length (len(v_idx)
@@ -471,8 +504,16 @@ def collect_encoded_sequences(self, seqs, v=None, w=None, v_idx=None):
         Wrapper function for encoding input sequences using the self.get_scores
         function with encode set to True.
         """
-        xs = self.get_scores(seqs, v, w, v_idx, encode=True)
-        return xs
+        xs = []
+        sequences_batched = get_batches(
+            seqs, batch_size=1000, dtype=str, 
+            keep_remaining=True, verbose=True
+        )
+        sequences_batched = np.atleast_2d(sequences_batched)
+
+        for seq_batch in sequences_batched:
+            xs.append(self.get_scores(seq_batch, v, w, v_idx, encode=True))
+        return xs[0]
 
     @staticmethod
     def normalize(apc_mat):
@@ -691,7 +732,7 @@ def save_corr_csv(gremlin: GREMLIN, min_distance: int = 0, sort_by: str = 'apc')
     )
     df_mtx_sorted_mindist.to_csv(f"coevolution_{sort_by}_sorted.csv", sep=',')
     logger.info(f"Saved coevolution CSV data as "
-                f"{os.path.abspath(f'coevolution_{sort_by}_sorted.csv')}")
+          f"{os.path.abspath(f'coevolution_{sort_by}_sorted.csv')}")
 
 
 def plot_predicted_ssm(gremlin: GREMLIN):
diff --git a/pypef/hybrid/hybrid_model.py b/pypef/hybrid/hybrid_model.py
@@ -447,7 +447,7 @@ def train_llm(self):
                 self.input_ids,
                 self.llm_attention_mask,  
                 self.structure_input_ids,
-                n_epochs=50, 
+                n_epochs=50,
                 device=self.device,
                 verbose=self.verbose
             )
@@ -641,7 +641,6 @@ def ls_ts_performance(self):
         return spearman_r, reg, beta_1, beta_2
 
 
-
 """ 
 ###########################################################################################
 # Below: Some helper functions that call or are dependent on the DCALLMHybridModel class. #
diff --git a/pypef/llm/prosst_lora_tune.py b/pypef/llm/prosst_lora_tune.py
@@ -128,7 +128,7 @@ def load_model(model, filename):
 def prosst_train(
         x_sequence_batches, score_batches, loss_fn, model, optimizer,
         input_ids, attention_mask, structure_input_ids,
-        n_epochs=3, device: str | None = None, seed: int | None = None,
+        n_epochs=50, device: str | None = None, seed: int | None = None,
         early_stop: int = 50, verbose: bool = True):
     if seed is not None:
         torch.manual_seed(seed)
@@ -139,7 +139,7 @@ def prosst_train(
     x_sequence_batches = x_sequence_batches.to(device)
     score_batches = score_batches.to(device)
     pbar_epochs = tqdm(range(1, n_epochs + 1), disable=not verbose)
-    epoch_spearman_1 = 0.0
+    epoch_spearman_1 = -1.0
     did_not_improve_counter = 0
     best_model = None
     best_model_epoch = np.nan
@@ -177,7 +177,7 @@ def prosst_train(
                 f"Y_true: {score_batches.cpu().numpy().flatten()}, "
                 f"Y_pred: {np.array(y_preds_detached)}"
             )
-        if epoch_spearman_2 > epoch_spearman_1:
+        if epoch_spearman_2 > epoch_spearman_1 or epoch == 0:
             if best_model is not None:
                 if os.path.isfile(best_model):
                     os.remove(best_model)
diff --git a/pypef/llm/utils.py b/pypef/llm/utils.py
@@ -35,10 +35,8 @@ def get_batches(a, dtype, batch_size=5,
             a_remaining = a[-remaining:]
         else:
             logger.info(f"Batch size greater than or equal to total array length: "
-                 f"returning full array (of shape: {np.shape(a)})...")
+                        f"returning full array (of shape: {np.shape(a)})...")
             if keep_remaining:
-                return list(a)
-            else:
                 return a
     if len(orig_shape) == 2:
         a = a.reshape(np.shape(a)[0] // batch_size, batch_size, np.shape(a)[1])
@@ -47,10 +45,9 @@ def get_batches(a, dtype, batch_size=5,
     new_shape = np.shape(a)
     if verbose:
         logger.info(f'{orig_shape} -> {new_shape} (dropped {remaining})')
-    if keep_remaining: # Returning a list
-        a = list(a)
+    if keep_remaining: 
         logger.info('Adding dropped back to batches as last batch...')
-        a.append(a_remaining)
+        a = np.append(a, a_remaining)
         return a
     if keep_numpy:
         return a
diff --git a/pypef/utils/variant_data.py b/pypef/utils/variant_data.py
@@ -5,6 +5,7 @@
 import os
 import numpy as np
 import pandas as pd
+import warnings
 
 import logging
 logger = logging.getLogger('pypef.utils.variant_data')
@@ -462,3 +463,16 @@ def read_csv_and_shift_pos_ints(
     data = np.array([new_col, column_2]).T
     new_df = pd.DataFrame(data, columns=['variant', 'fitness'])
     new_df.to_csv(infile[:-4] + '_new' + infile[-4:], sep=';', index=False)
+
+
+def get_mismatches(seq_a: str, seq_b: str):
+    n = 0
+    mismatches = ""
+    if len(seq_a) != len(seq_b):
+        logger.warning("Sequence length's do not match!")
+        raise RuntimeError(f"{seq_a}\n{len(seq_a)}\n{seq_b}\n{len(seq_b)}")
+    for i_a, aa in enumerate(seq_a):
+        if aa != seq_b[i_a]:
+            mismatches += f"{aa}{i_a + 1}{seq_b[i_a]},"
+            n += 1
+    return n, mismatches[:-1]