Getting GREMLIN sequence encodings in batches

niklases · niklases · commit 2158ac68329f · 2025-04-23T07:02:36.000+02:00
of size 1000
diff --git a/pypef/dca/gremlin_inference.py b/pypef/dca/gremlin_inference.py
@@ -428,10 +428,9 @@ def get_scores(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.
         seqs_int = self.seq2int(seqs)
 
         try:
-            if seqs_int.shape[-1] != len(v_idx):
-                #logger.info(f'The input sequence length ({seqs_int.shape[-1]}) does not match the common gap-trimmed MSA sequence length ({len(v_idx)})!')
-                seqs_int = seqs_int[..., v_idx]
-                #logger.info(f'Updated shape: ({seqs_int.shape[-1]}) matches common MSA sequence length ({len(v_idx)}) now')
+            if seqs_int.shape[-1] != len(v_idx):  # The input sequence length ({seqs_int.shape[-1]}) 
+                # does not match the common gap-trimmed MSA sequence length (len(v_idx)
+                seqs_int = seqs_int[..., v_idx]  # Shape matches common MSA sequence length (len(v_idx)) now
         except IndexError:
             raise SystemError(
                 "The loaded GREMLIN parameter model does not match the input model "
diff --git a/pypef/llm/esm_lora_tune.py b/pypef/llm/esm_lora_tune.py
@@ -103,19 +103,25 @@ def corr_loss(y_true: torch.Tensor, y_pred: torch.Tensor):
 
 
 def get_batches(a, dtype, batch_size=5, 
-                keep_numpy: bool = False, verbose: bool = False):
+                keep_numpy: bool = False, keep_remaining=False, verbose: bool = False):
     a = np.asarray(a, dtype=dtype)
     orig_shape = np.shape(a)
     remaining = len(a) % batch_size
     if remaining != 0:
         a = a[:-remaining]
+        a_remaining = a[-remaining:]
     if len(orig_shape) == 2:
         a = a.reshape(np.shape(a)[0] // batch_size, batch_size, np.shape(a)[1])
     else:  # elif len(orig_shape) == 1:
         a = a.reshape(np.shape(a)[0] // batch_size, batch_size)
     new_shape = np.shape(a)
     if verbose:
         print(f'{orig_shape} -> {new_shape}  (dropped {remaining})')
+    if keep_remaining: # Returning a list
+        a = list(a)
+        print('Adding dropped back to batches as last batch...')
+        a.append(a_remaining)
+        return a
     if keep_numpy:
         return a
     return torch.Tensor(a).to(dtype)
diff --git a/scripts/ProteinGym_runs/results/dca_esm_and_hybrid_opt_results.csv b/scripts/ProteinGym_runs/results/dca_esm_and_hybrid_opt_results.csv
@@ -155,3 +155,5 @@ No.,Dataset,N_Variants,N_Max_Muts,Untrained_Performance_DCA,Untrained_Performanc
 165,FECA_ECOLI_Tsuboyama_2023_2D1U,1886,2,0.40408514501427906,0.3139309947296232,0.5738858975054831,0.4523259703428921,0.5052927011009388,0.6656143272779307,0.4769453507671686,0.5682842582146705,0.7580245116989018,0.6098073159278943,0.7656754659582231,nan,1886,1786,1686,886,346
 166,GCN4_YEAST_Staller_2018,2638,44,0.25011546899669806,-0.006027620813706041,0.22764968209696385,0.24358636362901392,0.241174939429571,nan,0.2544759868327667,0.36879030431219906,nan,0.2859991050918842,0.5125215331300973,nan,2638,2538,2438,1638,684
 167,GFP_AEQVI_Sarkisyan_2016,51714,15,0.6406366653494072,0.1336688728267403,0.6860965519817945,0.6422492276272843,0.6495590297325259,nan,0.6486786880849276,0.6360629034990605,nan,0.7463828216993244,0.7711781697848346,nan,51714,51614,51514,50714,8310
+168,GRB2_HUMAN_Faure_2021,63366,2,0.5258434005381363,0.5367412810228084,0.7216670654700138,0.5839763004949682,0.6852622116666458,0.697364904206813,0.6746840807821936,0.7103753714289707,0.7316071269759475,0.708765178129165,0.7742172713240596,0.7938850307943259,63366,63266,63166,62366,13344
+169,HECD1_HUMAN_Tsuboyama_2023_3DKM,5586,2,0.28623326119763287,0.2150000228393089,0.2406307470397028,0.5991137006846494,0.6520916880669738,0.7123845116845012,0.671240119874008,0.6713008248409833,0.715769538708621,0.6959516975099861,0.7840773924211748,0.7706682813598381,5586,5486,5386,4586,986
diff --git a/scripts/ProteinGym_runs/run_performance_tests_proteingym_hybrid_dca_llm.py b/scripts/ProteinGym_runs/run_performance_tests_proteingym_hybrid_dca_llm.py
@@ -4,7 +4,9 @@
 import gc
 import time
 import warnings
+import psutil
 import json
+from tqdm import tqdm
 import pandas as pd
 import numpy as np
 import torch
@@ -60,6 +62,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
     get_vram()
     MAX_WT_SEQUENCE_LENGTH = 1000
     print(f"Maximum sequence length: {MAX_WT_SEQUENCE_LENGTH}")
+    print(f"Loading LLM models into {device} device...")
     prosst_base_model, prosst_lora_model, prosst_tokenizer, prosst_optimizer = get_prosst_models()
     prosst_vocab = prosst_tokenizer.get_vocab()
     prosst_base_model = prosst_base_model.to(device)
@@ -89,7 +92,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
             #    print('Continuing (TODO: requires cut of PDB input struture residues)...')
             #    continue
             # Getting % usage of virtual_memory (3rd field)
-            import psutil;print(f'RAM used: {round(psutil.virtual_memory()[3]/1E9, 3)} '
+            print(f'RAM used: {round(psutil.virtual_memory()[3]/1E9, 3)} '
                   f'GB ({psutil.virtual_memory()[2]} %)')
             variant_fitness_data = pd.read_csv(csv_path, sep=',')
             print('N_variant-fitness-tuples:', np.shape(variant_fitness_data)[0])
@@ -142,7 +145,12 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
             
             print('GREMLIN-DCA: optimization...')
             gremlin = GREMLIN(alignment=msa_path, opt_iter=100, optimize=True)
-            x_dca = gremlin.collect_encoded_sequences(sequences)
+            sequences_batched = get_batches(sequences, batch_size=1000, 
+                                            dtype=str, keep_remaining=True, verbose=True)
+            x_dca = []
+            for seq_b in tqdm(sequences_batched, desc="Getting GREMLIN sequence encodings"):
+                for x in gremlin.collect_encoded_sequences(seq_b):
+                    x_dca.append(x)
             x_wt = gremlin.x_wt
             y_pred_dca = get_delta_e_statistical_model(x_dca, x_wt)
             print(f'DCA (unsupervised performance): {spearmanr(fitnesses, y_pred_dca)[0]:.3f}')