Update model loading process to work offline if models are available

niklases · niklases · commit 4bca8f106eac · 2025-08-08T19:04:02.000+02:00
diff --git a/pypef/llm/esm_lora_tune.py b/pypef/llm/esm_lora_tune.py
@@ -28,15 +28,17 @@
 from peft import LoraConfig, get_peft_model
 from transformers import logging as hf_logging
 hf_logging.set_verbosity_error()
-from transformers import EsmForMaskedLM, EsmTokenizer
 
 from pypef.utils.helpers import get_device
-from pypef.llm.utils import corr_loss
+from pypef.llm.utils import corr_loss, load_model_and_tokenizer
 
 
 def get_esm_models():
-    base_model = EsmForMaskedLM.from_pretrained(f'facebook/esm1v_t33_650M_UR90S_3')
-    tokenizer = EsmTokenizer.from_pretrained(f'facebook/esm1v_t33_650M_UR90S_3')
+    base_model, tokenizer = load_model_and_tokenizer(
+        f'facebook/esm1v_t33_650M_UR90S_3'
+        # Just sticking to AutoModelForMaskedLM and AutoTokenizer 
+        # instead to EsmForMaskedLM and EsmTokenizer
+    )  
     peft_config = LoraConfig(r=8, target_modules=["query", "value"])
     lora_model = get_peft_model(base_model, peft_config)
     optimizer = torch.optim.Adam(lora_model.parameters(), lr=0.01)
diff --git a/pypef/llm/prosst_lora_tune.py b/pypef/llm/prosst_lora_tune.py
@@ -21,7 +21,6 @@
 import numpy as np
 from scipy.stats import spearmanr
 from tqdm import tqdm
-from transformers import AutoModelForMaskedLM, AutoTokenizer
 from peft import LoraConfig, get_peft_model
 from Bio import SeqIO, BiopythonParserWarning
 warnings.filterwarnings(action='ignore', category=BiopythonParserWarning)
diff --git a/pypef/llm/utils.py b/pypef/llm/utils.py
@@ -5,7 +5,6 @@
 import torch
 import os
 import platform
-from huggingface_hub import try_to_load_from_cache, _CACHED_NO_EXIST
 from transformers import AutoModelForMaskedLM, AutoTokenizer
 from transformers.utils import logging as ts_logging
 ts_logging.set_verbosity_error()
@@ -64,42 +63,83 @@ def get_default_cache_dir():
     """
     system = platform.system()
     if system == "Windows":
-        return os.path.join(os.environ.get("USERPROFILE", ""), ".cache", "huggingface", "transformers")
+        return os.path.join(
+            os.environ.get("USERPROFILE", ""), ".cache",
+            "huggingface", "transformers"
+        )
     elif system == "Darwin":
         return os.path.expanduser("~/.cache/huggingface/transformers")
     else: # Assume Linux or other Unix-like systems
         return os.path.expanduser("~/.cache/huggingface/transformers")
 
 
-def is_model_cached(repo_id: str, cache_dir: str) -> bool:
+def is_model_cached(repo_id: str, cache_dir: str):
     """
     Check if the required model and tokenizer files are cached locally.
     """
-
-    filepath = try_to_load_from_cache(repo_id=repo_id, filename='model.safetensors', cache_dir=cache_dir)
-    if isinstance(filepath, str):
-        return True # file is cached
-    elif filepath is _CACHED_NO_EXIST:
-        return False # non-existence of file is cached
+    snapshot_dir = None
+    if os.path.isdir(cache_dir):
+        ref_file = os.path.join(
+            cache_dir, f'models--{repo_id.replace("/", '--')}', 'refs', 'main'
+        )
+        if os.path.isfile(ref_file):
+            with open(ref_file, 'r') as fh:
+                t = fh.readlines()
+            ref = t[0].strip()
+        else:
+            return False, snapshot_dir
+        snapshot_dir = os.path.join(
+            cache_dir, f'models--{repo_id.replace("/", '--')}', 'snapshots', ref
+        )
+        if os.path.isdir(snapshot_dir):
+            return True, snapshot_dir
+        else:
+            return False, None
     else:
-        return False # file is not cached and not in non-existance cache
+        return False, snapshot_dir
 
 
-def load_model_and_tokenizer(model_name, cache_dir: str | os.PathLike | None = None):
+def load_model_and_tokenizer(
+        model_name: str, 
+        cache_dir: str | os.PathLike | None = None, 
+        model_loader=None, 
+        tokenizer_loader=None
+):
     """
     Load the model and tokenizer from cache directory. Downloads to cache if not present.
     """
     if cache_dir is None:
         cache_dir = get_default_cache_dir()
-    if is_model_cached(model_name, cache_dir):
-        logger.info(f"Loading model and tokenizer from cache {cache_dir}...")
+    if model_loader is None:
+        model_loader = AutoModelForMaskedLM
+    if tokenizer_loader is None:
+        tokenizer_loader = AutoTokenizer
+    exists, exists_at = is_model_cached(model_name, cache_dir)
+    if exists:
+        try:
+            logger.info(f"Loading model and tokenizer from cache {exists_at}...")
+            model = model_loader.from_pretrained(
+                exists_at, trust_remote_code=True
+            )
+            tokenizer = tokenizer_loader.from_pretrained(
+                exists_at, trust_remote_code=True
+            )
+        except OSError as e:
+            logger.info(f"Faced error \"{e}\": Trying to load with regular cache load path...")
+            model = model_loader.from_pretrained(
+                model_name, cache_dir=cache_dir, trust_remote_code=True
+            )
+            tokenizer = tokenizer_loader.from_pretrained(
+                model_name, cache_dir=cache_dir, trust_remote_code=True
+            )
     else:
         logger.info(f"Did not find model and tokenizer in cache directory, downloading model "
-                    f"and tokenizer from the internet and storing in cache {cache_dir}...")
-    model = AutoModelForMaskedLM.from_pretrained(
-        model_name, cache_dir=cache_dir, trust_remote_code=True
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_name, cache_dir=cache_dir, trust_remote_code=True
-    )
+              f"and tokenizer from the internet and storing in cache {cache_dir}...")
+        model = model_loader.from_pretrained(
+            model_name, cache_dir=cache_dir, trust_remote_code=True
+        )
+        tokenizer = tokenizer_loader.from_pretrained(
+            model_name, cache_dir=cache_dir, trust_remote_code=True
+        )
+    logger.info("Model and tokenizer loaded successfully...")
     return model, tokenizer
diff --git a/scripts/ProteinGym_runs/protgym_hybrid_perf_test_crossval.py b/scripts/ProteinGym_runs/protgym_hybrid_perf_test_crossval.py
@@ -50,6 +50,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
     get_vram()
     MAX_WT_SEQUENCE_LENGTH = 600  # TODO: 1000
     MAX_VARIANT_FITNESS_PAIRS = 5000
+    N_CV = 5
     print(f"Maximum sequence length: {MAX_WT_SEQUENCE_LENGTH}")
     print(f"Loading LLM models into {device} device...")
     prosst_base_model, prosst_lora_model, prosst_tokenizer, prosst_optimizer = get_prosst_models()
@@ -160,11 +161,6 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
                 x_esm, esm_attention_mask = esm_tokenize_sequences(
                     sequences, esm_tokenizer, max_length=len(wt_seq), verbose=False
                 )
-                #y_esm = esm_infer(
-                #    get_batches(x_esm, dtype=float, batch_size=1), 
-                #    esm_attention_mask, 
-                #    esm_base_model
-                #)
                 y_esm = inference(sequences, 'esm', model=esm_base_model, verbose=False)
                 print(f'ESM1v (unsupervised performance): '
                       f'{spearmanr(fitnesses, y_esm.cpu())[0]:.3f}')
@@ -177,10 +173,6 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
                     pdb, prosst_tokenizer, wt_seq, verbose=False
                     )
                 x_prosst = prosst_tokenize_sequences(sequences=sequences, vocab=prosst_vocab, verbose=False)
-                #y_prosst = get_logits_from_full_seqs(
-                #        x_prosst, prosst_base_model, input_ids, prosst_attention_mask, 
-                #        structure_input_ids, train=False
-                #)
                 y_prosst = inference(sequences, 'prosst', pdb_file=pdb, wt_seq=wt_seq, model=prosst_base_model, verbose=False)
                 print(f'ProSST (unsupervised performance): '
                       f'{spearmanr(fitnesses, y_prosst.cpu())[0]:.3f}')
@@ -192,8 +184,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
                 print('Both LLM\'s had RunTimeErrors, skipping dataset...')
                 continue 
 
-            ns_y_test = [len(variants)]
-            ds = DatasetSplitter(df_or_csv_file=csv_path, n_cv=5, mutation_separator=mut_sep)
+            ds = DatasetSplitter(df_or_csv_file=csv_path, n_cv=N_CV, mutation_separator=mut_sep)
             ds.plot_distributions()
             if max_muts >= 2:  # Only using random cross-validation splits
                 print("Only performing random splits as data contains multi-substituted variants...")
@@ -202,17 +193,20 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
                 print("Only single substituted variants found, performing random, modulo, and continuous data splits...")
                 target_split_indices = ds.get_all_split_indices()
             temp_results = {}
-            # TODO: Get correct indices for full df for multi-muts using DatasetSplitter!
+            for c in ["Random", "Modulo", "Continuous"]:
+                temp_results.update({c: {}})
+                for s in range(N_CV):
+                    temp_results[c].update({f'Split {s}': {}})
+                    for m in ['DCA', 'ESM1v', 'ProSST', 'DCA hybrid', 'DCA+ESM1v hybrid', 'DCA+ProSST hybrid']:
+                        # Prefill with NaN's
+                        temp_results[c][f'Split {s}'].update({m: np.nan})
             for i_category, (train_indices, test_indices) in enumerate(target_split_indices):
                 category = ["Random", "Modulo", "Continuous"][i_category]
                 print(f'Category: {category}')
-                temp_results.update({category: {}})
                 for i_split, (train_i, test_i) in enumerate(zip(
                     train_indices, test_indices
                 )):
                     print(f'    Split: {i_split + 1}')
-                    print(test_i)
-                    temp_results[category].update({f'Split {i_split}': {}})
                     try:
                         _train_sequences, test_sequences = np.asarray(sequences)[train_i], np.asarray(sequences)[test_i]
                         x_dca_train, x_dca_test = np.asarray(x_dca)[train_i], np.asarray(x_dca)[test_i]
@@ -224,14 +218,10 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
                         esm_lora_model_2 = copy.deepcopy(esm_lora_model)
                         esm_optimizer = torch.optim.Adam(esm_lora_model_2.parameters(), lr=0.0001)
                         train_size, test_size = len(train_i), len(test_i)
-                        #get_vram()
                     except ValueError as e:
                         print(f"Only {len(fitnesses)} variant-fitness pairs in total, "
                               f"cannot split the data in N_Train = {train_size} and N_Test "
                               f"(N_Total - N_Train) [Excepted error: {e}].")
-                        for m in ['DCA', 'ESM1v', 'ProSST', 'DCA hybrid', 'DCA+ESM1v hybrid', 'DCA+ProSST hybrid']:
-                            temp_results[category][f'Split {i_split}'].update({m: np.nan})
-                        ns_y_test.append(np.nan)
                         continue
                     (
                         x_dca_train, 
@@ -276,9 +266,6 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
                               f"in N_Train = {len(y_train)} and N_Test = {len(y_test)} "
                               f"results in N_Test <= 50 variants - not getting "
                               f"performance for N_Train = {len(y_train)}...")
-                        ns_y_test.append(np.nan)
-                        for m in ['DCA', 'ESM1v', 'ProSST', 'DCA hybrid', 'DCA+ESM1v hybrid', 'DCA+ProSST hybrid']:
-                            temp_results[category][f'Split {i_split}'].update({m: np.nan})
                         continue
 
                     y_test_pred_dca = get_delta_e_statistical_model(x_dca_test, x_wt)
@@ -313,10 +300,8 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
                             print(f'        {m_str} (split {i_split + 1}) performance: {spearmanr(y_test, y_test_pred)[0]:.3f} '
                                   f'(train size={train_size}, test_size={test_size})')
                             temp_results[category][f'Split {i_split}'].update({m_str: spearmanr(y_test, y_test_pred)[0]})
-                        except RuntimeError as e:  # modeling_prosst.py, line 920, in forward 
-                            # or UnboundLocalError in prosst_lora_tune.py, line 167
-                            temp_results[category][f'Split {i_split}'].update({m_str: np.nan})
-                    ns_y_test.append(len(y_test_pred))
+                        except RuntimeError as e:  # modeling_prosst.py in forward
+                            continue
                     del prosst_lora_model_2
                     del esm_lora_model_2
                     torch.cuda.empty_cache()
@@ -358,7 +343,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
                     f'{int(dt)}\n')
                 
 
-def plot_csv_data(csv, plot_name):
+def plot_csv_data(csv):
     plt.figure(figsize=(24, 12))
     sns.set_style("whitegrid")
     df = pd.read_csv(csv, sep=',')  
@@ -487,4 +472,4 @@ def plot_csv_data(csv, plot_name):
             ):
                 fh2.write(line)
     
-    plot_csv_data(csv=clean_out_results_csv, plot_name='mut_performance')
+    plot_csv_data(csv=clean_out_results_csv)