Update launch.json and restructured hybrid modeling a bit

niklases · niklases · commit 1a018c49fd56 · 2025-04-14T19:28:03.000+02:00
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -123,6 +123,25 @@
         },
 
         {
+            "name": "Python: PyPEF hybrid LS-TS GREMLIN-DCA-ESM1v avGFP",
+            "type": "debugpy",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "hybrid", 
+                //"-m", "GREMLIN",   // optional, not required  
+                "--ls", "LS.fasl",
+                "--ts", "TS.fasl", 
+                "--params", "GREMLIN",
+                "--llm", "esm"
+            ]
+        },
+
+        { // Test on test set
             "name": "Python: PyPEF hybrid/only-TS-zero-shot GREMLIN-DCA avGFP",
             "type": "debugpy",
             "request": "launch",
@@ -139,6 +158,24 @@
             ]
         },
 
+        { // Test on test set: Hybrid DCA-LLM ESM1v
+            "name": "Python: PyPEF hybrid/only-TS-zero-shot GREMLIN-DCA-ESM1v avGFP",
+            "type": "debugpy",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "hybrid", 
+                //"-m", "GREMLIN",   // optional, not required  
+                "--ts", "TS.fasl", 
+                "--params", "GREMLIN",
+                "--llm", "esm"
+            ]
+        },
+
         {
             "name": "Python: PyPEF hybrid/only-PS-zero-shot GREMLIN-DCA avGFP",
             "type": "debugpy",
diff --git a/pypef/hybrid/hybrid_model.py b/pypef/hybrid/hybrid_model.py
@@ -1187,16 +1187,17 @@ def performance_ls_ts(
                     f"variants: {len(test_sequences)}. Remaining: {len(test_variants)} "
                     f"(after removing substitutions at gap positions)."
                     )
-
+        print('LLM:', llm)
         if llm == 'esm':
             llm_dict = esm_setup(train_sequences)
+            print('XX', llm_dict)
             x_llm_test = esm_tokenize_sequences(
-                test_sequences, llm_dict['llm_tokenizer'], max_length=len(test_sequences[0])
+                test_sequences, llm_dict['esm1v']['llm_tokenizer'], max_length=len(test_sequences[0])
             )
         elif llm == 'prosst':
             llm_dict = prosst_setup(wt_seq, pdb_file, sequences=train_sequences)
             x_llm_test = prosst_tokenize_sequences(
-                test_sequences, llm_dict['llm_tokenizer'], max_length=len(test_sequences[0])
+                test_sequences, llm_dict['prosst']['llm_tokenizer'], max_length=len(test_sequences[0])
             )
         else:
             llm_dict = None
@@ -1211,29 +1212,27 @@ def performance_ls_ts(
             x_wt=x_wt
         )
         model_name = f'HYBRID{model_type.lower()}{llm.lower()}'
-
         y_test_pred = hybrid_model.hybrid_prediction(np.array(x_test), x_llm_test)
-
         print(f'Hybrid performance: {spearmanr(y_test, y_test_pred)}')
-
         save_model_to_dict_pickle(hybrid_model, model_name)
 
     elif ts_fasta is not None and model_pickle_file is not None and params_file is not None:
         print(f'Taking model from saved model (Pickle file): {model_pickle_file}...')
-
         model, model_type = get_model_and_type(model_pickle_file)
-
         if model_type != 'Hybrid':  # same as below in next elif
             x_test, test_variants, test_sequences, y_test, x_wt, *_ = plmc_or_gremlin_encoding(
                 test_variants, test_sequences, y_test, model_pickle_file, substitution_sep, threads, False)
             y_test_pred = get_delta_e_statistical_model(x_test, x_wt)
         else:  # Hybrid model input requires params from plmc or GREMLIN model
-            #beta_1, beta_2, reg = model.beta_1, model.beta_2, model.regressor
             x_test, test_variants, test_sequences, y_test, *_ = plmc_or_gremlin_encoding(
                 test_variants, test_sequences, y_test, params_file,
                 substitution_sep, threads, False
             )
-            y_test_pred = model.hybrid_prediction(x_test)
+            if model.llm_model_input is not None:
+                if list(model.llm_model_input.keys())[0] == 'esm1v':
+                    pass
+            else:
+                y_test_pred = model.hybrid_prediction(x_test)
 
     elif ts_fasta is not None and model_pickle_file is None:  # no LS provided --> statistical modeling / no ML
         print(f'No learning set provided, falling back to statistical DCA model: '
diff --git a/pypef/hybrid/hybrid_run.py b/pypef/hybrid/hybrid_run.py
@@ -53,6 +53,7 @@ def run_pypef_hybrid_modeling(arguments):
             threads=threads,
             params_file=arguments['--params'],
             model_pickle_file=arguments['--model'],
+            llm=arguments['--llm'],
             substitution_sep=arguments['--mutation_sep'],
             label=arguments['--label']
         )
diff --git a/pypef/main.py b/pypef/main.py
@@ -143,7 +143,9 @@
     pypef hybrid 
         [--ts TEST_SET] [--ps PREDICTION_SET]
         [--model MODEL] [--params PARAM_FILE]
-        [--ls LEARNING_SET] [--label] [--threads THREADS]
+        [--ls LEARNING_SET] [--label] 
+        [--llm LLM]
+        [--threads THREADS]
     pypef hybrid --model MODEL --params PARAM_FILE
         [--ts TEST_SET] [--label]
         [--ps PREDICTION_SET] [--pmult] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb]
@@ -206,6 +208,7 @@
                                     (line trimming) [default: 0.5].
   --label                           Label the plot instances [default: False].
   -l --ls LEARNING_SET              Input learning set in .fasta format.
+  --llm LLM                         LLM model to use for hybrid modeling next to DCA (options are 'ESM1v' and 'ProSST').
   -m --model MODEL                  Model (pickle file) for plotting of validation or for
                                     performing predictions.
   --msa MSA_FILE                    Multiple sequence alignment (MSA) in FASTA or A2M format for
@@ -298,7 +301,7 @@
 from schema import Schema, SchemaError, Optional, Or, Use
 
 from pypef.ml.ml_run import run_pypef_pure_ml
-from pypef.dca.dca_run import run_pypef_hybrid_modeling
+from pypef.hybrid.hybrid_run import run_pypef_hybrid_modeling
 from pypef.utils.utils_run import run_pypef_utils
 
 
@@ -330,6 +333,7 @@
     Optional('--inter_gap'): Use(float),
     Optional('--intra_gap'): Use(float),
     Optional('--label'): bool,
+    Optional('--llm'): Or(None, str),
     Optional('--ls'): Or(None, str),
     Optional('--model'): Or(None, str),
     Optional('--msa'): Or(None, str),
diff --git a/scripts/ProteinGym_runs/results/dca_esm_and_hybrid_opt_results.csv b/scripts/ProteinGym_runs/results/dca_esm_and_hybrid_opt_results.csv
@@ -1 +1,11 @@
 No.,Dataset,N_Variants,N_Max_Muts,Untrained_Performance_DCA,Untrained_Performance_ESM1v,Untrained_Performance_ProSST,Hybrid_DCA_Trained_Performance_100,Hybrid_DCA_ESM1v_Trained_Performance_100,Hybrid_DCA_ProSST_Trained_Performance_100,Hybrid_DCA_Trained_Performance_200,Hybrid_DCA_ESM1v_Trained_Performance_200,Hybrid_DCA_ProSST_Trained_Performance_200,Hybrid_DCA_Trained_Performance_1000,Hybrid_DCA_ESM1v_Trained_Performance_1000,Hybrid_DCA_ProSST_Trained_Performance_1000,N_Y_test,N_Y_test_100,N_Y_test_200,N_Y_test_1000,Time_in_s
+1,A0A140D2T1_ZIKV_Sourisseau_2019,9576,1,0.3961484373234954,0.025321007234227765,nan,0.39625513159145775,0.3945199199006544,nan,0.3625266358769935,0.3796601052244235,nan,0.42589911431961464,0.44013401721412304,nan,9576,9476,9376,8576,3450
+2,A0A192B1T2_9HIV1_Haddox_2018,12577,1,0.5142308485827871,0.4818033871398063,0.4480413345511842,0.5146527813514364,0.5435129382224032,0.5924128662504711,0.5467805611967641,0.5628488925632386,0.6078673337402053,0.6135023014022313,0.6678319327395137,0.6655014702779698,12577,12477,12377,11577,9738
+3,A0A1I9GEU1_NEIME_Kennouche_2019,922,1,0.05411293279402172,0.07207428613031318,0.047303886240045084,0.04436729926547371,0.04427441555019729,0.08790597435423003,0.06540297172246635,0.0725659812731205,0.03614327174785718,nan,nan,nan,922,822,722,nan,256
+4,A0A247D711_LISMN_Stadelmann_2021,1653,1,0.4714160727397353,0.1922849713838631,0.4683286023339054,0.4842922488159595,0.47869162933481824,0.5869795493921092,0.531315014757049,0.5133715949185581,0.6219219229364262,0.6323348215864005,0.6248817967566451,0.6894997810047412,1653,1553,1453,653,526
+5,A0A2Z5U3Z0_9INFA_Doud_2016,10715,1,0.1391166205539817,0.48300176044919707,0.47710377008999405,0.13967082235859982,0.49375101649184666,0.49294188359007274,0.27802529906449985,0.5248994314331253,0.5642866911518036,0.5196491031680416,0.6504876227143324,0.6457733718778265,10715,10615,10515,9715,5328
+6,A0A2Z5U3Z0_9INFA_Wu_2014,2350,1,0.3648647777916376,0.43310916448256903,0.36612159772291447,0.3671848245849248,0.45217249523558617,0.4603122688694703,0.3689452266258967,0.48361188758319923,0.42902907236054805,0.3967658185607553,0.49496694985733,0.49448058180987625,2350,2250,2150,1350,2488
+7,A4D664_9INFA_Soh_2019,14421,1,-0.10916121986038664,0.04568083925978014,0.2959612431218879,-0.10951341634667816,0.16922806390981557,0.3449684112889143,-0.02816637449647003,0.2659695893301885,0.3762411589983513,0.4045659532993644,0.4649682439579048,0.5442379418082474,14421,14321,14221,13421,8969
+8,A4GRB6_PSEAI_Chen_2020,5004,1,0.6681056494435768,0.543247747835155,0.647351733217166,0.6105347017831526,0.6007582102931497,0.7414608515883568,0.7053638375627392,0.6894205591814954,0.7677593077079674,0.7245479703656822,0.8059450782290521,0.8107707930340956,5004,4904,4804,4004,1751
+9,AACC1_PSEAI_Dandage_2018,1801,1,0.3180712414525488,0.45793953382550573,0.36853292069676097,0.3174612456170627,0.45937287821213785,0.4310318712519802,0.3521756690003161,0.49206057023996924,0.4617532190070763,0.4488626249244256,0.5505813182399806,0.534569265039648,1801,1701,1601,801,869
+10,ACE2_HUMAN_Chan_2020,2223,1,0.24320754065919856,0.1855938942334426,0.2613054581997969,0.2985805551410494,0.24485718938989934,0.353286631689331,0.4023145866828806,0.3372015473315942,0.4700770240532049,0.5643356952550576,0.6012504479478733,0.610115781454495,2223,2123,2023,1223,3995
diff --git a/scripts/ProteinGym_runs/results/dca_esm_and_hybrid_opt_results_clean.csv b/scripts/ProteinGym_runs/results/dca_esm_and_hybrid_opt_results_clean.csv
@@ -0,0 +1,11 @@
+No.,Dataset,N_Variants,N_Max_Muts,Untrained_Performance_DCA,Untrained_Performance_ESM1v,Untrained_Performance_ProSST,Hybrid_DCA_Trained_Performance_100,Hybrid_DCA_ESM1v_Trained_Performance_100,Hybrid_DCA_ProSST_Trained_Performance_100,Hybrid_DCA_Trained_Performance_200,Hybrid_DCA_ESM1v_Trained_Performance_200,Hybrid_DCA_ProSST_Trained_Performance_200,Hybrid_DCA_Trained_Performance_1000,Hybrid_DCA_ESM1v_Trained_Performance_1000,Hybrid_DCA_ProSST_Trained_Performance_1000,N_Y_test,N_Y_test_100,N_Y_test_200,N_Y_test_1000,Time_in_s
+1,A0A140D2T1_ZIKV_Sourisseau_2019,9576,1,0.3961484373234954,0.025321007234227765,nan,0.39625513159145775,0.3945199199006544,nan,0.3625266358769935,0.3796601052244235,nan,0.42589911431961464,0.44013401721412304,nan,9576,9476,9376,8576,3450
+2,A0A192B1T2_9HIV1_Haddox_2018,12577,1,0.5142308485827871,0.4818033871398063,0.4480413345511842,0.5146527813514364,0.5435129382224032,0.5924128662504711,0.5467805611967641,0.5628488925632386,0.6078673337402053,0.6135023014022313,0.6678319327395137,0.6655014702779698,12577,12477,12377,11577,9738
+3,A0A1I9GEU1_NEIME_Kennouche_2019,922,1,0.05411293279402172,0.07207428613031318,0.047303886240045084,0.04436729926547371,0.04427441555019729,0.08790597435423003,0.06540297172246635,0.0725659812731205,0.03614327174785718,nan,nan,nan,922,822,722,nan,256
+4,A0A247D711_LISMN_Stadelmann_2021,1653,1,0.4714160727397353,0.1922849713838631,0.4683286023339054,0.4842922488159595,0.47869162933481824,0.5869795493921092,0.531315014757049,0.5133715949185581,0.6219219229364262,0.6323348215864005,0.6248817967566451,0.6894997810047412,1653,1553,1453,653,526
+5,A0A2Z5U3Z0_9INFA_Doud_2016,10715,1,0.1391166205539817,0.48300176044919707,0.47710377008999405,0.13967082235859982,0.49375101649184666,0.49294188359007274,0.27802529906449985,0.5248994314331253,0.5642866911518036,0.5196491031680416,0.6504876227143324,0.6457733718778265,10715,10615,10515,9715,5328
+6,A0A2Z5U3Z0_9INFA_Wu_2014,2350,1,0.3648647777916376,0.43310916448256903,0.36612159772291447,0.3671848245849248,0.45217249523558617,0.4603122688694703,0.3689452266258967,0.48361188758319923,0.42902907236054805,0.3967658185607553,0.49496694985733,0.49448058180987625,2350,2250,2150,1350,2488
+7,A4D664_9INFA_Soh_2019,14421,1,-0.10916121986038664,0.04568083925978014,0.2959612431218879,-0.10951341634667816,0.16922806390981557,0.3449684112889143,-0.02816637449647003,0.2659695893301885,0.3762411589983513,0.4045659532993644,0.4649682439579048,0.5442379418082474,14421,14321,14221,13421,8969
+8,A4GRB6_PSEAI_Chen_2020,5004,1,0.6681056494435768,0.543247747835155,0.647351733217166,0.6105347017831526,0.6007582102931497,0.7414608515883568,0.7053638375627392,0.6894205591814954,0.7677593077079674,0.7245479703656822,0.8059450782290521,0.8107707930340956,5004,4904,4804,4004,1751
+9,AACC1_PSEAI_Dandage_2018,1801,1,0.3180712414525488,0.45793953382550573,0.36853292069676097,0.3174612456170627,0.45937287821213785,0.4310318712519802,0.3521756690003161,0.49206057023996924,0.4617532190070763,0.4488626249244256,0.5505813182399806,0.534569265039648,1801,1701,1601,801,869
+10,ACE2_HUMAN_Chan_2020,2223,1,0.24320754065919856,0.1855938942334426,0.2613054581997969,0.2985805551410494,0.24485718938989934,0.353286631689331,0.4023145866828806,0.3372015473315942,0.4700770240532049,0.5643356952550576,0.6012504479478733,0.610115781454495,2223,2123,2023,1223,3995

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,7 @@ def run_pypef_hybrid_modeling(arguments):`
`53`	`53`	`threads=threads,`
`54`	`54`	`params_file=arguments['--params'],`
`55`	`55`	`model_pickle_file=arguments['--model'],`
	`56`	`+ llm=arguments['--llm'],`
`56`	`57`	`substitution_sep=arguments['--mutation_sep'],`
`57`	`58`	`label=arguments['--label']`
`58`	`59`	`)`