Skip to content

Commit 1a018c4

Browse files
committed
Update launch.json and restructured hybrid modeling a bit
1 parent d8a47e7 commit 1a018c4

File tree

6 files changed

+74
-12
lines changed

6 files changed

+74
-12
lines changed

.vscode/launch.json

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,25 @@
123123
},
124124

125125
{
126+
"name": "Python: PyPEF hybrid LS-TS GREMLIN-DCA-ESM1v avGFP",
127+
"type": "debugpy",
128+
"request": "launch",
129+
"env": {"PYTHONPATH": "${workspaceFolder}"},
130+
"program": "${workspaceFolder}/pypef/main.py",
131+
"console": "integratedTerminal",
132+
"justMyCode": true,
133+
"cwd": "${workspaceFolder}/datasets/AVGFP/",
134+
"args": [
135+
"hybrid",
136+
//"-m", "GREMLIN", // optional, not required
137+
"--ls", "LS.fasl",
138+
"--ts", "TS.fasl",
139+
"--params", "GREMLIN",
140+
"--llm", "esm"
141+
]
142+
},
143+
144+
{ // Test on test set
126145
"name": "Python: PyPEF hybrid/only-TS-zero-shot GREMLIN-DCA avGFP",
127146
"type": "debugpy",
128147
"request": "launch",
@@ -139,6 +158,24 @@
139158
]
140159
},
141160

161+
{ // Test on test set: Hybrid DCA-LLM ESM1v
162+
"name": "Python: PyPEF hybrid/only-TS-zero-shot GREMLIN-DCA-ESM1v avGFP",
163+
"type": "debugpy",
164+
"request": "launch",
165+
"env": {"PYTHONPATH": "${workspaceFolder}"},
166+
"program": "${workspaceFolder}/pypef/main.py",
167+
"console": "integratedTerminal",
168+
"justMyCode": true,
169+
"cwd": "${workspaceFolder}/datasets/AVGFP/",
170+
"args": [
171+
"hybrid",
172+
//"-m", "GREMLIN", // optional, not required
173+
"--ts", "TS.fasl",
174+
"--params", "GREMLIN",
175+
"--llm", "esm"
176+
]
177+
},
178+
142179
{
143180
"name": "Python: PyPEF hybrid/only-PS-zero-shot GREMLIN-DCA avGFP",
144181
"type": "debugpy",

pypef/hybrid/hybrid_model.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1187,16 +1187,17 @@ def performance_ls_ts(
11871187
f"variants: {len(test_sequences)}. Remaining: {len(test_variants)} "
11881188
f"(after removing substitutions at gap positions)."
11891189
)
1190-
1190+
print('LLM:', llm)
11911191
if llm == 'esm':
11921192
llm_dict = esm_setup(train_sequences)
1193+
print('XX', llm_dict)
11931194
x_llm_test = esm_tokenize_sequences(
1194-
test_sequences, llm_dict['llm_tokenizer'], max_length=len(test_sequences[0])
1195+
test_sequences, llm_dict['esm1v']['llm_tokenizer'], max_length=len(test_sequences[0])
11951196
)
11961197
elif llm == 'prosst':
11971198
llm_dict = prosst_setup(wt_seq, pdb_file, sequences=train_sequences)
11981199
x_llm_test = prosst_tokenize_sequences(
1199-
test_sequences, llm_dict['llm_tokenizer'], max_length=len(test_sequences[0])
1200+
test_sequences, llm_dict['prosst']['llm_tokenizer'], max_length=len(test_sequences[0])
12001201
)
12011202
else:
12021203
llm_dict = None
@@ -1211,29 +1212,27 @@ def performance_ls_ts(
12111212
x_wt=x_wt
12121213
)
12131214
model_name = f'HYBRID{model_type.lower()}{llm.lower()}'
1214-
12151215
y_test_pred = hybrid_model.hybrid_prediction(np.array(x_test), x_llm_test)
1216-
12171216
print(f'Hybrid performance: {spearmanr(y_test, y_test_pred)}')
1218-
12191217
save_model_to_dict_pickle(hybrid_model, model_name)
12201218

12211219
elif ts_fasta is not None and model_pickle_file is not None and params_file is not None:
12221220
print(f'Taking model from saved model (Pickle file): {model_pickle_file}...')
1223-
12241221
model, model_type = get_model_and_type(model_pickle_file)
1225-
12261222
if model_type != 'Hybrid': # same as below in next elif
12271223
x_test, test_variants, test_sequences, y_test, x_wt, *_ = plmc_or_gremlin_encoding(
12281224
test_variants, test_sequences, y_test, model_pickle_file, substitution_sep, threads, False)
12291225
y_test_pred = get_delta_e_statistical_model(x_test, x_wt)
12301226
else: # Hybrid model input requires params from plmc or GREMLIN model
1231-
#beta_1, beta_2, reg = model.beta_1, model.beta_2, model.regressor
12321227
x_test, test_variants, test_sequences, y_test, *_ = plmc_or_gremlin_encoding(
12331228
test_variants, test_sequences, y_test, params_file,
12341229
substitution_sep, threads, False
12351230
)
1236-
y_test_pred = model.hybrid_prediction(x_test)
1231+
if model.llm_model_input is not None:
1232+
if list(model.llm_model_input.keys())[0] == 'esm1v':
1233+
pass
1234+
else:
1235+
y_test_pred = model.hybrid_prediction(x_test)
12371236

12381237
elif ts_fasta is not None and model_pickle_file is None: # no LS provided --> statistical modeling / no ML
12391238
print(f'No learning set provided, falling back to statistical DCA model: '
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def run_pypef_hybrid_modeling(arguments):
5353
threads=threads,
5454
params_file=arguments['--params'],
5555
model_pickle_file=arguments['--model'],
56+
llm=arguments['--llm'],
5657
substitution_sep=arguments['--mutation_sep'],
5758
label=arguments['--label']
5859
)

pypef/main.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,9 @@
143143
pypef hybrid
144144
[--ts TEST_SET] [--ps PREDICTION_SET]
145145
[--model MODEL] [--params PARAM_FILE]
146-
[--ls LEARNING_SET] [--label] [--threads THREADS]
146+
[--ls LEARNING_SET] [--label]
147+
[--llm LLM]
148+
[--threads THREADS]
147149
pypef hybrid --model MODEL --params PARAM_FILE
148150
[--ts TEST_SET] [--label]
149151
[--ps PREDICTION_SET] [--pmult] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb]
@@ -206,6 +208,7 @@
206208
(line trimming) [default: 0.5].
207209
--label Label the plot instances [default: False].
208210
-l --ls LEARNING_SET Input learning set in .fasta format.
211+
--llm LLM LLM model to use for hybrid modeling next to DCA (options are 'ESM1v' and 'ProSST').
209212
-m --model MODEL Model (pickle file) for plotting of validation or for
210213
performing predictions.
211214
--msa MSA_FILE Multiple sequence alignment (MSA) in FASTA or A2M format for
@@ -298,7 +301,7 @@
298301
from schema import Schema, SchemaError, Optional, Or, Use
299302

300303
from pypef.ml.ml_run import run_pypef_pure_ml
301-
from pypef.dca.dca_run import run_pypef_hybrid_modeling
304+
from pypef.hybrid.hybrid_run import run_pypef_hybrid_modeling
302305
from pypef.utils.utils_run import run_pypef_utils
303306

304307

@@ -330,6 +333,7 @@
330333
Optional('--inter_gap'): Use(float),
331334
Optional('--intra_gap'): Use(float),
332335
Optional('--label'): bool,
336+
Optional('--llm'): Or(None, str),
333337
Optional('--ls'): Or(None, str),
334338
Optional('--model'): Or(None, str),
335339
Optional('--msa'): Or(None, str),
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,11 @@
11
No.,Dataset,N_Variants,N_Max_Muts,Untrained_Performance_DCA,Untrained_Performance_ESM1v,Untrained_Performance_ProSST,Hybrid_DCA_Trained_Performance_100,Hybrid_DCA_ESM1v_Trained_Performance_100,Hybrid_DCA_ProSST_Trained_Performance_100,Hybrid_DCA_Trained_Performance_200,Hybrid_DCA_ESM1v_Trained_Performance_200,Hybrid_DCA_ProSST_Trained_Performance_200,Hybrid_DCA_Trained_Performance_1000,Hybrid_DCA_ESM1v_Trained_Performance_1000,Hybrid_DCA_ProSST_Trained_Performance_1000,N_Y_test,N_Y_test_100,N_Y_test_200,N_Y_test_1000,Time_in_s
2+
1,A0A140D2T1_ZIKV_Sourisseau_2019,9576,1,0.3961484373234954,0.025321007234227765,nan,0.39625513159145775,0.3945199199006544,nan,0.3625266358769935,0.3796601052244235,nan,0.42589911431961464,0.44013401721412304,nan,9576,9476,9376,8576,3450
3+
2,A0A192B1T2_9HIV1_Haddox_2018,12577,1,0.5142308485827871,0.4818033871398063,0.4480413345511842,0.5146527813514364,0.5435129382224032,0.5924128662504711,0.5467805611967641,0.5628488925632386,0.6078673337402053,0.6135023014022313,0.6678319327395137,0.6655014702779698,12577,12477,12377,11577,9738
4+
3,A0A1I9GEU1_NEIME_Kennouche_2019,922,1,0.05411293279402172,0.07207428613031318,0.047303886240045084,0.04436729926547371,0.04427441555019729,0.08790597435423003,0.06540297172246635,0.0725659812731205,0.03614327174785718,nan,nan,nan,922,822,722,nan,256
5+
4,A0A247D711_LISMN_Stadelmann_2021,1653,1,0.4714160727397353,0.1922849713838631,0.4683286023339054,0.4842922488159595,0.47869162933481824,0.5869795493921092,0.531315014757049,0.5133715949185581,0.6219219229364262,0.6323348215864005,0.6248817967566451,0.6894997810047412,1653,1553,1453,653,526
6+
5,A0A2Z5U3Z0_9INFA_Doud_2016,10715,1,0.1391166205539817,0.48300176044919707,0.47710377008999405,0.13967082235859982,0.49375101649184666,0.49294188359007274,0.27802529906449985,0.5248994314331253,0.5642866911518036,0.5196491031680416,0.6504876227143324,0.6457733718778265,10715,10615,10515,9715,5328
7+
6,A0A2Z5U3Z0_9INFA_Wu_2014,2350,1,0.3648647777916376,0.43310916448256903,0.36612159772291447,0.3671848245849248,0.45217249523558617,0.4603122688694703,0.3689452266258967,0.48361188758319923,0.42902907236054805,0.3967658185607553,0.49496694985733,0.49448058180987625,2350,2250,2150,1350,2488
8+
7,A4D664_9INFA_Soh_2019,14421,1,-0.10916121986038664,0.04568083925978014,0.2959612431218879,-0.10951341634667816,0.16922806390981557,0.3449684112889143,-0.02816637449647003,0.2659695893301885,0.3762411589983513,0.4045659532993644,0.4649682439579048,0.5442379418082474,14421,14321,14221,13421,8969
9+
8,A4GRB6_PSEAI_Chen_2020,5004,1,0.6681056494435768,0.543247747835155,0.647351733217166,0.6105347017831526,0.6007582102931497,0.7414608515883568,0.7053638375627392,0.6894205591814954,0.7677593077079674,0.7245479703656822,0.8059450782290521,0.8107707930340956,5004,4904,4804,4004,1751
10+
9,AACC1_PSEAI_Dandage_2018,1801,1,0.3180712414525488,0.45793953382550573,0.36853292069676097,0.3174612456170627,0.45937287821213785,0.4310318712519802,0.3521756690003161,0.49206057023996924,0.4617532190070763,0.4488626249244256,0.5505813182399806,0.534569265039648,1801,1701,1601,801,869
11+
10,ACE2_HUMAN_Chan_2020,2223,1,0.24320754065919856,0.1855938942334426,0.2613054581997969,0.2985805551410494,0.24485718938989934,0.353286631689331,0.4023145866828806,0.3372015473315942,0.4700770240532049,0.5643356952550576,0.6012504479478733,0.610115781454495,2223,2123,2023,1223,3995
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
No.,Dataset,N_Variants,N_Max_Muts,Untrained_Performance_DCA,Untrained_Performance_ESM1v,Untrained_Performance_ProSST,Hybrid_DCA_Trained_Performance_100,Hybrid_DCA_ESM1v_Trained_Performance_100,Hybrid_DCA_ProSST_Trained_Performance_100,Hybrid_DCA_Trained_Performance_200,Hybrid_DCA_ESM1v_Trained_Performance_200,Hybrid_DCA_ProSST_Trained_Performance_200,Hybrid_DCA_Trained_Performance_1000,Hybrid_DCA_ESM1v_Trained_Performance_1000,Hybrid_DCA_ProSST_Trained_Performance_1000,N_Y_test,N_Y_test_100,N_Y_test_200,N_Y_test_1000,Time_in_s
2+
1,A0A140D2T1_ZIKV_Sourisseau_2019,9576,1,0.3961484373234954,0.025321007234227765,nan,0.39625513159145775,0.3945199199006544,nan,0.3625266358769935,0.3796601052244235,nan,0.42589911431961464,0.44013401721412304,nan,9576,9476,9376,8576,3450
3+
2,A0A192B1T2_9HIV1_Haddox_2018,12577,1,0.5142308485827871,0.4818033871398063,0.4480413345511842,0.5146527813514364,0.5435129382224032,0.5924128662504711,0.5467805611967641,0.5628488925632386,0.6078673337402053,0.6135023014022313,0.6678319327395137,0.6655014702779698,12577,12477,12377,11577,9738
4+
3,A0A1I9GEU1_NEIME_Kennouche_2019,922,1,0.05411293279402172,0.07207428613031318,0.047303886240045084,0.04436729926547371,0.04427441555019729,0.08790597435423003,0.06540297172246635,0.0725659812731205,0.03614327174785718,nan,nan,nan,922,822,722,nan,256
5+
4,A0A247D711_LISMN_Stadelmann_2021,1653,1,0.4714160727397353,0.1922849713838631,0.4683286023339054,0.4842922488159595,0.47869162933481824,0.5869795493921092,0.531315014757049,0.5133715949185581,0.6219219229364262,0.6323348215864005,0.6248817967566451,0.6894997810047412,1653,1553,1453,653,526
6+
5,A0A2Z5U3Z0_9INFA_Doud_2016,10715,1,0.1391166205539817,0.48300176044919707,0.47710377008999405,0.13967082235859982,0.49375101649184666,0.49294188359007274,0.27802529906449985,0.5248994314331253,0.5642866911518036,0.5196491031680416,0.6504876227143324,0.6457733718778265,10715,10615,10515,9715,5328
7+
6,A0A2Z5U3Z0_9INFA_Wu_2014,2350,1,0.3648647777916376,0.43310916448256903,0.36612159772291447,0.3671848245849248,0.45217249523558617,0.4603122688694703,0.3689452266258967,0.48361188758319923,0.42902907236054805,0.3967658185607553,0.49496694985733,0.49448058180987625,2350,2250,2150,1350,2488
8+
7,A4D664_9INFA_Soh_2019,14421,1,-0.10916121986038664,0.04568083925978014,0.2959612431218879,-0.10951341634667816,0.16922806390981557,0.3449684112889143,-0.02816637449647003,0.2659695893301885,0.3762411589983513,0.4045659532993644,0.4649682439579048,0.5442379418082474,14421,14321,14221,13421,8969
9+
8,A4GRB6_PSEAI_Chen_2020,5004,1,0.6681056494435768,0.543247747835155,0.647351733217166,0.6105347017831526,0.6007582102931497,0.7414608515883568,0.7053638375627392,0.6894205591814954,0.7677593077079674,0.7245479703656822,0.8059450782290521,0.8107707930340956,5004,4904,4804,4004,1751
10+
9,AACC1_PSEAI_Dandage_2018,1801,1,0.3180712414525488,0.45793953382550573,0.36853292069676097,0.3174612456170627,0.45937287821213785,0.4310318712519802,0.3521756690003161,0.49206057023996924,0.4617532190070763,0.4488626249244256,0.5505813182399806,0.534569265039648,1801,1701,1601,801,869
11+
10,ACE2_HUMAN_Chan_2020,2223,1,0.24320754065919856,0.1855938942334426,0.2613054581997969,0.2985805551410494,0.24485718938989934,0.353286631689331,0.4023145866828806,0.3372015473315942,0.4700770240532049,0.5643356952550576,0.6012504479478733,0.610115781454495,2223,2123,2023,1223,3995

0 commit comments

Comments
 (0)