Skip to content

Commit 9de9979

Browse files
committed
ProtGym split techniques benchmarking:
skipping long computations (for now)
1 parent f658c94 commit 9de9979

File tree

4 files changed

+33
-11
lines changed

4 files changed

+33
-11
lines changed

pypef/utils/plot.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,18 @@ def plot_y_true_vs_y_pred(
2424
Plots predicted versus true values using the hybrid model for prediction.
2525
Function called by function predict_ps.
2626
"""
27-
_prec, _acc, _bacc, rec, _f1, _mcc, _auroc, _aps = get_binarized_classification_performances(y_true, y_pred)
27+
_prec, _acc, _bacc, rec, _f1, _mcc, _auroc, _aps = (
28+
get_binarized_classification_performances(y_true, y_pred)
29+
)
2830
if hybrid:
2931
spearman_rho = stats.spearmanr(y_true, y_pred)[0]
3032
# Recall: Here, top 10 % fit variants are positive labeled (1), rest are labeled negative (0) by default
31-
plt.scatter(y_true, y_pred, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.7, c=y_true, vmin=min(y_true), vmax=max(y_true),
32-
label=f'Spearman\'s ' + fr'$\rho$ = {spearman_rho:.3f}' + '\n'
33-
+ r'Recall$_\mathrm{top 10 \%}$' + f' = {rec:.3f}\n'
34-
+ fr'($N$ = {len(y_true)})'
33+
plt.scatter(
34+
y_true, y_pred,
35+
marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.7,
36+
c=y_true, vmin=min(y_true), vmax=max(y_true),
37+
label=f'Spearman\'s ' + fr'$\rho$ = {spearman_rho:.3f}' + '\n' +
38+
r'Recall$_\mathrm{top 10 \%}$' + f' = {rec:.3f}\n' + fr'($N$ = {len(y_true)})'
3539
)
3640
if name != '':
3741
file_name = f'DCA_Hybrid_Model_Performance_{name}.png'

pypef/utils/split.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,10 +225,10 @@ def plot_distributions(self):
225225
pos_train, counts_train = self._get_distribution(train_indices[i_split])
226226
pos_test, counts_test = self._get_distribution(test_indices[i_split])
227227
axs[i_category + 1, i_split].plot(
228-
pos_train, counts_train, marker="o", linestyle="--", markersize=3
228+
pos_train, counts_train, marker="o", linestyle="--", markersize=3, linewidth=0.5
229229
)
230230
axs[i_category + 1, i_split].plot(
231-
pos_test, counts_test, marker="o", linestyle="--", markersize=3
231+
pos_test, counts_test, marker="o", linestyle="--", markersize=3, linewidth=0.5
232232
)
233233
xticks = list(axs[i_category + 1, i_split].get_xticks())
234234
xticks = xticks[1:-1]

scripts/ProteinGym_runs/protgym_hybrid_perf_test_crossval.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
4949
print(f"Using {device.upper()} device")
5050
get_vram()
5151
MAX_WT_SEQUENCE_LENGTH = 600 # TODO: 1000
52+
MAX_VARIANT_FITNESS_PAIRS = 5000
5253
print(f"Maximum sequence length: {MAX_WT_SEQUENCE_LENGTH}")
5354
print(f"Loading LLM models into {device} device...")
5455
prosst_base_model, prosst_lora_model, prosst_tokenizer, prosst_optimizer = get_prosst_models()
@@ -90,9 +91,6 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
9091
variants = variant_fitness_data['mutant'].to_numpy()
9192
variants_orig = variants
9293
fitnesses = variant_fitness_data['DMS_score'].to_numpy()
93-
if len(fitnesses) <= 50:
94-
print('Number of available variants <= 50, skipping dataset...')
95-
continue
9694
variants_split = []
9795
for variant in variants:
9896
# Split double and higher substituted variants to multiple single substitutions
@@ -108,6 +106,16 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
108106
n_muts.append(len(variant))
109107
max_muts = max(n_muts)
110108
print(f'N max. (multiple) amino acid substitutions: {max_muts}')
109+
if len(fitnesses) <= 50 or len(fitnesses) > MAX_VARIANT_FITNESS_PAIRS:
110+
print(f'Number of available variants <= 50 or > {MAX_VARIANT_FITNESS_PAIRS}'
111+
f', skipping dataset...')
112+
with open(out_results_csv, 'a') as fh:
113+
fh.write(
114+
f'{numbers_of_datasets[i]},{dset_key},{len(variants_orig)},'
115+
f'{max_muts},{len(fitnesses)} variant fitness pairs (below 50 '
116+
f'or more than {MAX_WT_SEQUENCE_LENGTH})\n'
117+
)
118+
continue
111119
if len(wt_seq) > MAX_WT_SEQUENCE_LENGTH:
112120
print(f'Sequence length over {MAX_WT_SEQUENCE_LENGTH}, which represents '
113121
f'a potential out-of-memory risk (when running on GPU, set '
@@ -256,11 +264,13 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
256264
}
257265
print(f' Train: {len(np.array(y_train))} --> Test: {len(np.array(y_test))}')
258266
if len(y_test) <= 50:
259-
print(f"Only {len(fitnesses)} in total, splitting the data "
267+
print(f" Only {len(fitnesses)} in total, splitting the data "
260268
f"in N_Train = {len(y_train)} and N_Test = {len(y_test)} "
261269
f"results in N_Test <= 50 variants - not getting "
262270
f"performance for N_Train = {len(y_train)}...")
263271
ns_y_test.append(np.nan)
272+
for m in ['DCA', 'ESM1v', 'ProSST', 'DCA hybrid', 'DCA+ESM1v hybrid', 'DCA+ProSST hybrid']:
273+
temp_results[category][f'Split {i_split}'].update({m: np.nan})
264274
continue
265275
#get_vram()
266276

scripts/ProteinGym_runs/results/dca_esm_and_hybrid_5cv-split_results.csv

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,14 @@ No.,Dataset,N_Variants,N_Max_Muts,Untrained_Performance_DCA,Untrained_Performanc
33
2,A0A192B1T2_9HIV1_Haddox_2018,12577,1,Sequence too long (852 > 600)
44
3,A0A1I9GEU1_NEIME_Kennouche_2019,922,1,0.05411293279402172,0.07207428613031318,0.047303886240045084,-0.04928669127689938,0.15119300689514967,0.014181585895217707,0.10675325522953774,0.043964778384818604,-0.017976073598702094,0.08404269181132104,0.12975434595379667,0.09925376908046603,0.06199366414985664,-0.09147130789468667,0.12552340113042965,0.09395907217962027,0.1562129322617565,0.014009561561887141,-0.05531775683408093,0.06278558972986106,0.28829339290381795,0.027646360948805508,0.0193850078174259,-0.04586957271677053,-0.04895025707415656,0.23004664930215152,0.07042057546299449,0.031186236066322367,0.038899388766534745,0.08461281450276825,0.2551439141421933,0.04250949671362989,-0.0387374167037556,0.057358026137344156,0.11698325109303387,0.17259716582253684,0.0680145887295762,0.1243874968349091,0.10476854830526693,0.06520981142700823,0.20010998626599621,0.10464943537661343,0.14307809422038859,0.07602858013619787,0.012248872587475458,0.18681501267439019,0.03459509676131014,0.18820764992567163,5474
55
4,A0A247D711_LISMN_Stadelmann_2021,1653,1,0.3257171454789246,0.1922849713838631,0.4683286023339054,0.425359788416616,0.4482956731636781,0.4139800340667575,0.39983529943172424,0.37945477532026367,0.6057987058523887,0.5695236533689353,0.5925738842577352,0.5631997726773084,0.5443000968386646,0.7344860026161634,0.6469789054856282,0.6511735912398791,0.6509204673814494,0.6143648037931253,0.10026340966202295,-0.050045305385736163,0.04402104411640051,-0.04977025155826621,0.1765502664331152,0.23702472477615977,0.2975087769175714,0.2425237153399762,0.36643939888658494,0.5418330566398947,0.500174535886334,0.4155296674266911,0.4742642255516514,0.42645668434508316,0.4912508608833821,0.33106297440345095,0.26049143171005745,-0.019370918896295188,0.014566511138794955,-0.15139881830894192,0.272742572212445,0.3194740996965452,0.3198733018784645,0.0954217237757609,0.20087384064978656,0.3190475140511522,0.69959970846981,0.5450796656559668,0.22032845683404303,0.33793774719352276,6466
6+
5,A0A2Z5U3Z0_9INFA_Doud_2016,10715,1,10715 variant fitness pairs (below 50 or more than 600)
7+
6,A0A2Z5U3Z0_9INFA_Wu_2014,2350,1,0.3336227599994097,0.43310916448256903,0.36612159772291447,0.3290070513276686,0.38641724656845045,0.3877587482428224,0.41259160702222375,0.33865741941032906,0.4772153619929998,0.5333046105910981,0.5011429241697434,0.5163489267928253,0.48457438985760704,0.44901503750458444,0.4916582660675098,0.4720937973653675,0.5373467951157461,0.4918593046967948,0.3112556987104785,0.27215471998623963,0.270029398489026,0.3574445293575238,0.2987785251877998,0.49142096917262035,0.4689766660462643,0.4365502665086605,0.4047390241271481,0.5207911906059824,0.43691050484875604,0.4239995733977735,0.4345264132302189,0.46001571234122884,0.4184343876283869,0.36237176276271615,0.37414189450660923,0.38745825439990883,0.326142661297214,0.04044084141013186,0.4935874210359441,0.44634025417659273,0.43629568912104005,0.45680116777782936,0.3373794617092947,0.40236730723537206,0.4302566938215456,0.5016980869955363,0.4258165702796568,0.29220831679026527,38243
8+
7,A4D664_9INFA_Soh_2019,14421,1,14421 variant fitness pairs (below 50 or more than 600)
9+
8,A4GRB6_PSEAI_Chen_2020,5004,1,5004 variant fitness pairs (below 50 or more than 600)
10+
9,AACC1_PSEAI_Dandage_2018,1801,1,0.23878540848477645,0.45793953382550573,0.36853292069676097,0.21468316486429614,0.34391465535278387,0.22100993695919746,0.3238861825326002,0.25719586574800135,0.5674765206123771,0.4895220384930953,0.5475734123462886,0.5877195554492447,0.5438753385442788,0.5797424281844477,0.5128714984940727,0.3798390934086425,0.445900559932304,0.5855706191148594,-0.045692892702990553,0.024998235829937304,0.23874220476296826,0.10703716354704361,0.16253107458984928,0.42846694732051427,0.4395653528089788,0.4430185802815563,0.5257232211281666,0.3388300804007799,0.0013612470133120946,0.44389471089187793,0.436567864624026,0.29230972769555674,0.3455461727970937,0.029286617427829165,0.22444406563905325,0.5654896683788994,-0.006946366995155716,-0.14050952730983762,0.27465976256453783,0.17499619347307516,0.45793053065945016,0.5180128405511337,0.40826837886452405,0.1337599250912491,0.4570158520699047,0.47516698499399346,0.4800767699420239,0.15322516202675934,10438
11+
10,ACE2_HUMAN_Chan_2020,2223,1,Sequence too long (805 > 600)
12+
11,ADRB2_HUMAN_Jones_2020,7800,1,7800 variant fitness pairs (below 50 or more than 600)
613
12,AICDA_HUMAN_Gajula_2014_3cycles,209,1,0.41950521618921593,0.4075489898558927,0.274172920796004,0.4234950859543545,0.5197827298232696,0.5175785162112894,0.37714710370192484,0.36980807913773095,0.5119618908372606,0.5819758565792303,0.39193914437137345,0.44881775281082104,0.54174105341698,0.5523479230940255,0.5383742675471141,0.3619334577549017,0.5218490269980818,0.5029704170989369,0.3408436903839512,0.31987305947716743,0.5589154208553807,0.4521327174006057,0.2997591865802987,0.34687341150685796,0.18822499565539788,0.5035837398794349,0.483422578448325,0.3379680821670497,0.3411725842633825,0.20693806208392873,0.5331290701200798,0.3521994952102276,0.2913291437144539,0.12079710278373396,0.1668764022851235,0.08830287777656197,0.2659949345972696,0.7694141813746219,0.278352893833916,-0.033156426158945845,0.3086770981507823,0.1856925099433163,0.7010837094589468,0.3650215678472993,0.409148110258411,0.21041689462742094,0.3641545150746804,0.4213111058413938,1624
714
13,AMIE_PSEAE_Wrenbeck_2017,6227,1,0.4865220422275132,0.617273943143269,0.48233745058184774,0.6247556364864019,0.5637852711251141,0.6540784859982384,0.6341444679334964,0.6197896872164523,0.7958407070023474,0.7885790075634426,0.8131564517490826,0.8051715141591055,0.8062469765412855,0.7818339428067079,0.695958728332422,0.787700554067909,0.7622956253703456,0.7658722339243652,0.43860536134608286,0.2557034176712655,0.394611746066854,0.47660382328577866,0.4834463395388783,0.6904720593579283,0.6563703356818152,0.6406390703027633,0.7050062806610575,0.7148009219870343,0.5921369335347038,0.5559248777884879,0.5964994560266164,0.5640208292157121,0.5940987459606644,0.326936290048655,0.30714604520783967,0.4021949403637349,0.2927378676795793,0.06931987542437028,0.6355811966573056,0.6492092180261322,0.5757335774899451,0.5694597657892678,0.22771513389394565,0.56699032950344,0.5909816291276658,0.45215344827145243,0.5795435678137627,0.34215819397838965,60630
815
14,ARGR_ECOLI_Tsuboyama_2023_1AOY,1287,1,0.366064086686212,0.3625682795745455,0.6029023718184582,0.5988746767803351,0.46984654078265037,0.5857284754480594,0.5310816129116218,0.4778592632644194,0.7849960816888388,0.7862343385825422,0.7852178044101772,0.8359331423156259,0.8061735189657456,0.8943522293445376,0.8790074379613787,0.8356172518324132,0.8856077787078092,0.8643095081893041,0.39724437532153123,0.5446488028034597,0.4750189203947585,0.45471122497191896,0.310318171224561,0.6613641315451368,0.7752740615150877,0.7904226574806784,0.807901884060273,0.7863140897623656,0.8109382273695493,0.7995610211612624,0.8853319278173424,0.8540341039139389,0.828082572910159,0.1880208372315011,0.5540865740355319,0.22845056692482024,-0.044527449894949396,-0.12431881367962351,0.5225792631888516,0.7739677915233598,0.7266454242453707,0.3855376347592431,0.42134233866311016,0.5635257838657134,0.8480307823833785,0.8687231115737192,0.7042320761238904,0.8343639966177417,4557
16+
15,B2L11_HUMAN_Dutta_2010_binding-Mcl-1,170,1,-0.09215236134822696,0.37716061821801294,0.2737196279944554,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,0.7022273445034504,nan,nan,nan,nan,0.39511312217194566,nan,nan,nan,nan,0.7000904977375565,nan,nan,0.12734758402045815,nan,nan,nan,nan,0.2639819004524887,nan,nan,nan,nan,0.630316742081448,nan,nan,nan,163

0 commit comments

Comments
 (0)