Skip to content

Commit 5917357

Browse files
authored
Merge pull request #6 from niklases/dev
Dev
2 parents c926041 + 3e9ddbc commit 5917357

File tree

17 files changed

+503
-115
lines changed

17 files changed

+503
-115
lines changed

.github/workflows/build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,6 @@ jobs:
3434
run: |
3535
# stop the build if there are Python syntax errors or undefined names
3636
flake8 ./pypef --count --select=E9,F63,F7,F82 --show-source --statistics
37-
- name: Export Pythonpath and run CLI PyPEF version test with pytest
37+
- name: Export Pythonpath and run PyPEF API and CLI version test with pytest
3838
run: |
39-
export PYTHONPATH="${PYTHONPATH}:${PWD}" && python -m pytest tests/cli/
39+
export PYTHONPATH="${PYTHONPATH}:${PWD}" && python -m pytest tests/

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,3 +384,7 @@ scripts/Setup/linux/AVGFP/uref100_avgfp_jhmmer_119.a2m
384384
scripts/Setup/linux/AVGFP/avGFP.csv
385385
scripts/Setup/linux/api_encoding_train_test.py
386386
scripts/Setup/linux/apc.png
387+
datasets/ANEH/KARS160122_PLS_LOOCV_ML_Model_Performance.png
388+
datasets/ANEH/CV_performance/KARS160122_PLS_LOOCV_5-fold-CV.png
389+
datasets/ANEH/CV_performance/KARS160122_PLS_LOOCV_CV_Results.txt
390+
datasets/AVGFP/Predictions_Hybrid_TopTS.txt

.vscode/launch.json

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
{
2+
// Use IntelliSense to learn about possible attributes.
3+
// Hover to view descriptions of existing attributes.
4+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5+
"version": "0.2.0",
6+
"configurations": [
7+
{
8+
"name": "Python: PyPEF Help",
9+
"type": "python",
10+
"request": "launch",
11+
"env": {"PYTHONPATH": "${workspaceFolder}"},
12+
"program": "${workspaceFolder}/pypef/main.py",
13+
"console": "integratedTerminal",
14+
"justMyCode": true,
15+
"args": [
16+
"--help"
17+
]
18+
},
19+
20+
{
21+
"name": "Python: PyPEF MKLSTS ANEH",
22+
"type": "python",
23+
"request": "launch",
24+
"env": {"PYTHONPATH": "${workspaceFolder}"},
25+
"program": "${workspaceFolder}/pypef/main.py",
26+
"console": "integratedTerminal",
27+
"justMyCode": true,
28+
"cwd": "${workspaceFolder}/datasets/ANEH/",
29+
"args": [
30+
"mklsts",
31+
"--wt", "${workspaceFolder}/datasets/ANEH/Sequence_WT_ANEH.fasta",
32+
"--input", "${workspaceFolder}/datasets/ANEH/37_ANEH_variants.csv"
33+
]
34+
},
35+
36+
{
37+
"name": "Python: PyPEF MKLSTS avGFP",
38+
"type": "python",
39+
"request": "launch",
40+
"env": {"PYTHONPATH": "${workspaceFolder}"},
41+
"program": "${workspaceFolder}/pypef/main.py",
42+
"console": "integratedTerminal",
43+
"justMyCode": true,
44+
"cwd": "${workspaceFolder}/datasets/AVGFP/",
45+
"args": [
46+
"mklsts",
47+
"--wt", "P42212_F64L.fasta",
48+
"--input", "avGFP.csv"
49+
]
50+
},
51+
52+
{
53+
"name": "Python: PyPEF ml -e onehot pls_loocv",
54+
"type": "python",
55+
"request": "launch",
56+
"env": {"PYTHONPATH": "${workspaceFolder}"},
57+
"program": "${workspaceFolder}/pypef/main.py",
58+
"console": "integratedTerminal",
59+
"justMyCode": true,
60+
"cwd": "${workspaceFolder}/datasets/ANEH",
61+
"args": [
62+
"ml",
63+
"-e", "onehot",
64+
"-l", "LS.fasl",
65+
"-t", "TS.fasl",
66+
"--regressor", "pls_loocv"
67+
]
68+
},
69+
70+
{ // GREMLIN zero-shot steps:
71+
// 1. $pypef param_inference --msa uref100_avgfp_jhmmer_119.a2m --opt_iter 100
72+
// 2. $pypef hybrid -t TS.fasl --params GREMLIN
73+
// or
74+
// 2. $pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN
75+
"name": "Python: PyPEF save GREMLIN avGFP model",
76+
"type": "python",
77+
"request": "launch",
78+
"env": {"PYTHONPATH": "${workspaceFolder}"},
79+
"program": "${workspaceFolder}/pypef/main.py",
80+
"console": "integratedTerminal",
81+
"justMyCode": true,
82+
"cwd": "${workspaceFolder}/datasets/AVGFP/",
83+
"args": [
84+
"param_inference",
85+
"--msa", "uref100_avgfp_jhmmer_119.a2m",
86+
"--opt_iter", "100"
87+
]
88+
},
89+
90+
{
91+
"name": "Python: PyPEF hybrid/only-TS-zero-shot GREMLIN-DCA avGFP",
92+
"type": "python",
93+
"request": "launch",
94+
"env": {"PYTHONPATH": "${workspaceFolder}"},
95+
"program": "${workspaceFolder}/pypef/main.py",
96+
"console": "integratedTerminal",
97+
"justMyCode": true,
98+
"cwd": "${workspaceFolder}/datasets/AVGFP/",
99+
"args": [
100+
"hybrid",
101+
//"-m", "GREMLIN", // optional, not required
102+
"--ts", "TS.fasl",
103+
"--params", "GREMLIN"
104+
]
105+
},
106+
107+
{
108+
"name": "Python: PyPEF hybrid/only-PS-zero-shot GREMLIN-DCA avGFP",
109+
"type": "python",
110+
"request": "launch",
111+
"env": {"PYTHONPATH": "${workspaceFolder}"},
112+
"program": "${workspaceFolder}/pypef/main.py",
113+
"console": "integratedTerminal",
114+
"justMyCode": true,
115+
"cwd": "${workspaceFolder}/datasets/AVGFP/",
116+
"args": [
117+
"hybrid",
118+
//"-m", "GREMLIN", // optional, not required
119+
"--ps", "TS.fasl",
120+
"--params", "GREMLIN"
121+
]
122+
},
123+
124+
{ // PLMC zero-shot steps:
125+
// 1. $pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params
126+
// 2. $pypef hybrid -t TS.fasl --params PLMC
127+
"name": "Python: PyPEF save PLMC avGFP model",
128+
"type": "python",
129+
"request": "launch",
130+
"env": {"PYTHONPATH": "${workspaceFolder}"},
131+
"program": "${workspaceFolder}/pypef/main.py",
132+
"console": "integratedTerminal",
133+
"justMyCode": true,
134+
"cwd": "${workspaceFolder}/datasets/AVGFP/",
135+
"args": [
136+
"param_inference",
137+
"--params", "uref100_avgfp_jhmmer_119_plmc_42.6.params"
138+
]
139+
},
140+
141+
{
142+
"name": "Python: PyPEF hybrid/only-TS-zero-shot PLMC-DCA avGFP",
143+
"type": "python",
144+
"request": "launch",
145+
"env": {"PYTHONPATH": "${workspaceFolder}"},
146+
"program": "${workspaceFolder}/pypef/main.py",
147+
"console": "integratedTerminal",
148+
"justMyCode": true,
149+
"cwd": "${workspaceFolder}/datasets/AVGFP/",
150+
"args": [
151+
"hybrid",
152+
"--ts", "TS.fasl",
153+
"--params", "PLMC",
154+
"--threads", "24"
155+
]
156+
},
157+
158+
{
159+
"name": "Python: PyPEF hybrid/only-PS-zero-shot PLMC-DCA avGFP",
160+
"type": "python",
161+
"request": "launch",
162+
"env": {"PYTHONPATH": "${workspaceFolder}"},
163+
"program": "${workspaceFolder}/pypef/main.py",
164+
"console": "integratedTerminal",
165+
"justMyCode": true,
166+
"cwd": "${workspaceFolder}/datasets/AVGFP/",
167+
"args": [
168+
"hybrid",
169+
//"-m", "PLMC", // optional, not required
170+
"--ps", "TS.fasl",
171+
"--params", "uref100_avgfp_jhmmer_119_plmc_42.6.params",
172+
"--threads", "24"
173+
]
174+
},
175+
176+
{
177+
"name": "Python: PyPEF hybrid/only-PS-zero-shot PLMC-DCA variant 2 avGFP",
178+
"type": "python",
179+
"request": "launch",
180+
"env": {"PYTHONPATH": "${workspaceFolder}"},
181+
"program": "${workspaceFolder}/pypef/main.py",
182+
"console": "integratedTerminal",
183+
"justMyCode": true,
184+
"cwd": "${workspaceFolder}/datasets/AVGFP/",
185+
"args": [
186+
"hybrid",
187+
//"-m", "PLMC", // optional, not required
188+
"--ps", "TS.fasl",
189+
"--params", "PLMC",
190+
"--threads", "24"
191+
]
192+
},
193+
194+
{
195+
"name": "Python: PyPEF !wrong! MSA input format (STO)",
196+
"type": "python",
197+
"request": "launch",
198+
"env": {"PYTHONPATH": "${workspaceFolder}"},
199+
"program": "${workspaceFolder}/pypef/main.py",
200+
"console": "integratedTerminal",
201+
"justMyCode": true,
202+
"cwd": "${workspaceFolder}/datasets/AVGFP/",
203+
"args": [
204+
"param_inference",
205+
"--msa", "uref100_avgfp_jhmmer_119.sto"
206+
]
207+
}
208+
]
209+
}

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"markdown.extension.toc.updateOnSave": false
3+
}

README.md

Lines changed: 58 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,22 @@ Preprint available at bioRxiv: https://doi.org/10.1101/2022.06.07.495081.
1717
<sup>*§*</sup><sub>Equal contribution</sub> <br>
1818

1919
---
20+
2021
## Table of Contents
21-
- [PyPEF](#pypef)
22-
- [Installation](#installation)
23-
- [Requirements](#requirements)
24-
- [Running Examples](#examples)
25-
- [Tutorial](#tutorial)
26-
- [Encoding Technique Options](#encoding-options)
27-
- [Modeling Techniques](#modeling-techniques)
28-
- [Pure Machine Learning (ML)-based Modeling](#pure-ml)
29-
- [Hybrid Modeling](#hybrid-modeling)
30-
- [Model Hyperparameter Grids for Training](#grids)
31-
- [Setting Up the Scripts Yourself](#set-up)
32-
- [Preprocessing for DCA-based Sequence Encoding](#dca-preprocessing)
33-
- [API Usage for Sequence Encoding](#api-usage)
22+
[PyPEF: Pythonic Protein Engineering Framework](#pypef-pythonic-protein-engineering-framework)
23+
- [Quick Installation](#quick-installation)
24+
- [Requirements](#requirements)
25+
- [Running Examples](#running-examples)
26+
- [Tutorial](#tutorial)
27+
- [Encoding Technique Options](#encoding-technique-options)
28+
- [Modeling Techniques](#modeling-techniques)
29+
- [Pure Machine Learning (ML)-based Modeling](#pure-machine-learning-ml-based-modeling)
30+
- [Hybrid Modeling Using the MERGE Method](#hybrid-modeling-using-the-merge-method)
31+
- [Model Hyperparameter Grids for Training](#model-hyperparameter-grids-for-training)
32+
- [Setting Up the Scripts Yourself](#setting-up-the-scripts-yourself)
33+
- [Preprocessing for DCA-based Sequence Encoding](#preprocessing-for-dca-based-sequence-encoding)
34+
- [Unsupervised/zero-shot prediction](#unsupervisedzero-shot-prediction)
35+
- [API Usage for Sequence Encoding](#api-usage-for-sequence-encoding)
3436
---
3537

3638
<a name="pypef"></a>
@@ -278,7 +280,7 @@ Copy the notebook URL in your internet browser and select the Workflow_PyPEF.ipy
278280
## Modeling Techniques
279281
<a name="pure-ml"></a>
280282
### Pure Machine Learning (ML)-based Modeling
281-
Serveral linear and non-linear modeling options are available by default to construct supervised regression models based on the generated sequence features, i.e. encoded sequences.
283+
Serveral linear and non-linear modeling options are available by default to construct supervised regression models based on the generated sequence features, i.e. encoded sequences.
282284
Regression models are trained, i.e. model hyperparameters are optimized, by *k*- fold (by default, fivefold) cross-validation on training samples. Here, the model aims to map the encoded variant sequences that are the features (***X***) for predicting the corresponding fitness labels (***y***) such that *f(***X***)* --> ***y*** – while cross-validation and/or using a model implementing a penalty will be necessary for better model generalization behavior.
283285
Following regression options from [Scikit-learn](https://scikit-learn.org/stable/) are implemented (for optimized hyperparameters, see Model Hyperparameters section below):
284286
- [Partial Least Squares Regression (linear model)](https://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html)
@@ -379,32 +381,74 @@ python3 ./pypef/main.py
379381
5. Now you can follow approaches 5.1 (using GREMLIN; implemented in TensorFlow) or 5.2 (using plmc; extern parameter generation in C).
380382

381383
5.1. Running GREMLIN on the generated MSA (in FASTA or A2M format):
384+
382385
```
383386
pypef param_inference --msa ANEH_jhmmer.a2m -w WT_SEQUENCE.FASTA --opt_iter 250
384387
```
388+
385389
The pickled GREMLIN file can then be used for encoding new/test sequences:
390+
386391
```
387392
pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params GREMLIN
388393
```
394+
389395
Or for hybrid modeling:
396+
390397
```
391398
pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN
392399
```
393400
394401
5.2 After [installing plmc](https://github.com/debbiemarkslab/plmc#compilation), generate the evolutionary coupling file, which is used for encoding sequences. For example, set `-le` to the value output by `sto2a2m`:
402+
395403
```
396404
plmc -o ANEH_72.6.params -le 72.6 -m 100 -g -f WT_ANEH ANEH_jhmmer.a2m
397405
```
398406
399407
The output parameter (.params) file can be used for encoding sequences with the DCA-based encoding technique (`-e dca`) by providing it to PyPEF; e.g. for pure ML modeling:
408+
400409
```
401410
pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params ANEH_72.6.params
402411
```
412+
403413
Or for hybrid modeling:
414+
404415
```
405416
pypef hybrid -l LS.fasl -t TS.fasl --params ANEH_72.6.params
406417
```
407418
419+
<a name="zero-shot-prediction"></a>
420+
## Unsupervised/zero-shot prediction
421+
Several developed methods allow unsupervised prediction of a proteins fitness based on its sequence (and/or structure).
422+
These methods have the advantage that no initial knowledge about a proteins fitness is required for prediction, while a correlation of the predicted score and a protein's natural fitness is assumed.
423+
DCA itself is a statistical/unsupervised method based on MSA information that outperforms simpler MSA-based methods (such as (un)coupled raw MSA sequence frequencies or BLOSUM scores), e.g., see [scripts/GREMLIN_numba/using_gremlin_functionalities.ipynb](scripts/GREMLIN_numba/using_gremlin_functionalities.ipynb).
424+
To make zero-shot predictions using PyPEF (plmc-DCA or GREMLIN-DCA) just do not provide a training set (no `-l` flag, only a `-t` or `-p` flag) for hybrid modeling, e.g., for the avGFP data, try
425+
426+
```
427+
pypef param_inference --msa uref100_avgfp_jhmmer_119.a2m
428+
pypef hybrid -t TS.fasl --params GREMLIN
429+
pypef hybrid -p PS.fasta --params GREMLIN
430+
```
431+
432+
using the GREMLIN parameters, or,
433+
434+
```
435+
pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params
436+
pypef hybrid -t TS.fasl --params PLMC
437+
pypef hybrid -p PS.fasta --params PLMC
438+
```
439+
440+
using the plmc parameters.
441+
442+
Other well-performing zero-shot prediction methods with available source code are:
443+
444+
- ESM-1v/ESM-2 (https://github.com/facebookresearch/esm)
445+
- DeepSequence (https://github.com/debbiemarkslab/DeepSequence)
446+
- EVcouplings (plmc-DCA, https://github.com/debbiemarkslab/EVcouplings)
447+
- EVE (https://github.com/OATML/EVE)
448+
- Tranception (https://github.com/OATML-Markslab/Tranception)
449+
450+
This list is by no means complete, see ProteinGym [repository](https://github.com/OATML-Markslab/ProteinGym) and [website](https://proteingym.org/) for a more detailed overview of available methods and achieved performances (as well as for getting many benchmark data sets).
451+
408452
<a name="api-usage"></a>
409453
## API Usage for Sequence Encoding
410454
For script-based encoding of sequences using PyPEF and the available AAindex-, OneHot- or DCA-based techniques, the classes and corresponding functions can be imported, i.e. `OneHotEncoding`, `AAIndexEncoding`, `GREMLIN` (DCA), `PLMC` (DCA), and `DCAHybridModel`. In addition, implemented functions for CV-based tuning of regression models can be used to train and validate models, eventually deriving them to obtain performances on retained data for testing. An exemplary script and a Jupyter notebook for CV-based (low-*N*) tuning of models and using them for testing is provided at [scripts/Encoding_low_N/api_encoding_train_test.py](scripts/Encoding_low_N/api_encoding_train_test.py) and [scripts/Encoding_low_N/api_encoding_train_test.ipynb](scripts/Encoding_low_N/api_encoding_train_test.ipynb), respectively.

pypef/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@
1717
# §Equal contribution
1818

1919

20-
__version__ = '0.3.2-alpha'
20+
__version__ = '0.3.3-alpha'

pypef/dca/dca_run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def run_pypef_hybrid_modeling(arguments):
6161
label=arguments['--label']
6262
)
6363

64-
elif arguments['--params'] and arguments['--model']:
64+
elif arguments['--params'] and arguments['--model'] or arguments['--ps']:
6565
prediction_dict = {}
6666
prediction_dict.update({
6767
'drecomb': arguments['--drecomb'],

0 commit comments

Comments
 (0)