Skip to content

Commit b0ec680

Browse files
committed
New core functions, model ops, improved calibration, more repo cleanup
1 parent f1947eb commit b0ec680

34 files changed

+612
-64756
lines changed

MANIFEST.in

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
include deeplc/mods/*
1+
include deeplc/models/*
22
include deeplc/package_data/**/*
33
include deeplc/unimod/*
4-
include deeplc/aa_comp_rel.csv
54
include deeplc/baseline_performance/*

TODO.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# DeepLC 4.0 to do list
2+
3+
## Alpha 1 release
4+
5+
[x] SciKit-Learn like API for calibration
6+
[x] Streamlined use of Data and DataLoader
7+
[x] Module with PyTorch-level model operations (train, predict, load, save)
8+
[x] Refactor core functions to use new model operations module
9+
10+
## Alpha 2 release
11+
12+
[ ] Add architecture module for training new models
13+
[ ] Get calibration/finetuning PSMs from main psm_list using score/q-value for best selection?
14+
[ ] Add CLI commands with file I/O
15+
16+
## Beta release
17+
18+
[ ] Ensure mapping of MaxQuant modifications
19+
[ ] Update README
20+
[ ] Update documentation to reflect new structure
21+
[ ] Update examples to use new structure
22+
23+
## Stable release
24+
25+
[ ] Decent coverage of unit tests
26+
[ ] Update GUI (no use of argparse -> alternative for Gooey?)
27+
[ ] Update Streamlit app
28+
29+
## Open questions / issues
30+
31+
[ ] Should the library feature be reintroduced?
32+
[ ] Implementation into IM2Deep

config.ini

Lines changed: 0 additions & 32 deletions
This file was deleted.

deeplc/__init__.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
# __all__ = ["DeepLC"]
1+
from importlib.metadata import version
22

3-
# from importlib.metadata import version
3+
from deeplc.core import calibrate_and_predict, finetune_and_predict, predict
44

5-
# __version__ = version("deeplc")
6-
7-
8-
# from deeplc.deeplc import DeepLC
5+
__version__: str = version("deeplc")
6+
__all__: list[str] = [
7+
"predict",
8+
"calibrate_and_predict",
9+
"finetune_and_predict",
10+
]

deeplc/__main__.py

Lines changed: 2 additions & 203 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,12 @@
11
"""Main command line interface to DeepLC."""
22

3-
__author__ = ["Robbin Bouwmeester", "Ralf Gabriels"]
4-
__credits__ = [
5-
"Robbin Bouwmeester",
6-
"Ralf Gabriels",
7-
"Prof. Lennart Martens",
8-
"Sven Degroeve",
9-
]
10-
__license__ = "Apache License, Version 2.0"
11-
__maintainer__ = ["Robbin Bouwmeester", "Ralf Gabriels"]
12-
13-
143
import logging
15-
import os
164
import sys
17-
import warnings
18-
19-
import pandas as pd
20-
from psm_utils.io import read_file
21-
from psm_utils.io.peptide_record import peprec_to_proforma
22-
from psm_utils.psm import PSM
23-
from psm_utils.psm_list import PSMList
245

25-
from deeplc import DeepLC, __version__
26-
from deeplc._argument_parser import parse_arguments
27-
from deeplc._exceptions import DeepLCError
6+
LOGGER = logging.getLogger(__name__)
287

29-
logger = logging.getLogger(__name__)
308

31-
32-
def setup_logging(passed_level):
9+
def _setup_logging(passed_level):
3310
log_mapping = {
3411
"critical": logging.CRITICAL,
3512
"error": logging.ERROR,
@@ -51,181 +28,3 @@ def setup_logging(passed_level):
5128
datefmt="%Y-%m-%d %H:%M:%S",
5229
level=log_mapping[passed_level.lower()],
5330
)
54-
55-
56-
def main(gui=False):
57-
"""Main function for the CLI."""
58-
argu = parse_arguments(gui=gui)
59-
60-
setup_logging(argu.log_level)
61-
62-
# Reset logging levels if DEBUG (see deeplc.py)
63-
if argu.log_level.lower() == "debug":
64-
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"
65-
logging.getLogger("tensorflow").setLevel(logging.DEBUG)
66-
warnings.filterwarnings("default", category=DeprecationWarning)
67-
warnings.filterwarnings("default", category=FutureWarning)
68-
warnings.filterwarnings("default", category=UserWarning)
69-
else:
70-
os.environ["KMP_WARNINGS"] = "0"
71-
72-
try:
73-
run(**vars(argu))
74-
except DeepLCError as e:
75-
logger.exception(e)
76-
sys.exit(1)
77-
78-
79-
def run(
80-
file_pred,
81-
file_cal=None,
82-
file_pred_out=None,
83-
file_model=None,
84-
pygam_calibration=True,
85-
split_cal=50,
86-
dict_divider=50,
87-
use_library=None,
88-
write_library=False,
89-
batch_num=50000,
90-
n_threads=None,
91-
transfer_learning=False,
92-
log_level="info",
93-
verbose=True,
94-
):
95-
"""Run DeepLC."""
96-
logger.info("Using DeepLC version %s", __version__)
97-
logger.debug("Using %i CPU threads", n_threads)
98-
99-
df_pred = False
100-
df_cal = False
101-
first_line_pred = ""
102-
first_line_cal = ""
103-
104-
if not file_cal and file_model != None:
105-
fm_dict = {}
106-
sel_group = ""
107-
for fm in file_model:
108-
if len(sel_group) == 0:
109-
sel_group = "_".join(fm.split("_")[:-1])
110-
fm_dict[sel_group] = fm
111-
continue
112-
m_group = "_".join(fm.split("_")[:-1])
113-
if m_group == sel_group:
114-
fm_dict[m_group] = fm
115-
file_model = fm_dict
116-
117-
with open(file_pred) as f:
118-
first_line_pred = f.readline().strip()
119-
if file_cal:
120-
with open(file_cal) as f:
121-
first_line_cal = f.readline().strip()
122-
123-
if "modifications" in first_line_pred.split(",") and "seq" in first_line_pred.split(","):
124-
# Read input files
125-
df_pred = pd.read_csv(file_pred)
126-
if len(df_pred.columns) < 2:
127-
df_pred = pd.read_csv(file_pred, sep=" ")
128-
df_pred = df_pred.fillna("")
129-
file_pred = ""
130-
131-
list_of_psms = []
132-
for seq, mod, ident in zip(df_pred["seq"], df_pred["modifications"], df_pred.index):
133-
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=ident))
134-
psm_list_pred = PSMList(psm_list=list_of_psms)
135-
df_pred = None
136-
else:
137-
psm_list_pred = read_file(file_pred)
138-
if "msms" in file_pred and ".txt" in file_pred:
139-
mapper = pd.read_csv(
140-
os.path.join(
141-
os.path.dirname(os.path.realpath(__file__)),
142-
"unimod/map_mq_file.csv",
143-
),
144-
index_col=0,
145-
)["value"].to_dict()
146-
psm_list_pred.rename_modifications(mapper)
147-
148-
# Allow for calibration file to be empty (undefined), fill in if/elif if present
149-
psm_list_cal = []
150-
if (
151-
"modifications" in first_line_cal.split(",")
152-
and "seq" in first_line_cal.split(",")
153-
and file_cal
154-
):
155-
df_cal = pd.read_csv(file_cal)
156-
if len(df_cal.columns) < 2:
157-
df_cal = pd.read_csv(df_cal, sep=" ")
158-
df_cal = df_cal.fillna("")
159-
file_cal = ""
160-
161-
list_of_psms = []
162-
for seq, mod, ident, tr in zip(
163-
df_cal["seq"], df_cal["modifications"], df_cal.index, df_cal["tr"]
164-
):
165-
list_of_psms.append(
166-
PSM(
167-
peptidoform=peprec_to_proforma(seq, mod),
168-
spectrum_id=ident,
169-
retention_time=tr,
170-
)
171-
)
172-
psm_list_cal = PSMList(psm_list=list_of_psms)
173-
df_cal = None
174-
elif file_cal:
175-
psm_list_cal = read_file(file_cal)
176-
if "msms" in file_cal and ".txt" in file_cal:
177-
mapper = pd.read_csv(
178-
os.path.join(
179-
os.path.dirname(os.path.realpath(__file__)),
180-
"unimod/map_mq_file.csv",
181-
),
182-
index_col=0,
183-
)["value"].to_dict()
184-
psm_list_cal.rename_modifications(mapper)
185-
186-
# Make the DeepLC object that will handle making predictions and calibration
187-
dlc = DeepLC(
188-
path_model=file_model,
189-
cnn_model=True,
190-
split_cal=split_cal,
191-
dict_cal_divider=dict_divider,
192-
write_library=write_library,
193-
use_library=use_library,
194-
batch_num=batch_num,
195-
n_jobs=n_threads,
196-
verbose=verbose,
197-
deeplc_retrain=transfer_learning,
198-
)
199-
200-
# Calibrate the original model based on the new retention times
201-
if len(psm_list_cal) > 0:
202-
logger.info("Selecting best model and calibrating predictions...")
203-
logger.info("Initiating transfer learning?")
204-
dlc.calibrate_preds(psm_list=psm_list_cal)
205-
206-
# Make predictions; calibrated or uncalibrated
207-
logger.info("Making predictions using model: %s", dlc.model)
208-
if len(psm_list_cal) > 0:
209-
preds = dlc._make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred)
210-
else:
211-
preds = dlc._make_preds(
212-
seq_df=df_pred,
213-
infile=file_pred,
214-
psm_list=psm_list_pred,
215-
calibrate=False,
216-
)
217-
218-
# df_pred["predicted_tr"] = preds
219-
logger.info("Writing predictions to file: %s", file_pred_out)
220-
221-
file_pred_out = open(file_pred_out, "w")
222-
file_pred_out.write("Sequence proforma,predicted retention time\n")
223-
for psm, tr in zip(psm_list_pred, preds):
224-
file_pred_out.write(f"{psm.peptidoform.proforma},{tr}\n")
225-
file_pred_out.close()
226-
227-
logger.info("DeepLC finished!")
228-
229-
230-
if __name__ == "__main__":
231-
main()

deeplc/_architecture.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# TODO: Add architectures for training from scratch

0 commit comments

Comments
 (0)