Skip to content

Commit 167ce2d

Browse files
authored
Merge branch 'PyProphet:master' into master
2 parents a0ebf12 + 7cec084 commit 167ce2d

26 files changed

+1524
-971
lines changed

pyprophet/data_handling.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ def prepare_data_table(table,
143143
decoy_name="decoy",
144144
main_score_name=None,
145145
score_columns=None,
146+
level=None
146147
):
147148
N = len(table)
148149
if not N:
@@ -219,7 +220,7 @@ def prepare_data_table(table,
219220

220221
tg_ids = table[tg_id_name]
221222

222-
if not check_for_unique_blocks(tg_ids):
223+
if not check_for_unique_blocks(tg_ids) and level != 'alignment':
223224
raise click.ClickException("" + tg_id_name + " values do not form unique blocks in input file(s).")
224225

225226
tg_map = dict()
@@ -285,6 +286,8 @@ def update_chosen_main_score_in_table(train, score_columns, use_as_main_score):
285286
train.df.insert(5, temp_col.name, temp_col)
286287
click.echo(f"Info: Updated main score column from {old_main_score_column} to {use_as_main_score}...")
287288
return train, tuple(updated_score_columns)
289+
290+
288291
class Experiment(object):
289292

290293
@profile

pyprophet/export_parquet.py

Lines changed: 52 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pandas as pd
44
from pyprophet.export import check_sqlite_table
55
from duckdb_extensions import extension_importer
6+
import re
67

78
def getPeptideProteinScoreTable(conndb, level):
89
if level == 'peptide':
@@ -31,7 +32,7 @@ def getVarColumnNames(condb, tableName):
3132

3233

3334
# this method is only currently supported for combined output and not with ipf
34-
def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
35+
def export_to_parquet(infile, outfile, transitionLevel=False, onlyFeatures=False, noDecoys=False):
3536
'''
3637
Convert an OSW sqlite file to Parquet format
3738
@@ -66,6 +67,9 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
6667
CREATE INDEX IF NOT EXISTS idx_protein_protein_id ON PROTEIN (ID);
6768
CREATE INDEX IF NOT EXISTS idx_peptide_protein_mapping_peptide_id ON PEPTIDE_PROTEIN_MAPPING (PEPTIDE_ID);
6869
70+
CREATE INDEX IF NOT EXISTS idx_transition_id ON TRANSITION (ID);
71+
CREATE INDEX IF NOT EXISTS idx_transition_precursor_mapping_transition_id ON TRANSITION_PRECURSOR_MAPPING (TRANSITION_ID);
72+
CREATE INDEX IF NOT EXISTS idx_transition_precursor_mapping_precursor_id ON TRANSITION_PRECURSOR_MAPPING (PRECURSOR_ID);
6973
'''
7074

7175
if check_sqlite_table(con, "FEATURE_MS1"):
@@ -200,19 +204,30 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
200204

201205
# create a list of all the columns
202206
columns_list = [col for c in columns.values() for col in c]
207+
208+
# create a list of just aliases for groupby
209+
pattern = re.compile(r"(.*)\sAS")
210+
alias_list = [ pattern.search(col).group(1) for c in columns.values() for col in c]
203211

204212
# join the list into a single string separated by a comma and a space
205213
columnsToSelect = ", ".join(columns_list)
206-
207-
join_features = "LEFT JOIN" if onlyFeatures else "FULL JOIN"
208-
209-
# First read feature data
210-
# Feature Data
211-
if not transitionLevel:
212-
feature_query = f'''
213-
SELECT {columnsToSelect}
214-
FROM FEATURE
215-
{join_features} PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID
214+
aliasToSelect = ", ".join(alias_list)
215+
216+
# For feature level group important transition level data into one row separated by ';'
217+
featureLvlPrefix = "GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION'" if not transitionLevel else ""
218+
featureLvlSuffix = f'GROUP BY {aliasToSelect}' if not transitionLevel else ""
219+
220+
decoyExclude = "WHERE PRECURSOR.DECOY == 0" if noDecoys else ""
221+
222+
if not onlyFeatures:
223+
query = f'''
224+
SELECT {columnsToSelect},
225+
{featureLvlPrefix}
226+
FROM TRANSITION_PRECURSOR_MAPPING
227+
LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
228+
LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID
229+
LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID
230+
LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
216231
LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
217232
LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
218233
LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
@@ -224,48 +239,30 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
224239
{gene_table_joins}
225240
{pepJoin}
226241
{protJoin}
242+
{decoyExclude}
243+
{featureLvlSuffix}
227244
'''
228-
else: # is transition level
229-
230-
# merge transition and precursor level data
231-
if not onlyFeatures:
232-
feature_query = f'''
233-
SELECT {columnsToSelect}
234-
FROM TRANSITION_PRECURSOR_MAPPING
235-
LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
236-
LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID
237-
LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID
238-
LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
239-
LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
240-
LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
241-
LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
242-
LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID
243-
LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
244-
LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID
245-
LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID
246-
LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
247-
{gene_table_joins}
248-
{pepJoin}
249-
{protJoin}
250-
'''
251-
else:
252-
feature_query = f'''
253-
SELECT {columnsToSelect}
254-
FROM FEATURE_TRANSITION
255-
LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID
256-
LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
257-
LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID
258-
LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
259-
LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
260-
LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
261-
LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
262-
LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID
263-
LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
264-
LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID
265-
LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID
266-
LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
267-
{gene_table_joins}
268-
{pepJoin}
269-
{protJoin}
270-
'''
271-
condb.sql(feature_query).write_parquet(outfile)
245+
else:
246+
query = f'''
247+
SELECT {columnsToSelect},
248+
{featureLvlPrefix}
249+
FROM FEATURE_TRANSITION
250+
LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID
251+
LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
252+
LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID
253+
LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
254+
LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
255+
LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
256+
LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
257+
LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID
258+
LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
259+
LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID
260+
LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID
261+
LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
262+
{gene_table_joins}
263+
{pepJoin}
264+
{protJoin}
265+
{decoyExclude}
266+
{featureLvlSuffix}
267+
'''
268+
condb.sql(query).write_parquet(outfile)

0 commit comments

Comments
 (0)