Skip to content

Commit 57a6e5f

Browse files
authored
Merge pull request #130 from Roestlab/fix/parquet_export_global_stats
fix: export parquet peptide global level
2 parents fe296fd + 98e2bac commit 57a6e5f

File tree

1 file changed

+15
-11
lines changed

1 file changed

+15
-11
lines changed

pyprophet/export_parquet.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ def getPeptideProteinScoreTable(conndb, level):
1717
nonGlobal.columns = [ col.upper().replace('-', '') for col in nonGlobal.columns.map('_'.join)]
1818
nonGlobal= nonGlobal.reset_index()
1919

20-
glob = conndb.sql(f"select {id}, RUN_ID, SCORE, PVALUE, QVALUE, PEP from {score_table} where context == 'global'").df()
20+
glob = conndb.sql(f"select {id}, SCORE, PVALUE, QVALUE, PEP from {score_table} where context == 'global'").df()
2121
glob.columns = [ col.upper() + '_GLOBAL' if col != id else col for col in glob.columns ]
2222

23-
return nonGlobal.merge(glob, how='outer')
23+
return glob, nonGlobal
2424

2525
def getVarColumnNames(condb, tableName):
2626
'''
@@ -139,7 +139,7 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
139139
## features
140140
columns['FEATURE'] = ['EXP_RT', 'EXP_IM', 'NORM_RT', 'DELTA_RT', 'LEFT_WIDTH', 'RIGHT_WIDTH']
141141
columns['FEATURE_MS2'] = ['FEATURE_ID', 'AREA_INTENSITY', 'TOTAL_AREA_INTENSITY', 'APEX_INTENSITY', 'TOTAL_MI'] + getVarColumnNames(con, 'FEATURE_MS2')
142-
columns['FEATURE_MS1'] = ['APEX_INTENSITY', 'VAR_MASSDEV_SCORE'] + getVarColumnNames(con, 'FEATURE_MS1')
142+
columns['FEATURE_MS1'] = ['APEX_INTENSITY', 'AREA_INTENSITY'] + getVarColumnNames(con, 'FEATURE_MS1')
143143
if hasIm:
144144
imColumns = ['EXP_IM', 'DELTA_IM']
145145
columns['FEATURE_MS2'] = columns['FEATURE_MS2'] + imColumns
@@ -150,15 +150,19 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
150150

151151
# Check for Peptide/Protein scores Context Scores
152152
if check_sqlite_table(con, "SCORE_PEPTIDE"):
153-
pepTable = getPeptideProteinScoreTable(condb, "peptide")
154-
pepJoin = 'LEFT JOIN pepTable ON pepTable.PEPTIDE_ID = PEPTIDE.ID and pepTable.RUN_ID = RUN.ID'
155-
columns['pepTable'] = list(set(pepTable.columns).difference(set(['PEPTIDE_ID', 'RUN_ID']))) # all columns except PEPTIDE_ID and RUN_ID
153+
pepTable_global, pepTable_nonGlobal = getPeptideProteinScoreTable(condb, "peptide")
154+
pepJoin = '''LEFT JOIN pepTable_nonGlobal ON pepTable_nonGlobal.PEPTIDE_ID = PEPTIDE.ID and pepTable_nonGlobal.RUN_ID = RUN.ID
155+
LEFT JOIN pepTable_global ON pepTable_global.PEPTIDE_ID = PEPTIDE.ID'''
156+
columns['pepTable_nonGlobal'] = list(set(pepTable_nonGlobal.columns).difference(set(['PEPTIDE_ID', 'RUN_ID']))) # all columns except PEPTIDE_ID and RUN_ID
157+
columns['pepTable_global'] = list(set(pepTable_global.columns).difference(set(['PEPTIDE_ID']))) # all columns except PEPTIDE_ID and RUN_ID
156158

157159

158160
if check_sqlite_table(con, "SCORE_PROTEIN"):
159-
protTable = getPeptideProteinScoreTable(condb, "protein")
160-
protJoin = 'LEFT JOIN protTable ON protTable.PROTEIN_ID = PROTEIN.ID and protTable.RUN_ID = RUN.ID'
161-
columns['protTable'] = list(set(protTable.columns).difference(set(['PROTEIN_ID', 'RUN_ID']))) # all columns except PEPTIDE_ID and RUN_ID
161+
protTable_global, protTable_nonGlobal = getPeptideProteinScoreTable(condb, "protein")
162+
protJoin = '''LEFT JOIN protTable_nonGlobal ON protTable_nonGlobal.PROTEIN_ID = PROTEIN.ID and protTable_nonGlobal.RUN_ID = RUN.ID
163+
LEFT JOIN protTable_global ON protTable_global.PROTEIN_ID = PROTEIN.ID'''
164+
columns['protTable_nonGlobal'] = list(set(protTable_nonGlobal.columns).difference(set(['PROTEIN_ID', 'RUN_ID']))) # all columns except PROTEIN_ID and RUN_ID
165+
columns['protTable_global'] = list(set(protTable_global.columns).difference(set(['PROTEIN_ID']))) # all columns except PROTEIN_ID
162166

163167
## other
164168
columns['RUN'] = ['FILENAME']
@@ -181,9 +185,9 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
181185

182186
for table in columns.keys(): # iterate through all tables
183187
## rename pepTable and protTable to be inline with sql scheme
184-
if table == 'pepTable':
188+
if table in ['pepTable_nonGlobal','pepTable_global']:
185189
renamed_table = 'SCORE_PEPTIDE'
186-
elif table == 'protTable':
190+
elif table in ['protTable_nonGlobal', 'protTable_global']:
187191
renamed_table = 'SCORE_PROTEIN'
188192
else:
189193
renamed_table = table

0 commit comments

Comments
 (0)