33import pandas as pd
44from pyprophet .export import check_sqlite_table
55from duckdb_extensions import extension_importer
6+ import re
67
78def getPeptideProteinScoreTable (conndb , level ):
89 if level == 'peptide' :
@@ -31,7 +32,7 @@ def getVarColumnNames(condb, tableName):
3132
3233
3334# this method is only currently supported for combined output and not with ipf
34- def export_to_parquet (infile , outfile , transitionLevel , onlyFeatures = False ):
35+ def export_to_parquet (infile , outfile , transitionLevel = False , onlyFeatures = False , noDecoys = False ):
3536 '''
3637 Convert an OSW sqlite file to Parquet format
3738
@@ -66,6 +67,9 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
6667 CREATE INDEX IF NOT EXISTS idx_protein_protein_id ON PROTEIN (ID);
6768 CREATE INDEX IF NOT EXISTS idx_peptide_protein_mapping_peptide_id ON PEPTIDE_PROTEIN_MAPPING (PEPTIDE_ID);
6869
70+ CREATE INDEX IF NOT EXISTS idx_transition_id ON TRANSITION (ID);
71+ CREATE INDEX IF NOT EXISTS idx_transition_precursor_mapping_transition_id ON TRANSITION_PRECURSOR_MAPPING (TRANSITION_ID);
72+ CREATE INDEX IF NOT EXISTS idx_transition_precursor_mapping_precursor_id ON TRANSITION_PRECURSOR_MAPPING (PRECURSOR_ID);
6973 '''
7074
7175 if check_sqlite_table (con , "FEATURE_MS1" ):
@@ -200,19 +204,30 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
200204
201205 # create a list of all the columns
202206 columns_list = [col for c in columns .values () for col in c ]
207+
208+ # create a list of just aliases for groupby
209+ pattern = re .compile (r"(.*)\sAS" )
210+ alias_list = [ pattern .search (col ).group (1 ) for c in columns .values () for col in c ]
203211
204212 # join the list into a single string separated by a comma and a space
205213 columnsToSelect = ", " .join (columns_list )
206-
207- join_features = "LEFT JOIN" if onlyFeatures else "FULL JOIN"
208-
209- # First read feature data
210- # Feature Data
211- if not transitionLevel :
212- feature_query = f'''
213- SELECT { columnsToSelect }
214- FROM FEATURE
215- { join_features } PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID
214+ aliasToSelect = ", " .join (alias_list )
215+
216+ # For feature level group important transition level data into one row separated by ';'
217+ featureLvlPrefix = "GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION'" if not transitionLevel else ""
218+ featureLvlSuffix = f'GROUP BY { aliasToSelect } ' if not transitionLevel else ""
219+
220+ decoyExclude = "WHERE PRECURSOR.DECOY == 0" if noDecoys else ""
221+
222+ if not onlyFeatures :
223+ query = f'''
224+ SELECT { columnsToSelect } ,
225+ { featureLvlPrefix }
226+ FROM TRANSITION_PRECURSOR_MAPPING
227+ LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
228+ LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID
229+ LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID
230+ LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
216231 LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
217232 LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
218233 LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
@@ -224,48 +239,30 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
224239 { gene_table_joins }
225240 { pepJoin }
226241 { protJoin }
242+ { decoyExclude }
243+ { featureLvlSuffix }
227244 '''
228- else : # is transition level
229-
230- # merge transition and precursor level data
231- if not onlyFeatures :
232- feature_query = f'''
233- SELECT { columnsToSelect }
234- FROM TRANSITION_PRECURSOR_MAPPING
235- LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
236- LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID
237- LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID
238- LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
239- LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
240- LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
241- LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
242- LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID
243- LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
244- LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID
245- LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID
246- LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
247- { gene_table_joins }
248- { pepJoin }
249- { protJoin }
250- '''
251- else :
252- feature_query = f'''
253- SELECT { columnsToSelect }
254- FROM FEATURE_TRANSITION
255- LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID
256- LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
257- LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID
258- LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
259- LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
260- LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
261- LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
262- LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID
263- LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
264- LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID
265- LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID
266- LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
267- { gene_table_joins }
268- { pepJoin }
269- { protJoin }
270- '''
271- condb .sql (feature_query ).write_parquet (outfile )
245+ else :
246+ query = f'''
247+ SELECT { columnsToSelect } ,
248+ { featureLvlPrefix }
249+ FROM FEATURE_TRANSITION
250+ LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID
251+ LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
252+ LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID
253+ LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
254+ LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
255+ LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
256+ LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
257+ LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID
258+ LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
259+ LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID
260+ LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID
261+ LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
262+ { gene_table_joins }
263+ { pepJoin }
264+ { protJoin }
265+ { decoyExclude }
266+ { featureLvlSuffix }
267+ '''
268+ condb .sql (query ).write_parquet (outfile )
0 commit comments