@@ -226,56 +226,46 @@ def map_copy_number(copy_number_data, improve_id_data, entrez_data):
226226 return (improve_mapped_cn_df )
227227
228228
229- def map_transcriptomics (transciptomics_data , improve_id_data , entrez_data ):
229+
230+
231+ def map_proteomics (proteomics_data , improve_id_data , entrez_data ):
230232
231233 # read in data
232- if isinstance (transciptomics_data , pd .DataFrame ) == False :
233- transciptomics_data = pd .read_csv (transciptomics_data )
234+ if isinstance (proteomics_data , pd .DataFrame ) == False :
235+ proteomics_data = pd .read_csv (proteomics_data )
234236
235237 if isinstance (improve_id_data , pd .DataFrame ) == False :
236238 improve_id_data = pd .read_csv (improve_id_data )
237-
239+
238240 if isinstance (entrez_data , pd .DataFrame ) == False :
239241 entrez_data = pd .read_csv (entrez_data )
240242
241- # first, convert genes, which are in ensembl id's to gene names
242- transciptomics_data = transciptomics_data .rename (columns = {'Unnamed: 0' : 'stable_id' })
243- mg = mygene .MyGeneInfo ()
244- ensembl_ids = transciptomics_data ['stable_id' ].values
245- gene_info_list = mg .getgenes (ensembl_ids , fields = 'symbol' )
246- gene_df = pd .DataFrame .from_dict (gene_info_list )
247- for_tpm = pd .merge (transciptomics_data , gene_df [['query' ,'symbol' ]], how = 'inner' , left_on = "stable_id" , right_on = "query" )
248- for_tpm = for_tpm .dropna (subset = ['symbol' ])
249- for_tpm = for_tpm .drop (columns = ['query' ,'stable_id' ])
250- for_tpm = for_tpm .rename (columns = {'symbol' :'stable_id' })
251- for_tpm .to_csv ("/tmp/counts_for_tpm_conversion.tsv" , sep = '\t ' )
252-
253-
254- # run tpmFromCounts.py to convert counts to tpm
255- os .system ("python3 tpmFromCounts.py --counts /tmp/counts_for_tpm_conversion.tsv --genome_build https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.gtf.gz --gene_col stable_id --exclude_col stable_id --out_file /tmp/transcriptomics_tpm.tsv" )
256-
257- # melt the df so there is one sample and gene per row
258- long_transcriptomics_df = pd .read_csv ("/tmp/transcriptomics_tpm.tsv" ,sep = '\t ' )
259- long_transcriptomics_df = pd .melt (long_transcriptomics_df , id_vars = ['stable_id' ], value_vars = long_transcriptomics_df .columns [long_transcriptomics_df .columns != 'stable_id' ])
260- long_transcriptomics_df = long_transcriptomics_df .rename (columns = {'value' :'transcriptomics' , 0 :'sample_name' })
261-
243+ # first, replace colnames with first row and delete first row
244+ proteomics_data .columns = proteomics_data .iloc [0 ,:]
245+ proteomics_data = proteomics_data .iloc [1 :]
262246
263- # map gene names to entrez id's
264- mapped_transcriptomics_df = pd .merge (long_transcriptomics_df , entrez_data [['other_id' ,'entrez_id' ]].drop_duplicates (), how = 'inner' , left_on = "stable_id" , right_on = "other_id" )
265- mapped_transcriptomics_df = mapped_transcriptomics_df .dropna (subset = ['entrez_id' ])
247+ # melt the df so there is one sample and prot per row
248+ proteomics_data = proteomics_data .rename (columns = {proteomics_data .columns [0 ]:'gene_symbol' })
249+ long_prot_df = pd .melt (proteomics_data , id_vars = ['gene_symbol' ], value_vars = proteomics_data .columns [proteomics_data .columns != 'gene_symbol' ])
250+ long_prot_df = long_prot_df .rename (columns = {0 :'sample_name' , 'value' :'proteomics' })
251+
252+
253+ # map gene names to entrez id's
254+ mapped_proteomics_df = pd .merge (long_prot_df , entrez_data [['other_id' ,'entrez_id' ]].drop_duplicates (), how = 'inner' , left_on = "gene_symbol" , right_on = "other_id" )
255+ mapped_proteomics_df = mapped_proteomics_df .dropna (subset = ['entrez_id' ])
266256
267257 # mapping improve sample id'samples_df
268- mapped_transcriptomics_df = pd .merge (mapped_transcriptomics_df , improve_id_data [['other_id' ,'improve_sample_id' ]].drop_duplicates (), how = 'inner' , left_on = "variable " , right_on = "other_id" )
269-
258+ mapped_proteomics_df = pd .merge (mapped_proteomics_df , improve_id_data [['other_id' ,'improve_sample_id' ]].drop_duplicates (), how = 'inner' , left_on = "sample_name " , right_on = "other_id" )
259+
270260 # clean up column names and data types
271- mapped_transcriptomics_df = mapped_transcriptomics_df .drop (columns = ['stable_id ' ,'variable ' ,'other_id_x' ,'other_id_y' ])
272- mapped_transcriptomics_df ['source' ] = "Synapse"
273- mapped_transcriptomics_df ['study' ] = "liver"
274- mapped_transcriptomics_df = mapped_transcriptomics_df .dropna ()
275- mapped_transcriptomics_df = mapped_transcriptomics_df .astype ({'entrez_id' :'int' ,'improve_sample_id' :'int' })
276- mapped_transcriptomics_df = mapped_transcriptomics_df [['entrez_id' ,'transcriptomics ' ,'improve_sample_id' ,'source' ,'study' ]]
261+ mapped_proteomics_df = mapped_proteomics_df .drop (columns = ['gene_symbol ' ,'sample_name ' ,'other_id_x' ,'other_id_y' ])
262+ mapped_proteomics_df ['source' ] = "Synapse"
263+ mapped_proteomics_df ['study' ] = "liver"
264+ mapped_proteomics_df = mapped_proteomics_df .dropna ()
265+ mapped_proteomics_df = mapped_proteomics_df .astype ({'entrez_id' :'int' ,'improve_sample_id' :'int' })
266+ mapped_proteomics_df = mapped_proteomics_df [['entrez_id' ,'proteomics ' ,'improve_sample_id' ,'source' ,'study' ]]
277267
278- return (mapped_transcriptomics_df )
268+ return (mapped_proteomics_df )
279269
280270
281271if __name__ == "__main__" :
@@ -293,6 +283,7 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
293283 parser .add_argument ('-M' , '--mutations' , action = 'store_true' , default = False , help = 'Generate mutations data' )
294284 parser .add_argument ('-C' , '--copy_number' , action = 'store_true' , default = False , help = 'Generate copy number data' )
295285
286+
296287 args = parser .parse_args ()
297288
298289
0 commit comments