@@ -225,7 +225,56 @@ def map_copy_number(copy_number_data, improve_id_data, entrez_data):
225225
226226 return (improve_mapped_cn_df )
227227
228+ def map_transcriptomics (transciptomics_data , improve_id_data , entrez_data ):
228229
230+ # read in data
231+ if isinstance (transciptomics_data , pd .DataFrame ) == False :
232+ transciptomics_data = pd .read_csv (transciptomics_data )
233+
234+ if isinstance (improve_id_data , pd .DataFrame ) == False :
235+ improve_id_data = pd .read_csv (improve_id_data )
236+
237+ if isinstance (entrez_data , pd .DataFrame ) == False :
238+ entrez_data = pd .read_csv (entrez_data )
239+
240+ # first, convert genes, which are in ensembl id's to gene names
241+ transciptomics_data = transciptomics_data .rename (columns = {'Unnamed: 0' : 'stable_id' })
242+ mg = mygene .MyGeneInfo ()
243+ ensembl_ids = transciptomics_data ['stable_id' ].values
244+ gene_info_list = mg .getgenes (ensembl_ids , fields = 'symbol' )
245+ gene_df = pd .DataFrame .from_dict (gene_info_list )
246+ for_tpm = pd .merge (transciptomics_data , gene_df [['query' ,'symbol' ]], how = 'inner' , left_on = "stable_id" , right_on = "query" )
247+ for_tpm = for_tpm .dropna (subset = ['symbol' ])
248+ for_tpm = for_tpm .drop (columns = ['query' ,'stable_id' ])
249+ for_tpm = for_tpm .rename (columns = {'symbol' :'stable_id' })
250+ for_tpm .to_csv ("/tmp/counts_for_tpm_conversion.tsv" , sep = '\t ' )
251+
252+
253+ # run tpmFromCounts.py to convert counts to tpm
254+ os .system ("python3 tpmFromCounts.py --counts /tmp/counts_for_tpm_conversion.tsv --genome_build https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.gtf.gz --gene_col stable_id --exclude_col stable_id --out_file /tmp/transcriptomics_tpm.tsv" )
255+
256+ # melt the df so there is one sample and gene per row
257+ long_transcriptomics_df = pd .read_csv ("/tmp/transcriptomics_tpm.tsv" ,sep = '\t ' )
258+ long_transcriptomics_df = pd .melt (long_transcriptomics_df , id_vars = ['stable_id' ], value_vars = long_transcriptomics_df .columns [long_transcriptomics_df .columns != 'stable_id' ])
259+ long_transcriptomics_df = long_transcriptomics_df .rename (columns = {'value' :'transcriptomics' , 0 :'sample_name' })
260+
261+
262+ # map gene names to entrez id's
263+ mapped_transcriptomics_df = pd .merge (long_transcriptomics_df , entrez_data [['other_id' ,'entrez_id' ]].drop_duplicates (), how = 'inner' , left_on = "stable_id" , right_on = "other_id" )
264+ mapped_transcriptomics_df = mapped_transcriptomics_df .dropna (subset = ['entrez_id' ])
265+
266+ # mapping improve sample id'samples_df
267+ mapped_transcriptomics_df = pd .merge (mapped_transcriptomics_df , improve_id_data [['other_id' ,'improve_sample_id' ]].drop_duplicates (), how = 'inner' , left_on = "variable" , right_on = "other_id" )
268+
269+ # clean up column names and data types
270+ mapped_transcriptomics_df = mapped_transcriptomics_df .drop (columns = ['stable_id' ,'variable' ,'other_id_x' ,'other_id_y' ])
271+ mapped_transcriptomics_df ['source' ] = "Synapse"
272+ mapped_transcriptomics_df ['study' ] = "liver"
273+ mapped_transcriptomics_df = mapped_transcriptomics_df .dropna ()
274+ mapped_transcriptomics_df = mapped_transcriptomics_df .astype ({'entrez_id' :'int' ,'improve_sample_id' :'int' })
275+ mapped_transcriptomics_df = mapped_transcriptomics_df [['entrez_id' ,'transcriptomics' ,'improve_sample_id' ,'source' ,'study' ]]
276+
277+ return (mapped_transcriptomics_df )
229278
230279
231280def map_proteomics (proteomics_data , improve_id_data , entrez_data ):
0 commit comments