@@ -326,7 +326,82 @@ def process_datasets(args):
326326 )
327327 )
328328
329+ #-------------------------------------------------------------------
330+ # create proteomics master table
331+ #-------------------------------------------------------------------
332+
333+ proteomics = merge_master_tables (
334+ args = args ,
335+ data_sets = data_sets ,
336+ data_type = 'proteomics'
337+ )
338+
339+ ####
340+ # Imputation step:
341+ # currently we are imputing by generating the mean over all samples
342+ # in wich the protein was detected across all datasets.
343+ # The missing values are the back filled for each protein.
344+ ####
345+ proteomics = (
346+ proteomics
347+ # the proteomics table has the transposed first (see below)
348+ # due to .fillna not working as expected with axis==1
349+ .T
350+ .fillna (
351+ # the filling of NAs with 'value' is not implemented for
352+ # axis==1, despite what is documented for pandas>2.0.0
353+ value = proteomics .median (axis = 1 , skipna = True ),
354+ axis = 0
355+ )
356+ .T # transpose back into original orientation
357+ )
358+ # merging ensemble gene id & gene symbol into the proteomics
359+ # data
360+ proteomics = pd .merge (
361+ proteomics ,
362+ data_gene_names [[
363+ 'entrez_id' ,
364+ 'ensembl_gene_id' ,
365+ 'gene_symbol'
366+ ]],
367+ how = 'left' ,
368+ on = 'entrez_id' ,
369+ )
370+
371+ # moving ensemble_id & gene_symbol columns to the front of the table
372+ # such that when transposing the DataFrame they are row 3 and 2
373+ # respectively
374+ proteomics .insert (
375+ 1 ,
376+ 'gene_symbol' ,
377+ proteomics .pop ('gene_symbol' )
378+ )
379+ proteomics .insert (
380+ 0 ,
381+ 'ensembl_gene_id' ,
382+ proteomics .pop ('ensembl_gene_id' )
383+ )
384+
385+ proteomics = proteomics [proteomics ['entrez_id' ] != 0 ]
386+ proteomics = proteomics .fillna (0 ).T .reset_index ()
387+ for i in range (0 ,3 ):
388+ proteomics .iloc [i ,0 ] = np .nan
329389
390+ # writing the proteomics datatable to '/x_data/*_proteomics.tsv'
391+ outfile_path = args .WORKDIR .joinpath (
392+ "data_out" ,
393+ "x_data" ,
394+ "cancer_proteomics.tsv"
395+ )
396+ (proteomics
397+ .to_csv (
398+ path_or_buf = outfile_path ,
399+ sep = '\t ' ,
400+ header = False ,
401+ index = False
402+ )
403+ )
404+
330405 #-------------------------------------------------------------------
331406 # create copynumber master table & discretized table
332407 #-------------------------------------------------------------------
@@ -869,7 +944,7 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
869944 for data_set in data_sets :
870945 if data_sets [data_set ].experiments is not None :
871946 if (
872- data_type in ['transcriptomics' , 'copy_number' ] and
947+ data_type in ['transcriptomics' , 'copy_number' , 'proteomics' ] and
873948 getattr (data_sets [data_set ], data_type , None ) is not None
874949 ):
875950 dfs_to_merge .append (
0 commit comments