Skip to content

Commit fb709c0

Browse files
committed
added proteomics return
1 parent 52dfa8c commit fb709c0

File tree

1 file changed

+76
-1
lines changed

1 file changed

+76
-1
lines changed

scripts/prepare_data_for_improve.py

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,82 @@ def process_datasets(args):
326326
)
327327
)
328328

329+
#-------------------------------------------------------------------
330+
# create proteomics master table
331+
#-------------------------------------------------------------------
332+
333+
proteomics = merge_master_tables(
334+
args=args,
335+
data_sets=data_sets,
336+
data_type='proteomics'
337+
)
338+
339+
####
340+
# Imputation step:
341+
# currently we are imputing by generating the mean over all samples
342+
# in wich the protein was detected across all datasets.
343+
# The missing values are the back filled for each protein.
344+
####
345+
proteomics = (
346+
proteomics
347+
# the proteomics table has the transposed first (see below)
348+
# due to .fillna not working as expected with axis==1
349+
.T
350+
.fillna(
351+
# the filling of NAs with 'value' is not implemented for
352+
# axis==1, despite what is documented for pandas>2.0.0
353+
value=proteomics.median(axis=1, skipna=True),
354+
axis=0
355+
)
356+
.T # transpose back into original orientation
357+
)
358+
# merging ensemble gene id & gene symbol into the proteomics
359+
# data
360+
proteomics = pd.merge(
361+
proteomics,
362+
data_gene_names[[
363+
'entrez_id',
364+
'ensembl_gene_id',
365+
'gene_symbol'
366+
]],
367+
how='left',
368+
on='entrez_id',
369+
)
370+
371+
# moving ensemble_id & gene_symbol columns to the front of the table
372+
# such that when transposing the DataFrame they are row 3 and 2
373+
# respectively
374+
proteomics.insert(
375+
1,
376+
'gene_symbol',
377+
proteomics.pop('gene_symbol')
378+
)
379+
proteomics.insert(
380+
0,
381+
'ensembl_gene_id',
382+
proteomics.pop('ensembl_gene_id')
383+
)
384+
385+
proteomics = proteomics[proteomics['entrez_id'] != 0]
386+
proteomics = proteomics.fillna(0).T.reset_index()
387+
for i in range(0,3):
388+
proteomics.iloc[i,0] = np.nan
329389

390+
# writing the proteomics datatable to '/x_data/*_proteomics.tsv'
391+
outfile_path = args.WORKDIR.joinpath(
392+
"data_out",
393+
"x_data",
394+
"cancer_proteomics.tsv"
395+
)
396+
(proteomics
397+
.to_csv(
398+
path_or_buf=outfile_path,
399+
sep='\t',
400+
header=False,
401+
index=False
402+
)
403+
)
404+
330405
#-------------------------------------------------------------------
331406
# create copynumber master table & discretized table
332407
#-------------------------------------------------------------------
@@ -869,7 +944,7 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
869944
for data_set in data_sets:
870945
if data_sets[data_set].experiments is not None:
871946
if (
872-
data_type in ['transcriptomics', 'copy_number'] and
947+
data_type in ['transcriptomics', 'copy_number', 'proteomics'] and
873948
getattr(data_sets[data_set], data_type, None) is not None
874949
):
875950
dfs_to_merge.append(

0 commit comments

Comments
 (0)