Skip to content

Commit d63a378

Browse files
committed
fixed merging of mastertables to account for "x" and "y" columns after merging
1 parent 41c9991 commit d63a378

File tree

1 file changed

+50
-23
lines changed

1 file changed

+50
-23
lines changed

scripts/prepare_data_for_improve.py

Lines changed: 50 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

22
import argparse
3-
import functools as ft
3+
from copy import deepcopy
44
import logging
55
from os import PathLike
66
from pathlib import Path
@@ -267,11 +267,18 @@ def process_datasets(args):
267267
"x_data",
268268
"cancer_gene_expression.tsv"
269269
)
270-
merged_transcriptomics.transpose().to_csv(
271-
path_or_buf=outfile_path,
272-
sep='\t',
273-
header=False
274-
)
270+
# some ML algorithms need full matrices as input.
271+
# This back fills NAs with 0s - the assumend "neutral" value for
272+
# gene expression data
273+
(merged_transcriptomics
274+
.fillna(0)
275+
.transpose()
276+
.to_csv(
277+
path_or_buf=outfile_path,
278+
sep='\t',
279+
header=False
280+
)
281+
)
275282

276283

277284
#-------------------------------------------------------------------
@@ -308,14 +315,20 @@ def process_datasets(args):
308315
"x_data",
309316
"cancer_copy_number.tsv"
310317
)
311-
merged_copy_number.transpose().to_csv(
312-
path_or_buf=outfile_path,
313-
sep='\t',
314-
header=False
315-
)
318+
(merged_copy_number
319+
.fillna(1)
320+
.transpose()
321+
.to_csv(
322+
path_or_buf=outfile_path,
323+
sep='\t',
324+
header=False
325+
)
326+
)
316327
# join the "meta data tables" like copynumber etc.
317328

318329

330+
331+
319332
def split_data_sets(
320333
args: dict,
321334
data_sets: dict,
@@ -501,21 +514,35 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
501514
data_sets[data_set].format(data_type=data_type)
502515
)
503516

504-
merged_data = ft.reduce(
505-
lambda left_df, right_df: pd.merge(
506-
left_df,
507-
right_df,
508-
on='entrez_id',
509-
how='outer',
510-
),
511-
dfs_to_merge,
512-
)
517+
merged_data = None
518+
for df in dfs_to_merge:
519+
if merged_data is None:
520+
# filling the return DF with a copy of the "first" DF
521+
merged_data = deepcopy(df)
522+
else:
523+
# merging routine
524+
# pandas.merge always creates C_x & C_y if column C is in
525+
# both the right and left DF. By defining the suffixes
526+
# we can just delete the 'y' column since C_x == C_y in our
527+
# data
528+
merged_data = merged_data.merge(
529+
df,
530+
on='entrez_id',
531+
suffixes=('', '__rm'),
532+
how='outer'
533+
)
534+
merged_data.columns = merged_data.columns.astype(str)
535+
# the "C_y" removal routine
536+
merged_data = (
537+
merged_data
538+
.loc[:, ~merged_data.columns.str.contains('__rm')]
539+
)
513540

514-
# temporary fix to values that should be int but currently aren't
515-
# in the coderdata dataset storage
541+
# Casting col and row indices back to int
542+
merged_data.columns.astype(int)
516543
if not merged_data.index.dtype == int:
517544
merged_data.index = merged_data.index.astype(int)
518-
545+
519546
return merged_data
520547

521548

0 commit comments

Comments
 (0)