|
1 | 1 |
|
2 | 2 | import argparse |
3 | | -import functools as ft |
| 3 | +from copy import deepcopy |
4 | 4 | import logging |
5 | 5 | from os import PathLike |
6 | 6 | from pathlib import Path |
@@ -267,11 +267,18 @@ def process_datasets(args): |
267 | 267 | "x_data", |
268 | 268 | "cancer_gene_expression.tsv" |
269 | 269 | ) |
270 | | - merged_transcriptomics.transpose().to_csv( |
271 | | - path_or_buf=outfile_path, |
272 | | - sep='\t', |
273 | | - header=False |
274 | | - ) |
| 270 | + # some ML algorithms need full matrices as input. |
| 271 | + # This back fills NAs with 0s - the assumend "neutral" value for |
| 272 | + # gene expression data |
| 273 | + (merged_transcriptomics |
| 274 | + .fillna(0) |
| 275 | + .transpose() |
| 276 | + .to_csv( |
| 277 | + path_or_buf=outfile_path, |
| 278 | + sep='\t', |
| 279 | + header=False |
| 280 | + ) |
| 281 | + ) |
275 | 282 |
|
276 | 283 |
|
277 | 284 | #------------------------------------------------------------------- |
@@ -308,14 +315,20 @@ def process_datasets(args): |
308 | 315 | "x_data", |
309 | 316 | "cancer_copy_number.tsv" |
310 | 317 | ) |
311 | | - merged_copy_number.transpose().to_csv( |
312 | | - path_or_buf=outfile_path, |
313 | | - sep='\t', |
314 | | - header=False |
315 | | - ) |
| 318 | + (merged_copy_number |
| 319 | + .fillna(1) |
| 320 | + .transpose() |
| 321 | + .to_csv( |
| 322 | + path_or_buf=outfile_path, |
| 323 | + sep='\t', |
| 324 | + header=False |
| 325 | + ) |
| 326 | + ) |
316 | 327 | # join the "meta data tables" like copynumber etc. |
317 | 328 |
|
318 | 329 |
|
| 330 | + |
| 331 | + |
319 | 332 | def split_data_sets( |
320 | 333 | args: dict, |
321 | 334 | data_sets: dict, |
@@ -501,21 +514,35 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'): |
501 | 514 | data_sets[data_set].format(data_type=data_type) |
502 | 515 | ) |
503 | 516 |
|
504 | | - merged_data = ft.reduce( |
505 | | - lambda left_df, right_df: pd.merge( |
506 | | - left_df, |
507 | | - right_df, |
508 | | - on='entrez_id', |
509 | | - how='outer', |
510 | | - ), |
511 | | - dfs_to_merge, |
512 | | - ) |
| 517 | + merged_data = None |
| 518 | + for df in dfs_to_merge: |
| 519 | + if merged_data is None: |
| 520 | + # filling the return DF with a copy of the "first" DF |
| 521 | + merged_data = deepcopy(df) |
| 522 | + else: |
| 523 | + # merging routine |
| 524 | + # pandas.merge always creates C_x & C_y if column C is in |
| 525 | + # both the right and left DF. By defining the suffixes |
| 526 | + # we can just delete the 'y' column since C_x == C_y in our |
| 527 | + # data |
| 528 | + merged_data = merged_data.merge( |
| 529 | + df, |
| 530 | + on='entrez_id', |
| 531 | + suffixes=('', '__rm'), |
| 532 | + how='outer' |
| 533 | + ) |
| 534 | + merged_data.columns = merged_data.columns.astype(str) |
| 535 | + # the "C_y" removal routine |
| 536 | + merged_data = ( |
| 537 | + merged_data |
| 538 | + .loc[:, ~merged_data.columns.str.contains('__rm')] |
| 539 | + ) |
513 | 540 |
|
514 | | - # temporary fix to values that should be int but currently aren't |
515 | | - # in the coderdata dataset storage |
| 541 | + # Casting col and row indices back to int |
| 542 | + merged_data.columns.astype(int) |
516 | 543 | if not merged_data.index.dtype == int: |
517 | 544 | merged_data.index = merged_data.index.astype(int) |
518 | | - |
| 545 | + |
519 | 546 | return merged_data |
520 | 547 |
|
521 | 548 |
|
|
0 commit comments