Skip to content

Commit 80b09c7

Browse files
authored
Merge pull request #12 from lrgr/imuse-next
iMuSE-next
2 parents a6a6180 + 2cb4a74 commit 80b09c7

19 files changed

+590
-176
lines changed

app/data/clean_data.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,13 @@ def get_parser():
1010
parser = argparse.ArgumentParser()
1111
parser.add_argument('-s', '--samples-file', type=str, required=True)
1212
parser.add_argument('-c', '--clinical-file', type=str, required=True)
13+
parser.add_argument('-g', '--genes-file', type=str, required=True)
1314
parser.add_argument('-c-sbs', '--counts-sbs-file', type=str, required=True)
1415
parser.add_argument('-c-dbs', '--counts-dbs-file', type=str, required=True)
1516
parser.add_argument('-c-indel', '--counts-indel-file', type=str, required=True)
17+
18+
parser.add_argument('-pid', '--proj-id', type=str, required=True)
19+
parser.add_argument('-g-agg', '--genes-agg-file', type=str, required=True)
1620
return parser
1721

1822
def run( args ):
@@ -50,5 +54,13 @@ def run( args ):
5054
clinical_filtered_df = clinical_filtered_df.reset_index()
5155
clinical_filtered_df.to_csv(args.clinical_file, sep='\t', index=False)
5256

57+
if args.genes_file != "None":
58+
print('* Cleaning genes data')
59+
genes_agg_df = pd.read_csv(args.genes_agg_file, sep='\t')
60+
genes_df = pd.read_csv(args.genes_file, sep='\t')
61+
genes_df["proj_id"] = args.proj_id
62+
genes_df = genes_df.groupby(["proj_id", GENE_SYMBOL]).size().reset_index(name='count')
63+
genes_agg_df = genes_agg_df.append(genes_df, ignore_index=True)
64+
genes_agg_df.to_csv(args.genes_agg_file, sep='\t', index=False)
5365

5466
if __name__ == '__main__': run( get_parser().parse_args(sys.argv[1:]) )

app/data/fetch_data.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,32 @@
11
import pandas as pd
22
import subprocess
33
import os
4+
import sys
5+
6+
# Load our modules
7+
this_file_path = os.path.abspath(os.path.dirname(__file__))
8+
sys.path.append(os.path.normpath(this_file_path + '/../'))
9+
from web_constants import GENE_SYMBOL
10+
411

512
OBJ_STORE_URL = "https://mutation-signature-explorer.obj.umiacs.umd.edu/"
613
FILE_COLUMNS = [
714
'path_extended_SBS', 'path_counts_SBS',
815
'path_extended_DBS', 'path_counts_DBS',
916
'path_extended_INDEL', 'path_counts_INDEL',
1017
'path_clinical',
11-
'path_samples'
18+
'path_samples',
19+
'path_genes'
1220
]
1321
OBJ_DIR = '../../obj' if bool(os.environ.get("DEBUG", '')) else '/obj'
1422
META_FILE = './meta.tsv' if bool(os.environ.get("DEBUG", '')) else '/app/data/meta.tsv'
23+
GENES_AGG_FILE = os.path.join(OBJ_DIR, 'genes_agg.tsv')
24+
25+
CLEAN_DATA_PY = 'clean_data.py' if bool(os.environ.get("DEBUG", '')) else '/app/data/clean_data.py'
26+
27+
def create_genes_agg_file():
28+
genes_agg_df = pd.DataFrame(index=[], columns=["proj_id", GENE_SYMBOL, "count"])
29+
genes_agg_df.to_csv(GENES_AGG_FILE, sep='\t', index=False)
1530

1631
def download(file_list):
1732
for file_path in file_list:
@@ -20,21 +35,33 @@ def download(file_list):
2035
if not os.path.isfile(local_file_path):
2136
subprocess.run(['curl', remote_file_url, '--create-dirs', '-o', local_file_path])
2237

23-
def clean_samples(samples_path, clinical_path, counts_path_sbs, counts_path_dbs, counts_path_indel):
38+
def clean_samples(proj_id, samples_path, clinical_path, counts_path_sbs, counts_path_dbs, counts_path_indel, genes_path):
2439
subprocess.run([
25-
'python', 'clean_data.py',
40+
'python', CLEAN_DATA_PY,
2641
'-s', os.path.join(OBJ_DIR, samples_path),
2742
'-c', os.path.join(OBJ_DIR, clinical_path),
43+
'-g', (os.path.join(OBJ_DIR, genes_path) if not pd.isna(genes_path) else "None"),
2844
'-c-sbs', os.path.join(OBJ_DIR, counts_path_sbs),
2945
'-c-dbs', os.path.join(OBJ_DIR, counts_path_dbs),
30-
'-c-indel', os.path.join(OBJ_DIR, counts_path_indel)
46+
'-c-indel', os.path.join(OBJ_DIR, counts_path_indel),
47+
'-g-agg', GENES_AGG_FILE,
48+
'-pid', proj_id
3149
])
3250

3351
if __name__ == "__main__":
52+
create_genes_agg_file()
3453
file_list = []
3554
df = pd.read_csv(META_FILE, sep='\t')
3655
for file_column in FILE_COLUMNS:
3756
file_list += df[file_column].dropna().tolist()
3857
download(file_list)
3958
for index, row in df.iterrows():
40-
clean_samples(row['path_samples'], row['path_clinical'], row['path_counts_SBS'], row['path_counts_DBS'], row['path_counts_INDEL'])
59+
clean_samples(
60+
row['id'],
61+
row['path_samples'],
62+
row['path_clinical'],
63+
row['path_counts_SBS'],
64+
row['path_counts_DBS'],
65+
row['path_counts_INDEL'],
66+
row['path_genes']
67+
)

app/data/meta.tsv

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)