11import pandas as pd
22import subprocess
33import os
4+ import sys
5+
6+ # Load our modules
7+ this_file_path = os .path .abspath (os .path .dirname (__file__ ))
8+ sys .path .append (os .path .normpath (this_file_path + '/../' ))
9+ from web_constants import GENE_SYMBOL
10+
411
512OBJ_STORE_URL = "https://mutation-signature-explorer.obj.umiacs.umd.edu/"
613FILE_COLUMNS = [
714 'path_extended_SBS' , 'path_counts_SBS' ,
815 'path_extended_DBS' , 'path_counts_DBS' ,
916 'path_extended_INDEL' , 'path_counts_INDEL' ,
1017 'path_clinical' ,
11- 'path_samples'
18+ 'path_samples' ,
19+ 'path_genes'
1220]
1321OBJ_DIR = '../../obj' if bool (os .environ .get ("DEBUG" , '' )) else '/obj'
1422META_FILE = './meta.tsv' if bool (os .environ .get ("DEBUG" , '' )) else '/app/data/meta.tsv'
23+ GENES_AGG_FILE = os .path .join (OBJ_DIR , 'genes_agg.tsv' )
24+
25+ CLEAN_DATA_PY = 'clean_data.py' if bool (os .environ .get ("DEBUG" , '' )) else '/app/data/clean_data.py'
26+
27+ def create_genes_agg_file ():
28+ genes_agg_df = pd .DataFrame (index = [], columns = ["proj_id" , GENE_SYMBOL , "count" ])
29+ genes_agg_df .to_csv (GENES_AGG_FILE , sep = '\t ' , index = False )
1530
1631def download (file_list ):
1732 for file_path in file_list :
@@ -20,21 +35,33 @@ def download(file_list):
2035 if not os .path .isfile (local_file_path ):
2136 subprocess .run (['curl' , remote_file_url , '--create-dirs' , '-o' , local_file_path ])
2237
23- def clean_samples (samples_path , clinical_path , counts_path_sbs , counts_path_dbs , counts_path_indel ):
38+ def clean_samples (proj_id , samples_path , clinical_path , counts_path_sbs , counts_path_dbs , counts_path_indel , genes_path ):
2439 subprocess .run ([
25- 'python' , 'clean_data.py' ,
40+ 'python' , CLEAN_DATA_PY ,
2641 '-s' , os .path .join (OBJ_DIR , samples_path ),
2742 '-c' , os .path .join (OBJ_DIR , clinical_path ),
43+ '-g' , (os .path .join (OBJ_DIR , genes_path ) if not pd .isna (genes_path ) else "None" ),
2844 '-c-sbs' , os .path .join (OBJ_DIR , counts_path_sbs ),
2945 '-c-dbs' , os .path .join (OBJ_DIR , counts_path_dbs ),
30- '-c-indel' , os .path .join (OBJ_DIR , counts_path_indel )
46+ '-c-indel' , os .path .join (OBJ_DIR , counts_path_indel ),
47+ '-g-agg' , GENES_AGG_FILE ,
48+ '-pid' , proj_id
3149 ])
3250
3351if __name__ == "__main__" :
52+ create_genes_agg_file ()
3453 file_list = []
3554 df = pd .read_csv (META_FILE , sep = '\t ' )
3655 for file_column in FILE_COLUMNS :
3756 file_list += df [file_column ].dropna ().tolist ()
3857 download (file_list )
3958 for index , row in df .iterrows ():
40- clean_samples (row ['path_samples' ], row ['path_clinical' ], row ['path_counts_SBS' ], row ['path_counts_DBS' ], row ['path_counts_INDEL' ])
59+ clean_samples (
60+ row ['id' ],
61+ row ['path_samples' ],
62+ row ['path_clinical' ],
63+ row ['path_counts_SBS' ],
64+ row ['path_counts_DBS' ],
65+ row ['path_counts_INDEL' ],
66+ row ['path_genes' ]
67+ )
0 commit comments