lrgr
diff --git a/‎app/data/clean_data.py‎
Lines changed: 12 additions & 0 deletions b/‎app/data/clean_data.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎app/data/fetch_data.py‎
Lines changed: 32 additions & 5 deletions b/‎app/data/fetch_data.py‎
Lines changed: 32 additions & 5 deletions
diff --git a/‎app/data/meta.tsv‎
Lines changed: 1 addition & 1 deletion b/‎app/data/meta.tsv‎
Lines changed: 1 addition & 1 deletion
@@ -10,9 +10,13 @@ def get_parser():
     parser = argparse.ArgumentParser()
     parser.add_argument('-s', '--samples-file', type=str, required=True)
     parser.add_argument('-c', '--clinical-file', type=str, required=True)
+    parser.add_argument('-g', '--genes-file', type=str, required=True)
     parser.add_argument('-c-sbs', '--counts-sbs-file', type=str, required=True)
     parser.add_argument('-c-dbs', '--counts-dbs-file', type=str, required=True)
     parser.add_argument('-c-indel', '--counts-indel-file', type=str, required=True)
+
+    parser.add_argument('-pid', '--proj-id', type=str, required=True)
+    parser.add_argument('-g-agg', '--genes-agg-file', type=str, required=True)
     return parser
 
 def run( args ):
@@ -50,5 +54,13 @@ def run( args ):
     clinical_filtered_df = clinical_filtered_df.reset_index()
     clinical_filtered_df.to_csv(args.clinical_file, sep='\t', index=False)
 
+    if args.genes_file != "None":
+        print('* Cleaning genes data')
+        genes_agg_df = pd.read_csv(args.genes_agg_file, sep='\t')
+        genes_df = pd.read_csv(args.genes_file, sep='\t')
+        genes_df["proj_id"] = args.proj_id
+        genes_df = genes_df.groupby(["proj_id", GENE_SYMBOL]).size().reset_index(name='count')
+        genes_agg_df = genes_agg_df.append(genes_df, ignore_index=True)
+        genes_agg_df.to_csv(args.genes_agg_file, sep='\t', index=False)
 
 if __name__ == '__main__': run( get_parser().parse_args(sys.argv[1:]) )
@@ -1,17 +1,32 @@
 import pandas as pd
 import subprocess
 import os
+import sys
+
+# Load our modules
+this_file_path = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.normpath(this_file_path + '/../'))
+from web_constants import GENE_SYMBOL
+
 
 OBJ_STORE_URL = "https://mutation-signature-explorer.obj.umiacs.umd.edu/"
 FILE_COLUMNS = [
   'path_extended_SBS', 'path_counts_SBS', 
   'path_extended_DBS', 'path_counts_DBS', 
   'path_extended_INDEL', 'path_counts_INDEL', 
   'path_clinical',
-  'path_samples'
+  'path_samples',
+  'path_genes'
 ]
 OBJ_DIR = '../../obj' if bool(os.environ.get("DEBUG", '')) else '/obj'
 META_FILE = './meta.tsv' if bool(os.environ.get("DEBUG", '')) else '/app/data/meta.tsv'
+GENES_AGG_FILE = os.path.join(OBJ_DIR, 'genes_agg.tsv')
+
+CLEAN_DATA_PY = 'clean_data.py' if bool(os.environ.get("DEBUG", '')) else '/app/data/clean_data.py'
+
+def create_genes_agg_file():
+  genes_agg_df = pd.DataFrame(index=[], columns=["proj_id", GENE_SYMBOL, "count"])
+  genes_agg_df.to_csv(GENES_AGG_FILE, sep='\t', index=False)
 
 def download(file_list):
   for file_path in file_list:
@@ -20,21 +35,33 @@ def download(file_list):
     if not os.path.isfile(local_file_path):
       subprocess.run(['curl', remote_file_url, '--create-dirs', '-o', local_file_path])
 
-def clean_samples(samples_path, clinical_path, counts_path_sbs, counts_path_dbs, counts_path_indel):
+def clean_samples(proj_id, samples_path, clinical_path, counts_path_sbs, counts_path_dbs, counts_path_indel, genes_path):
   subprocess.run([
-    'python', 'clean_data.py', 
+    'python', CLEAN_DATA_PY, 
     '-s', os.path.join(OBJ_DIR, samples_path), 
     '-c', os.path.join(OBJ_DIR, clinical_path),
+    '-g', (os.path.join(OBJ_DIR, genes_path) if not pd.isna(genes_path) else "None"),
     '-c-sbs', os.path.join(OBJ_DIR, counts_path_sbs),
     '-c-dbs', os.path.join(OBJ_DIR, counts_path_dbs),
-    '-c-indel', os.path.join(OBJ_DIR, counts_path_indel)
+    '-c-indel', os.path.join(OBJ_DIR, counts_path_indel),
+    '-g-agg', GENES_AGG_FILE,
+    '-pid', proj_id
   ])
 
 if __name__ == "__main__":
+  create_genes_agg_file()
   file_list = []
   df = pd.read_csv(META_FILE, sep='\t')
   for file_column in FILE_COLUMNS:
     file_list += df[file_column].dropna().tolist()
   download(file_list)
   for index, row in df.iterrows():
-    clean_samples(row['path_samples'], row['path_clinical'], row['path_counts_SBS'], row['path_counts_DBS'], row['path_counts_INDEL'])
+    clean_samples(
+      row['id'],
+      row['path_samples'], 
+      row['path_clinical'], 
+      row['path_counts_SBS'], 
+      row['path_counts_DBS'], 
+      row['path_counts_INDEL'],
+      row['path_genes']
+    )