isb-cgc
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 0 deletions b/‎.gitignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎api/Cohort.py‎
Lines changed: 62 additions & 104 deletions b/‎api/Cohort.py‎
Lines changed: 62 additions & 104 deletions
diff --git a/‎api/data_access.py‎
Lines changed: 24 additions & 23 deletions b/‎api/data_access.py‎
Lines changed: 24 additions & 23 deletions
diff --git a/‎api/isb_cgc_api/cohorts_create.py‎
Lines changed: 12 additions & 19 deletions b/‎api/isb_cgc_api/cohorts_create.py‎
Lines changed: 12 additions & 19 deletions
diff --git a/‎api/isb_cgc_api/cohorts_delete.py‎
Lines changed: 0 additions & 1 deletion b/‎api/isb_cgc_api/cohorts_delete.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎api/isb_cgc_api/cohorts_get.py‎
Lines changed: 14 additions & 14 deletions b/‎api/isb_cgc_api/cohorts_get.py‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎api/isb_cgc_api/cohorts_googlegenomics.py‎
Lines changed: 1 addition & 1 deletion b/‎api/isb_cgc_api/cohorts_googlegenomics.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎api/isb_cgc_api/cohorts_preview.py‎
Lines changed: 2 additions & 2 deletions b/‎api/isb_cgc_api/cohorts_preview.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎api/isb_cgc_api/isb_cgc_api_helpers.py‎
Lines changed: 10 additions & 10 deletions b/‎api/isb_cgc_api/isb_cgc_api_helpers.py‎
Lines changed: 10 additions & 10 deletions
@@ -2,11 +2,16 @@
 __pycache__/
 *.py[~cod]
 
+# eclipse files
+.project
+.pydevproject
+
 # C extensions
 *.so
 
 # Distribution / packaging
 .Python
+doc/
 env/
 venv/
 build/
@@ -19,6 +24,7 @@ lib64/
 parts/
 sdist/
 var/
+*_diff.txt
 *.egg-info/
 .installed.cfg
 *.egg
 
@@ -37,7 +37,7 @@
 from api.pairwise_api import PairwiseResults, PairwiseResultVector, PairwiseFilterMessage
 from api.api_helpers import sql_connection
 
-from projects.models import Study
+from projects.models import Project
 
 import sys
 
@@ -47,33 +47,33 @@
     'BMI': 'kg/m^2',
 }
 
-ISB_CGC_STUDIES = {
+ISB_CGC_PROJECTS = {
     'list': []
 }
 
 # DUPLICATE METHOD
 # Due to the way sql connections are done, it's easiest to duplicate this method and the static variable
 # it creates. The original is in Cohorts/views, and all changes will happen there first.
 #
-# Generate the ISB_CGC_STUDIES['list'] value set based on the get_isbcgc_study_set sproc
-def fetch_isbcgc_study_set():
+# Generate the ISB_CGC_PROJECTS['list'] value set based on the get_isbcgc_project_set sproc
+def fetch_isbcgc_project_set():
     try:
         cursor = None
         db = sql_connection()
-        if not ISB_CGC_STUDIES['list'] or len(ISB_CGC_STUDIES['list']) <= 0:
+        if not ISB_CGC_PROJECTS['list'] or len(ISB_CGC_PROJECTS['list']) <= 0:
             cursor = db.cursor()
-            cursor.execute("SELECT COUNT(SPECIFIC_NAME) FROM INFORMATION_SCHEMA.ROUTINES WHERE SPECIFIC_NAME = 'get_isbcgc_study_set';")
+            cursor.execute("SELECT COUNT(SPECIFIC_NAME) FROM INFORMATION_SCHEMA.ROUTINES WHERE SPECIFIC_NAME = 'get_isbcgc_project_set';")
             # Only try to fetch the study set if the sproc exists
             if cursor.fetchall()[0][0] > 0:
-                cursor.execute("CALL get_isbcgc_study_set();")
-                ISB_CGC_STUDIES['list'] = []
+                cursor.execute("CALL get_isbcgc_project_set();")
+                ISB_CGC_PROJECTS['list'] = []
                 for row in cursor.fetchall():
-                    ISB_CGC_STUDIES['list'].append(row[0])
+                    ISB_CGC_PROJECTS['list'].append(row[0])
             else:
                 # Otherwise just warn
-                logger.warn("[WARNING] Stored procedure get_isbcgc_study_set was not found!")
+                logger.warn("[WARNING] Stored procedure get_isbcgc_project_set was not found!")
 
-        return ISB_CGC_STUDIES['list']
+        return ISB_CGC_PROJECTS['list']
     except Exception as e:
         logger.error(e)
         logger.error(traceback.format_exc())
@@ -152,10 +152,11 @@ class PlotDataCohortInfo(Message):
 
 class PlotDataPoint(Message):
     sample_id = StringField(1)
-    x = StringField(2)
-    y = StringField(3)
-    c = StringField(4)
-    cohort = IntegerField(5, repeated=True)
+    case_id = StringField(2)
+    x = StringField(3)
+    y = StringField(4)
+    c = StringField(5)
+    cohort = IntegerField(6, repeated=True)
 
 
 class PlotDataTypes(Message):
@@ -392,7 +393,7 @@ def get_merged_feature_vectors(self, x_id, y_id, c_id, cohort_id_array, logTrans
                             "[WARNING] No valid log base was supplied - log transformation will not be applied!"
                         )
 
-        vms = VectorMergeSupport('NA', 'sample_id', ['x', 'y', 'c']) # changed so that it plots per sample not patient
+        vms = VectorMergeSupport('NA', 'sample_id', 'case_id', ['x', 'y', 'c']) # changed so that it plots per sample not patient
         vms.add_dict_array(x_vec, 'x', 'value')
         vms.add_dict_array(y_vec, 'y', 'value')
         vms.add_dict_array(c_vec, 'c', 'value')
@@ -496,7 +497,7 @@ def data_access_for_plot(self, request):
                     logging.error("Invalid internal feature ID '{}'".format(feature_id))
                     raise NotFoundException()
 
-            # Get the study IDs these cohorts' samples come from
+            # Get the project IDs these cohorts' samples come from
             cohort_vals = ()
             cohort_params = ""
 
@@ -509,9 +510,9 @@ def data_access_for_plot(self, request):
             db = sql_connection()
             cursor = db.cursor()
 
-            tcga_studies = fetch_isbcgc_study_set()
+            tcga_studies = fetch_isbcgc_project_set()
 
-            cursor.execute("SELECT DISTINCT study_id FROM cohorts_samples WHERE cohort_id IN ("+cohort_params+");",cohort_vals)
+            cursor.execute("SELECT DISTINCT project_id FROM cohorts_samples WHERE cohort_id IN ("+cohort_params+");",cohort_vals)
 
             # Only samples whose source studies are TCGA studies, or extended from them, should be used
             confirmed_study_ids = []
@@ -525,11 +526,11 @@ def data_access_for_plot(self, request):
                     unconfirmed_study_ids.append(row[0])
 
             if len(unconfirmed_study_ids) > 0:
-                studies = Study.objects.filter(id__in=unconfirmed_study_ids)
+                projects = Project.objects.filter(id__in=unconfirmed_study_ids)
 
-                for study in studies:
-                    if study.get_my_root_and_depth()['root'] in tcga_studies:
-                        confirmed_study_ids.append(study.id)
+                for project in projects:
+                    if project.get_my_root_and_depth()['root'] in tcga_studies:
+                        confirmed_study_ids.append(project.id)
 
             return self.get_merged_feature_vectors(x_id, y_id, c_id, cohort_id_array, logTransform, confirmed_study_ids)
         except NotFoundException as nfe:
 
@@ -32,13 +32,14 @@
 from message_classes import MetadataRangesItem
 
 from api.api_helpers import sql_connection, WHITELIST_RE
-from cohorts.models import Cohort as Django_Cohort, Cohort_Perms, Patients, Samples, Filters
+from cohorts.models import Cohort as Django_Cohort, Cohort_Perms, Samples, Filters
 from bq_data_access.cohort_bigquery import BigQueryCohortSupport
 
 logger = logging.getLogger(__name__)
 
 BASE_URL = settings.BASE_URL
 
+
 class FilterDetails(messages.Message):
     name = messages.StringField(1)
     value = messages.StringField(2)
@@ -98,10 +99,9 @@ def create(self, request):
         try:
             db = sql_connection()
             cursor = db.cursor(MySQLdb.cursors.DictCursor)
-            cursor.execute(patient_query_str, value_tuple)
-            patient_barcodes = [row['ParticipantBarcode'] for row in cursor.fetchall()]
             cursor.execute(sample_query_str, value_tuple)
-            sample_barcodes = [row['SampleBarcode'] for row in cursor.fetchall()]
+            # TODO: We need to adjust this to pull the correct project ID as well
+            sample_barcodes = [{'sample_barcode': row['sample_barcode'], 'case_barcode': row['case_barcode'], 'project_id': None,} for row in cursor.fetchall()]
 
         except (IndexError, TypeError), e:
             logger.warn(e)
@@ -128,7 +128,7 @@ def create(self, request):
             raise endpoints.BadRequestException(
                 "Your cohort's name contains invalid characters (" + match.__str__() + "); please choose another name.")
 
-        if len(patient_barcodes) == 0 or len(sample_barcodes) == 0:
+        if len(sample_barcodes) == 0:
             raise endpoints.BadRequestException(
                 "The cohort could not be saved because no samples meet the specified parameters.")
 
@@ -138,22 +138,15 @@ def create(self, request):
                                                       last_date_saved=datetime.utcnow())
         created_cohort.save()
 
-        # 2. insert patients into cohort_patients
-        patient_barcodes = list(set(patient_barcodes))
-        patient_list = [Patients(cohort=created_cohort, patient_id=patient_code) for patient_code in
-                        patient_barcodes]
-        Patients.objects.bulk_create(patient_list)
-
-        # 3. insert samples into cohort_samples
-        sample_barcodes = list(set(sample_barcodes))
-        sample_list = [Samples(cohort=created_cohort, sample_id=sample_code) for sample_code in sample_barcodes]
+        # 2. insert samples into cohort_samples
+        sample_list = [Samples(cohort=created_cohort, sample_barcode=sample['sample_barcode'], case_barcode=sample['case_barcode'], project_id=sample['project_id']) for sample in sample_barcodes]
         Samples.objects.bulk_create(sample_list)
 
-        # 4. Set permission for user to be owner
+        # 3. Set permission for user to be owner
         perm = Cohort_Perms(cohort=created_cohort, user=django_user, perm=Cohort_Perms.OWNER)
         perm.save()
 
-        # 5. Create filters applied
+        # 4. Create filters applied
         filter_data = []
         for key, value_list in query_dict.items():
             for val in value_list:
@@ -164,18 +157,18 @@ def create(self, request):
             filter_data.append(FilterDetails(name=key, value=str(val)))
             Filters.objects.create(resulting_cohort=created_cohort, name=key, value=val).save()
 
-        # 6. Store cohort to BigQuery
+        # 5. Store cohort to BigQuery
         project_id = settings.BQ_PROJECT_ID
         cohort_settings = settings.GET_BQ_COHORT_SETTINGS()
         bcs = BigQueryCohortSupport(project_id, cohort_settings.dataset_id, cohort_settings.table_id)
-        bcs.add_cohort_with_sample_barcodes(created_cohort.id, sample_barcodes)
+        bcs.add_cohort_to_bq(created_cohort.id, sample_barcodes)
 
         request_finished.send(self)
 
         return CreatedCohort(id=str(created_cohort.id),
                              name=cohort_name,
                              last_date_saved=str(datetime.utcnow()),
                              filters=filter_data,
-                             patient_count=len(patient_barcodes),
+                             patient_count=created_cohort.case_size(),
                              sample_count=len(sample_barcodes)
                              )
@@ -37,7 +37,6 @@
 class ReturnJSON(messages.Message):
     message = messages.StringField(1)
 
-
 @ISB_CGC_Endpoints.api_class(resource_name='cohorts')
 class CohortsDeleteAPI(remote.Service):
     DELETE_RESOURCE = endpoints.ResourceContainer(cohort_id=messages.IntegerField(1, required=True))
 
@@ -114,21 +114,21 @@ def get(self, request):
             parent_id_data = CohortsGetListMessageBuilder().make_parent_id_list_from_cursor(
                 cursor.fetchall(), row)
 
-            # get list of patients in this cohort
-            patient_query_str, patient_query_tuple = CohortsGetListQueryBuilder().build_patients_query(
-                {'cohort_id': str(row['id'])})
-            cursor.execute(patient_query_str, patient_query_tuple)
-            patient_list = [str(patient_row.get('patient_id'))
-                            for patient_row in cursor.fetchall()
-                            if patient_row.get('patient_id')]
-
-            # get list of samples in this cohort
+            # get list of samples and cases in this cohort
             sample_query_str, sample_query_tuple = CohortsGetListQueryBuilder().build_samples_query(
                 {'cohort_id': str(row['id'])})
             cursor.execute(sample_query_str, sample_query_tuple)
-            sample_list = [str(sample_row.get('sample_id'))
-                           for sample_row in cursor.fetchall()
-                           if sample_row.get('sample_id')]
+            sample_list = []
+            patient_list = []
+            for s_row in cursor.fetchall():
+                sample_list.append(s_row['sample_barcode'])
+                if s_row['case_barcode']:
+                    patient_list.append(s_row['case_barcode'])
+
+            if len(sample_list) == 0:
+                sample_list = ["None"]
+            if len(patient_list) == 0:
+                patient_list = ["None"]
 
             return CohortDetails(
                 id=str(row['id']),
@@ -143,8 +143,8 @@ def get(self, request):
                 filters=filter_data,
                 patient_count=len(patient_list),
                 sample_count=len(sample_list),
-                patients=patient_list if len(patient_list) > 0 else ["None"],
-                samples=sample_list if len(sample_list) > 0 else ["None"]
+                patients=patient_list,
+                samples=sample_list
             )
 
         except (IndexError, TypeError) as e:
 
@@ -89,7 +89,7 @@ def googlegenomics(self, request):
 
         query_str = 'SELECT SampleBarcode, GG_dataset_id, GG_readgroupset_id ' \
                     'FROM metadata_data ' \
-                    'JOIN cohorts_samples ON metadata_data.SampleBarcode=cohorts_samples.sample_id ' \
+                    'JOIN cohorts_samples ON metadata_data.SampleBarcode=cohorts_samples.sample_barcode ' \
                     'WHERE cohorts_samples.cohort_id=%s ' \
                     'AND GG_dataset_id !="" AND GG_readgroupset_id !="" ' \
                     'GROUP BY SampleBarcode, GG_dataset_id, GG_readgroupset_id;'
 
@@ -71,12 +71,12 @@ def preview(self, request):
             patient_cursor = db.cursor(MySQLdb.cursors.DictCursor)
             patient_cursor.execute(patient_query_str, value_tuple)
             for row in patient_cursor.fetchall():
-                patient_barcodes.append(row['ParticipantBarcode'])
+                patient_barcodes.append(row['case_barcode'])
 
             sample_cursor = db.cursor(MySQLdb.cursors.DictCursor)
             sample_cursor.execute(sample_query_str, value_tuple)
             for row in sample_cursor.fetchall():
-                sample_barcodes.append(row['SampleBarcode'])
+                sample_barcodes.append(row['sample_barcode'])
 
         except (IndexError, TypeError), e:
             logger.warn(e)
 
@@ -150,12 +150,12 @@ def build_parent_query(self, parent_query_dict):
 
     def build_patients_query(self, patient_query_dict):
         """
-        Builds the query that selects the patient count for a particular cohort
+        Builds the query that selects the case count for a particular cohort
         :param patient_query_dict: should be {'cohort_id': str(row['id])}
         :return: patient_query_str, patient_query_tuple
         """
-        patients_query_str = 'SELECT patient_id ' \
-                             'FROM cohorts_patients '
+        patients_query_str = 'SELECT case_barcode ' \
+                             'FROM cohorts_samples '
 
         patients_query_str += ' WHERE ' + '=%s AND '.join(key for key in patient_query_dict.keys()) + '=%s '
         patient_query_tuple = tuple(value for value in patient_query_dict.values())
@@ -168,7 +168,7 @@ def build_samples_query(self, sample_query_dict):
         :param sample_query_dict: should be {'cohort_id': str(row['id])}
         :return: sample_query_str, sample_query_tuple
         """
-        samples_query_str = 'SELECT sample_id ' \
+        samples_query_str = 'SELECT sample_barcode, case_barcode ' \
                             'FROM cohorts_samples '
 
         samples_query_str += ' WHERE ' + '=%s AND '.join(key for key in sample_query_dict.keys()) + '=%s '
@@ -213,12 +213,12 @@ def build_query(self, query_dict, gte_query_dict, lte_query_dict):
         Returns patient query string,  sample query string, value tuple.
         """
 
-        patient_query_str = 'SELECT DISTINCT(IF(ParticipantBarcode="", LEFT(SampleBarcode,12), ParticipantBarcode)) ' \
-                            'AS ParticipantBarcode ' \
+        patient_query_str = 'SELECT DISTINCT(IF(case_barcode="", LEFT(sample_barcode,12), case_barcode)) ' \
+                            'AS case_barcode ' \
                             'FROM metadata_samples ' \
                             'WHERE '
 
-        sample_query_str = 'SELECT SampleBarcode ' \
+        sample_query_str = 'SELECT sample_barcode, case_barcode ' \
                            'FROM metadata_samples ' \
                            'WHERE '
         value_tuple = ()
@@ -256,7 +256,7 @@ def build_query(self, query_dict, gte_query_dict, lte_query_dict):
             sample_query_str += ' {} <=%s '.format(key)
             value_tuple += (value,)
 
-        sample_query_str += ' GROUP BY SampleBarcode'
+        sample_query_str += ' GROUP BY sample_barcode'
 
         return patient_query_str, sample_query_str, value_tuple
 
@@ -307,9 +307,9 @@ def build_query(self, platform=None, pipeline=None, limit=None, cohort_id=None,
                     'FROM metadata_data '
 
         if cohort_id is None:
-            query_str += 'WHERE SampleBarcode=%s '
+            query_str += 'WHERE sample_barcode=%s '
         else:
-            query_str += 'JOIN cohorts_samples ON metadata_data.SampleBarcode=cohorts_samples.sample_id ' \
+            query_str += 'JOIN cohorts_samples ON metadata_data.sample_barcode=cohorts_samples.sample_barcode ' \
                          'WHERE cohorts_samples.cohort_id=%s '
 
         query_str += 'AND DataFileNameKey != "" AND DataFileNameKey is not null '