Skip to content

Commit cfc5709

Browse files
authored
Merge pull request #134 from isb-cgc/prod-sp
Data Migration
2 parents bf714be + 29f7048 commit cfc5709

File tree

104 files changed

+9550
-591
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

104 files changed

+9550
-591
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,16 @@
22
__pycache__/
33
*.py[~cod]
44

5+
# eclipse files
6+
.project
7+
.pydevproject
8+
59
# C extensions
610
*.so
711

812
# Distribution / packaging
913
.Python
14+
doc/
1015
env/
1116
venv/
1217
build/
@@ -19,6 +24,7 @@ lib64/
1924
parts/
2025
sdist/
2126
var/
27+
*_diff.txt
2228
*.egg-info/
2329
.installed.cfg
2430
*.egg

api/Cohort.py

Lines changed: 62 additions & 104 deletions
Large diffs are not rendered by default.

api/data_access.py

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
from api.pairwise_api import PairwiseResults, PairwiseResultVector, PairwiseFilterMessage
3838
from api.api_helpers import sql_connection
3939

40-
from projects.models import Study
40+
from projects.models import Project
4141

4242
import sys
4343

@@ -47,33 +47,33 @@
4747
'BMI': 'kg/m^2',
4848
}
4949

50-
ISB_CGC_STUDIES = {
50+
ISB_CGC_PROJECTS = {
5151
'list': []
5252
}
5353

5454
# DUPLICATE METHOD
5555
# Due to the way sql connections are done, it's easiest to duplicate this method and the static variable
5656
# it creates. The original is in Cohorts/views, and all changes will happen there first.
5757
#
58-
# Generate the ISB_CGC_STUDIES['list'] value set based on the get_isbcgc_study_set sproc
59-
def fetch_isbcgc_study_set():
58+
# Generate the ISB_CGC_PROJECTS['list'] value set based on the get_isbcgc_project_set sproc
59+
def fetch_isbcgc_project_set():
6060
try:
6161
cursor = None
6262
db = sql_connection()
63-
if not ISB_CGC_STUDIES['list'] or len(ISB_CGC_STUDIES['list']) <= 0:
63+
if not ISB_CGC_PROJECTS['list'] or len(ISB_CGC_PROJECTS['list']) <= 0:
6464
cursor = db.cursor()
65-
cursor.execute("SELECT COUNT(SPECIFIC_NAME) FROM INFORMATION_SCHEMA.ROUTINES WHERE SPECIFIC_NAME = 'get_isbcgc_study_set';")
65+
cursor.execute("SELECT COUNT(SPECIFIC_NAME) FROM INFORMATION_SCHEMA.ROUTINES WHERE SPECIFIC_NAME = 'get_isbcgc_project_set';")
6666
# Only try to fetch the study set if the sproc exists
6767
if cursor.fetchall()[0][0] > 0:
68-
cursor.execute("CALL get_isbcgc_study_set();")
69-
ISB_CGC_STUDIES['list'] = []
68+
cursor.execute("CALL get_isbcgc_project_set();")
69+
ISB_CGC_PROJECTS['list'] = []
7070
for row in cursor.fetchall():
71-
ISB_CGC_STUDIES['list'].append(row[0])
71+
ISB_CGC_PROJECTS['list'].append(row[0])
7272
else:
7373
# Otherwise just warn
74-
logger.warn("[WARNING] Stored procedure get_isbcgc_study_set was not found!")
74+
logger.warn("[WARNING] Stored procedure get_isbcgc_project_set was not found!")
7575

76-
return ISB_CGC_STUDIES['list']
76+
return ISB_CGC_PROJECTS['list']
7777
except Exception as e:
7878
logger.error(e)
7979
logger.error(traceback.format_exc())
@@ -152,10 +152,11 @@ class PlotDataCohortInfo(Message):
152152

153153
class PlotDataPoint(Message):
154154
sample_id = StringField(1)
155-
x = StringField(2)
156-
y = StringField(3)
157-
c = StringField(4)
158-
cohort = IntegerField(5, repeated=True)
155+
case_id = StringField(2)
156+
x = StringField(3)
157+
y = StringField(4)
158+
c = StringField(5)
159+
cohort = IntegerField(6, repeated=True)
159160

160161

161162
class PlotDataTypes(Message):
@@ -392,7 +393,7 @@ def get_merged_feature_vectors(self, x_id, y_id, c_id, cohort_id_array, logTrans
392393
"[WARNING] No valid log base was supplied - log transformation will not be applied!"
393394
)
394395

395-
vms = VectorMergeSupport('NA', 'sample_id', ['x', 'y', 'c']) # changed so that it plots per sample not patient
396+
vms = VectorMergeSupport('NA', 'sample_id', 'case_id', ['x', 'y', 'c']) # changed so that it plots per sample not patient
396397
vms.add_dict_array(x_vec, 'x', 'value')
397398
vms.add_dict_array(y_vec, 'y', 'value')
398399
vms.add_dict_array(c_vec, 'c', 'value')
@@ -496,7 +497,7 @@ def data_access_for_plot(self, request):
496497
logging.error("Invalid internal feature ID '{}'".format(feature_id))
497498
raise NotFoundException()
498499

499-
# Get the study IDs these cohorts' samples come from
500+
# Get the project IDs these cohorts' samples come from
500501
cohort_vals = ()
501502
cohort_params = ""
502503

@@ -509,9 +510,9 @@ def data_access_for_plot(self, request):
509510
db = sql_connection()
510511
cursor = db.cursor()
511512

512-
tcga_studies = fetch_isbcgc_study_set()
513+
tcga_studies = fetch_isbcgc_project_set()
513514

514-
cursor.execute("SELECT DISTINCT study_id FROM cohorts_samples WHERE cohort_id IN ("+cohort_params+");",cohort_vals)
515+
cursor.execute("SELECT DISTINCT project_id FROM cohorts_samples WHERE cohort_id IN ("+cohort_params+");",cohort_vals)
515516

516517
# Only samples whose source studies are TCGA studies, or extended from them, should be used
517518
confirmed_study_ids = []
@@ -525,11 +526,11 @@ def data_access_for_plot(self, request):
525526
unconfirmed_study_ids.append(row[0])
526527

527528
if len(unconfirmed_study_ids) > 0:
528-
studies = Study.objects.filter(id__in=unconfirmed_study_ids)
529+
projects = Project.objects.filter(id__in=unconfirmed_study_ids)
529530

530-
for study in studies:
531-
if study.get_my_root_and_depth()['root'] in tcga_studies:
532-
confirmed_study_ids.append(study.id)
531+
for project in projects:
532+
if project.get_my_root_and_depth()['root'] in tcga_studies:
533+
confirmed_study_ids.append(project.id)
533534

534535
return self.get_merged_feature_vectors(x_id, y_id, c_id, cohort_id_array, logTransform, confirmed_study_ids)
535536
except NotFoundException as nfe:

api/isb_cgc_api/cohorts_create.py

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,14 @@
3232
from message_classes import MetadataRangesItem
3333

3434
from api.api_helpers import sql_connection, WHITELIST_RE
35-
from cohorts.models import Cohort as Django_Cohort, Cohort_Perms, Patients, Samples, Filters
35+
from cohorts.models import Cohort as Django_Cohort, Cohort_Perms, Samples, Filters
3636
from bq_data_access.cohort_bigquery import BigQueryCohortSupport
3737

3838
logger = logging.getLogger(__name__)
3939

4040
BASE_URL = settings.BASE_URL
4141

42+
4243
class FilterDetails(messages.Message):
4344
name = messages.StringField(1)
4445
value = messages.StringField(2)
@@ -98,10 +99,9 @@ def create(self, request):
9899
try:
99100
db = sql_connection()
100101
cursor = db.cursor(MySQLdb.cursors.DictCursor)
101-
cursor.execute(patient_query_str, value_tuple)
102-
patient_barcodes = [row['ParticipantBarcode'] for row in cursor.fetchall()]
103102
cursor.execute(sample_query_str, value_tuple)
104-
sample_barcodes = [row['SampleBarcode'] for row in cursor.fetchall()]
103+
# TODO: We need to adjust this to pull the correct project ID as well
104+
sample_barcodes = [{'sample_barcode': row['sample_barcode'], 'case_barcode': row['case_barcode'], 'project_id': None,} for row in cursor.fetchall()]
105105

106106
except (IndexError, TypeError), e:
107107
logger.warn(e)
@@ -128,7 +128,7 @@ def create(self, request):
128128
raise endpoints.BadRequestException(
129129
"Your cohort's name contains invalid characters (" + match.__str__() + "); please choose another name.")
130130

131-
if len(patient_barcodes) == 0 or len(sample_barcodes) == 0:
131+
if len(sample_barcodes) == 0:
132132
raise endpoints.BadRequestException(
133133
"The cohort could not be saved because no samples meet the specified parameters.")
134134

@@ -138,22 +138,15 @@ def create(self, request):
138138
last_date_saved=datetime.utcnow())
139139
created_cohort.save()
140140

141-
# 2. insert patients into cohort_patients
142-
patient_barcodes = list(set(patient_barcodes))
143-
patient_list = [Patients(cohort=created_cohort, patient_id=patient_code) for patient_code in
144-
patient_barcodes]
145-
Patients.objects.bulk_create(patient_list)
146-
147-
# 3. insert samples into cohort_samples
148-
sample_barcodes = list(set(sample_barcodes))
149-
sample_list = [Samples(cohort=created_cohort, sample_id=sample_code) for sample_code in sample_barcodes]
141+
# 2. insert samples into cohort_samples
142+
sample_list = [Samples(cohort=created_cohort, sample_barcode=sample['sample_barcode'], case_barcode=sample['case_barcode'], project_id=sample['project_id']) for sample in sample_barcodes]
150143
Samples.objects.bulk_create(sample_list)
151144

152-
# 4. Set permission for user to be owner
145+
# 3. Set permission for user to be owner
153146
perm = Cohort_Perms(cohort=created_cohort, user=django_user, perm=Cohort_Perms.OWNER)
154147
perm.save()
155148

156-
# 5. Create filters applied
149+
# 4. Create filters applied
157150
filter_data = []
158151
for key, value_list in query_dict.items():
159152
for val in value_list:
@@ -164,18 +157,18 @@ def create(self, request):
164157
filter_data.append(FilterDetails(name=key, value=str(val)))
165158
Filters.objects.create(resulting_cohort=created_cohort, name=key, value=val).save()
166159

167-
# 6. Store cohort to BigQuery
160+
# 5. Store cohort to BigQuery
168161
project_id = settings.BQ_PROJECT_ID
169162
cohort_settings = settings.GET_BQ_COHORT_SETTINGS()
170163
bcs = BigQueryCohortSupport(project_id, cohort_settings.dataset_id, cohort_settings.table_id)
171-
bcs.add_cohort_with_sample_barcodes(created_cohort.id, sample_barcodes)
164+
bcs.add_cohort_to_bq(created_cohort.id, sample_barcodes)
172165

173166
request_finished.send(self)
174167

175168
return CreatedCohort(id=str(created_cohort.id),
176169
name=cohort_name,
177170
last_date_saved=str(datetime.utcnow()),
178171
filters=filter_data,
179-
patient_count=len(patient_barcodes),
172+
patient_count=created_cohort.case_size(),
180173
sample_count=len(sample_barcodes)
181174
)

api/isb_cgc_api/cohorts_delete.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737
class ReturnJSON(messages.Message):
3838
message = messages.StringField(1)
3939

40-
4140
@ISB_CGC_Endpoints.api_class(resource_name='cohorts')
4241
class CohortsDeleteAPI(remote.Service):
4342
DELETE_RESOURCE = endpoints.ResourceContainer(cohort_id=messages.IntegerField(1, required=True))

api/isb_cgc_api/cohorts_get.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -114,21 +114,21 @@ def get(self, request):
114114
parent_id_data = CohortsGetListMessageBuilder().make_parent_id_list_from_cursor(
115115
cursor.fetchall(), row)
116116

117-
# get list of patients in this cohort
118-
patient_query_str, patient_query_tuple = CohortsGetListQueryBuilder().build_patients_query(
119-
{'cohort_id': str(row['id'])})
120-
cursor.execute(patient_query_str, patient_query_tuple)
121-
patient_list = [str(patient_row.get('patient_id'))
122-
for patient_row in cursor.fetchall()
123-
if patient_row.get('patient_id')]
124-
125-
# get list of samples in this cohort
117+
# get list of samples and cases in this cohort
126118
sample_query_str, sample_query_tuple = CohortsGetListQueryBuilder().build_samples_query(
127119
{'cohort_id': str(row['id'])})
128120
cursor.execute(sample_query_str, sample_query_tuple)
129-
sample_list = [str(sample_row.get('sample_id'))
130-
for sample_row in cursor.fetchall()
131-
if sample_row.get('sample_id')]
121+
sample_list = []
122+
patient_list = []
123+
for s_row in cursor.fetchall():
124+
sample_list.append(s_row['sample_barcode'])
125+
if s_row['case_barcode']:
126+
patient_list.append(s_row['case_barcode'])
127+
128+
if len(sample_list) == 0:
129+
sample_list = ["None"]
130+
if len(patient_list) == 0:
131+
patient_list = ["None"]
132132

133133
return CohortDetails(
134134
id=str(row['id']),
@@ -143,8 +143,8 @@ def get(self, request):
143143
filters=filter_data,
144144
patient_count=len(patient_list),
145145
sample_count=len(sample_list),
146-
patients=patient_list if len(patient_list) > 0 else ["None"],
147-
samples=sample_list if len(sample_list) > 0 else ["None"]
146+
patients=patient_list,
147+
samples=sample_list
148148
)
149149

150150
except (IndexError, TypeError) as e:

api/isb_cgc_api/cohorts_googlegenomics.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def googlegenomics(self, request):
8989

9090
query_str = 'SELECT SampleBarcode, GG_dataset_id, GG_readgroupset_id ' \
9191
'FROM metadata_data ' \
92-
'JOIN cohorts_samples ON metadata_data.SampleBarcode=cohorts_samples.sample_id ' \
92+
'JOIN cohorts_samples ON metadata_data.SampleBarcode=cohorts_samples.sample_barcode ' \
9393
'WHERE cohorts_samples.cohort_id=%s ' \
9494
'AND GG_dataset_id !="" AND GG_readgroupset_id !="" ' \
9595
'GROUP BY SampleBarcode, GG_dataset_id, GG_readgroupset_id;'

api/isb_cgc_api/cohorts_preview.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,12 @@ def preview(self, request):
7171
patient_cursor = db.cursor(MySQLdb.cursors.DictCursor)
7272
patient_cursor.execute(patient_query_str, value_tuple)
7373
for row in patient_cursor.fetchall():
74-
patient_barcodes.append(row['ParticipantBarcode'])
74+
patient_barcodes.append(row['case_barcode'])
7575

7676
sample_cursor = db.cursor(MySQLdb.cursors.DictCursor)
7777
sample_cursor.execute(sample_query_str, value_tuple)
7878
for row in sample_cursor.fetchall():
79-
sample_barcodes.append(row['SampleBarcode'])
79+
sample_barcodes.append(row['sample_barcode'])
8080

8181
except (IndexError, TypeError), e:
8282
logger.warn(e)

api/isb_cgc_api/isb_cgc_api_helpers.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -150,12 +150,12 @@ def build_parent_query(self, parent_query_dict):
150150

151151
def build_patients_query(self, patient_query_dict):
152152
"""
153-
Builds the query that selects the patient count for a particular cohort
153+
Builds the query that selects the case count for a particular cohort
154154
:param patient_query_dict: should be {'cohort_id': str(row['id])}
155155
:return: patient_query_str, patient_query_tuple
156156
"""
157-
patients_query_str = 'SELECT patient_id ' \
158-
'FROM cohorts_patients '
157+
patients_query_str = 'SELECT case_barcode ' \
158+
'FROM cohorts_samples '
159159

160160
patients_query_str += ' WHERE ' + '=%s AND '.join(key for key in patient_query_dict.keys()) + '=%s '
161161
patient_query_tuple = tuple(value for value in patient_query_dict.values())
@@ -168,7 +168,7 @@ def build_samples_query(self, sample_query_dict):
168168
:param sample_query_dict: should be {'cohort_id': str(row['id])}
169169
:return: sample_query_str, sample_query_tuple
170170
"""
171-
samples_query_str = 'SELECT sample_id ' \
171+
samples_query_str = 'SELECT sample_barcode, case_barcode ' \
172172
'FROM cohorts_samples '
173173

174174
samples_query_str += ' WHERE ' + '=%s AND '.join(key for key in sample_query_dict.keys()) + '=%s '
@@ -213,12 +213,12 @@ def build_query(self, query_dict, gte_query_dict, lte_query_dict):
213213
Returns patient query string, sample query string, value tuple.
214214
"""
215215

216-
patient_query_str = 'SELECT DISTINCT(IF(ParticipantBarcode="", LEFT(SampleBarcode,12), ParticipantBarcode)) ' \
217-
'AS ParticipantBarcode ' \
216+
patient_query_str = 'SELECT DISTINCT(IF(case_barcode="", LEFT(sample_barcode,12), case_barcode)) ' \
217+
'AS case_barcode ' \
218218
'FROM metadata_samples ' \
219219
'WHERE '
220220

221-
sample_query_str = 'SELECT SampleBarcode ' \
221+
sample_query_str = 'SELECT sample_barcode, case_barcode ' \
222222
'FROM metadata_samples ' \
223223
'WHERE '
224224
value_tuple = ()
@@ -256,7 +256,7 @@ def build_query(self, query_dict, gte_query_dict, lte_query_dict):
256256
sample_query_str += ' {} <=%s '.format(key)
257257
value_tuple += (value,)
258258

259-
sample_query_str += ' GROUP BY SampleBarcode'
259+
sample_query_str += ' GROUP BY sample_barcode'
260260

261261
return patient_query_str, sample_query_str, value_tuple
262262

@@ -307,9 +307,9 @@ def build_query(self, platform=None, pipeline=None, limit=None, cohort_id=None,
307307
'FROM metadata_data '
308308

309309
if cohort_id is None:
310-
query_str += 'WHERE SampleBarcode=%s '
310+
query_str += 'WHERE sample_barcode=%s '
311311
else:
312-
query_str += 'JOIN cohorts_samples ON metadata_data.SampleBarcode=cohorts_samples.sample_id ' \
312+
query_str += 'JOIN cohorts_samples ON metadata_data.sample_barcode=cohorts_samples.sample_barcode ' \
313313
'WHERE cohorts_samples.cohort_id=%s '
314314

315315
query_str += 'AND DataFileNameKey != "" AND DataFileNameKey is not null '

0 commit comments

Comments
 (0)