Skip to content

Commit 393a6eb

Browse files
authored
Merge pull request #418 from ImagingDataCommons/idc-prod-sp
Release
2 parents 85bcde6 + 610b920 commit 393a6eb

21 files changed

+623
-148
lines changed

scripts/etl.py

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
###
2+
# Copyright 2015-2020, Institute for Systems Biology
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
###
16+
17+
from __future__ import print_function
18+
19+
from builtins import str
20+
from builtins import object
21+
import datetime
22+
import logging
23+
import traceback
24+
import os
25+
import re
26+
from csv import reader as csv_reader
27+
import csv
28+
from argparse import ArgumentParser
29+
import sys
30+
import time
31+
from copy import deepcopy
32+
from itertools import combinations, product
33+
34+
from idc import secret_settings, settings
35+
36+
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "idc.settings")
37+
38+
import django
39+
django.setup()
40+
41+
from idc_collections.models import Program, Collection, Attribute, Attribute_Ranges, \
42+
Attribute_Display_Values, DataSource, DataSourceJoin, DataVersion, DataSetType, \
43+
Attribute_Set_Type, Attribute_Display_Category, ImagingDataCommonsVersion
44+
45+
from django.contrib.auth.models import User
46+
idc_superuser = User.objects.get(username="idc")
47+
48+
logger = logging.getLogger('main_logger')
49+
50+
BQ_PROJ_DATASET = 'idc-dev-etl.idc_tcia_views_mvp_wave0'
51+
52+
53+
def new_attribute(name, displ_name, type, display_default, cross_collex=False, units=None):
54+
return {
55+
'name': name,
56+
"display_name": displ_name,
57+
"type": type,
58+
'units': units,
59+
'cross_collex': cross_collex,
60+
'solr_collex': [],
61+
'bq_tables': [],
62+
'set_types': [],
63+
'display': display_default,
64+
'categories': []
65+
}
66+
67+
def add_data_sets(sets_set):
68+
for dss in sets_set:
69+
try:
70+
obj, created = DataSetType.objects.update_or_create(name=dss['name'], data_type=dss['data_type'], set_type=dss['set_type'])
71+
72+
print("Data Set Type created:")
73+
print(obj)
74+
except Exception as e:
75+
logger.error("[ERROR] Data Version {} may not have been added!".format(dss['name']))
76+
logger.exception(e)
77+
78+
def add_data_versions(dv_set):
79+
idc_dev, created = ImagingDataCommonsVersion.objects.update_or_create(name="Imaging Data Commons Data Release", version_number="1.0")
80+
ver_to_idc = []
81+
try:
82+
for dv in dv_set:
83+
obj, created = DataVersion.objects.update_or_create(name=dv['name'], version=dv['ver'])
84+
85+
progs = Program.objects.filter(name__in=dv['progs'])
86+
ver_to_prog = []
87+
for prog in progs:
88+
ver_to_prog.append(DataVersion.programs.through(dataversion_id=obj.id, program_id=prog.id))
89+
90+
ver_to_idc.append(DataVersion.idc_versions.through(dataversion_id=obj.id, imagingdatacommonsversion_id=idc_dev.id))
91+
92+
DataVersion.programs.through.objects.bulk_create(ver_to_prog)
93+
DataVersion.idc_versions.through.objects.bulk_create(ver_to_idc)
94+
95+
logger.info("[STATUS] Data Versions loaded:")
96+
logger.info("{}".format(DataVersion.objects.all()))
97+
except Exception as e:
98+
logger.error("[ERROR] Data Versions may not have been added!")
99+
logger.exception(e)
100+
101+
def add_programs(program_set):
102+
results = {}
103+
for prog in program_set:
104+
try:
105+
obj, created = Program.objects.update_or_create(
106+
short_name=prog['short_name'], name=prog['full_name'], is_public=prog['public'],
107+
owner=User.objects.get(email=prog['owner']) if 'owner' in prog else idc_superuser)
108+
109+
print("Program created:")
110+
print(obj)
111+
112+
results[obj.short_name] = obj
113+
114+
except Exception as e:
115+
logger.error("[ERROR] Program {} may not have been added!".format(prog['short_name']))
116+
logger.exception(e)
117+
return results
118+
119+
def add_data_source(source_set, versions, programs, data_sets, source_type):
120+
for source in source_set:
121+
try:
122+
obj, created = DataSource.objects.update_or_create(
123+
name=source,
124+
count_col="case_barcode" if "tcga" in source else "PatientID",
125+
source_type=source_type
126+
)
127+
128+
progs = Program.objects.filter(short_name__in=programs)
129+
src_to_prog = []
130+
for prog in progs:
131+
src_to_prog.append(DataSource.programs.through(datasource_id=obj.id, program_id=prog.id))
132+
DataSource.programs.through.objects.bulk_create(src_to_prog)
133+
134+
data_versions = DataVersion.objects.filter(name__in=versions)
135+
versions_to_source = []
136+
for dv in data_versions:
137+
versions_to_source.append(DataSource.versions.through(dataversion_id=dv.id, datasource_id=obj.id))
138+
DataSource.versions.through.objects.bulk_create(versions_to_source)
139+
140+
datasets = DataSetType.objects.filter(name__in=data_sets)
141+
datasets_to_source = []
142+
for data_set in datasets:
143+
datasets_to_source.append(DataSource.data_sets.through(datasource_id=obj.id, datasettype_id=data_set.id))
144+
DataSource.data_sets.through.objects.bulk_create(datasets_to_source)
145+
146+
print("DataSource entry created: {}".format(source))
147+
except Exception as e:
148+
logger.error("[ERROR] DataSource {} may not have been added!".format(source))
149+
logger.exception(e)
150+
151+
def add_source_joins(froms, from_col, tos=None, to_col=None):
152+
src_joins = []
153+
154+
if not tos and not to_col:
155+
joins = combinations(froms, 2)
156+
for join in joins:
157+
for from_join in DataSource.objects.filter(name=join[0]):
158+
for to_join in DataSource.objects.filter(name=join[1]):
159+
src_joins.append(DataSourceJoin(
160+
from_src=from_join,
161+
to_src=to_join,
162+
from_src_col=from_col,
163+
to_src_col=from_col)
164+
)
165+
else:
166+
joins = product(froms,tos)
167+
for join in joins:
168+
for from_join in DataSource.objects.filter(name=join[0]):
169+
for to_join in DataSource.objects.filter(name=join[1]):
170+
src_joins.append(DataSourceJoin(
171+
from_src=from_join,
172+
to_src=to_join,
173+
from_src_col=from_col,
174+
to_src_col=to_col)
175+
)
176+
177+
if len(src_joins):
178+
DataSourceJoin.objects.bulk_create(src_joins)
179+
180+
def add_collections(collection_set):
181+
collex_list = []
182+
try:
183+
for collex in collection_set:
184+
collex_list.append(
185+
Collection(
186+
**collex['data'],
187+
owner=User.objects.get(email=collex['owner']) if 'owner' in collex else idc_superuser
188+
)
189+
)
190+
191+
Collection.objects.bulk_create(collex_list)
192+
193+
for collex in collection_set:
194+
obj = Collection.objects.get(collection_id=collex['data']['collection_id'])
195+
196+
if len(collex.get('data_versions',[])):
197+
collex_to_dv = []
198+
data_versions = DataVersion.objects.filter(name__in=collex['data_versions'])
199+
for dv in data_versions:
200+
collex_to_dv.append(Collection.data_versions.through(collection_id=obj.id, dataversion_id=dv.id))
201+
202+
Collection.data_versions.through.objects.bulk_create(collex_to_dv)
203+
204+
except Exception as e:
205+
logger.error("[ERROR] Collection {} may not have been added!".format(collex['data']['collection_id']))
206+
logger.exception(e)
207+
208+
209+
def add_attributes(attr_set):
210+
for attr in attr_set:
211+
try:
212+
obj, created = Attribute.objects.update_or_create(
213+
name=attr['name'], display_name=attr['display_name'], data_type=attr['type'],
214+
preformatted_values=True if 'preformatted_values' in attr else False,
215+
is_cross_collex=True if 'cross_collex' in attr else False,
216+
default_ui_display=attr['display'],
217+
units=attr.get('units',None)
218+
)
219+
if 'range' in attr:
220+
if len(attr['range']):
221+
for attr_range in attr['range']:
222+
Attribute_Ranges.objects.update_or_create(
223+
**attr_range, attribute=obj
224+
)
225+
else:
226+
Attribute_Ranges.objects.update_or_create(
227+
attribute=obj
228+
)
229+
if len(attr.get('display_vals',[])):
230+
for dv in attr['display_vals']:
231+
Attribute_Display_Values.objects.update_or_create(
232+
raw_value=dv['raw_value'], display_value=dv['display_value'], attribute=obj
233+
)
234+
if len(attr.get('solr_collex',[])):
235+
for sc in DataSource.objects.filter(name__in=attr['solr_collex']):
236+
obj.data_sources.add(sc)
237+
if len(attr.get('bq_tables',[])):
238+
for bqt in DataSource.objects.filter(name__in=attr['bq_tables']):
239+
obj.data_sources.add(bqt)
240+
if len(attr.get('set_types',[])):
241+
for set_type in attr.get('set_types'):
242+
Attribute_Set_Type.objects.update_or_create(
243+
datasettype=DataSetType.objects.get(data_type=set_type['set']), attribute=obj, child_record_search=set_type['child_record_search']
244+
)
245+
if len(attr.get('categories',[])):
246+
for cat in attr['categories']:
247+
Attribute_Display_Category.objects.update_or_create(
248+
category=cat['name'], category_display_name=cat['display_name'], attribute=obj
249+
)
250+
251+
except Exception as e:
252+
logger.error("[ERROR] Attribute {} may not have been added!".format(attr['name']))
253+
logger.exception(e)
254+
255+
def move_attrs(from_data_sources, to_data_sources):
256+
to_sources = DataSource.objects.filter(name__in=to_data_sources)
257+
from_sources = DataSource.objects.filter(name__in=from_data_sources)
258+
to_sources_attrs = to_sources.get_source_attrs()
259+
bulk_add = []
260+
261+
for fds in from_sources:
262+
from_source_attrs = fds.attribute_set.exclude(id__in=to_sources_attrs['ids'])
263+
print("Moving attributes from {}: {}".format(fds.name, "; ".join(from_source_attrs.values_list('name',flat=True))))
264+
265+
for attr in from_source_attrs:
266+
for ds in to_sources:
267+
bulk_add.append(Attribute.data_sources.through(attribute_id=attr.id, datasource_id=ds.id))
268+
269+
Attribute.data_sources.through.objects.bulk_create(bulk_add)
270+
271+
def update_data_sources(to_data_sources,set_types=None,versions=None,progs=None):
272+
to_sources = DataSource.objects.filter(name__in=to_data_sources)
273+
for ds in to_sources:
274+
if versions and len(versions):
275+
data_versions = DataVersion.objects.filter(name__in=versions)
276+
versions_to_source = []
277+
for dv in data_versions:
278+
versions_to_source.append(DataSource.versions.through(dataversion_id=dv.id, datasource_id=ds.id))
279+
DataSource.versions.through.objects.bulk_create(versions_to_source)
280+
281+
if set_types and len(set_types):
282+
datasets = DataSetType.objects.filter(name__in=set_types)
283+
datasets_to_source = []
284+
for data_set in datasets:
285+
datasets_to_source.append(DataSource.data_sets.through(datasource_id=ds.id, datasettype_id=data_set.id))
286+
DataSource.data_sets.through.objects.bulk_create(datasets_to_source)
287+
288+
if progs and len(progs):
289+
progs = Program.objects.filter(short_name__in=progs)
290+
src_to_prog = []
291+
for prog in progs:
292+
src_to_prog.append(DataSource.programs.through(datasource_id=ds.id, program_id=prog.id))
293+
DataSource.programs.through.objects.bulk_create(src_to_prog)
294+
295+
296+
def disable_data_sources(sources):
297+
disable = DataSource.objects.filter(name__in=sources)
298+
for ds in disable:
299+
ds.versions.clear()
300+
ds.data_sets.clear()
301+
ds.attribute_set.clear()
302+
ds.programs.clear()
303+
304+
def main():
305+
306+
try:
307+
move_attrs(["idc-dev-etl.idc_tcia_views_mvp_wave0.segmentations",
308+
"idc-dev-etl.idc_tcia_views_mvp_wave0.qualitative_measurements",
309+
"idc-dev-etl.idc_tcia_views_mvp_wave0.quantitative_measurements"
310+
],["idc-dev.metadata.dicom_pivot_wave0"])
311+
312+
update_data_sources(["idc-dev.metadata.dicom_pivot_wave0"],['Derived Data'],['TCIA Derived Data'],["TCGA","QIN","ISPY","LIDC"])
313+
314+
disable_data_sources(["idc-dev-etl.idc_tcia_views_mvp_wave0.segmentations",
315+
"idc-dev-etl.idc_tcia_views_mvp_wave0.qualitative_measurements",
316+
"idc-dev-etl.idc_tcia_views_mvp_wave0.quantitative_measurements"
317+
])
318+
319+
except Exception as e:
320+
logging.exception(e)
321+
322+
323+
if __name__ == "__main__":
324+
main()

shell/database-setup.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@ else
7878
mysql -u$MYSQL_ROOT_USER -h $MYSQL_DB_HOST -p$MYSQL_ROOT_PASSWORD -D$DATABASE_NAME < ${HOMEROOT}/scripts/${METADATA_SQL_FILE}
7979
fi
8080
81-
8281
echo "Adding Django site IDs..."
8382
python3 ${HOMEROOT}/scripts/add_site_ids.py
8483

shell/run_tests.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ python3 ./manage_unit_test.py test solr_helpers google_helpers.bigquery --noinpu
1616

1717
echo "Running module unit tests with unittest..."
1818

19-
#python3 -m unittest solr_helpers
19+
#python3 -m unittest solr_helpers

shell/vagrant-set-env.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22
echo 'export PYTHONPATH=/home/vagrant/www:/home/vagrant/www/lib:/home/vagrant/www/IDC-Common' | tee -a /home/vagrant/.bash_profile
3-
echo 'export SECURE_LOCAL_PATH=../secure_files/idc/' | tee -a /home/vagrant/.bash_profile
3+
echo 'export SECURE_LOCAL_PATH=../parentDir/secure_files/idc/' | tee -a /home/vagrant/.bash_profile
44
echo 'export DJANGO_SETTINGS_MODULE=idc.settings' | tee -a /home/vagrant/.bash_profile
55
source /home/vagrant/.bash_profile
66
chmod +x /home/vagrant/www/shell/python-su.sh

static/css/search.css

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ tr {
303303
word-wrap: break-word; }
304304

305305

306-
#studies_table_head th studies_table td {
306+
#studies_table_head th, #studies_table td {
307307
word-break: break-word; }
308308
#studies_table_head th.project-name, th.case-id, th.study-id,
309309
#studies_table td.project-name, td.case-id, td.study-id {
@@ -343,8 +343,8 @@ tr {
343343
#series_table td.series-description {
344344
width:32%;
345345
}
346-
#series_table_head th.text_data,
347-
#series_table td.text_data {
346+
#series_table_head th.open-viewer,
347+
#series_table td.open-viewer {
348348
width:13%;
349349
}
350350

0 commit comments

Comments
 (0)