Skip to content

Commit 02589bc

Browse files
committed
Merge branch 'master' of https://github.com/ImagingDataCommons/IDC-Common into idc-test
2 parents 3baa535 + d742b71 commit 02589bc

File tree

2 files changed

+94
-70
lines changed

2 files changed

+94
-70
lines changed

idc_collections/collex_metadata_utils.py

Lines changed: 81 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
#
21
# Copyright 2015-2020, Institute for Systems Biology
32
#
43
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -647,7 +646,10 @@ def parse_partition_to_filter(cart_partition):
647646

648647

649648
# Manifest types supported: s5cmd, idc_index, json.
650-
def submit_manifest_job(data_version, filters, storage_loc, manifest_type, instructions, fields, cart_partition=None, filename=None):
649+
def submit_manifest_job(
650+
data_version, filters, storage_loc, manifest_type, instructions, fields, from_cart=False,
651+
cart_partition=None, filtergrp_list=None, filename=None
652+
):
651653
cart_filters = parse_partition_to_filter(cart_partition) if cart_partition else None
652654
child_records = None if cart_filters else "StudyInstanceUID"
653655
service_account_info = json.load(open(settings.GOOGLE_APPLICATION_CREDENTIALS))
@@ -673,11 +675,14 @@ def submit_manifest_job(data_version, filters, storage_loc, manifest_type, instr
673675

674676
filters = filters or {}
675677

676-
bq_query_and_params = get_bq_metadata(
677-
filters, ["crdc_series_uuid", storage_loc], data_version, fields, ["crdc_series_uuid", storage_loc],
678-
no_submit=True, search_child_records_by=child_records,
679-
reformatted_fields=reformatted_fields, cart_filters=cart_filters
680-
)
678+
if from_cart:
679+
bq_query_and_params = create_cart_sql(cart_partition, filtergrp_list, storage_loc, lvl="series")
680+
else:
681+
bq_query_and_params = get_bq_metadata(
682+
filters, ["crdc_series_uuid", storage_loc], data_version, fields, ["crdc_series_uuid", storage_loc],
683+
no_submit=True, search_child_records_by=child_records,
684+
reformatted_fields=reformatted_fields, cart_filters=cart_filters
685+
)
681686

682687
manifest_job = {
683688
"query": bq_query_and_params['sql_string'],
@@ -705,6 +710,7 @@ def create_file_manifest(request, cohort=None):
705710
req = request.GET or request.POST
706711
manifest = None
707712
partitions = None
713+
filtergrp_list = None
708714
S5CMD_BASE = "cp s3://{}/{}/* .{}"
709715
file_type = req.get('file_type', 's5cmd').lower()
710716
loc = req.get('loc_type_{}'.format(file_type), 'aws')
@@ -768,8 +774,6 @@ def create_file_manifest(request, cohort=None):
768774
id__in=versions.get_data_sources().filter(source_type=source_type).values_list("id", flat=True)
769775
).distinct()
770776

771-
print("File type: {}".format(file_type))
772-
773777
if file_type in ['s5cmd', 'idc_index']:
774778
api_loc = "https://s3.amazonaws.com" if loc == 'aws' else "https://storage.googleapis.com"
775779
cmd = "# idc download <manifest file name>{}".format(os.linesep)
@@ -786,7 +790,8 @@ def create_file_manifest(request, cohort=None):
786790
if async_download and (file_type not in ["bq"]):
787791
jobId, file_name = submit_manifest_job(
788792
ImagingDataCommonsVersion.objects.filter(active=True), filters, storage_bucket, file_type, instructions,
789-
selected_columns_sorted if file_type not in ["s5cmd", "idc_index"] else None, cart_partition=partitions,
793+
selected_columns_sorted if file_type not in ["s5cmd", "idc_index"] else None, from_cart=from_cart,
794+
cart_partition=partitions, filtergrp_list=filtergrp_list,
790795
filename=file_name
791796
)
792797
return JsonResponse({
@@ -852,17 +857,6 @@ def create_file_manifest(request, cohort=None):
852857
hdr = [hdr]
853858
rows += (hdr,)
854859

855-
if items['total'] > MAX_FILE_LIST_ENTRIES:
856-
hdr = "{}NOTE: Due to the limits of our system, we can only return {} manifest entries.".format(
857-
cmt_delim, str(MAX_FILE_LIST_ENTRIES)
858-
) + " Your cohort's total entries exceeded this number. This part of {} entries has been ".format(
859-
str(MAX_FILE_LIST_ENTRIES)
860-
) + " downloaded, sorted by PatientID, StudyID, SeriesID, and SOPInstanceUID.{}".format(linesep)
861-
862-
if file_type not in ['s5cmd', 'idc_index']:
863-
hdr = [hdr]
864-
rows += (hdr,)
865-
866860
hdr = "{}IDC Data Version(s): {}{}".format(
867861
cmt_delim,
868862
"; ".join([str(x) for x in versions]),
@@ -1114,7 +1108,10 @@ def parse_partition_string(partition):
11141108
id = partition['id']
11151109
part_str = ''
11161110
for i in range(0,len(id)):
1117-
part_str = part_str + '(+'+filts[i]+':("'+id[i]+'"))'
1111+
if (i==0):
1112+
part_str = part_str + '(+'+filts[i]+':("'+id[i]+'"))'
1113+
else:
1114+
part_str = part_str + ' AND (+'+filts[i]+':("'+id[i]+'"))'
11181115
cur_not = partition['not']
11191116
if (len(cur_not)>0):
11201117
cur_not = ['"' + x + '"' for x in cur_not]
@@ -1162,7 +1159,7 @@ def create_cart_query_string(query_list, partitions, join):
11621159
cur_part_str = parse_partition_string(cur_part)
11631160
for j in range(len(cur_part_attr_strA)):
11641161
if (len(cur_part_attr_strA[j])>0):
1165-
solrA.append('(' + cur_part_str + ')(' + cur_part_attr_strA[j] + ')')
1162+
solrA.append('(' + cur_part_str + ') AND (' + cur_part_attr_strA[j] + ')')
11661163
else:
11671164
solrA.append(cur_part_str)
11681165
solrA = ['(' + x + ')' for x in solrA]
@@ -1341,11 +1338,12 @@ def generate_solr_cart_and_filter_strings(current_filters,filtergrp_list, partit
13411338
current_solr_query = build_solr_query(
13421339
copy.deepcopy(current_filters),
13431340
with_tags_for_ex=False,
1344-
search_child_records_by=None
1341+
search_child_records_by=None, solr_default_op='AND'
13451342
)
13461343
try:
13471344
current_filt_query_set = create_query_set(current_solr_query, aux_sources, image_source, all_ui_attrs,
13481345
image_source, DataSetType)
1346+
current_filt_query_set = ['(' + filt + ')' if not filt[0] == '(' else filt for filt in current_filt_query_set]
13491347
current_filt_str = "".join(current_filt_query_set)
13501348
except:
13511349
current_filt_str = ""
@@ -1360,7 +1358,7 @@ def generate_solr_cart_and_filter_strings(current_filters,filtergrp_list, partit
13601358
solr_query = build_solr_query(
13611359
copy.deepcopy(filtergrp),
13621360
with_tags_for_ex=False,
1363-
search_child_records_by=None
1361+
search_child_records_by=None, solr_default_op='AND'
13641362
)
13651363
query_set_for_filt = create_query_set(solr_query, aux_sources, image_source, all_ui_attrs, image_source, DataSetType)
13661364
query_set_for_filt=['(' + filt +')' if not filt[0] == '(' else filt for filt in query_set_for_filt]
@@ -1462,6 +1460,7 @@ def get_table_data_with_cart_data(tabletype, sortarg, sortdir, current_filters,f
14621460
del(current_filters[tblitem])
14631461
[current_filt_str, cart_query_str_all, cart_query_str_studylvl, cart_query_str_serieslvl] = generate_solr_cart_and_filter_strings(current_filters,filtergrp_list,partitions)
14641462
no_tble_item_filt_str = current_filt_str
1463+
14651464
if len(tblfiltstr)>0:
14661465
current_filt_str = tblfiltstr+current_filt_str
14671466
if len(current_filt_str) > 0:
@@ -1473,6 +1472,7 @@ def get_table_data_with_cart_data(tabletype, sortarg, sortdir, current_filters,f
14731472
if (tabletype == "collections"):
14741473
sorted_ids = current_filters["collection_id"]
14751474

1475+
14761476
elif ("facetfields" in table_data) and (sortarg in table_data["facetfields"]):
14771477
# when sorting by a 'facet' field (# of cases, # of studies etc.), we need to find the set of ids selected from
14781478
# this field by the limit, offset params in a preliminary solr call, then add that set as a filter to limit the
@@ -1507,8 +1507,14 @@ def get_table_data_with_cart_data(tabletype, sortarg, sortdir, current_filters,f
15071507
sortStr = sortarg + " " + sortdir
15081508
imgNm= image_source_series.name if (tabletype=="series") else image_source.name
15091509

1510+
if (len(current_filt_str)>0):
1511+
fqs=[current_filt_str]
1512+
else:
1513+
fqs=None
1514+
1515+
15101516
rng_query = query_solr(
1511-
collection=imgNm, fields=[id], query_string=current_filt_str, fqs=None,
1517+
collection=imgNm, fields=[id], query_string=None, fqs=fqs,
15121518
facets=None, sort=sortStr, counts_only=False, collapse_on=collapse_id, offset=offset, limit=limit,
15131519
uniques=None, with_cursor=None, stats=None, totals=None, op='AND'
15141520
)
@@ -1588,15 +1594,18 @@ def get_table_data_with_cart_data(tabletype, sortarg, sortdir, current_filters,f
15881594

15891595
#table attributes need filter query. cart queries come in via stats queries
15901596
fqset = [rngfilt]
1597+
#fqset = rngfilt
15911598
if len(current_filt_str) > 0:
1592-
fqset.append("{!tag=f1}(" + current_filt_str + ")")
1599+
fqset.append("{!tag=f1}(+" + current_filt_str + ")")
1600+
#fqset = fqset + " AND {!tag=f1}(" + current_filt_str + ")"
1601+
#fqset.append('{!tag=f1}(+Modality:("RTSTRUCT"))(+collection_id:("4d_lung"))')
15931602

15941603
attr_results = []
15951604
# if table is collections, don't need attributes only cart stats. if table is series used series store
15961605

15971606
if not (tabletype =="series") and not (tabletype =="collections"):
15981607
solr_result = query_solr(
1599-
collection=image_source.name, fields=field_list, query_string=None, fqs=fqset,
1608+
collection=image_source.name, fields=field_list, query_string=None, fqs=fqset[:],
16001609
facets=None,sort=sortStr, counts_only=False,collapse_on=collapse_id, offset=0, limit=limit,
16011610
uniques=None, with_cursor=None, stats=None, totals=None, op='AND'
16021611
)
@@ -1636,11 +1645,14 @@ def get_table_data_with_cart_data(tabletype, sortarg, sortdir, current_filters,f
16361645

16371646
custom_facets = table_data["facets"]
16381647
fqset = ["{!tag=f0}"+rngfilt]
1648+
#fqset = "{!tag=f0}" + rngfilt
16391649
colrngfilt=""
16401650
caserngfilt = ""
16411651
seriesrngfilt = ""
1652+
#fqset=""
16421653
if len(current_filt_str) > 0:
1643-
fqset.append("{!tag=f1}(" + current_filt_str + ")")
1654+
#fqset=fqset + "AND {!tag=f1}(" + current_filt_str + ")"
1655+
fqset.append("{!tag=f1}(+" + current_filt_str + ")")
16441656
custom_facets["per_id_nf"] = copy.deepcopy(table_data["facets_not_filt"]["per_id_nf"])
16451657
with_filter = True
16461658

@@ -1695,8 +1707,8 @@ def get_table_data_with_cart_data(tabletype, sortarg, sortdir, current_filters,f
16951707
custom_facets["upstream_study_filter_cart"] = copy.deepcopy(upstream_cart_facets["upstream_study_filter_cart"])
16961708
custom_facets["upstream_study_filter_cart"]["domain"]["filter"] = studyrngQ+no_tble_item_filt_str
16971709

1698-
in_cart_domain_all = {"filter": cart_query_str_all, "excludeTags":"f1"} if with_filter else {"filter": cart_query_str_all}
1699-
in_filter_and_cart_domain_all = {"filter": cart_query_str_all}
1710+
in_cart_domain_all = {"filter": '(+'+cart_query_str_all+')', "excludeTags":"f1"} if with_filter else {"filter": '(+'+cart_query_str_all+')'}
1711+
in_filter_and_cart_domain_all = {"filter": '(+'+cart_query_str_all+')'}
17001712

17011713

17021714

@@ -1710,9 +1722,9 @@ def get_table_data_with_cart_data(tabletype, sortarg, sortdir, current_filters,f
17101722

17111723
if not (cart_query_str_studylvl==None) and (len(cart_query_str_studylvl)>0):
17121724

1713-
in_cart_domain_studylvl = {"filter": cart_query_str_studylvl, "excludeTags": "f1"} if with_filter else {
1725+
in_cart_domain_studylvl = {"filter": '(+'+cart_query_str_studylvl+')', "excludeTags": "f1"} if with_filter else {
17141726
"filter": cart_query_str_studylvl}
1715-
in_filter_and_cart_domain_studylvl = {"filter": cart_query_str_studylvl}
1727+
in_filter_and_cart_domain_studylvl = {"filter": '(+'+cart_query_str_studylvl+')'}
17161728

17171729
custom_facets["series_in_filter_and_cart"] = copy.deepcopy(cart_facets["series_in_filter_and_cart"])
17181730
custom_facets["series_in_filter_and_cart"]["field"] = id
@@ -1819,7 +1831,7 @@ def get_table_data_with_cart_data(tabletype, sortarg, sortdir, current_filters,f
18191831
return [num_found, table_arr]
18201832

18211833

1822-
def get_cart_data_studylvl(filtergrp_list, partitions, limit, offset, length, mxseries,results_lvl='StudyInstanceUID', with_records=True):
1834+
def get_cart_data_studylvl(filtergrp_list, partitions, limit, offset, length, mxseries,results_lvl='StudyInstanceUID', with_records=True, debug=False):
18231835
aggregate_level = "StudyInstanceUID"
18241836
versions=ImagingDataCommonsVersion.objects.filter(
18251837
active=True
@@ -1857,7 +1869,7 @@ def get_cart_data_studylvl(filtergrp_list, partitions, limit, offset, length, mx
18571869
solr_query = build_solr_query(
18581870
copy.deepcopy(filtergrp),
18591871
with_tags_for_ex=False,
1860-
search_child_records_by=None
1872+
search_child_records_by=None, solr_default_op='AND'
18611873
)
18621874
query_set_for_filt = create_query_set(solr_query, aux_sources, image_source, all_ui_attrs, image_source, DataSetType)
18631875
query_set_for_filt=['(' + filt +')' if not filt[0] == '(' else filt for filt in query_set_for_filt]
@@ -1880,11 +1892,12 @@ def get_cart_data_studylvl(filtergrp_list, partitions, limit, offset, length, mx
18801892

18811893
serieslvl_found = False
18821894
studyidsinseries = {}
1895+
query_str_series_lvl = ''
18831896
if (len(partitions_series_lvl) > 0):
18841897
query_str_series_lvl = create_cart_query_string([''], partitions_series_lvl, False)
18851898
if (len(query_str_series_lvl) > 0):
18861899
solr_result_series_lvl = query_solr(
1887-
collection=image_source_series.name, fields=field_list, query_string=query_str_series_lvl, fqs=None,
1900+
collection=image_source_series.name, fields=field_list, query_string=None , fqs=[query_str_series_lvl],
18881901
limit=int(mxseries), facets=custom_facets, sort=sortStr, counts_only=False, collapse_on=None,
18891902
uniques=None, with_cursor=None, stats=None, totals=totals, op='AND'
18901903
)
@@ -1909,7 +1922,7 @@ def get_cart_data_studylvl(filtergrp_list, partitions, limit, offset, length, mx
19091922
query_str = create_cart_query_string(query_list, partitions_study_lvl, False)
19101923
if len(query_str) > 0:
19111924
solr_result = query_solr(
1912-
collection=image_source.name, fields=field_list, query_string=query_str, fqs=None, facets=custom_facets,
1925+
collection=image_source.name, fields=field_list, query_string=None, fqs=[query_str], facets=custom_facets,
19131926
sort=sortStr, counts_only=False, collapse_on=None, uniques=None, with_cursor=None, stats=None,
19141927
totals=['SeriesInstanceUID'], op='AND', limit=int(limit), offset=int(offset)
19151928
)
@@ -1976,7 +1989,9 @@ def get_cart_data_studylvl(filtergrp_list, partitions, limit, offset, length, mx
19761989
if ('crdcval' in row):
19771990
row['crdc_series_uuid'] = row['crdcval']
19781991

1979-
1992+
if debug:
1993+
solr_result['response']['query_string'] = query_str
1994+
solr_result['response']['query_string_series_lvl'] = query_str_series_lvl
19801995
return solr_result['response']
19811996

19821997

@@ -2025,7 +2040,7 @@ def get_cart_data(filtergrp_list, partitions, field_list, limit, offset):
20252040

20262041
solr_result = query_solr(collection=image_source.name, fields=field_list, query_string=query_str, fqs=None,
20272042
facets=None,sort=None, counts_only=False,collapse_on='SeriesInstanceUID', offset=offset, limit=limit, uniques=None,
2028-
with_cursor=None, stats=None, totals=None, op='AND')
2043+
with_cursor=None, stats=None, totals=None, op='OR')
20292044

20302045
return solr_result['response']
20312046

@@ -2044,31 +2059,31 @@ def filtergrp_to_sql(filtergrp_lst):
20442059
reformatted_fields=reformatted_fields
20452060
)
20462061
# final cart sql may involve several filters. Need to avoid collisions in parameter sets
2047-
for param_list in filtersql['params']:
2048-
for param in param_list:
2049-
param_name=param['name']
2050-
if param_name in used_params:
2051-
param_try=param_name
2052-
safe_name_found = False
2053-
mtch = re.search(r'_\d+$', param_name)
2054-
if mtch == None:
2062+
for param in filtersql['params']:
2063+
#for param in param_list:
2064+
param_name=param['name']
2065+
if param_name in used_params:
2066+
param_try=param_name
2067+
safe_name_found = False
2068+
mtch = re.search(r'_\d+$', param_name)
2069+
if mtch == None:
2070+
break
2071+
numtry = int(param_name[mtch.regs[0][0]+1:])
2072+
while not safe_name_found:
2073+
param_try = param_name[:mtch.regs[0][0]+1] + str(numtry)
2074+
if not param_try in used_params:
2075+
param['name']= param_try
2076+
used_params[param_try]=1
2077+
safe_name_found = True
20552078
break
2056-
numtry = int(param_name[mtch.regs[0][0]+1:])
2057-
while not safe_name_found:
2058-
param_try = param_name[:mtch.regs[0][0]+1] + str(numtry)
2059-
if not param_try in used_params:
2060-
param['name']= param_try
2061-
used_params[param_try]=1
2062-
safe_name_found = True
2063-
break
2064-
numtry = numtry + 1
2065-
if ('intersect_clause' in filtersql):
2066-
filtersql['intersect_clause'] = filtersql['intersect_clause'].replace(param_name, param_try)
2067-
if ('query_filters' in filtersql):
2068-
for filtindex in range(len(filtersql['query_filters'])):
2069-
filtersql['query_filters'][filtindex] = filtersql['query_filters'][filtindex].replace(param_name, param_try)
2070-
else:
2071-
used_params[param_name]=1
2079+
numtry = numtry + 1
2080+
if ('intersect_clause' in filtersql):
2081+
filtersql['intersect_clause'] = filtersql['intersect_clause'].replace(param_name, param_try)
2082+
if ('query_filters' in filtersql):
2083+
for filtindex in range(len(filtersql['query_filters'])):
2084+
filtersql['query_filters'][filtindex] = filtersql['query_filters'][filtindex].replace(param_name, param_try)
2085+
else:
2086+
used_params[param_name]=1
20722087
filtersA.append(filtersql)
20732088
return filtersA
20742089

@@ -2924,12 +2939,13 @@ def get_bq_metadata(filters, fields, data_version, sources_and_attrs=None, group
29242939
fields.extend(['"{}" AS {}'.format(static_fields[x],x) for x in static_fields])
29252940
if reformatted_fields:
29262941
fields = reformatted_fields
2942+
29272943
for_union.append(query_base.format(
29282944
field_clause= ",".join(fields),
29292945
table_clause="`{}` {}".format(table_info[image_table]['name'], table_info[image_table]['alias']),
29302946
join_clause=""" """.join(joins),
2931-
where_clause=(" AND ({})".format((" AND ".join(query_filters) if len(query_filters) else "") if len(filters) else "")) if len(filters) else "",
2932-
intersect_clause="{}".format("" if not len(intersect_statements) else "{}{}".format(
2947+
where_clause=(" AND ({})".format(" AND ".join(query_filters))) if len(query_filters) else "",
2948+
intersect_clause="AND {}".format("" if not len(intersect_statements) else "{}{}".format(
29332949
" AND " if len(non_related_filters) and len(query_filters) else "", "{} IN ({})".format(
29342950
child_record_search_field, intersect_clause
29352951
))),

0 commit comments

Comments
 (0)