Skip to content

Commit a69d6d4

Browse files
committed
-> Make sure to support v2 and v1 BQ API query parameter types since v2 isn't easily serialized for transport
1 parent 46405f4 commit a69d6d4

File tree

3 files changed

+318
-21
lines changed

3 files changed

+318
-21
lines changed

google_helpers/bigquery/bq_support.py

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from google.cloud import bigquery
2929
from google.cloud.bigquery import QueryJob, QueryJobConfig
3030
from googleapiclient.errors import HttpError
31-
from .utils import build_bq_filter_and_params as build_bq_flt_prm, build_bq_where_clause as build_bq_clause
31+
from .utils import build_bq_filter_and_params as build_bq_flt_prm, build_bq_where_clause as build_bq_clause, build_bq_filter_and_params_v1
3232

3333
logger = logging.getLogger(__name__)
3434

@@ -465,28 +465,23 @@ def insert_job_batch_and_get_results(cls, query_set):
465465

466466
return query_set
467467

468+
# v2 API pass through for filter and paramter builder
468469
@staticmethod
469470
def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with_count_toggle=False,
470-
field_prefix=None, type_schema=None, case_insens=True):
471+
field_prefix=None, type_schema=None, case_insens=True, continuous_numerics=None):
471472

472473
return build_bq_flt_prm(filters, comb_with, param_suffix, with_count_toggle, field_prefix, type_schema,
473-
case_insens)
474-
475-
# Builds a BQ WHERE clause from a set of filters of the form:
476-
# {
477-
# 'field_name': [<value>,...]
478-
# }
479-
# Breaks out '<ATTR> IS NULL'
480-
# 2+ values are converted to IN (<value>,...)
481-
# Filters must already be pre-bucketed or formatted
482-
# Use of LIKE is detected based on single-length value array and use of % in the value string
483-
# Support special 'mutation' filter category
484-
# Support for Greater/Less than (or equal to) via [gl]t[e]{0,1} in attr name,
485-
# eg. {"age_at_diagnosis_gte": [50,]}
486-
# Support for BETWEEN via _btw in attr name, eg. ("wbc_at_diagnosis_btw": [800,1200]}
487-
# Support for providing an explicit schema of the fields being searched
488-
#
489-
# TODO: add support for DATETIME eg 6/10/2010
474+
case_insens, continuous_numerics)
475+
476+
# v1 API pass through for filter and paramter builder
477+
@staticmethod
478+
def build_bq_filter_and_params_(filters, comb_with='AND', param_suffix=None, with_count_toggle=False,
479+
field_prefix=None, type_schema=None, case_insens=True, continuous_numerics=None):
480+
481+
return build_bq_filter_and_params_v1(filters, comb_with, param_suffix, with_count_toggle, field_prefix, type_schema,
482+
case_insens, continuous_numerics)
483+
484+
# pass through for where-clause builder
490485
@staticmethod
491486
def build_bq_where_clause(filters, join_with_space=False, comb_with='AND', field_prefix=None,
492487
type_schema=None, encapsulated=True, continuous_numerics=None, case_insens=True,

google_helpers/bigquery/utils.py

Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,3 +542,305 @@ def build_bq_where_clause(filters, join_with_space=False, comb_with='AND', field
542542
filter_set.append('{}{}{}'.format("(" if encapsulate else "", filter_string, ")" if encapsulate else ""))
543543

544544
return " {} ".format(comb_with).join(filter_set)
545+
546+
547+
# Builds a BQ API v1 query parameter set and WHERE clause string from a set of filters of the form:
548+
# {
549+
# 'field_name': [<value>,...]
550+
# }
551+
# Breaks out '<ATTR> IS NULL'
552+
# 2+ values are converted to IN (<value>,...)
553+
# Filters must already be pre-bucketed or formatted
554+
# Use of LIKE is detected based on single-length value array and use of % in the value string
555+
# Support special 'mutation' filter category
556+
# Support for Greater/Less than (or equal to) via [gl]t[e]{0,1} in attr name,
557+
# eg. {"age_at_diagnosis_gte": [50,]}
558+
# Support for BETWEEN via _btw in attr name, eg. ("wbc_at_diagnosis_btw": [800,1200]}
559+
# Support for providing an explicit schema of the fields being searched
560+
# Support for specifying a set of continuous numeric attributes to be presumed for BETWEEN clauses
561+
#
562+
# TODO: add support for DATETIME eg 6/10/2010
563+
def build_bq_filter_and_params_v1(filters, comb_with='AND', param_suffix=None, with_count_toggle=False,
564+
field_prefix=None, type_schema=None, case_insens=True, continuous_numerics=None):
565+
if field_prefix and field_prefix[-1] != ".":
566+
field_prefix += "."
567+
568+
continuous_numerics = continuous_numerics or []
569+
570+
result = {
571+
'filter_string': '',
572+
'parameters': [],
573+
'attr_params': {}
574+
}
575+
576+
attr_filters = {}
577+
578+
if with_count_toggle:
579+
result['count_params'] = {}
580+
581+
filter_set = []
582+
583+
mutation_filters = {}
584+
other_filters = {}
585+
586+
# Split mutation filters into their own set, because of repeat use of the same attrs
587+
for attr in filters:
588+
if 'MUT:' in attr:
589+
mutation_filters[attr] = filters[attr]
590+
else:
591+
other_filters[attr] = filters[attr]
592+
593+
mut_filtr_count = 1
594+
# 'Mutation' filters, special category for MUT: type filters
595+
for attr, values in list(mutation_filters.items()):
596+
if type(values) is not list:
597+
values = [values]
598+
gene = attr.split(':')[2]
599+
filter_type = attr.split(':')[-1].lower()
600+
invert = bool(attr.split(':')[3] == 'NOT')
601+
param_name = 'gene{}{}'.format(str(mut_filtr_count), '_{}'.format(param_suffix) if param_suffix is not None else '')
602+
filter_string = '{}Hugo_Symbol = @{} AND '.format('' if not field_prefix else field_prefix, param_name)
603+
604+
gene_query_param = {
605+
'name': param_name,
606+
'parameterType': {
607+
'type': 'STRING'
608+
},
609+
'parameterValue': {
610+
'value': gene
611+
}
612+
}
613+
614+
var_query_param = {
615+
'name': None,
616+
'parameterType': {
617+
'type': None
618+
},
619+
'parameterValue': {
620+
621+
}
622+
}
623+
624+
if filter_type == 'category' and values[0].lower() == 'any':
625+
filter_string += '{}Variant_Classification IS NOT NULL'.format(
626+
'' if not field_prefix else field_prefix, )
627+
var_query_param = None
628+
else:
629+
if filter_type == 'category':
630+
values = MOLECULAR_CATEGORIES[values[0]]['attrs']
631+
var_param_name = "var_class{}{}".format(str(mut_filtr_count),
632+
'_{}'.format(param_suffix) if param_suffix is not None else '')
633+
filter_string += '{}Variant_Classification {}IN UNNEST(@{})'.format(
634+
'' if not field_prefix else field_prefix, 'NOT ' if invert else '', var_param_name)
635+
var_query_param['name'] = var_param_name
636+
var_query_param['parameterType']['type'] = 'ARRAY'
637+
var_query_param['parameterValue'] = {'arrayValues': [{'value': x} for x in values]}
638+
var_query_param['parameterType']['arrayType'] = {'type': 'STRING'}
639+
640+
filter_set.append('({})'.format(filter_string))
641+
result['parameters'].append(gene_query_param)
642+
var_query_param and result['parameters'].append(var_query_param)
643+
644+
mut_filtr_count += 1
645+
646+
# Standard query filters
647+
for attr, values in list(other_filters.items()):
648+
is_btw = re.search('_e?btwe?', attr.lower()) is not None
649+
attr_name = attr[:attr.rfind('_')] if re.search('_[gl]te?|_e?btwe?|_eq', attr) else attr
650+
if attr_name not in attr_filters:
651+
operator = 'OR'
652+
if 'values' in values:
653+
# This is a fully qualified attribute which needs to have its definition broken out
654+
operator = values['op']
655+
values = values['values']
656+
attr_filters[attr_name] = {
657+
'OP': operator,
658+
'filters': []
659+
}
660+
attr_filter_set = attr_filters[attr_name]['filters']
661+
# We require our attributes to be value lists
662+
if type(values) is not list:
663+
values = [values]
664+
# However, *only* ranged numerics can be a list of lists; all others must be a single list
665+
else:
666+
if type(values[0]) is list and not is_btw and attr not in continuous_numerics:
667+
values = [y for x in values for y in x]
668+
669+
parameter_type = None
670+
if (type_schema and type_schema.get(attr, None)):
671+
parameter_type = ('NUMERIC' if type_schema[attr] != 'STRING' else 'STRING')
672+
elif FIXED_TYPES.get(attr, None):
673+
parameter_type = FIXED_TYPES.get(attr)
674+
else:
675+
# If the values are arrays we assume the first value in the first array is indicative of all
676+
# other values (since we don't support multi-typed fields)
677+
type_check = values[0] if type(values[0]) is not list else values[0][0]
678+
parameter_type = (
679+
'STRING' if (
680+
type(type_check) not in [int, float, complex] and re.compile(r'[^0-9\.,]',
681+
re.UNICODE).search(
682+
type_check)
683+
) else 'NUMERIC'
684+
)
685+
filter_string = ''
686+
param_name = attr + '{}'.format('_{}'.format(param_suffix) if param_suffix is not None else '')
687+
query_param = {
688+
'name': param_name,
689+
'parameterType': {'type': parameter_type},
690+
'parameterValue': {}
691+
}
692+
if 'None' in values:
693+
values.remove('None')
694+
filter_string = "{}{} IS NULL".format('' if not field_prefix else field_prefix, attr)
695+
696+
if len(values) > 0:
697+
if len(filter_string):
698+
filter_string += " OR "
699+
if len(values) == 1 and not is_btw:
700+
# Single scalar param
701+
query_param['parameterValue']['value'] = values[0]
702+
if query_param['parameterType']['type'] == 'STRING':
703+
filter_string += "LOWER({}{}) = LOWER(@{})".format('' if not field_prefix else field_prefix, attr,
704+
param_name)
705+
elif query_param['parameterType']['type'] == 'NUMERIC':
706+
operator = "{}{}".format(
707+
">" if re.search(r'_gte?', attr) else "<" if re.search(r'_lte?', attr) else "",
708+
'=' if re.search(r'_[lg]te', attr) or not re.search(r'_[lg]', attr) or attr.endswith(
709+
'_eq') else ''
710+
)
711+
filter_string += "{}{} {} @{}".format(
712+
'' if not field_prefix else field_prefix, attr_name,
713+
operator, param_name
714+
)
715+
# Occasionally attributes may come in without the appropriate _e?btwe? suffix; we account for that here
716+
# by checking for the proper attr_name in the optional continuous_numerics list
717+
elif is_btw or attr_name in continuous_numerics:
718+
# Check for a single array of two and if we find it, convert it to an array containing
719+
# a 2-member array
720+
if len(values) == 2 and type(values[0]) is not list:
721+
values = [values]
722+
else:
723+
# confirm an array of arrays all contain paired values
724+
all_pairs = True
725+
for x in values:
726+
if len(x) != 2:
727+
all_pairs = False
728+
if not all_pairs:
729+
logger.error(
730+
"[ERROR] While parsing attribute {}, calculated to be a numeric range filter, found an unparseable value:")
731+
logger.error("[ERROR] {}".format(values))
732+
continue
733+
btw_counter = 1
734+
query_params = []
735+
btw_filter_strings = []
736+
for btws in values:
737+
param_name_1 = '{}_btw_{}'.format(param_name, btw_counter)
738+
btw_counter += 1
739+
param_name_2 = '{}_btw_{}'.format(param_name, btw_counter)
740+
btw_counter += 1
741+
# Generate the params for each of the BTW cases
742+
if attr.endswith('_btw'):
743+
ops = ["{}{} > @{}".format(
744+
'' if not field_prefix else field_prefix, attr_name,
745+
param_name_1
746+
)]
747+
# filter_string += " OR ".join(btw_filter_strings)
748+
ops.append("{}{} < @{}".format(
749+
'' if not field_prefix else field_prefix, attr_name,
750+
param_name_2
751+
))
752+
btw_filter_strings.append(
753+
" AND ".join(ops)
754+
)
755+
elif attr.endswith('_ebtw'):
756+
ops = ["{}{} >= @{}".format(
757+
'' if not field_prefix else field_prefix, attr_name,
758+
param_name_1
759+
)]
760+
# filter_string += " OR ".join(btw_filter_strings)
761+
ops.append("{}{} < @{}".format(
762+
'' if not field_prefix else field_prefix, attr_name,
763+
param_name_2
764+
))
765+
btw_filter_strings.append(
766+
" AND ".join(ops)
767+
)
768+
elif attr.endswith('_btwe'):
769+
ops = ["{}{} > @{}".format(
770+
'' if not field_prefix else field_prefix, attr_name,
771+
param_name_1
772+
)]
773+
# filter_string += " OR ".join(btw_filter_strings)
774+
ops.append("{}{} <= @{}".format(
775+
'' if not field_prefix else field_prefix, attr_name,
776+
param_name_2
777+
))
778+
btw_filter_strings.append(
779+
" AND ".join(ops)
780+
)
781+
else: # attr.endswith('_ebtwe'):
782+
btw_filter_strings.append("{}{} BETWEEN @{} AND @{}".format(
783+
'' if not field_prefix else field_prefix, attr_name,
784+
param_name_1,
785+
param_name_2
786+
))
787+
# filter_string += " OR ".join(btw_filter_strings)
788+
789+
# query_param becomes our template for each pair
790+
query_param_1 = copy.deepcopy(query_param)
791+
query_param_2 = copy.deepcopy(query_param)
792+
query_param_1['name'] = param_name_1
793+
query_param_1['parameterValue']['value'] = btws[0]
794+
query_param_2['name'] = param_name_2
795+
query_param_2['parameterValue']['value'] = btws[1]
796+
query_params.extend([query_param_1, query_param_2, ])
797+
798+
filter_string += " OR ".join(btw_filter_strings)
799+
query_param = query_params
800+
else:
801+
if operator == 'AND' and len(values) > 1:
802+
# If an operator is to be AND'd with more than one value we must make an intersection statement
803+
# on the higher-level entity (i.e. select for studies which have series containing both values)
804+
# That cannot be performed here, as this is only a clause builder
805+
logger.warning("[WARNING] Multiple-value AND clauses require an intersection statement!")
806+
else:
807+
# Simple array param
808+
query_param['parameterType']['type'] = "ARRAY"
809+
query_param['parameterType']['arrayType'] = {
810+
'type': parameter_type
811+
}
812+
query_param['parameterValue'] = {
813+
'arrayValues': [{'value': x.lower() if parameter_type == 'STRING' else x} for x in values]}
814+
815+
clause_base = "%s IN UNNEST(@{})" % ("LOWER({}{})" if parameter_type == "STRING" else "{}{}")
816+
filter_string += clause_base.format('' if not field_prefix else field_prefix, attr,
817+
param_name)
818+
819+
if with_count_toggle:
820+
filter_string = "({}) OR @{}_filtering = 'not_filtering'".format(filter_string, param_name)
821+
result['count_params'][param_name] = {
822+
'name': param_name + '_filtering',
823+
'parameterType': {
824+
'type': 'STRING'
825+
},
826+
'parameterValue': {
827+
'value': 'filtering'
828+
}
829+
}
830+
if attr not in result['attr_params']:
831+
result['attr_params'][attr] = []
832+
result['attr_params'][attr].append(param_name)
833+
result['parameters'].append(result['count_params'][param_name])
834+
835+
attr_filter_set.append('{}'.format(filter_string))
836+
837+
if type(query_param) is list:
838+
result['parameters'].extend(query_param)
839+
else:
840+
result['parameters'].append(query_param)
841+
842+
filter_set = ["(({}))".format(") {} (".format(attr_filters[x]['OP']).join(attr_filters[x]['filters'])) for x in
843+
attr_filters]
844+
result['filter_string'] = " {} ".format(comb_with).join(filter_set)
845+
846+
return result

idc_collections/collex_metadata_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from solr_helpers import query_solr_and_format_result, query_solr, build_solr_stats, build_solr_facets, build_solr_query
3131
from google_helpers.bigquery.bq_support import BigQuerySupport
3232
from google_helpers.bigquery.export_support import BigQueryExportFileList
33-
from google_helpers.bigquery.utils import build_bq_filter_and_params as build_bq_filter_and_params_
33+
from google_helpers.bigquery.utils import build_bq_filter_and_params as build_bq_filter_and_params_v2, build_bq_filter_and_params_v1
3434
import hashlib
3535
from django.conf import settings
3636
from django.shortcuts import render, redirect
@@ -2972,7 +2972,7 @@ def get_bq_metadata(filters, fields, data_version, sources_and_attrs=None, group
29722972

29732973
ranged_numerics = Attribute.get_ranged_attrs()
29742974

2975-
build_bq_flt_and_params = build_bq_filter_and_params_ if with_v2_api else BigQuerySupport.build_bq_filter_and_params
2975+
build_bq_flt_and_params = build_bq_filter_and_params_v2 if with_v2_api else build_bq_filter_and_params_v1
29762976

29772977
filter_attr_by_bq = {}
29782978
field_attr_by_bq = {}

0 commit comments

Comments
 (0)