Skip to content

Commit 315423f

Browse files
author
Gerit Wagner
committed
check complex queries
1 parent a4c6e73 commit 315423f

File tree

6 files changed

+135
-0
lines changed

6 files changed

+135
-0
lines changed

search_query/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Tuple
77

88
# noqa: E501
9+
# ruff: noqa: E501
910

1011
# pylint: disable=too-few-public-methods
1112
# pylint: disable=line-too-long

search_query/linter_base.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
# pylint: disable=too-many-public-methods
2929
# pylint: disable=too-many-lines
30+
# ruff: noqa: E501
3031

3132

3233
# Could indeed be a general Validator class
@@ -1060,6 +1061,60 @@ def _check_redundant_terms(
10601061
)
10611062
redundant_terms.append(term_b)
10621063

1064+
def _check_for_opportunities_to_combine_subqueries(
1065+
self, term_field_query: Query
1066+
) -> None:
1067+
"""Check for opportunities to combine subqueries with the same search field."""
1068+
1069+
# Only consider top-level OR-connected subqueries with two children each
1070+
if term_field_query.operator and term_field_query.value == Operators.OR:
1071+
candidates = [
1072+
q
1073+
for q in term_field_query.children
1074+
if q.operator and q.value == Operators.AND and len(q.children) == 2
1075+
]
1076+
1077+
for i, q1 in enumerate(candidates):
1078+
for q2 in candidates[i + 1 :]:
1079+
a1, a2 = q1.children
1080+
b1, b2 = q2.children
1081+
1082+
# Identify identical and differing pairs
1083+
if a1.value == b1.value and a2.value != b2.value:
1084+
identical = (a1, b1)
1085+
differing = (a2, b2)
1086+
elif a2.value == b2.value and a1.value != b1.value:
1087+
identical = (a2, b2)
1088+
differing = (a1, b1)
1089+
else:
1090+
continue # Skip if no clearly matching pair
1091+
1092+
details = (
1093+
f"The queries share {Colors.GREY}identical query parts{Colors.END}:"
1094+
f"\n({Colors.GREY}{identical[0].to_string()}{Colors.END} AND "
1095+
f"{Colors.ORANGE}{differing[0].to_string()}{Colors.END}) OR \n"
1096+
f"({Colors.GREY}{identical[1].to_string()}{Colors.END} AND "
1097+
f"{Colors.ORANGE}{differing[1].to_string()}{Colors.END})\n"
1098+
f"Combine the {Colors.ORANGE}differing parts{Colors.END} into a "
1099+
f"{Colors.GREEN}single OR-group{Colors.END} to reduce redundancy:\n"
1100+
f"({Colors.GREY}{identical[0].to_string()}{Colors.END} AND "
1101+
f"({Colors.GREEN}{differing[0].to_string()} OR "
1102+
f"{differing[1].to_string()}{Colors.END}))"
1103+
)
1104+
1105+
positions = [differing[0].position, differing[1].position]
1106+
1107+
self.add_linter_message(
1108+
QueryErrorCode.QUERY_STRUCTURE_COMPLEX,
1109+
positions=positions, # type: ignore
1110+
details=details,
1111+
)
1112+
1113+
# iterate over subqueries
1114+
if term_field_query.children:
1115+
for child in term_field_query.children:
1116+
self._check_for_opportunities_to_combine_subqueries(child)
1117+
10631118

10641119
class QueryListLinter:
10651120
"""Class for Query List Validation"""

search_query/pubmed/linter.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,12 @@ def validate_query_tree(self, query: Query) -> None:
316316
# "Sleep"[mh] AND "vigilant attention"[ti]
317317
# "Sleep Deprivation"[mh] AND "vigilant attention"[ti]
318318

319+
def validate_platform_query(self, query: Query) -> None:
320+
"""Validate the query for the PubMed platform"""
321+
322+
term_field_query = self.get_query_with_fields_at_terms(query)
323+
self._check_for_opportunities_to_combine_subqueries(term_field_query)
324+
319325
def syntax_str_to_generic_search_field_set(self, field_value: str) -> set:
320326
"""Translate a search field"""
321327

search_query/pubmed/parser.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,9 @@ def parse(self) -> Query:
248248

249249
query.set_platform_unchecked(PLATFORM.PUBMED.value, silent=True)
250250

251+
self.linter.validate_platform_query(query) # type: ignore
252+
self.linter.check_status()
253+
251254
return query
252255

253256

@@ -347,6 +350,10 @@ def parse(self) -> Query:
347350

348351
query = list(self.query_dict.values())[-1]["query"]
349352

353+
linter = PubmedQueryStringLinter(query_str=self.query_list)
354+
linter.validate_query_tree(query)
355+
linter.check_status()
356+
350357
# linter.check_status() ?
351358
query.set_platform_unchecked(PLATFORM.PUBMED.value)
352359

search_query/pubmed/serializer.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ def to_string_pubmed(query: Query) -> str:
1515

1616
# to do combine querys for PLATFORM_COMBINED_FIELDS_MAP
1717

18+
if not query.children:
19+
# query has no children, so it is a leaf node
20+
if query.search_field:
21+
return f"{query.value}{query.search_field.value}"
22+
return query.value
23+
1824
result = ""
1925
for child in query.children:
2026
if not child.operator:

test/test_pubmed.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,66 @@ def test_pubmed_invalid_token_sequences(
709709
}
710710
],
711711
),
712+
(
713+
'((AI OR "Artificial Intelligence") AND Aversion) OR ((AI OR "Artificial Intelligence") AND Appreciation)',
714+
"",
715+
[
716+
{
717+
"code": "E0001",
718+
"label": "search-field-missing",
719+
"message": "Expected search field is missing",
720+
"is_fatal": False,
721+
"position": [(-1, -1)],
722+
"details": "Search field is missing (TODO: default?)",
723+
},
724+
{
725+
"code": "W0004",
726+
"label": "query-structure-unnecessarily-complex",
727+
"message": "Query structure is more complex than necessary",
728+
"is_fatal": False,
729+
"position": [(39, 47), (91, 103)],
730+
"details": 'The queries share \x1b[90midentical query parts\x1b[0m:\n(\x1b[90m(AI[all] OR "Artificial Intelligence"[all])\x1b[0m AND \x1b[93mAversion[all]\x1b[0m) OR \n(\x1b[90m(AI[all] OR "Artificial Intelligence"[all])\x1b[0m AND \x1b[93mAppreciation[all]\x1b[0m)\nCombine the \x1b[93mdiffering parts\x1b[0m into a \x1b[92msingle OR-group\x1b[0m to reduce redundancy:\n(\x1b[90m(AI[all] OR "Artificial Intelligence"[all])\x1b[0m AND (\x1b[92mAversion[all] OR Appreciation[all]\x1b[0m))',
731+
},
732+
],
733+
),
734+
(
735+
'(Algorithm* Aversion) OR (Algorithm* Appreciation) OR ((AI OR "Artificial Intelligence") AND Aversion) OR ((AI OR "Artificial Intelligence") AND Appreciation) OR ("AI recommendation" OR "Artificial intelligence recommendation" OR "Machine learning recommendation" OR "ML recommendation") OR ("AI decision*" OR "Artificial intelligence decision*" OR "Algorithm* decision" OR "Machine learning decision*" OR "ML decision*") OR ("AI Advice" OR "Artificial intelligence advice" OR "Algorithm* advice" OR "Machine learning advice" OR "ML advice") OR (("AI" OR "Artificial Intelligence" OR "Algorithm*" OR "Machine learning" OR "ML") AND "Decision aid")',
736+
"",
737+
[
738+
{
739+
"code": "E0001",
740+
"label": "search-field-missing",
741+
"message": "Expected search field is missing",
742+
"is_fatal": False,
743+
"position": [(-1, -1)],
744+
"details": "Search field is missing (TODO: default?)",
745+
},
746+
{
747+
"code": "W0004",
748+
"label": "query-structure-unnecessarily-complex",
749+
"message": "Query structure is more complex than necessary",
750+
"is_fatal": False,
751+
"position": [(93, 101), (145, 157)],
752+
"details": 'The queries share \x1b[90midentical query parts\x1b[0m:\n(\x1b[90m(AI[all] OR "Artificial Intelligence"[all])\x1b[0m AND \x1b[93mAversion[all]\x1b[0m) OR \n(\x1b[90m(AI[all] OR "Artificial Intelligence"[all])\x1b[0m AND \x1b[93mAppreciation[all]\x1b[0m)\nCombine the \x1b[93mdiffering parts\x1b[0m into a \x1b[92msingle OR-group\x1b[0m to reduce redundancy:\n(\x1b[90m(AI[all] OR "Artificial Intelligence"[all])\x1b[0m AND (\x1b[92mAversion[all] OR Appreciation[all]\x1b[0m))',
753+
},
754+
{
755+
"code": "W0004",
756+
"label": "query-structure-unnecessarily-complex",
757+
"message": "Query structure is more complex than necessary",
758+
"is_fatal": False,
759+
"position": [(93, 101), (632, 646)],
760+
"details": 'The queries share \x1b[90midentical query parts\x1b[0m:\n(\x1b[90m(AI[all] OR "Artificial Intelligence"[all])\x1b[0m AND \x1b[93mAversion[all]\x1b[0m) OR \n(\x1b[90m("AI"[all] OR "Artificial Intelligence"[all] OR "Algorithm*"[all] OR "Machine learning"[all] OR "ML"[all])\x1b[0m AND \x1b[93m"Decision aid"[all]\x1b[0m)\nCombine the \x1b[93mdiffering parts\x1b[0m into a \x1b[92msingle OR-group\x1b[0m to reduce redundancy:\n(\x1b[90m(AI[all] OR "Artificial Intelligence"[all])\x1b[0m AND (\x1b[92mAversion[all] OR "Decision aid"[all]\x1b[0m))',
761+
},
762+
{
763+
"code": "W0004",
764+
"label": "query-structure-unnecessarily-complex",
765+
"message": "Query structure is more complex than necessary",
766+
"is_fatal": False,
767+
"position": [(145, 157), (632, 646)],
768+
"details": 'The queries share \x1b[90midentical query parts\x1b[0m:\n(\x1b[90m(AI[all] OR "Artificial Intelligence"[all])\x1b[0m AND \x1b[93mAppreciation[all]\x1b[0m) OR \n(\x1b[90m("AI"[all] OR "Artificial Intelligence"[all] OR "Algorithm*"[all] OR "Machine learning"[all] OR "ML"[all])\x1b[0m AND \x1b[93m"Decision aid"[all]\x1b[0m)\nCombine the \x1b[93mdiffering parts\x1b[0m into a \x1b[92msingle OR-group\x1b[0m to reduce redundancy:\n(\x1b[90m(AI[all] OR "Artificial Intelligence"[all])\x1b[0m AND (\x1b[92mAppreciation[all] OR "Decision aid"[all]\x1b[0m))',
769+
},
770+
],
771+
),
712772
],
713773
)
714774
def test_linter(

0 commit comments

Comments
 (0)