Skip to content

Commit c72b348

Browse files
PubMed: Serializer, NEAR Query Translation, Artificial Parentheses (#42)
* fix artificial_parentheses * add artificial_parentheses for Pubmed * exclude NOT operator from unsupported suffix * fix serialization of NOT operator for Pubmed * fix Pubmed serializer * update details in Pubmed test * import annotations * revise PubMed serializer * NEAR query serialization * Auto-update documentation indices * parse NEAR queries * adjust NEARQuery, update serializer * fix f-string issue * adjust Pubmed linter tests * NEAR query translation * fix proximity validation * collapse NEAR queries --------- Co-authored-by: github-actions <actions@github.com>
1 parent cfc9d81 commit c72b348

File tree

8 files changed

+414
-95
lines changed

8 files changed

+414
-95
lines changed

search_query/linter_base.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ def handle_suffix_in_query_str(self, query_str: str) -> str:
436436
if quote_count % 2 != 0:
437437
return query_str # unbalanced quotes, do not attempt trimming
438438

439-
suffix_match = re.search(r"\)(?!\s*(AND|OR))[^()\[\]]*$", query_str)
439+
suffix_match = re.search(r"\)(?!\s*(AND|OR|NOT))[^()\[\]]*$", query_str)
440440

441441
original_query_str = query_str # preserve for position calculation
442442

@@ -704,6 +704,8 @@ def add_artificial_parentheses_for_operator_precedence(
704704
previous_value = -1
705705
# Added artificial parentheses
706706
art_par = 0
707+
# Start index
708+
start_index = index
707709

708710
self._print_unequal_precedence_warning(index)
709711

@@ -721,16 +723,28 @@ def add_artificial_parentheses_for_operator_precedence(
721723
if self.tokens[index].type == TokenTypes.PARENTHESIS_CLOSED:
722724
output.append(self.tokens[index])
723725
index += 1
724-
# Add closed parenthesis in case there are still open ones
725-
while art_par > 0:
726-
output.append(
727-
Token(
728-
value=")",
729-
type=TokenTypes.PARENTHESIS_CLOSED,
730-
position=(-1, -1),
726+
# Add parentheses in case there are missing ones
727+
if art_par > 0:
728+
while art_par > 0:
729+
output.append(
730+
Token(
731+
value=")",
732+
type=TokenTypes.PARENTHESIS_CLOSED,
733+
position=(-1, -1),
734+
)
731735
)
732-
)
733-
art_par -= 1
736+
art_par -= 1
737+
if art_par < 0:
738+
while art_par < 0:
739+
output.insert(
740+
start_index,
741+
Token(
742+
value="(",
743+
type=TokenTypes.PARENTHESIS_OPEN,
744+
position=(-1, -1),
745+
),
746+
)
747+
art_par += 1
734748
return index, output
735749

736750
if self.tokens[index].type in [

search_query/pubmed/linter.py

Lines changed: 149 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
#!/usr/bin/env python3
22
"""Pubmed query linter."""
3+
from __future__ import annotations
4+
35
import re
46
import typing
57

8+
from search_query.constants import Colors
69
from search_query.constants import ListTokenTypes
710
from search_query.constants import OperatorNodeTokenTypes
811
from search_query.constants import PLATFORM
@@ -213,6 +216,147 @@ def check_invalid_token_sequences(self) -> None:
213216
details=f"Cannot end with {self.tokens[-1].type.value}",
214217
)
215218

219+
def _print_unequal_precedence_warning(self, index: int) -> None:
220+
unequal_precedence_operators = self._get_unequal_precedence_operators(
221+
self.tokens[index:]
222+
)
223+
if not unequal_precedence_operators:
224+
return
225+
226+
precedence_list = [o.value for o in unequal_precedence_operators]
227+
precedence_lines = []
228+
for idx, op in enumerate(precedence_list):
229+
if idx == 0:
230+
precedence_lines.append(
231+
f"Operator {Colors.GREEN}{op}{Colors.END} at position {idx + 1} is evaluated first "
232+
f"because it is the leftmost operator."
233+
)
234+
elif idx == len(precedence_list) - 1:
235+
precedence_lines.append(
236+
f"Operator {Colors.ORANGE}{op}{Colors.END} at position {idx + 1} is evaluated last "
237+
f"because it is the rightmost operator."
238+
)
239+
else:
240+
precedence_lines.append(
241+
f"Operator {Colors.ORANGE}{op}{Colors.END} at position {idx + 1} is evaluated next."
242+
)
243+
244+
precedence_info = "\n".join(precedence_lines)
245+
246+
details = (
247+
"The query uses multiple operators, but without parentheses to make the intended logic explicit. "
248+
"PubMed evaluates queries strictly from left to right without applying traditional operator precedence. "
249+
"This can lead to unexpected interpretations of the query.\n\n"
250+
"Specifically:\n"
251+
f"{precedence_info}\n\n"
252+
"To fix this, search-query adds artificial parentheses around operators "
253+
"based on their left-to-right position in the query.\n\n"
254+
)
255+
256+
self.add_linter_message(
257+
QueryErrorCode.IMPLICIT_PRECEDENCE,
258+
positions=[o.position for o in unequal_precedence_operators],
259+
details=details,
260+
)
261+
262+
def add_artificial_parentheses_for_operator_precedence(
263+
self,
264+
index: int = 0,
265+
output: typing.Optional[list] = None,
266+
) -> tuple[int, list[Token]]:
267+
"""
268+
Adds artificial parentheses with position (-1, -1)
269+
to enforce PubMed operator precedence.
270+
"""
271+
if output is None:
272+
output = []
273+
# Value of operator
274+
value = 0
275+
# Value of previous operator
276+
previous_value = -1
277+
# Added artificial parentheses
278+
art_par = 0
279+
# Start index
280+
start_index = index
281+
282+
self._print_unequal_precedence_warning(index)
283+
284+
while index < len(self.tokens):
285+
# Forward iteration through tokens
286+
287+
if self.tokens[index].type == TokenTypes.PARENTHESIS_OPEN:
288+
output.append(self.tokens[index])
289+
index += 1
290+
index, output = self.add_artificial_parentheses_for_operator_precedence(
291+
index, output
292+
)
293+
continue
294+
295+
if self.tokens[index].type == TokenTypes.PARENTHESIS_CLOSED:
296+
output.append(self.tokens[index])
297+
index += 1
298+
# Add opening parentheses in case there are missing ones
299+
if art_par < 0:
300+
while art_par < 0:
301+
output.insert(
302+
start_index,
303+
Token(
304+
value="(",
305+
type=TokenTypes.PARENTHESIS_OPEN,
306+
position=(-1, -1),
307+
),
308+
)
309+
art_par += 1
310+
return index, output
311+
312+
if self.tokens[index].type in [
313+
TokenTypes.LOGIC_OPERATOR,
314+
TokenTypes.PROXIMITY_OPERATOR,
315+
]:
316+
value = self.get_precedence(self.tokens[index].value.upper())
317+
318+
if previous_value in (value, -1):
319+
# Same precedence → just add to output
320+
output.append(self.tokens[index])
321+
previous_value = value
322+
323+
elif value != previous_value:
324+
# Different precedence → close parenthesis
325+
output.append(
326+
Token(
327+
value=")",
328+
type=TokenTypes.PARENTHESIS_CLOSED,
329+
position=(-1, -1),
330+
)
331+
)
332+
previous_value -= 1
333+
art_par -= 1
334+
output.append(self.tokens[index])
335+
previous_value = value
336+
337+
index += 1
338+
continue
339+
340+
# Default: search terms, fields, etc.
341+
output.append(self.tokens[index])
342+
index += 1
343+
344+
# Add opening parentheses in case there are missing ones
345+
if art_par < 0:
346+
while art_par < 0:
347+
output.insert(
348+
0,
349+
Token(
350+
value="(", type=TokenTypes.PARENTHESIS_OPEN, position=(-1, -1)
351+
),
352+
)
353+
art_par += 1
354+
355+
if index == len(self.tokens):
356+
self.flatten_redundant_artificial_nesting(output)
357+
358+
return index, output
359+
216360
def check_invalid_wildcard(self, query: Query) -> None:
217361
"""Check search term for invalid wildcard *"""
218362

@@ -262,15 +406,12 @@ def check_invalid_proximity_operator(self) -> None:
262406
continue
263407

264408
nr_of_terms = len(search_phrase_token.value.strip('"').split())
265-
if nr_of_terms >= 2 and not (
409+
if nr_of_terms < 2 or not (
266410
search_phrase_token.value[0] == '"'
267411
and search_phrase_token.value[-1] == '"'
268412
):
269413
details = (
270-
"When using proximity operators, "
271-
+ "search terms consisting of 2 or more words "
272-
+ f"(i.e., {search_phrase_token.value}) "
273-
+ "must be enclosed in double quotes"
414+
"Proximity search requires 2 or more search terms enclosed in double quotes."
274415
)
275416
self.add_linter_message(
276417
QueryErrorCode.INVALID_PROXIMITY_USE,
@@ -292,8 +433,6 @@ def check_invalid_proximity_operator(self) -> None:
292433
positions=[field_token.position],
293434
details=details,
294435
)
295-
# Update search field token
296-
self.tokens[index].value = field_value
297436

298437
def validate_query_tree(self, query: Query) -> None:
299438
"""Validate the query tree"""
@@ -420,10 +559,10 @@ class PubmedQueryListLinter(QueryListLinter):
420559

421560
def __init__(
422561
self,
423-
parser: "PubmedListParser",
424-
string_parser_class: typing.Type["QueryStringParser"],
562+
parser: PubmedListParser,
563+
string_parser_class: typing.Type[QueryStringParser],
425564
):
426-
self.parser: "PubmedListParser" = parser
565+
self.parser: PubmedListParser = parser
427566
self.string_parser_class = string_parser_class
428567
super().__init__(parser, string_parser_class)
429568

search_query/pubmed/parser.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from search_query.query import Query
1919
from search_query.query import SearchField
2020
from search_query.query_term import Term
21+
from search_query.query_near import NEARQuery
2122

2223

2324
class PubmedParser(QueryStringParser):
@@ -205,6 +206,26 @@ def _parse_search_term(self, tokens: list) -> Query:
205206

206207
# Determine the search field of the search term.
207208
if len(tokens) > 1 and tokens[1].type == TokenTypes.FIELD:
209+
if ":~" in tokens[1].value:
210+
# Parse NEAR query
211+
field_value, prox_value = self.PROXIMITY_REGEX.match(tokens[1].value).groups()
212+
field_value = "[" + field_value + "]"
213+
return NEARQuery(
214+
value=Operators.NEAR,
215+
search_field=None,
216+
children=[
217+
Term(
218+
value=search_term_token.value,
219+
search_field=SearchField(value=field_value, position=tokens[1].position),
220+
position=tokens[0].position,
221+
platform="deactivated"
222+
)
223+
],
224+
position=(tokens[0].position[0], tokens[1].position[1]),
225+
distance=prox_value,
226+
platform="deactivated"
227+
)
228+
208229
search_field = SearchField(
209230
value=tokens[1].value, position=tokens[1].position
210231
)

search_query/pubmed/serializer.py

Lines changed: 30 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -12,49 +12,39 @@
1212

1313
def to_string_pubmed(query: Query) -> str:
1414
"""Serialize the Query tree into a PubMed search string."""
15-
16-
# to do combine querys for PLATFORM_COMBINED_FIELDS_MAP
17-
1815
if not query.children:
19-
# query has no children, so it is a leaf node
20-
if query.search_field:
21-
return f"{query.value}{query.search_field.value}"
22-
return query.value
23-
16+
# Serialize term query
17+
return (
18+
f"{query.value}" f"{query.search_field.value if query.search_field else ''}"
19+
)
20+
if query.value == Operators.NEAR:
21+
# Serialize near query
22+
distance = query.distance if hasattr(query, 'distance') else 0
23+
return (
24+
f"{query.children[0].value}"
25+
f"{query.children[0].search_field.value[:-1]}"
26+
f":~{distance}]"
27+
)
28+
if query.value == Operators.RANGE:
29+
# Serialize range query
30+
return (
31+
f"{query.children[0].value}:{query.children[1].value}"
32+
f"{query.children[0].search_field.value}"
33+
)
34+
# Serialize compound query
2435
result = ""
25-
for child in query.children:
26-
if not child.operator:
27-
# query is not an operator
28-
if (child == query.children[0]) & (child != query.children[-1]):
29-
# current element is first but not only child element
30-
# -->operator does not need to be appended again
31-
result = (
32-
f"{result}({child.value}"
33-
f"{child.search_field.value if child.search_field else ''}"
34-
)
35-
36-
else:
37-
# current element is not first child
38-
result = (
39-
f"{result} {query.value} {child.value}"
40-
f"{child.search_field.value if child.search_field else ''}"
41-
)
42-
43-
if child == query.children[-1]:
44-
# current Element is last Element -> closing parenthesis
45-
result = f"{result})"
46-
36+
for i, child in enumerate(query.children):
37+
if i > 0:
38+
# Add operator between query children
39+
result += f" {query.value} "
40+
if isinstance(child, str):
41+
result += child
4742
else:
48-
# query is operator query
49-
if child.value == Operators.NOT:
50-
# current element is NOT Operator -> no parenthesis in PubMed
51-
result = f"{result}{to_string_pubmed(child)}"
43+
# Recursively serialize query children
44+
result += to_string_pubmed(child)
5245

53-
elif (child == query.children[0]) & (child != query.children[-1]):
54-
result = f"{result}({to_string_pubmed(child)}"
55-
else:
56-
result = f"{result} {query.value} {to_string_pubmed(child)}"
46+
if query.get_parent():
47+
# Add parentheses around nested queries
48+
result = "(" + result + ")"
5749

58-
if (child == query.children[-1]) & (child.value != Operators.NOT):
59-
result = f"{result})"
6050
return result

0 commit comments

Comments
 (0)