Skip to content

Commit 7919637

Browse files
author
Gerit Wagner
committed
testing/revisions
1 parent 25606c6 commit 7919637

File tree

12 files changed

+203
-36
lines changed

12 files changed

+203
-36
lines changed

search_query/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -431,10 +431,10 @@ class QueryErrorCode(Enum):
431431
"",
432432
)
433433
INVALID_PROXIMITY_USE = (
434-
[PLATFORM.PUBMED],
434+
[PLATFORM.PUBMED, PLATFORM.EBSCO],
435435
"E0005",
436436
"invalid-proximity-use",
437-
"Invalid use of the proximity operator :~",
437+
"Invalid use of the proximity operator",
438438
"",
439439
)
440440
INVALID_WILDCARD_USE = (

search_query/ebsco/constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
"KW": {Fields.AUTHOR_KEYWORDS, Fields.KEYWORDS},
2525
"ZW": {Fields.AUTHOR_KEYWORDS, Fields.KEYWORDS},
2626
"DE": {Fields.DESCRIPTORS, Fields.AUTHOR_KEYWORDS},
27+
"MH": {Fields.MESH_TERM},
28+
"ZY": {Fields.COUNTRY_REGION},
29+
"ZU": {Fields.SUBJECT_TERMS},
2730
}
2831

2932
_RAW_PREPROCESSING_MAP = {
@@ -39,6 +42,9 @@
3942
"LA": r"LA",
4043
"KW": r"KW",
4144
"DE": r"DE",
45+
"MH": r"MH",
46+
"ZY": r"ZY",
47+
"ZU": r"ZU",
4248
}
4349
# Note: lower-case fields return different results
4450
PREPROCESSING_MAP = {k: re.compile(v) for k, v in _RAW_PREPROCESSING_MAP.items()}

search_query/ebsco/linter.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def validate_tokens(
7979
self.check_unbalanced_parentheses()
8080
self.add_artificial_parentheses_for_operator_precedence()
8181
self.check_operator_capitalization()
82+
self.check_invalid_near_within_operators()
8283

8384
self.check_search_field_general()
8485
return self.tokens
@@ -111,6 +112,37 @@ def check_invalid_syntax(self) -> None:
111112
f"'{match.group(0)}' is invalid.",
112113
)
113114

115+
def check_invalid_near_within_operators(self) -> None:
116+
"""
117+
Check for invalid NEAR and WITHIN operators in the query.
118+
EBSCO does not support NEAR and WITHIN operators.
119+
"""
120+
121+
for token in self.tokens:
122+
if token.type == TokenTypes.PROXIMITY_OPERATOR:
123+
if token.value.startswith("NEAR"):
124+
details = (
125+
f"Operator {token.value} "
126+
"is not supported by EBSCO. Must be N/x instead."
127+
)
128+
self.add_linter_message(
129+
QueryErrorCode.INVALID_PROXIMITY_USE,
130+
positions=[token.position],
131+
details=details,
132+
)
133+
token.value = token.value.replace("NEAR/", "N")
134+
if token.value.startswith("WITHIN"):
135+
details = (
136+
f"Operator {token.value} "
137+
"is not supported by EBSCO. Must be W/x instead."
138+
)
139+
self.add_linter_message(
140+
QueryErrorCode.INVALID_PROXIMITY_USE,
141+
positions=[token.position],
142+
details=details,
143+
)
144+
token.value = token.value.replace("WITHIN/", "W")
145+
114146
def check_search_field_general(self) -> None:
115147
"""Check field 'Search Fields' in content."""
116148

@@ -206,6 +238,37 @@ def check_invalid_token_sequences(self) -> None:
206238
details=f"Cannot end with {self.tokens[-1].type.value}",
207239
)
208240

241+
def check_invalid_near_within_operators_query(self, query: Query) -> None:
242+
"""
243+
Check for invalid NEAR and WITHIN operators in the query.
244+
EBSCO does not support NEAR and WITHIN operators.
245+
"""
246+
if query.operator:
247+
if query.value.startswith("NEAR"):
248+
details = (
249+
f"Operator {query.value} "
250+
"is not supported by EBSCO. Must be N/x instead."
251+
)
252+
self.add_linter_message(
253+
QueryErrorCode.INVALID_PROXIMITY_USE,
254+
positions=[query.position or (-1, -1)],
255+
details=details,
256+
)
257+
258+
if query.value.startswith("WITHIN"):
259+
details = (
260+
f"Operator {query.value} "
261+
"is not supported by EBSCO. Must be W/x instead."
262+
)
263+
self.add_linter_message(
264+
QueryErrorCode.INVALID_PROXIMITY_USE,
265+
positions=[query.position or (-1, -1)],
266+
details=details,
267+
)
268+
269+
for child in query.children:
270+
self.check_invalid_near_within_operators_query(child)
271+
209272
def validate_query_tree(self, query: Query) -> None:
210273
"""
211274
Validate the query tree.

search_query/ebsco/parser.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ class EBSCOParser(QueryStringParser):
2121

2222
PARENTHESIS_REGEX = re.compile(r"[\(\)]")
2323
LOGIC_OPERATOR_REGEX = re.compile(r"\b(AND|OR|NOT)\b", flags=re.IGNORECASE)
24-
PROXIMITY_OPERATOR_REGEX = re.compile(r"(N|W)\d+")
24+
PROXIMITY_OPERATOR_REGEX = re.compile(
25+
r"(N|W)\d+|(NEAR|WITHIN)/\d+", flags=re.IGNORECASE
26+
)
2527
SEARCH_FIELD_REGEX = re.compile(r"\b([A-Z]{2})\b")
2628
SEARCH_TERM_REGEX = re.compile(r"\"[^\"]*\"|\b(?!S\d+\b)[^()\s]+[\*\+\?]?")
2729

@@ -276,7 +278,7 @@ def parse(self) -> Query:
276278
self.linter.validate_query_tree(query)
277279
self.linter.check_status()
278280

279-
query.set_platform_unchecked(PLATFORM.EBSCO.value)
281+
query.set_platform_unchecked(PLATFORM.EBSCO.value, silent=True)
280282

281283
return query
282284

search_query/linter_base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,7 @@ def add_artificial_parentheses_for_operator_precedence(
586586
TokenTypes.LOGIC_OPERATOR,
587587
TokenTypes.PROXIMITY_OPERATOR,
588588
]:
589-
value = self.get_precedence(self.tokens[index].value)
589+
value = self.get_precedence(self.tokens[index].value.upper())
590590

591591
if current_value in (value, -1):
592592
# Same precedence → just add to output
@@ -852,7 +852,7 @@ def _check_redundant_terms(self, query: Query) -> None:
852852
if field_a != field_b:
853853
continue
854854

855-
if field_a == "[mh]": # pragma: no cover
855+
if field_a in ["[mh]", "ZY", "DE"]: # pragma: no cover
856856
# excact matches required for mh
857857
continue
858858

search_query/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def parse(
4040
mode: str = LinterMode.STRICT,
4141
) -> Query:
4242
"""Parse a query string."""
43-
platform = platform.lower()
43+
platform = get_platform(platform)
4444

4545
if "1." in query_str[:10]:
4646
if platform not in LIST_PARSERS: # pragma: no cover

search_query/parser_base.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,27 +45,28 @@ def print_tokens(self) -> None:
4545
print(f"{token.value:<30} {token.type:<40} {str(token.position):<10}")
4646

4747
def combine_subsequent_terms(self) -> None:
48-
"""Combine subsequent terms in the list of tokens."""
49-
# Combine subsequent terms (without quotes)
50-
# This would be more challenging in the regex
48+
"""Combine all consecutive SEARCH_TERM tokens into one."""
5149
combined_tokens = []
5250
i = 0
5351
while i < len(self.tokens):
54-
if (
55-
i + 1 < len(self.tokens)
56-
and self.tokens[i].type == TokenTypes.SEARCH_TERM
57-
and self.tokens[i + 1].type == TokenTypes.SEARCH_TERM
58-
):
52+
if self.tokens[i].type == TokenTypes.SEARCH_TERM:
53+
start = self.tokens[i].position[0]
54+
value_parts = [self.tokens[i].value]
55+
end = self.tokens[i].position[1]
56+
i += 1
57+
while (
58+
i < len(self.tokens)
59+
and self.tokens[i].type == TokenTypes.SEARCH_TERM
60+
):
61+
value_parts.append(self.tokens[i].value)
62+
end = self.tokens[i].position[1]
63+
i += 1
5964
combined_token = Token(
60-
value=self.tokens[i].value + " " + self.tokens[i + 1].value,
65+
value=" ".join(value_parts),
6166
type=TokenTypes.SEARCH_TERM,
62-
position=(
63-
self.tokens[i].position[0],
64-
self.tokens[i + 1].position[1],
65-
),
67+
position=(start, end),
6668
)
6769
combined_tokens.append(combined_token)
68-
i += 2
6970
else:
7071
combined_tokens.append(self.tokens[i])
7172
i += 1

search_query/pubmed/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ def parse(self) -> Query:
222222
self.linter.validate_query_tree(query)
223223
self.linter.check_status()
224224

225-
query.set_platform_unchecked(PLATFORM.PUBMED.value)
225+
query.set_platform_unchecked(PLATFORM.PUBMED.value, silent=True)
226226

227227
return query
228228

search_query/query.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ def __init__(
7272
self.marked = False
7373
# Note: platform is only set for root nodes
7474
self._platform = platform
75+
# helper flag to silence linter after parse() to avoid repeated linter printout
76+
self._silence_linter = False
7577

7678
self._parent: typing.Optional[Query] = None
7779
if children:
@@ -96,28 +98,32 @@ def _validate_platform_constraints(self) -> None:
9698

9799
wos_linter = WOSQueryStringLinter()
98100
wos_linter.validate_query_tree(self)
99-
wos_linter.check_status()
101+
if not self._silence_linter:
102+
wos_linter.check_status()
100103

101104
elif self.platform == PLATFORM.PUBMED.value:
102105
from search_query.pubmed.linter import PubmedQueryStringLinter
103106

104107
pubmed_linter = PubmedQueryStringLinter()
105108
pubmed_linter.validate_query_tree(self)
106-
pubmed_linter.check_status()
109+
if not self._silence_linter:
110+
pubmed_linter.check_status()
107111

108112
elif self.platform == "generic":
109113
from search_query.generic.linter import GenericLinter
110114

111115
gen_linter = GenericLinter()
112116
gen_linter.validate_query_tree(self)
113-
gen_linter.check_status()
117+
if not self._silence_linter:
118+
gen_linter.check_status()
114119

115120
elif self.platform == PLATFORM.EBSCO.value:
116121
from search_query.ebsco.linter import EBSCOQueryStringLinter
117122

118123
ebsco_linter = EBSCOQueryStringLinter()
119124
ebsco_linter.validate_query_tree(self)
120-
ebsco_linter.check_status()
125+
if not self._silence_linter:
126+
ebsco_linter.check_status()
121127

122128
else: # pragma: no cover
123129
raise NotImplementedError(
@@ -144,10 +150,13 @@ def platform(self, platform: str) -> None:
144150
self._set_platform_recursively(platform)
145151
self._validate_platform_constraints()
146152

147-
def set_platform_unchecked(self, platform: str) -> None:
153+
def set_platform_unchecked(self, platform: str, silent: bool = False) -> None:
148154
"""Set the platform for this query node without validation.
149155
This is an optional utility for parsers.
150156
"""
157+
158+
if silent:
159+
self._silence_linter = True
151160
self._set_platform_recursively(platform)
152161

153162
def __deepcopy__(self, memo: dict) -> Query:

search_query/wos/parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class WOSParser(QueryStringParser):
2828
"""Parser for Web-of-Science queries."""
2929

3030
SEARCH_TERM_REGEX = re.compile(
31-
r'\*?[\w\-/\.\!\*]+(?:[\*\$\?][\w\-/\.\!\*]*)*|"[^"]+"'
31+
r'\*?[\w\-/\.\!\*,&]+(?:[\*\$\?][\w\-/\.\!\*,&]*)*|"[^"]+"'
3232
)
3333
LOGIC_OPERATOR_REGEX = re.compile(r"\b(AND|OR|NOT)\b", flags=re.IGNORECASE)
3434
PROXIMITY_OPERATOR_REGEX = re.compile(
@@ -336,7 +336,7 @@ def parse(self) -> Query:
336336
)
337337
query.search_field = search_field_general
338338

339-
query.set_platform_unchecked(PLATFORM.WOS.value)
339+
query.set_platform_unchecked(PLATFORM.WOS.value, silent=True)
340340

341341
return query
342342

0 commit comments

Comments
 (0)