Skip to content

Commit 9044169

Browse files
author
Gerit Wagner
committed
test/refactor
1 parent 22ea72c commit 9044169

File tree

11 files changed

+230
-29
lines changed

11 files changed

+230
-29
lines changed

docs/source/platforms/ebsco.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,4 @@ Resources
4444

4545
- `EBSCO Search Help <https://connect.ebsco.com/s/article/Searching-EBSCO-Databases?language=en_US>`_
4646
- `EBSCO Search Fields Guide <https://connect.ebsco.com/s/article/Field-Codes-Searchable-EBSCOhost?language=en_US>`_
47+
- `EBSCO wildcard restrictions <https://connect.ebsco.com/s/article/Searching-with-Wildcards-in-EDS-and-EBSCOhost?language=en_US>`_

search_query/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,7 @@ class QueryErrorCode(Enum):
425425
[PLATFORM.WOS, PLATFORM.PUBMED],
426426
"F3003",
427427
"invalid-list-reference",
428-
"Invalid list reference in list query (not found)",
428+
"Invalid list reference in list query",
429429
"",
430430
)
431431

search_query/ebsco/linter.py

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,10 +122,15 @@ def check_invalid_near_within_operators(self) -> None:
122122

123123
for token in self.tokens:
124124
if token.type == TokenTypes.PROXIMITY_OPERATOR:
125+
digit = "x"
126+
m = re.search(r"/(\d+)", token.value)
127+
if m:
128+
digit = m.group(1)
129+
125130
if token.value.startswith("NEAR"):
126131
details = (
127132
f"Operator {token.value} "
128-
"is not supported by EBSCO. Must be Nx instead."
133+
f"is not supported by EBSCO. Must be N{digit} instead."
129134
)
130135
self.add_linter_message(
131136
QueryErrorCode.INVALID_PROXIMITY_USE,
@@ -136,7 +141,7 @@ def check_invalid_near_within_operators(self) -> None:
136141
if token.value.startswith("WITHIN"):
137142
details = (
138143
f"Operator {token.value} "
139-
"is not supported by EBSCO. Must be Wx instead."
144+
f"is not supported by EBSCO. Must be W{digit} instead."
140145
)
141146
self.add_linter_message(
142147
QueryErrorCode.INVALID_PROXIMITY_USE,
@@ -187,6 +192,8 @@ def check_invalid_token_sequences(self) -> None:
187192

188193
elif token_type == TokenTypes.LOGIC_OPERATOR:
189194
details = "Invalid operator position"
195+
if prev_type == TokenTypes.LOGIC_OPERATOR:
196+
details = "Cannot have two consecutive operators"
190197
positions = [token.position]
191198

192199
elif (
@@ -271,15 +278,72 @@ def check_invalid_near_within_operators_query(self, query: Query) -> None:
271278
for child in query.children:
272279
self.check_invalid_near_within_operators_query(child)
273280

281+
def check_unsupported_wildcards(self, query: Query) -> None:
282+
"""Check for unsupported characters in the search string."""
283+
284+
if query.is_term():
285+
val = query.value
286+
# Check for leading wildcard
287+
match = re.search(r"^(\*|\?|\#)", val)
288+
if match:
289+
position = (-1, -1)
290+
if query.position:
291+
position = (
292+
query.position[0] + match.start(),
293+
query.position[0] + match.end(),
294+
)
295+
self.add_linter_message(
296+
QueryErrorCode.WILDCARD_UNSUPPORTED,
297+
positions=[position],
298+
details="Wildcard not allowed at the beginning of a term.",
299+
)
300+
301+
# Count each wildcard
302+
char_count = sum(c not in "*?#" for c in val[:4])
303+
if re.search(r"^[^\*\?\#](\?|\#)", val) and char_count < 2:
304+
# Star in second position followed by more letters (e.g., "f*tal")
305+
position = (-1, -1)
306+
if query.position:
307+
position = (query.position[0], query.position[0] + len(val))
308+
details = (
309+
"Invalid wildcard use: only one leading literal character found. "
310+
"When a wildcard appears within the first four characters, "
311+
"at least two literal (non-wildcard) characters "
312+
"must be present in that span."
313+
)
314+
self.add_linter_message(
315+
QueryErrorCode.WILDCARD_UNSUPPORTED,
316+
positions=[position],
317+
details=details,
318+
)
319+
320+
if re.search(r"^[^\*\?\#](\*)", val):
321+
position = (-1, -1)
322+
if query.position:
323+
position = (query.position[0], query.position[0] + len(val))
324+
details = (
325+
"Do not use * in the second position followed by "
326+
"additional letters. Use ? or # instead (e.g., f?tal)."
327+
)
328+
self.add_linter_message(
329+
QueryErrorCode.WILDCARD_UNSUPPORTED,
330+
positions=[position],
331+
details=details,
332+
)
333+
334+
for child in query.children:
335+
self.check_unsupported_wildcards(child)
336+
274337
def validate_query_tree(self, query: Query) -> None:
275338
"""
276339
Validate the query tree.
277340
This method is called after the query tree has been built.
278341
"""
279342

280343
self.check_unbalanced_quotes_in_terms(query)
281-
self.check_invalid_characters_in_search_term_query(query, "@%$^~\\<>{}[]#")
344+
self.check_invalid_characters_in_search_term_query(query, "@%$^~\\<>{}[]")
282345
self.check_unsupported_search_fields_in_query(query)
346+
self.check_unsupported_wildcards(query)
283347

284348
term_field_query = self.get_query_with_fields_at_terms(query)
285349
self._check_date_filters_in_subquery(term_field_query)

search_query/ebsco/parser.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class EBSCOParser(QueryStringParser):
2525
r"(N|W)\d+|(NEAR|WITHIN)/\d+", flags=re.IGNORECASE
2626
)
2727
SEARCH_FIELD_REGEX = re.compile(r"\b([A-Z]{2})\b")
28-
SEARCH_TERM_REGEX = re.compile(r"\"[^\"]*\"|\b(?!S\d+\b)[^()\s]+[\*\+\?]?")
28+
SEARCH_TERM_REGEX = re.compile(r"\"[^\"]*\"|\*?\b[^()\s]+")
2929

3030
OPERATOR_REGEX = re.compile(
3131
"|".join([LOGIC_OPERATOR_REGEX.pattern, PROXIMITY_OPERATOR_REGEX.pattern])
@@ -120,6 +120,25 @@ def _extract_proximity_distance(self, token: Token) -> int:
120120
token.value = operator
121121
return distance
122122

123+
def fix_ambiguous_tokens(self) -> None:
124+
"""Fix ambiguous tokens that could be misinterpreted as a search field."""
125+
126+
def is_potential_term(token_str: str) -> bool:
127+
return bool(re.fullmatch(r"[A-Z]{2,}", token_str))
128+
129+
# Field token followed by term which is misclassified as a field token
130+
for i in range(len(self.tokens) - 1):
131+
current = self.tokens[i]
132+
next_token = self.tokens[i + 1]
133+
134+
if (
135+
current.type == TokenTypes.FIELD
136+
and next_token.type == TokenTypes.FIELD
137+
and is_potential_term(next_token.value)
138+
):
139+
# Reclassify the second FIELD token as a SEARCH_TERM
140+
next_token.type = TokenTypes.SEARCH_TERM
141+
123142
def tokenize(self) -> None:
124143
"""Tokenize the query_str."""
125144

@@ -154,6 +173,7 @@ def tokenize(self) -> None:
154173

155174
# Combine subsequent search_terms in case of no quotation marks
156175
self.combine_subsequent_tokens()
176+
self.fix_ambiguous_tokens()
157177

158178
def append_node(
159179
self,

search_query/linter_base.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -536,9 +536,9 @@ def get_precedence(self, token: str) -> int:
536536

537537
def _get_unequal_precedence_operators(
538538
self, tokens: list[Token]
539-
) -> typing.List[tuple[int, int]]:
539+
) -> typing.List[Token]:
540540
"""Get positions of unequal precedence operators."""
541-
unequal_precedence_operators = []
541+
unequal_precedence_operators: typing.List[Token] = []
542542
previous_value = -1
543543
level = 0
544544
prev_token = None
@@ -554,8 +554,8 @@ def _get_unequal_precedence_operators(
554554
continue
555555
if token.type in [TokenTypes.LOGIC_OPERATOR, TokenTypes.PROXIMITY_OPERATOR]:
556556
value = self.get_precedence(token.value.upper())
557-
if value != previous_value and previous_value != -1:
558-
if not unequal_precedence_operators:
557+
if previous_value not in [value, -1]:
558+
if not unequal_precedence_operators and prev_token:
559559
unequal_precedence_operators.append(prev_token)
560560
unequal_precedence_operators.append(token)
561561
previous_value = value
@@ -578,15 +578,18 @@ def _print_unequal_precedence_warning(self, index: int) -> None:
578578
for idx, (op, prec) in enumerate(precedence_list):
579579
if idx == 0:
580580
precedence_lines.append(
581-
f"Operator {Colors.GREEN}{op}{Colors.END} is evaluated first because it has the highest precedence level ({prec})."
581+
f"Operator {Colors.GREEN}{op}{Colors.END} is evaluated first "
582+
f"because it has the highest precedence level ({prec})."
582583
)
583584
elif idx == len(precedence_list) - 1:
584585
precedence_lines.append(
585-
f"Operator {Colors.ORANGE}{op}{Colors.END} is evaluated last because it has the lowest precedence level ({prec})."
586+
f"Operator {Colors.ORANGE}{op}{Colors.END} is evaluated last "
587+
f"because it has the lowest precedence level ({prec})."
586588
)
587589
else:
588590
precedence_lines.append(
589-
f"Operator {Colors.ORANGE}{op}{Colors.END} has precedence level {prec}."
591+
f"Operator {Colors.ORANGE}{op}{Colors.END} "
592+
f"has precedence level {prec}."
590593
)
591594

592595
precedence_info = "\n".join(precedence_lines)
@@ -597,7 +600,8 @@ def _print_unequal_precedence_warning(self, index: int) -> None:
597600
"This can lead to unexpected interpretations of the query.\n\n"
598601
"Specifically:\n"
599602
f"{precedence_info}\n\n"
600-
"To fix this, search-query adds artificial parentheses around operator groups with higher precedence.\n\n"
603+
"To fix this, search-query adds artificial parentheses around "
604+
"operator groups with higher precedence.\n\n"
601605
)
602606

603607
self.add_linter_message(

search_query/pubmed/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
"[dcom]": r"\[dcom\]|\[completion date\]",
3434
"[cois]": r"\[cois\]|\[conflict of interest statement\]",
3535
"[cn]": r"\[cn\]|\[corporate author\]",
36-
"[crdt]": r"\[crdt\]|\[create date\]",
36+
"[crdt]": r"\[crdt\]|\[create date\]|\[date - create\]",
3737
"[rn]": r"\[rn\]|\[ec/rn number\]",
3838
"[ed]": r"\[ed\]|\[editor\]",
3939
"[edat]": r"\[edat\]|\[entry date\]",
@@ -62,7 +62,7 @@
6262
"[ps]": r"\[ps\]|\[personal name as subject\]",
6363
"[pa]": r"\[pa\]|\[pharmacological action\]",
6464
"[pl]": r"\[pl\]|\[place of publication\]",
65-
"[dp]": r"\[dp\]|\[publication date\]|\[pdate\]",
65+
"[dp]": r"\[dp\]|\[publication date\]|\[pdat\]",
6666
"[pt]": r"\[pt\]|\[publication type\]",
6767
"[pubn]": r"\[pubn\]|\[publisher\]",
6868
"[si]": r"\[si\]|\[secondary source id\]",

search_query/pubmed/linter.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,11 @@ def check_invalid_token_sequences(self) -> None:
179179
)
180180
]
181181

182-
elif token_type and prev_type and prev_type != TokenTypes.LOGIC_OPERATOR:
182+
elif (
183+
token_type
184+
and prev_type
185+
and prev_type not in [TokenTypes.LOGIC_OPERATOR]
186+
):
183187
details = "Missing operator"
184188
positions = [
185189
(

search_query/wos/linter.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ def check_invalid_token_sequences(self) -> None:
294294
if token.type == TokenTypes.FIELD and next_token.type == TokenTypes.FIELD:
295295
self.add_linter_message(
296296
QueryErrorCode.INVALID_TOKEN_SEQUENCE,
297-
positions=[next_token.position],
297+
positions=[(token.position[0], next_token.position[1])],
298298
)
299299
continue
300300

@@ -303,7 +303,7 @@ def check_invalid_token_sequences(self) -> None:
303303
if next_token.type not in allowed_next_types:
304304
self.add_linter_message(
305305
QueryErrorCode.INVALID_TOKEN_SEQUENCE,
306-
positions=[next_token.position],
306+
positions=[(token.position[0], next_token.position[1])],
307307
)
308308

309309
# Check the last token

test/test_ebsco.py

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,28 @@
104104
),
105105
],
106106
),
107+
(
108+
"TI RN OR AB RN",
109+
[
110+
Token(value="TI", type=TokenTypes.FIELD, position=(0, 2)),
111+
Token(value="RN", type=TokenTypes.SEARCH_TERM, position=(3, 5)),
112+
Token(value="OR", type=TokenTypes.LOGIC_OPERATOR, position=(6, 8)),
113+
Token(value="AB", type=TokenTypes.FIELD, position=(9, 11)),
114+
Token(value="RN", type=TokenTypes.SEARCH_TERM, position=(12, 14)),
115+
],
116+
),
117+
# (
118+
# '(DE "Persuasive Communication) OR (DE "Collaboration")',
119+
# [
120+
# ]
121+
# )
107122
],
108123
)
109124
def test_tokenization(
110125
query_string: str, expected_tokens: List[Tuple[str, str, Tuple[int, int]]]
111126
) -> None:
112127
"""Test EBSCO parser tokenization."""
128+
print(query_string)
113129
parser = EBSCOParser(query_string, search_field_general="")
114130
parser.tokenize()
115131

@@ -291,6 +307,82 @@ def test_invalid_token_sequences(
291307
'(ZY "sudan" OR ZY "south sudan") AND TI "context of vegetarians"',
292308
[],
293309
),
310+
(
311+
"bias OR OR politics",
312+
[
313+
{
314+
"code": "F1004",
315+
"label": "invalid-token-sequence",
316+
"message": "The sequence of tokens is invalid.",
317+
"is_fatal": True,
318+
"position": [(8, 10)],
319+
"details": "Cannot have two consecutive operators",
320+
}
321+
],
322+
),
323+
(
324+
"*ology",
325+
[
326+
{
327+
"code": "F2001",
328+
"label": "wildcard-unsupported",
329+
"message": "Unsupported wildcard in search string.",
330+
"is_fatal": True,
331+
"position": [(0, 1)],
332+
"details": "Wildcard not allowed at the beginning of a term.",
333+
}
334+
],
335+
),
336+
(
337+
"f??*",
338+
[
339+
{
340+
"code": "F2001",
341+
"label": "wildcard-unsupported",
342+
"message": "Unsupported wildcard in search string.",
343+
"is_fatal": True,
344+
"position": [(0, 4)],
345+
"details": "Invalid wildcard use: only one leading literal character found. When a wildcard appears within the first four characters, at least two literal (non-wildcard) characters must be present in that span.",
346+
}
347+
],
348+
),
349+
(
350+
"f*tal",
351+
[
352+
{
353+
"code": "F2001",
354+
"label": "wildcard-unsupported",
355+
"message": "Unsupported wildcard in search string.",
356+
"is_fatal": True,
357+
"position": [(0, 5)],
358+
"details": "Do not use * in the second position followed by additional letters. Use ? or # instead (e.g., f?tal).",
359+
}
360+
],
361+
),
362+
(
363+
"colo#r",
364+
[],
365+
),
366+
(
367+
"pediatric*",
368+
[],
369+
),
370+
(
371+
"tumor*",
372+
[],
373+
),
374+
(
375+
"education*",
376+
[],
377+
),
378+
(
379+
"f#tal",
380+
[],
381+
),
382+
(
383+
"f?tal",
384+
[],
385+
),
294386
],
295387
)
296388
def test_linter(query_string: str, messages: list) -> None:
@@ -365,7 +457,7 @@ def test_linter(query_string: str, messages: list) -> None:
365457
"message": "Invalid use of the proximity operator",
366458
"is_fatal": False,
367459
"position": [(8, 14)],
368-
"details": "Operator NEAR/2 is not supported by EBSCO. Must be Nx instead.",
460+
"details": "Operator NEAR/2 is not supported by EBSCO. Must be N2 instead.",
369461
}
370462
],
371463
),
@@ -380,7 +472,7 @@ def test_linter(query_string: str, messages: list) -> None:
380472
"message": "Invalid use of the proximity operator",
381473
"is_fatal": False,
382474
"position": [(8, 16)],
383-
"details": "Operator WITHIN/2 is not supported by EBSCO. Must be Wx instead.",
475+
"details": "Operator WITHIN/2 is not supported by EBSCO. Must be W2 instead.",
384476
}
385477
],
386478
),

0 commit comments

Comments
 (0)