Skip to content

Commit daff9fc

Browse files
author
Gerit Wagner
committed
refactoring/formatting
1 parent 3cadc98 commit daff9fc

File tree

3 files changed

+98
-71
lines changed

3 files changed

+98
-71
lines changed

search_query/linter_pubmed.py

Lines changed: 49 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -22,34 +22,27 @@ class PubmedQueryStringValidator(QueryStringValidator):
2222
PROXIMITY_REGEX = r"^\[(.+):~(.*)\]$"
2323
parser: "PubmedParser"
2424

25-
VALID_TOKEN_SEQUENCES = {
26-
None: [
27-
TokenTypes.SEARCH_TERM,
28-
TokenTypes.PARENTHESIS_OPEN
29-
],
25+
VALID_TOKEN_SEQUENCES: typing.Dict[TokenTypes, typing.List[TokenTypes]] = {
3026
TokenTypes.PARENTHESIS_OPEN: [
3127
TokenTypes.SEARCH_TERM,
3228
TokenTypes.PARENTHESIS_OPEN,
3329
],
3430
TokenTypes.PARENTHESIS_CLOSED: [
3531
TokenTypes.LOGIC_OPERATOR,
3632
TokenTypes.PARENTHESIS_CLOSED,
37-
None
3833
],
3934
TokenTypes.SEARCH_TERM: [
4035
TokenTypes.FIELD,
4136
TokenTypes.LOGIC_OPERATOR,
4237
TokenTypes.PARENTHESIS_CLOSED,
43-
None
4438
],
4539
TokenTypes.FIELD: [
4640
TokenTypes.LOGIC_OPERATOR,
4741
TokenTypes.PARENTHESIS_CLOSED,
48-
None
4942
],
5043
TokenTypes.LOGIC_OPERATOR: [
5144
TokenTypes.SEARCH_TERM,
52-
TokenTypes.PARENTHESIS_OPEN
45+
TokenTypes.PARENTHESIS_OPEN,
5346
],
5447
}
5548

@@ -61,7 +54,7 @@ def validate_tokens(self, tokens: list) -> list:
6154
for index, token in enumerate(tokens):
6255
if token.type == TokenTypes.SEARCH_TERM:
6356
self._check_invalid_characters(token)
64-
if '*' in token.value:
57+
if "*" in token.value:
6558
self._check_invalid_wildcard(token)
6659

6760
if token.type == TokenTypes.FIELD:
@@ -105,8 +98,33 @@ def _check_unbalanced_parentheses(self, tokens: list) -> None:
10598
def _check_invalid_token_sequence(self, tokens: list) -> None:
10699
"""Check token list for invalid token sequences."""
107100
for i in range(0, len(tokens) + 1):
108-
prev_type = tokens[i - 1].type if i > 0 else None
109-
token_type = tokens[i].type if i < len(tokens) else None
101+
if i == len(tokens):
102+
if tokens[i - 1].type in [
103+
TokenTypes.PARENTHESIS_OPEN,
104+
TokenTypes.LOGIC_OPERATOR,
105+
]:
106+
self.parser.add_linter_message(
107+
QueryErrorCode.INVALID_TOKEN_SEQUENCE,
108+
pos=tokens[i - 1].position,
109+
details=f"Cannot end with {tokens[i-1].type}",
110+
)
111+
break
112+
113+
token_type = tokens[i].type # if i < len(tokens) else None
114+
if i == 0:
115+
# Skip first token
116+
if token_type not in [
117+
TokenTypes.SEARCH_TERM,
118+
TokenTypes.PARENTHESIS_OPEN,
119+
]:
120+
self.parser.add_linter_message(
121+
QueryErrorCode.INVALID_TOKEN_SEQUENCE,
122+
pos=tokens[i].position,
123+
details=f"Cannot start with {token_type}",
124+
)
125+
continue
126+
127+
prev_type = tokens[i - 1].type
110128

111129
if token_type not in self.VALID_TOKEN_SEQUENCES[prev_type]:
112130
if token_type == TokenTypes.FIELD:
@@ -117,17 +135,24 @@ def _check_invalid_token_sequence(self, tokens: list) -> None:
117135
details = "Invalid operator position"
118136
position = tokens[i].position
119137

120-
elif prev_type == TokenTypes.PARENTHESIS_OPEN and token_type == TokenTypes.PARENTHESIS_CLOSED:
138+
elif (
139+
prev_type == TokenTypes.PARENTHESIS_OPEN
140+
and token_type == TokenTypes.PARENTHESIS_CLOSED
141+
):
121142
details = "Empty parenthesis"
122143
position = (tokens[i - 1].position[0], tokens[i].position[1])
123144

124-
elif token_type and prev_type and prev_type != TokenTypes.LOGIC_OPERATOR:
145+
elif (
146+
token_type and prev_type and prev_type != TokenTypes.LOGIC_OPERATOR
147+
):
125148
details = "Missing operator"
126149
position = (tokens[i - 1].position[0], tokens[i].position[1])
127150

128151
else:
129152
details = ""
130-
position = tokens[i].position if token_type else tokens[i - 1].position
153+
position = (
154+
tokens[i].position if token_type else tokens[i - 1].position
155+
)
131156

132157
self.parser.add_linter_message(
133158
QueryErrorCode.INVALID_TOKEN_SEQUENCE,
@@ -143,8 +168,7 @@ def _check_precedence(self, index: int, tokens: list) -> None:
143168
if token.type == TokenTypes.PARENTHESIS_OPEN:
144169
if i == 0:
145170
return
146-
else:
147-
i -= 1
171+
i -= 1
148172
if token.type == TokenTypes.PARENTHESIS_CLOSED:
149173
i += 1
150174
if token.type == TokenTypes.LOGIC_OPERATOR and i == 0:
@@ -153,12 +177,10 @@ def _check_precedence(self, index: int, tokens: list) -> None:
153177
if token.value.upper() not in operator_group:
154178
self.parser.add_linter_message(
155179
QueryErrorCode.IMPLICIT_PRECEDENCE,
156-
pos=tokens[index].position
180+
pos=tokens[index].position,
157181
)
158182

159-
def _check_invalid_characters(
160-
self, token: Token
161-
) -> None:
183+
def _check_invalid_characters(self, token: Token) -> None:
162184
"""Check a search term for invalid characters"""
163185
invalid_characters = "!#$%+.;<>?\\^_{}~'()[]"
164186
value = token.value
@@ -167,10 +189,9 @@ def _check_invalid_characters(
167189
for i, char in enumerate(token.value):
168190
if char in invalid_characters:
169191
self.parser.add_linter_message(
170-
QueryErrorCode.INVALID_CHARACTER,
171-
pos=token.position
192+
QueryErrorCode.INVALID_CHARACTER, pos=token.position
172193
)
173-
value = value[:i] + " " + value[i + 1:]
194+
value = value[:i] + " " + value[i + 1 :]
174195
# Update token
175196
if value != token.value:
176197
token.value = value
@@ -270,12 +291,8 @@ def _check_redundant_terms(self, query: Query) -> None:
270291
):
271292
continue
272293

273-
field_a = self.parser.map_search_field(
274-
term_a.search_field.value
275-
)
276-
field_b = self.parser.map_search_field(
277-
term_b.search_field.value
278-
)
294+
field_a = self.parser.map_search_field(term_a.search_field.value)
295+
field_b = self.parser.map_search_field(term_b.search_field.value)
279296

280297
if field_a == field_b and (
281298
term_a.value == term_b.value
@@ -346,7 +363,8 @@ def _check_unsupported_search_field(self, search_field: SearchField) -> None:
346363
search_field.position and search_field.value == "ab"
347364
):
348365
self.parser.add_linter_message(
349-
QueryErrorCode.SEARCH_FIELD_UNSUPPORTED, search_field.position
366+
QueryErrorCode.SEARCH_FIELD_UNSUPPORTED,
367+
search_field.position or (-1, -1),
350368
)
351369
search_field.value = Fields.ALL
352370
search_field.position = None

search_query/parser_pubmed.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -281,15 +281,15 @@ def translate_search_fields(self, query: Query) -> None:
281281
self.translate_search_fields(child)
282282
return
283283

284-
query.search_field.value = self.map_search_field(query.search_field.value)
284+
if query.search_field:
285+
query.search_field.value = self.map_search_field(query.search_field.value)
285286

286-
# Convert queries in the form 'Term [tiab]' into 'Term [ti] OR Term [ab]'.
287-
if query.search_field.value == "[tiab]":
288-
self._expand_combined_fields(query, [Fields.TITLE, Fields.ABSTRACT])
289-
return
287+
# Convert queries in the form 'Term [tiab]' into 'Term [ti] OR Term [ab]'.
288+
if query.search_field.value == "[tiab]":
289+
self._expand_combined_fields(query, [Fields.TITLE, Fields.ABSTRACT])
290290

291291
def parse_user_provided_fields(self, field_values: str) -> list:
292-
"""Extract and translate user-provided search fields and return them as a list"""
292+
"""Extract and translate user-provided search fields (return as a list)"""
293293
if not field_values:
294294
return []
295295

@@ -340,7 +340,7 @@ def _expand_combined_fields(self, query: Query, search_fields: list) -> None:
340340

341341
query.value = Operators.OR
342342
query.operator = True
343-
query.search_field.value = Fields.ALL
343+
query.search_field = SearchField(value=Fields.ALL)
344344
query.children = query_children
345345

346346
def get_query_leaves(self, query: Query) -> list:
@@ -378,7 +378,7 @@ def parse(self) -> Query:
378378

379379
def check_linter_status(self) -> None:
380380
"""Check the output of the linter and report errors to the user"""
381-
new_messages = self.linter_messages[self.last_read_index + 1:]
381+
new_messages = self.linter_messages[self.last_read_index + 1 :]
382382
for msg in new_messages:
383383
e = QuerySyntaxError(msg["message"], self.query_str, msg["pos"])
384384

@@ -426,6 +426,7 @@ def parse(self) -> Query:
426426
query = self.parser_class(query_string, self.search_field_general).parse()
427427

428428
except QuerySyntaxError as exc:
429+
# pylint: disable=duplicate-code
429430
# Correct positions and query string
430431
# to display the error for the original (list) query
431432
new_pos = exc.pos

test/test_parser_pubmed.py

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
"""Tests for Pubmed search query parser."""
33
from typing import Tuple
44

5-
import pytest # type: ignore
5+
import pytest # type: ignore
66

7+
from search_query.constants import QueryErrorCode
78
from search_query.constants import Token
89
from search_query.constants import TokenTypes
9-
from search_query.constants import QueryErrorCode
1010
from search_query.exception import SearchQueryException
1111
from search_query.parser_pubmed import PubmedParser
1212

@@ -21,50 +21,58 @@
2121
(
2222
'("health tracking" [tw] OR"remote monitoring"[tw])AND wearable device[tw]NOT Comment[pt]',
2323
[
24-
Token(value='(', type=TokenTypes.PARENTHESIS_OPEN, position=(0, 1)),
25-
Token(value='"health tracking"', type=TokenTypes.SEARCH_TERM, position=(1, 18)),
26-
Token(value='[tw]', type=TokenTypes.FIELD, position=(19, 23)),
27-
Token(value='OR', type=TokenTypes.LOGIC_OPERATOR, position=(24, 26)),
28-
Token(value='"remote monitoring"', type=TokenTypes.SEARCH_TERM, position=(26, 45)),
29-
Token(value='[tw]', type=TokenTypes.FIELD, position=(45, 49)),
30-
Token(value=')', type=TokenTypes.PARENTHESIS_CLOSED, position=(49, 50)),
31-
Token(value='AND', type=TokenTypes.LOGIC_OPERATOR, position=(50, 53)),
32-
Token(value='wearable device', type=TokenTypes.SEARCH_TERM, position=(54, 69)),
33-
Token(value='[tw]', type=TokenTypes.FIELD, position=(69, 73)),
34-
Token(value='NOT', type=TokenTypes.LOGIC_OPERATOR, position=(73, 76)),
35-
Token(value='Comment', type=TokenTypes.SEARCH_TERM, position=(77, 84)),
36-
Token(value='[pt]', type=TokenTypes.FIELD, position=(84, 88)),
37-
]
24+
Token(value="(", type=TokenTypes.PARENTHESIS_OPEN, position=(0, 1)),
25+
Token(
26+
value='"health tracking"',
27+
type=TokenTypes.SEARCH_TERM,
28+
position=(1, 18),
29+
),
30+
Token(value="[tw]", type=TokenTypes.FIELD, position=(19, 23)),
31+
Token(value="OR", type=TokenTypes.LOGIC_OPERATOR, position=(24, 26)),
32+
Token(
33+
value='"remote monitoring"',
34+
type=TokenTypes.SEARCH_TERM,
35+
position=(26, 45),
36+
),
37+
Token(value="[tw]", type=TokenTypes.FIELD, position=(45, 49)),
38+
Token(value=")", type=TokenTypes.PARENTHESIS_CLOSED, position=(49, 50)),
39+
Token(value="AND", type=TokenTypes.LOGIC_OPERATOR, position=(50, 53)),
40+
Token(
41+
value="wearable device",
42+
type=TokenTypes.SEARCH_TERM,
43+
position=(54, 69),
44+
),
45+
Token(value="[tw]", type=TokenTypes.FIELD, position=(69, 73)),
46+
Token(value="NOT", type=TokenTypes.LOGIC_OPERATOR, position=(73, 76)),
47+
Token(value="Comment", type=TokenTypes.SEARCH_TERM, position=(77, 84)),
48+
Token(value="[pt]", type=TokenTypes.FIELD, position=(84, 88)),
49+
],
3850
)
39-
]
51+
],
4052
)
41-
def test_tokenization_pubmed(
42-
query_str: str, expected_tokens: list
43-
) -> None:
53+
def test_tokenization_pubmed(query_str: str, expected_tokens: list) -> None:
4454
pubmed_parser = PubmedParser(query_str, "")
4555
pubmed_parser.tokenize()
4656
assert pubmed_parser.tokens == expected_tokens, print(pubmed_parser.tokens)
4757

4858

4959
@pytest.mark.parametrize(
50-
'query_str, expected_translation',
60+
"query_str, expected_translation",
5161
[
5262
(
5363
'(eHealth[Title/Abstract] OR "eHealth"[MeSH Terms]) AND Review[Publication Type]',
54-
'AND[OR[OR[all][eHealth[ti], eHealth[ab]], "eHealth"[mh]], Review[pt]]'
64+
'AND[OR[OR[all][eHealth[ti], eHealth[ab]], "eHealth"[mh]], Review[pt]]',
5565
)
56-
]
66+
],
5767
)
58-
def test_parser_pubmed(
59-
query_str: str, expected_translation: str
60-
) -> None:
68+
def test_parser_pubmed(query_str: str, expected_translation: str) -> None:
6169
pubmed_parser = PubmedParser(query_str, "")
6270
query_tree = pubmed_parser.parse()
6371
assert expected_translation == query_tree.to_string(), print(query_tree.to_string())
6472

6573

6674
@pytest.mark.parametrize(
67-
'query_str, error, pos',
75+
"query_str, error, pos",
6876
[
6977
(
7078
'("health tracking" OR "remote monitoring") AND (("mobile application" OR "wearable device")',
@@ -107,7 +115,7 @@ def test_parser_pubmed(
107115
(41, 43),
108116
),
109117
(
110-
'digital health[tiab:~5]',
118+
"digital health[tiab:~5]",
111119
QueryErrorCode.INVALID_PROXIMITY_USE,
112120
(14, 23),
113121
),
@@ -141,12 +149,12 @@ def test_parser_pubmed(
141149
QueryErrorCode.SEARCH_FIELD_UNSUPPORTED,
142150
(9, 13),
143151
),
144-
]
152+
],
145153
)
146154
def test_linter_pubmed(
147-
query_str: str,
148-
error: QueryErrorCode,
149-
pos: Tuple,
155+
query_str: str,
156+
error: QueryErrorCode,
157+
pos: Tuple,
150158
) -> None:
151159
pubmed_parser = PubmedParser(query_str, "")
152160
try:

0 commit comments

Comments
 (0)