refactoring/formatting

Gerit Wagner · Gerit Wagner · commit daff9fc3ce21 · 2025-04-16T17:18:35.000+02:00
diff --git a/search_query/linter_pubmed.py b/search_query/linter_pubmed.py
@@ -22,34 +22,27 @@ class PubmedQueryStringValidator(QueryStringValidator):
     PROXIMITY_REGEX = r"^\[(.+):~(.*)\]$"
     parser: "PubmedParser"
 
-    VALID_TOKEN_SEQUENCES = {
-        None: [
-            TokenTypes.SEARCH_TERM,
-            TokenTypes.PARENTHESIS_OPEN
-        ],
+    VALID_TOKEN_SEQUENCES: typing.Dict[TokenTypes, typing.List[TokenTypes]] = {
         TokenTypes.PARENTHESIS_OPEN: [
             TokenTypes.SEARCH_TERM,
             TokenTypes.PARENTHESIS_OPEN,
         ],
         TokenTypes.PARENTHESIS_CLOSED: [
             TokenTypes.LOGIC_OPERATOR,
             TokenTypes.PARENTHESIS_CLOSED,
-            None
         ],
         TokenTypes.SEARCH_TERM: [
             TokenTypes.FIELD,
             TokenTypes.LOGIC_OPERATOR,
             TokenTypes.PARENTHESIS_CLOSED,
-            None
         ],
         TokenTypes.FIELD: [
             TokenTypes.LOGIC_OPERATOR,
             TokenTypes.PARENTHESIS_CLOSED,
-            None
         ],
         TokenTypes.LOGIC_OPERATOR: [
             TokenTypes.SEARCH_TERM,
-            TokenTypes.PARENTHESIS_OPEN
+            TokenTypes.PARENTHESIS_OPEN,
         ],
     }
 
@@ -61,7 +54,7 @@ def validate_tokens(self, tokens: list) -> list:
         for index, token in enumerate(tokens):
             if token.type == TokenTypes.SEARCH_TERM:
                 self._check_invalid_characters(token)
-                if '*' in token.value:
+                if "*" in token.value:
                     self._check_invalid_wildcard(token)
 
             if token.type == TokenTypes.FIELD:
@@ -105,8 +98,33 @@ def _check_unbalanced_parentheses(self, tokens: list) -> None:
     def _check_invalid_token_sequence(self, tokens: list) -> None:
         """Check token list for invalid token sequences."""
         for i in range(0, len(tokens) + 1):
-            prev_type = tokens[i - 1].type if i > 0 else None
-            token_type = tokens[i].type if i < len(tokens) else None
+            if i == len(tokens):
+                if tokens[i - 1].type in [
+                    TokenTypes.PARENTHESIS_OPEN,
+                    TokenTypes.LOGIC_OPERATOR,
+                ]:
+                    self.parser.add_linter_message(
+                        QueryErrorCode.INVALID_TOKEN_SEQUENCE,
+                        pos=tokens[i - 1].position,
+                        details=f"Cannot end with {tokens[i-1].type}",
+                    )
+                break
+
+            token_type = tokens[i].type  # if i < len(tokens) else None
+            if i == 0:
+                # Skip first token
+                if token_type not in [
+                    TokenTypes.SEARCH_TERM,
+                    TokenTypes.PARENTHESIS_OPEN,
+                ]:
+                    self.parser.add_linter_message(
+                        QueryErrorCode.INVALID_TOKEN_SEQUENCE,
+                        pos=tokens[i].position,
+                        details=f"Cannot start with {token_type}",
+                    )
+                continue
+
+            prev_type = tokens[i - 1].type
 
             if token_type not in self.VALID_TOKEN_SEQUENCES[prev_type]:
                 if token_type == TokenTypes.FIELD:
@@ -117,17 +135,24 @@ def _check_invalid_token_sequence(self, tokens: list) -> None:
                     details = "Invalid operator position"
                     position = tokens[i].position
 
-                elif prev_type == TokenTypes.PARENTHESIS_OPEN and token_type == TokenTypes.PARENTHESIS_CLOSED:
+                elif (
+                    prev_type == TokenTypes.PARENTHESIS_OPEN
+                    and token_type == TokenTypes.PARENTHESIS_CLOSED
+                ):
                     details = "Empty parenthesis"
                     position = (tokens[i - 1].position[0], tokens[i].position[1])
 
-                elif token_type and prev_type and prev_type != TokenTypes.LOGIC_OPERATOR:
+                elif (
+                    token_type and prev_type and prev_type != TokenTypes.LOGIC_OPERATOR
+                ):
                     details = "Missing operator"
                     position = (tokens[i - 1].position[0], tokens[i].position[1])
 
                 else:
                     details = ""
-                    position = tokens[i].position if token_type else tokens[i - 1].position
+                    position = (
+                        tokens[i].position if token_type else tokens[i - 1].position
+                    )
 
                 self.parser.add_linter_message(
                     QueryErrorCode.INVALID_TOKEN_SEQUENCE,
@@ -143,8 +168,7 @@ def _check_precedence(self, index: int, tokens: list) -> None:
             if token.type == TokenTypes.PARENTHESIS_OPEN:
                 if i == 0:
                     return
-                else:
-                    i -= 1
+                i -= 1
             if token.type == TokenTypes.PARENTHESIS_CLOSED:
                 i += 1
             if token.type == TokenTypes.LOGIC_OPERATOR and i == 0:
@@ -153,12 +177,10 @@ def _check_precedence(self, index: int, tokens: list) -> None:
                         if token.value.upper() not in operator_group:
                             self.parser.add_linter_message(
                                 QueryErrorCode.IMPLICIT_PRECEDENCE,
-                                pos=tokens[index].position
+                                pos=tokens[index].position,
                             )
 
-    def _check_invalid_characters(
-        self, token: Token
-    ) -> None:
+    def _check_invalid_characters(self, token: Token) -> None:
         """Check a search term for invalid characters"""
         invalid_characters = "!#$%+.;<>?\\^_{}~'()[]"
         value = token.value
@@ -167,10 +189,9 @@ def _check_invalid_characters(
         for i, char in enumerate(token.value):
             if char in invalid_characters:
                 self.parser.add_linter_message(
-                    QueryErrorCode.INVALID_CHARACTER,
-                    pos=token.position
+                    QueryErrorCode.INVALID_CHARACTER, pos=token.position
                 )
-                value = value[:i] + " " + value[i + 1:]
+                value = value[:i] + " " + value[i + 1 :]
         # Update token
         if value != token.value:
             token.value = value
@@ -270,12 +291,8 @@ def _check_redundant_terms(self, query: Query) -> None:
                     ):
                         continue
 
-                    field_a = self.parser.map_search_field(
-                        term_a.search_field.value
-                    )
-                    field_b = self.parser.map_search_field(
-                        term_b.search_field.value
-                    )
+                    field_a = self.parser.map_search_field(term_a.search_field.value)
+                    field_b = self.parser.map_search_field(term_b.search_field.value)
 
                     if field_a == field_b and (
                         term_a.value == term_b.value
@@ -346,7 +363,8 @@ def _check_unsupported_search_field(self, search_field: SearchField) -> None:
             search_field.position and search_field.value == "ab"
         ):
             self.parser.add_linter_message(
-                QueryErrorCode.SEARCH_FIELD_UNSUPPORTED, search_field.position
+                QueryErrorCode.SEARCH_FIELD_UNSUPPORTED,
+                search_field.position or (-1, -1),
             )
             search_field.value = Fields.ALL
             search_field.position = None
diff --git a/search_query/parser_pubmed.py b/search_query/parser_pubmed.py
@@ -281,15 +281,15 @@ def translate_search_fields(self, query: Query) -> None:
                 self.translate_search_fields(child)
             return
 
-        query.search_field.value = self.map_search_field(query.search_field.value)
+        if query.search_field:
+            query.search_field.value = self.map_search_field(query.search_field.value)
 
-        # Convert queries in the form 'Term [tiab]' into 'Term [ti] OR Term [ab]'.
-        if query.search_field.value == "[tiab]":
-            self._expand_combined_fields(query, [Fields.TITLE, Fields.ABSTRACT])
-            return
+            # Convert queries in the form 'Term [tiab]' into 'Term [ti] OR Term [ab]'.
+            if query.search_field.value == "[tiab]":
+                self._expand_combined_fields(query, [Fields.TITLE, Fields.ABSTRACT])
 
     def parse_user_provided_fields(self, field_values: str) -> list:
-        """Extract and translate user-provided search fields and return them as a list"""
+        """Extract and translate user-provided search fields (return as a list)"""
         if not field_values:
             return []
 
@@ -340,7 +340,7 @@ def _expand_combined_fields(self, query: Query, search_fields: list) -> None:
 
         query.value = Operators.OR
         query.operator = True
-        query.search_field.value = Fields.ALL
+        query.search_field = SearchField(value=Fields.ALL)
         query.children = query_children
 
     def get_query_leaves(self, query: Query) -> list:
@@ -378,7 +378,7 @@ def parse(self) -> Query:
 
     def check_linter_status(self) -> None:
         """Check the output of the linter and report errors to the user"""
-        new_messages = self.linter_messages[self.last_read_index + 1:]
+        new_messages = self.linter_messages[self.last_read_index + 1 :]
         for msg in new_messages:
             e = QuerySyntaxError(msg["message"], self.query_str, msg["pos"])
 
@@ -426,6 +426,7 @@ def parse(self) -> Query:
             query = self.parser_class(query_string, self.search_field_general).parse()
 
         except QuerySyntaxError as exc:
+            # pylint: disable=duplicate-code
             # Correct positions and query string
             # to display the error for the original (list) query
             new_pos = exc.pos
diff --git a/test/test_parser_pubmed.py b/test/test_parser_pubmed.py
@@ -2,11 +2,11 @@
 """Tests for Pubmed search query parser."""
 from typing import Tuple
 
-import pytest   # type: ignore
+import pytest  # type: ignore
 
+from search_query.constants import QueryErrorCode
 from search_query.constants import Token
 from search_query.constants import TokenTypes
-from search_query.constants import QueryErrorCode
 from search_query.exception import SearchQueryException
 from search_query.parser_pubmed import PubmedParser
 
@@ -21,50 +21,58 @@
         (
             '("health tracking" [tw] OR"remote monitoring"[tw])AND wearable device[tw]NOT Comment[pt]',
             [
-                Token(value='(', type=TokenTypes.PARENTHESIS_OPEN, position=(0, 1)),
-                Token(value='"health tracking"', type=TokenTypes.SEARCH_TERM, position=(1, 18)),
-                Token(value='[tw]', type=TokenTypes.FIELD, position=(19, 23)),
-                Token(value='OR', type=TokenTypes.LOGIC_OPERATOR, position=(24, 26)),
-                Token(value='"remote monitoring"', type=TokenTypes.SEARCH_TERM, position=(26, 45)),
-                Token(value='[tw]', type=TokenTypes.FIELD, position=(45, 49)),
-                Token(value=')', type=TokenTypes.PARENTHESIS_CLOSED, position=(49, 50)),
-                Token(value='AND', type=TokenTypes.LOGIC_OPERATOR, position=(50, 53)),
-                Token(value='wearable device', type=TokenTypes.SEARCH_TERM, position=(54, 69)),
-                Token(value='[tw]', type=TokenTypes.FIELD, position=(69, 73)),
-                Token(value='NOT', type=TokenTypes.LOGIC_OPERATOR, position=(73, 76)),
-                Token(value='Comment', type=TokenTypes.SEARCH_TERM, position=(77, 84)),
-                Token(value='[pt]', type=TokenTypes.FIELD, position=(84, 88)),
-            ]
+                Token(value="(", type=TokenTypes.PARENTHESIS_OPEN, position=(0, 1)),
+                Token(
+                    value='"health tracking"',
+                    type=TokenTypes.SEARCH_TERM,
+                    position=(1, 18),
+                ),
+                Token(value="[tw]", type=TokenTypes.FIELD, position=(19, 23)),
+                Token(value="OR", type=TokenTypes.LOGIC_OPERATOR, position=(24, 26)),
+                Token(
+                    value='"remote monitoring"',
+                    type=TokenTypes.SEARCH_TERM,
+                    position=(26, 45),
+                ),
+                Token(value="[tw]", type=TokenTypes.FIELD, position=(45, 49)),
+                Token(value=")", type=TokenTypes.PARENTHESIS_CLOSED, position=(49, 50)),
+                Token(value="AND", type=TokenTypes.LOGIC_OPERATOR, position=(50, 53)),
+                Token(
+                    value="wearable device",
+                    type=TokenTypes.SEARCH_TERM,
+                    position=(54, 69),
+                ),
+                Token(value="[tw]", type=TokenTypes.FIELD, position=(69, 73)),
+                Token(value="NOT", type=TokenTypes.LOGIC_OPERATOR, position=(73, 76)),
+                Token(value="Comment", type=TokenTypes.SEARCH_TERM, position=(77, 84)),
+                Token(value="[pt]", type=TokenTypes.FIELD, position=(84, 88)),
+            ],
         )
-    ]
+    ],
 )
-def test_tokenization_pubmed(
-        query_str: str, expected_tokens: list
-) -> None:
+def test_tokenization_pubmed(query_str: str, expected_tokens: list) -> None:
     pubmed_parser = PubmedParser(query_str, "")
     pubmed_parser.tokenize()
     assert pubmed_parser.tokens == expected_tokens, print(pubmed_parser.tokens)
 
 
 @pytest.mark.parametrize(
-    'query_str, expected_translation',
+    "query_str, expected_translation",
     [
         (
             '(eHealth[Title/Abstract] OR "eHealth"[MeSH Terms]) AND Review[Publication Type]',
-            'AND[OR[OR[all][eHealth[ti], eHealth[ab]], "eHealth"[mh]], Review[pt]]'
+            'AND[OR[OR[all][eHealth[ti], eHealth[ab]], "eHealth"[mh]], Review[pt]]',
         )
-    ]
+    ],
 )
-def test_parser_pubmed(
-        query_str: str, expected_translation: str
-) -> None:
+def test_parser_pubmed(query_str: str, expected_translation: str) -> None:
     pubmed_parser = PubmedParser(query_str, "")
     query_tree = pubmed_parser.parse()
     assert expected_translation == query_tree.to_string(), print(query_tree.to_string())
 
 
 @pytest.mark.parametrize(
-    'query_str, error, pos',
+    "query_str, error, pos",
     [
         (
             '("health tracking" OR "remote monitoring") AND (("mobile application" OR "wearable device")',
@@ -107,7 +115,7 @@ def test_parser_pubmed(
             (41, 43),
         ),
         (
-            'digital health[tiab:~5]',
+            "digital health[tiab:~5]",
             QueryErrorCode.INVALID_PROXIMITY_USE,
             (14, 23),
         ),
@@ -141,12 +149,12 @@ def test_parser_pubmed(
             QueryErrorCode.SEARCH_FIELD_UNSUPPORTED,
             (9, 13),
         ),
-    ]
+    ],
 )
 def test_linter_pubmed(
-        query_str: str,
-        error: QueryErrorCode,
-        pos: Tuple,
+    query_str: str,
+    error: QueryErrorCode,
+    pos: Tuple,
 ) -> None:
     pubmed_parser = PubmedParser(query_str, "")
     try: