test/refactor

Gerit Wagner · Gerit Wagner · commit 904416903f86 · 2025-05-29T07:16:05.000+02:00
diff --git a/docs/source/platforms/ebsco.rst b/docs/source/platforms/ebsco.rst
@@ -44,3 +44,4 @@ Resources
 
 - `EBSCO Search Help <https://connect.ebsco.com/s/article/Searching-EBSCO-Databases?language=en_US>`_
 - `EBSCO Search Fields Guide <https://connect.ebsco.com/s/article/Field-Codes-Searchable-EBSCOhost?language=en_US>`_
+- `EBSCO wildcard restrictions <https://connect.ebsco.com/s/article/Searching-with-Wildcards-in-EDS-and-EBSCOhost?language=en_US>`_
diff --git a/search_query/constants.py b/search_query/constants.py
@@ -425,7 +425,7 @@ class QueryErrorCode(Enum):
         [PLATFORM.WOS, PLATFORM.PUBMED],
         "F3003",
         "invalid-list-reference",
-        "Invalid list reference in list query (not found)",
+        "Invalid list reference in list query",
         "",
     )
 
diff --git a/search_query/ebsco/linter.py b/search_query/ebsco/linter.py
@@ -122,10 +122,15 @@ def check_invalid_near_within_operators(self) -> None:
 
         for token in self.tokens:
             if token.type == TokenTypes.PROXIMITY_OPERATOR:
+                digit = "x"
+                m = re.search(r"/(\d+)", token.value)
+                if m:
+                    digit = m.group(1)
+
                 if token.value.startswith("NEAR"):
                     details = (
                         f"Operator {token.value} "
-                        "is not supported by EBSCO. Must be Nx instead."
+                        f"is not supported by EBSCO. Must be N{digit} instead."
                     )
                     self.add_linter_message(
                         QueryErrorCode.INVALID_PROXIMITY_USE,
@@ -136,7 +141,7 @@ def check_invalid_near_within_operators(self) -> None:
                 if token.value.startswith("WITHIN"):
                     details = (
                         f"Operator {token.value} "
-                        "is not supported by EBSCO. Must be Wx instead."
+                        f"is not supported by EBSCO. Must be W{digit} instead."
                     )
                     self.add_linter_message(
                         QueryErrorCode.INVALID_PROXIMITY_USE,
@@ -187,6 +192,8 @@ def check_invalid_token_sequences(self) -> None:
 
                 elif token_type == TokenTypes.LOGIC_OPERATOR:
                     details = "Invalid operator position"
+                    if prev_type == TokenTypes.LOGIC_OPERATOR:
+                        details = "Cannot have two consecutive operators"
                     positions = [token.position]
 
                 elif (
@@ -271,15 +278,72 @@ def check_invalid_near_within_operators_query(self, query: Query) -> None:
         for child in query.children:
             self.check_invalid_near_within_operators_query(child)
 
+    def check_unsupported_wildcards(self, query: Query) -> None:
+        """Check for unsupported characters in the search string."""
+
+        if query.is_term():
+            val = query.value
+            # Check for leading wildcard
+            match = re.search(r"^(\*|\?|\#)", val)
+            if match:
+                position = (-1, -1)
+                if query.position:
+                    position = (
+                        query.position[0] + match.start(),
+                        query.position[0] + match.end(),
+                    )
+                self.add_linter_message(
+                    QueryErrorCode.WILDCARD_UNSUPPORTED,
+                    positions=[position],
+                    details="Wildcard not allowed at the beginning of a term.",
+                )
+
+            # Count each wildcard
+            char_count = sum(c not in "*?#" for c in val[:4])
+            if re.search(r"^[^\*\?\#](\?|\#)", val) and char_count < 2:
+                # Star in second position followed by more letters (e.g., "f*tal")
+                position = (-1, -1)
+                if query.position:
+                    position = (query.position[0], query.position[0] + len(val))
+                details = (
+                    "Invalid wildcard use: only one leading literal character found. "
+                    "When a wildcard appears within the first four characters, "
+                    "at least two literal (non-wildcard) characters "
+                    "must be present in that span."
+                )
+                self.add_linter_message(
+                    QueryErrorCode.WILDCARD_UNSUPPORTED,
+                    positions=[position],
+                    details=details,
+                )
+
+            if re.search(r"^[^\*\?\#](\*)", val):
+                position = (-1, -1)
+                if query.position:
+                    position = (query.position[0], query.position[0] + len(val))
+                details = (
+                    "Do not use * in the second position followed by "
+                    "additional letters. Use ? or # instead (e.g., f?tal)."
+                )
+                self.add_linter_message(
+                    QueryErrorCode.WILDCARD_UNSUPPORTED,
+                    positions=[position],
+                    details=details,
+                )
+
+        for child in query.children:
+            self.check_unsupported_wildcards(child)
+
     def validate_query_tree(self, query: Query) -> None:
         """
         Validate the query tree.
         This method is called after the query tree has been built.
         """
 
         self.check_unbalanced_quotes_in_terms(query)
-        self.check_invalid_characters_in_search_term_query(query, "@%$^~\\<>{}[]#")
+        self.check_invalid_characters_in_search_term_query(query, "@%$^~\\<>{}[]")
         self.check_unsupported_search_fields_in_query(query)
+        self.check_unsupported_wildcards(query)
 
         term_field_query = self.get_query_with_fields_at_terms(query)
         self._check_date_filters_in_subquery(term_field_query)
diff --git a/search_query/ebsco/parser.py b/search_query/ebsco/parser.py
@@ -25,7 +25,7 @@ class EBSCOParser(QueryStringParser):
         r"(N|W)\d+|(NEAR|WITHIN)/\d+", flags=re.IGNORECASE
     )
     SEARCH_FIELD_REGEX = re.compile(r"\b([A-Z]{2})\b")
-    SEARCH_TERM_REGEX = re.compile(r"\"[^\"]*\"|\b(?!S\d+\b)[^()\s]+[\*\+\?]?")
+    SEARCH_TERM_REGEX = re.compile(r"\"[^\"]*\"|\*?\b[^()\s]+")
 
     OPERATOR_REGEX = re.compile(
         "|".join([LOGIC_OPERATOR_REGEX.pattern, PROXIMITY_OPERATOR_REGEX.pattern])
@@ -120,6 +120,25 @@ def _extract_proximity_distance(self, token: Token) -> int:
         token.value = operator
         return distance
 
+    def fix_ambiguous_tokens(self) -> None:
+        """Fix ambiguous tokens that could be misinterpreted as a search field."""
+
+        def is_potential_term(token_str: str) -> bool:
+            return bool(re.fullmatch(r"[A-Z]{2,}", token_str))
+
+        # Field token followed by term which is misclassified as a field token
+        for i in range(len(self.tokens) - 1):
+            current = self.tokens[i]
+            next_token = self.tokens[i + 1]
+
+            if (
+                current.type == TokenTypes.FIELD
+                and next_token.type == TokenTypes.FIELD
+                and is_potential_term(next_token.value)
+            ):
+                # Reclassify the second FIELD token as a SEARCH_TERM
+                next_token.type = TokenTypes.SEARCH_TERM
+
     def tokenize(self) -> None:
         """Tokenize the query_str."""
 
@@ -154,6 +173,7 @@ def tokenize(self) -> None:
 
         # Combine subsequent search_terms in case of no quotation marks
         self.combine_subsequent_tokens()
+        self.fix_ambiguous_tokens()
 
     def append_node(
         self,
diff --git a/search_query/linter_base.py b/search_query/linter_base.py
@@ -536,9 +536,9 @@ def get_precedence(self, token: str) -> int:
 
     def _get_unequal_precedence_operators(
         self, tokens: list[Token]
-    ) -> typing.List[tuple[int, int]]:
+    ) -> typing.List[Token]:
         """Get positions of unequal precedence operators."""
-        unequal_precedence_operators = []
+        unequal_precedence_operators: typing.List[Token] = []
         previous_value = -1
         level = 0
         prev_token = None
@@ -554,8 +554,8 @@ def _get_unequal_precedence_operators(
                 continue
             if token.type in [TokenTypes.LOGIC_OPERATOR, TokenTypes.PROXIMITY_OPERATOR]:
                 value = self.get_precedence(token.value.upper())
-                if value != previous_value and previous_value != -1:
-                    if not unequal_precedence_operators:
+                if previous_value not in [value, -1]:
+                    if not unequal_precedence_operators and prev_token:
                         unequal_precedence_operators.append(prev_token)
                     unequal_precedence_operators.append(token)
                 previous_value = value
@@ -578,15 +578,18 @@ def _print_unequal_precedence_warning(self, index: int) -> None:
         for idx, (op, prec) in enumerate(precedence_list):
             if idx == 0:
                 precedence_lines.append(
-                    f"Operator {Colors.GREEN}{op}{Colors.END} is evaluated first because it has the highest precedence level ({prec})."
+                    f"Operator {Colors.GREEN}{op}{Colors.END} is evaluated first "
+                    f"because it has the highest precedence level ({prec})."
                 )
             elif idx == len(precedence_list) - 1:
                 precedence_lines.append(
-                    f"Operator {Colors.ORANGE}{op}{Colors.END} is evaluated last because it has the lowest precedence level ({prec})."
+                    f"Operator {Colors.ORANGE}{op}{Colors.END} is evaluated last "
+                    f"because it has the lowest precedence level ({prec})."
                 )
             else:
                 precedence_lines.append(
-                    f"Operator {Colors.ORANGE}{op}{Colors.END} has precedence level {prec}."
+                    f"Operator {Colors.ORANGE}{op}{Colors.END} "
+                    f"has precedence level {prec}."
                 )
 
         precedence_info = "\n".join(precedence_lines)
@@ -597,7 +600,8 @@ def _print_unequal_precedence_warning(self, index: int) -> None:
             "This can lead to unexpected interpretations of the query.\n\n"
             "Specifically:\n"
             f"{precedence_info}\n\n"
-            "To fix this, search-query adds artificial parentheses around operator groups with higher precedence.\n\n"
+            "To fix this, search-query adds artificial parentheses around "
+            "operator groups with higher precedence.\n\n"
         )
 
         self.add_linter_message(
diff --git a/search_query/pubmed/constants.py b/search_query/pubmed/constants.py
@@ -33,7 +33,7 @@
     "[dcom]": r"\[dcom\]|\[completion date\]",
     "[cois]": r"\[cois\]|\[conflict of interest statement\]",
     "[cn]": r"\[cn\]|\[corporate author\]",
-    "[crdt]": r"\[crdt\]|\[create date\]",
+    "[crdt]": r"\[crdt\]|\[create date\]|\[date - create\]",
     "[rn]": r"\[rn\]|\[ec/rn number\]",
     "[ed]": r"\[ed\]|\[editor\]",
     "[edat]": r"\[edat\]|\[entry date\]",
@@ -62,7 +62,7 @@
     "[ps]": r"\[ps\]|\[personal name as subject\]",
     "[pa]": r"\[pa\]|\[pharmacological action\]",
     "[pl]": r"\[pl\]|\[place of publication\]",
-    "[dp]": r"\[dp\]|\[publication date\]|\[pdate\]",
+    "[dp]": r"\[dp\]|\[publication date\]|\[pdat\]",
     "[pt]": r"\[pt\]|\[publication type\]",
     "[pubn]": r"\[pubn\]|\[publisher\]",
     "[si]": r"\[si\]|\[secondary source id\]",
diff --git a/search_query/pubmed/linter.py b/search_query/pubmed/linter.py
@@ -179,7 +179,11 @@ def check_invalid_token_sequences(self) -> None:
                     )
                 ]
 
-            elif token_type and prev_type and prev_type != TokenTypes.LOGIC_OPERATOR:
+            elif (
+                token_type
+                and prev_type
+                and prev_type not in [TokenTypes.LOGIC_OPERATOR]
+            ):
                 details = "Missing operator"
                 positions = [
                     (
diff --git a/search_query/wos/linter.py b/search_query/wos/linter.py
@@ -294,7 +294,7 @@ def check_invalid_token_sequences(self) -> None:
             if token.type == TokenTypes.FIELD and next_token.type == TokenTypes.FIELD:
                 self.add_linter_message(
                     QueryErrorCode.INVALID_TOKEN_SEQUENCE,
-                    positions=[next_token.position],
+                    positions=[(token.position[0], next_token.position[1])],
                 )
                 continue
 
@@ -303,7 +303,7 @@ def check_invalid_token_sequences(self) -> None:
             if next_token.type not in allowed_next_types:
                 self.add_linter_message(
                     QueryErrorCode.INVALID_TOKEN_SEQUENCE,
-                    positions=[next_token.position],
+                    positions=[(token.position[0], next_token.position[1])],
                 )
 
         # Check the last token
diff --git a/test/test_ebsco.py b/test/test_ebsco.py
@@ -104,12 +104,28 @@
                 ),
             ],
         ),
+        (
+            "TI RN OR AB RN",
+            [
+                Token(value="TI", type=TokenTypes.FIELD, position=(0, 2)),
+                Token(value="RN", type=TokenTypes.SEARCH_TERM, position=(3, 5)),
+                Token(value="OR", type=TokenTypes.LOGIC_OPERATOR, position=(6, 8)),
+                Token(value="AB", type=TokenTypes.FIELD, position=(9, 11)),
+                Token(value="RN", type=TokenTypes.SEARCH_TERM, position=(12, 14)),
+            ],
+        ),
+        # (
+        #     '(DE "Persuasive Communication) OR (DE "Collaboration")',
+        #     [
+        #     ]
+        # )
     ],
 )
 def test_tokenization(
     query_string: str, expected_tokens: List[Tuple[str, str, Tuple[int, int]]]
 ) -> None:
     """Test EBSCO parser tokenization."""
+    print(query_string)
     parser = EBSCOParser(query_string, search_field_general="")
     parser.tokenize()
 
@@ -291,6 +307,82 @@ def test_invalid_token_sequences(
             '(ZY "sudan" OR ZY "south sudan") AND TI "context of vegetarians"',
             [],
         ),
+        (
+            "bias OR OR politics",
+            [
+                {
+                    "code": "F1004",
+                    "label": "invalid-token-sequence",
+                    "message": "The sequence of tokens is invalid.",
+                    "is_fatal": True,
+                    "position": [(8, 10)],
+                    "details": "Cannot have two consecutive operators",
+                }
+            ],
+        ),
+        (
+            "*ology",
+            [
+                {
+                    "code": "F2001",
+                    "label": "wildcard-unsupported",
+                    "message": "Unsupported wildcard in search string.",
+                    "is_fatal": True,
+                    "position": [(0, 1)],
+                    "details": "Wildcard not allowed at the beginning of a term.",
+                }
+            ],
+        ),
+        (
+            "f??*",
+            [
+                {
+                    "code": "F2001",
+                    "label": "wildcard-unsupported",
+                    "message": "Unsupported wildcard in search string.",
+                    "is_fatal": True,
+                    "position": [(0, 4)],
+                    "details": "Invalid wildcard use: only one leading literal character found. When a wildcard appears within the first four characters, at least two literal (non-wildcard) characters must be present in that span.",
+                }
+            ],
+        ),
+        (
+            "f*tal",
+            [
+                {
+                    "code": "F2001",
+                    "label": "wildcard-unsupported",
+                    "message": "Unsupported wildcard in search string.",
+                    "is_fatal": True,
+                    "position": [(0, 5)],
+                    "details": "Do not use * in the second position followed by additional letters. Use ? or # instead (e.g., f?tal).",
+                }
+            ],
+        ),
+        (
+            "colo#r",
+            [],
+        ),
+        (
+            "pediatric*",
+            [],
+        ),
+        (
+            "tumor*",
+            [],
+        ),
+        (
+            "education*",
+            [],
+        ),
+        (
+            "f#tal",
+            [],
+        ),
+        (
+            "f?tal",
+            [],
+        ),
     ],
 )
 def test_linter(query_string: str, messages: list) -> None:
@@ -365,7 +457,7 @@ def test_linter(query_string: str, messages: list) -> None:
                     "message": "Invalid use of the proximity operator",
                     "is_fatal": False,
                     "position": [(8, 14)],
-                    "details": "Operator NEAR/2 is not supported by EBSCO. Must be Nx instead.",
+                    "details": "Operator NEAR/2 is not supported by EBSCO. Must be N2 instead.",
                 }
             ],
         ),
@@ -380,7 +472,7 @@ def test_linter(query_string: str, messages: list) -> None:
                     "message": "Invalid use of the proximity operator",
                     "is_fatal": False,
                     "position": [(8, 16)],
-                    "details": "Operator WITHIN/2 is not supported by EBSCO. Must be Wx instead.",
+                    "details": "Operator WITHIN/2 is not supported by EBSCO. Must be W2 instead.",
                 }
             ],
         ),
diff --git a/test/test_pubmed.py b/test/test_pubmed.py
diff --git a/test/test_wos.py b/test/test_wos.py

Original file line number	Diff line number	Diff line change
`@@ -44,3 +44,4 @@ Resources`
`44`	`44`
`45`	`45`	- `EBSCO Search Help <https://connect.ebsco.com/s/article/Searching-EBSCO-Databases?language=en_US>`_
`46`	`46`	- `EBSCO Search Fields Guide <https://connect.ebsco.com/s/article/Field-Codes-Searchable-EBSCOhost?language=en_US>`_
	`47`	+- `EBSCO wildcard restrictions <https://connect.ebsco.com/s/article/Searching-with-Wildcards-in-EDS-and-EBSCOhost?language=en_US>`_
Original file line number	Diff line number	Diff line change
`@@ -425,7 +425,7 @@ class QueryErrorCode(Enum):`
`425`	`425`	`[PLATFORM.WOS, PLATFORM.PUBMED],`
`426`	`426`	`"F3003",`
`427`	`427`	`"invalid-list-reference",`
`428`		`- "Invalid list reference in list query (not found)",`
	`428`	`+ "Invalid list reference in list query",`
`429`	`429`	`"",`
`430`	`430`	`)`
`431`	`431`
Original file line number	Diff line number	Diff line change
`@@ -179,7 +179,11 @@ def check_invalid_token_sequences(self) -> None:`
`179`	`179`	`)`
`180`	`180`	`]`
`181`	`181`
`182`		`- elif token_type and prev_type and prev_type != TokenTypes.LOGIC_OPERATOR:`
	`182`	`+ elif (`
	`183`	`+ token_type`
	`184`	`+ and prev_type`
	`185`	`+ and prev_type not in [TokenTypes.LOGIC_OPERATOR]`
	`186`	`+ ):`
`183`	`187`	`details = "Missing operator"`
`184`	`188`	`positions = [`
`185`	`189`	`(`