revise wos parser

Gerit Wagner · Gerit Wagner · commit 6a8e69efbeea · 2025-11-09T12:24:56.000+01:00
diff --git a/search_query/wos/parser.py b/search_query/wos/parser.py
@@ -26,33 +26,36 @@
 class WOSParser(QueryStringParser):
     """Parser for Web-of-Science queries."""
 
-    TERM_REGEX = re.compile(
-        r'\*?[\w\-/\.\!\*,&\\]+(?:[\*\$\?][\w\-/\.\!\*,&\\]*)*|"[^"]+"'
-    )
+    # 1) structured parts
+    FIELD_REGEX = re.compile(r"\b\w{2,3}=")
     LOGIC_OPERATOR_REGEX = re.compile(r"\b(AND|OR|NOT)\b", flags=re.IGNORECASE)
     PROXIMITY_OPERATOR_REGEX = re.compile(
         r"\b(NEAR/\d{1,2}|NEAR)\b", flags=re.IGNORECASE
     )
-    FIELD_REGEX = re.compile(r"\b\w{2}=|\b\w{3}=")
     PARENTHESIS_REGEX = re.compile(r"[\(\)]")
-    fieldS_REGEX = re.compile(r"\b(?!and\b)[a-zA-Z]+(?:\s(?!and\b)[a-zA-Z]+)*")
 
-    OPERATOR_REGEX = re.compile(
-        "|".join([LOGIC_OPERATOR_REGEX.pattern, PROXIMITY_OPERATOR_REGEX.pattern])
-    )
+    # 2) quoted term — this matches only if quotes are balanced.
+    QUOTED_TERM_REGEX = re.compile(r"\".*?\"")
+
+    # 3) fallback term:
+    # make this permissive enough to also swallow a stray `"`,
+    # but still exclude structural WOS characters (space, parens, equals).
+    PERMISSIVE_TERM_REGEX = re.compile(r"[^\s\(\)=]+")
 
-    # Combine all regex patterns into a single pattern
+    # build the combined pattern:
+    # fields → logic/proximity → parens → quoted term → term
     pattern = re.compile(
-        r"|".join(
+        "|".join(
             [
                 FIELD_REGEX.pattern,
                 LOGIC_OPERATOR_REGEX.pattern,
                 PROXIMITY_OPERATOR_REGEX.pattern,
-                TERM_REGEX.pattern,
                 PARENTHESIS_REGEX.pattern,
-                # self.fieldS_REGEX.pattern,
+                QUOTED_TERM_REGEX.pattern,
+                PERMISSIVE_TERM_REGEX.pattern,
             ]
-        )
+        ),
+        flags=re.IGNORECASE,
     )
 
     # pylint: disable=too-many-arguments
@@ -102,7 +105,10 @@ def tokenize(self) -> None:
                 token_type = TokenTypes.PROXIMITY_OPERATOR
             elif self.FIELD_REGEX.fullmatch(value):
                 token_type = TokenTypes.FIELD
-            elif self.TERM_REGEX.fullmatch(value):
+            elif self.QUOTED_TERM_REGEX.fullmatch(value):
+                # fully quoted term
+                token_type = TokenTypes.TERM
+            elif self.PERMISSIVE_TERM_REGEX.fullmatch(value):
                 token_type = TokenTypes.TERM
             else:  # pragma: no cover
                 token_type = TokenTypes.UNKNOWN
@@ -116,9 +122,6 @@ def tokenize(self) -> None:
 
     def combine_subsequent_terms(self) -> None:
         """Combine subsequent terms in the list of tokens."""
-        # Combine subsequent terms (without quotes)
-        # This would be more challenging in the regex
-        # Changed the implementation to combine multiple terms
         combined_tokens: typing.List[Token] = []
         i = 0
         j = 0
diff --git a/test/wos/test_wos_parser_v_1.py b/test/wos/test_wos_parser_v_1.py
@@ -83,6 +83,7 @@
                 Token(value="AND", type=TokenTypes.LOGIC_OPERATOR, position=(20, 23)),
                 Token(value="test", type=TokenTypes.TERM, position=(24, 28)),
                 Token(value=")", type=TokenTypes.PARENTHESIS_CLOSED, position=(28, 29)),
+                Token(value='"', type=TokenTypes.TERM, position=(29, 30)),
             ],
         ),
         (
@@ -93,7 +94,7 @@
                 Token(value="(", type=TokenTypes.PARENTHESIS_OPEN, position=(4, 5)),
                 Token(value="platform*", type=TokenTypes.TERM, position=(5, 14)),
                 Token(value="OR", type=TokenTypes.LOGIC_OPERATOR, position=(15, 17)),
-                Token(value="digital work", type=TokenTypes.TERM, position=(19, 31)),
+                Token(value='"digital work', type=TokenTypes.TERM, position=(18, 31)),
                 Token(value=")", type=TokenTypes.PARENTHESIS_CLOSED, position=(31, 32)),
                 Token(value=")", type=TokenTypes.PARENTHESIS_CLOSED, position=(32, 33)),
             ],
@@ -237,6 +238,19 @@ def test_tokenization(query_str: str, expected_tokens: list) -> None:
                 },
             ],
         ),
+        (
+            '(TI=(platform* OR "digital work))',
+            [
+                {
+                    "code": "PARSE_0003",
+                    "label": "unbalanced-quotes",
+                    "message": "Quotes are unbalanced in the query",
+                    "is_fatal": True,
+                    "position": [(18, 31)],
+                    "details": "Unmatched opening quote",
+                }
+            ],
+        ),
         (
             "TI=term1 au= ti=",
             [
@@ -596,14 +610,6 @@ def test_tokenization(query_str: str, expected_tokens: list) -> None:
                     "position": [(17, 21)],
                     "details": "WOS fields must be before search terms and without brackets, e.g. AB=robot or TI=monitor. '[ti]' is invalid.",
                 },
-                {
-                    "code": "PARSE_0001",
-                    "label": "tokenizing-failed",
-                    "message": "Fatal error during tokenization",
-                    "is_fatal": True,
-                    "position": [(9, 21)],
-                    "details": "Unparsed segment: 'ehealth[ti]'",
-                },
             ],
         ),
         (
@@ -614,8 +620,8 @@ def test_tokenization(query_str: str, expected_tokens: list) -> None:
                     "label": "unbalanced-quotes",
                     "message": "Quotes are unbalanced in the query",
                     "is_fatal": True,
-                    "position": [(18, 19)],
-                    "details": "",
+                    "position": [(18, 31)],
+                    "details": "Unmatched opening quote",
                 }
             ],
         ),