Skip to content

Commit 6a8e69e

Browse files
author
Gerit Wagner
committed
revise wos parser
1 parent 430e4a0 commit 6a8e69e

File tree

2 files changed

+37
-28
lines changed

2 files changed

+37
-28
lines changed

search_query/wos/parser.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,33 +26,36 @@
2626
class WOSParser(QueryStringParser):
2727
"""Parser for Web-of-Science queries."""
2828

29-
TERM_REGEX = re.compile(
30-
r'\*?[\w\-/\.\!\*,&\\]+(?:[\*\$\?][\w\-/\.\!\*,&\\]*)*|"[^"]+"'
31-
)
29+
# 1) structured parts
30+
FIELD_REGEX = re.compile(r"\b\w{2,3}=")
3231
LOGIC_OPERATOR_REGEX = re.compile(r"\b(AND|OR|NOT)\b", flags=re.IGNORECASE)
3332
PROXIMITY_OPERATOR_REGEX = re.compile(
3433
r"\b(NEAR/\d{1,2}|NEAR)\b", flags=re.IGNORECASE
3534
)
36-
FIELD_REGEX = re.compile(r"\b\w{2}=|\b\w{3}=")
3735
PARENTHESIS_REGEX = re.compile(r"[\(\)]")
38-
fieldS_REGEX = re.compile(r"\b(?!and\b)[a-zA-Z]+(?:\s(?!and\b)[a-zA-Z]+)*")
3936

40-
OPERATOR_REGEX = re.compile(
41-
"|".join([LOGIC_OPERATOR_REGEX.pattern, PROXIMITY_OPERATOR_REGEX.pattern])
42-
)
37+
# 2) quoted term — this matches only if quotes are balanced.
38+
QUOTED_TERM_REGEX = re.compile(r"\".*?\"")
39+
40+
# 3) fallback term:
41+
# make this permissive enough to also swallow a stray `"`,
42+
# but still exclude structural WOS characters (space, parens, equals).
43+
PERMISSIVE_TERM_REGEX = re.compile(r"[^\s\(\)=]+")
4344

44-
# Combine all regex patterns into a single pattern
45+
# build the combined pattern:
46+
# fields → logic/proximity → parens → quoted term → term
4547
pattern = re.compile(
46-
r"|".join(
48+
"|".join(
4749
[
4850
FIELD_REGEX.pattern,
4951
LOGIC_OPERATOR_REGEX.pattern,
5052
PROXIMITY_OPERATOR_REGEX.pattern,
51-
TERM_REGEX.pattern,
5253
PARENTHESIS_REGEX.pattern,
53-
# self.fieldS_REGEX.pattern,
54+
QUOTED_TERM_REGEX.pattern,
55+
PERMISSIVE_TERM_REGEX.pattern,
5456
]
55-
)
57+
),
58+
flags=re.IGNORECASE,
5659
)
5760

5861
# pylint: disable=too-many-arguments
@@ -102,7 +105,10 @@ def tokenize(self) -> None:
102105
token_type = TokenTypes.PROXIMITY_OPERATOR
103106
elif self.FIELD_REGEX.fullmatch(value):
104107
token_type = TokenTypes.FIELD
105-
elif self.TERM_REGEX.fullmatch(value):
108+
elif self.QUOTED_TERM_REGEX.fullmatch(value):
109+
# fully quoted term
110+
token_type = TokenTypes.TERM
111+
elif self.PERMISSIVE_TERM_REGEX.fullmatch(value):
106112
token_type = TokenTypes.TERM
107113
else: # pragma: no cover
108114
token_type = TokenTypes.UNKNOWN
@@ -116,9 +122,6 @@ def tokenize(self) -> None:
116122

117123
def combine_subsequent_terms(self) -> None:
118124
"""Combine subsequent terms in the list of tokens."""
119-
# Combine subsequent terms (without quotes)
120-
# This would be more challenging in the regex
121-
# Changed the implementation to combine multiple terms
122125
combined_tokens: typing.List[Token] = []
123126
i = 0
124127
j = 0

test/wos/test_wos_parser_v_1.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
Token(value="AND", type=TokenTypes.LOGIC_OPERATOR, position=(20, 23)),
8484
Token(value="test", type=TokenTypes.TERM, position=(24, 28)),
8585
Token(value=")", type=TokenTypes.PARENTHESIS_CLOSED, position=(28, 29)),
86+
Token(value='"', type=TokenTypes.TERM, position=(29, 30)),
8687
],
8788
),
8889
(
@@ -93,7 +94,7 @@
9394
Token(value="(", type=TokenTypes.PARENTHESIS_OPEN, position=(4, 5)),
9495
Token(value="platform*", type=TokenTypes.TERM, position=(5, 14)),
9596
Token(value="OR", type=TokenTypes.LOGIC_OPERATOR, position=(15, 17)),
96-
Token(value="digital work", type=TokenTypes.TERM, position=(19, 31)),
97+
Token(value='"digital work', type=TokenTypes.TERM, position=(18, 31)),
9798
Token(value=")", type=TokenTypes.PARENTHESIS_CLOSED, position=(31, 32)),
9899
Token(value=")", type=TokenTypes.PARENTHESIS_CLOSED, position=(32, 33)),
99100
],
@@ -237,6 +238,19 @@ def test_tokenization(query_str: str, expected_tokens: list) -> None:
237238
},
238239
],
239240
),
241+
(
242+
'(TI=(platform* OR "digital work))',
243+
[
244+
{
245+
"code": "PARSE_0003",
246+
"label": "unbalanced-quotes",
247+
"message": "Quotes are unbalanced in the query",
248+
"is_fatal": True,
249+
"position": [(18, 31)],
250+
"details": "Unmatched opening quote",
251+
}
252+
],
253+
),
240254
(
241255
"TI=term1 au= ti=",
242256
[
@@ -596,14 +610,6 @@ def test_tokenization(query_str: str, expected_tokens: list) -> None:
596610
"position": [(17, 21)],
597611
"details": "WOS fields must be before search terms and without brackets, e.g. AB=robot or TI=monitor. '[ti]' is invalid.",
598612
},
599-
{
600-
"code": "PARSE_0001",
601-
"label": "tokenizing-failed",
602-
"message": "Fatal error during tokenization",
603-
"is_fatal": True,
604-
"position": [(9, 21)],
605-
"details": "Unparsed segment: 'ehealth[ti]'",
606-
},
607613
],
608614
),
609615
(
@@ -614,8 +620,8 @@ def test_tokenization(query_str: str, expected_tokens: list) -> None:
614620
"label": "unbalanced-quotes",
615621
"message": "Quotes are unbalanced in the query",
616622
"is_fatal": True,
617-
"position": [(18, 19)],
618-
"details": "",
623+
"position": [(18, 31)],
624+
"details": "Unmatched opening quote",
619625
}
620626
],
621627
),

0 commit comments

Comments
 (0)