Skip to content

Commit 78dac69

Browse files
authored
Fixing crash on possessive query (#992)
1 parent 3f119b5 commit 78dac69

File tree

4 files changed

+64
-9
lines changed

4 files changed

+64
-9
lines changed

paperqa/agents/search.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
from paperqa.docs import Docs
4747
from paperqa.settings import IndexSettings, get_settings
4848
from paperqa.types import VAR_MATCH_LOOKUP, DocDetails
49-
from paperqa.utils import ImpossibleParsingError, hexdigest
49+
from paperqa.utils import ImpossibleParsingError, clean_possessives, hexdigest
5050

5151
from .models import JSONType, SupportsPickle
5252

@@ -408,9 +408,8 @@ async def get_saved_object(
408408
return self.storage.read_from_string(content)
409409
return None
410410

411-
def clean_query(self, query: str) -> str:
412-
# SEE: https://regex101.com/r/DoLMoa/3
413-
return re.sub(r'[*\[\]:(){}~^><+"\\]', "", query)
411+
# Remove these characters, SEE: https://regex101.com/r/DoLMoa/3
412+
CLEAN_QUERY_REGEX: ClassVar[re.Pattern] = re.compile(r'[*\[\]:(){}~^><+"\\]')
414413

415414
async def query(
416415
self,
@@ -424,13 +423,17 @@ async def query(
424423
query_fields = list(field_subset or self.fields)
425424
searcher = await self.searcher
426425
index = await self.index
426+
cleaned_query = self.CLEAN_QUERY_REGEX.sub("", query)
427+
try:
428+
parsed_query = index.parse_query(cleaned_query, query_fields)
429+
except ValueError: # Rejected by tantivy
430+
# Retry with more aggressive cleaning
431+
parsed_query = index.parse_query(
432+
clean_possessives(cleaned_query), query_fields
433+
)
427434
addresses = [
428435
s[1]
429-
for s in searcher.search(
430-
index.parse_query(self.clean_query(query), query_fields),
431-
top_n,
432-
offset=offset,
433-
).hits
436+
for s in searcher.search(parsed_query, top_n, offset=offset).hits
434437
if s[0] > min_score
435438
]
436439
search_index_docs = [searcher.doc(address) for address in addresses]

paperqa/utils.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,3 +564,20 @@ def maybe_get_date(date: str | datetime | None) -> datetime | None:
564564
continue
565565
return None
566566
return date
567+
568+
569+
def clean_possessives(text: str) -> str:
570+
"""Remove possessive apostrophes from text (e.g. "X's Y" to "Xs Y")."""
571+
# Handle apostrophes after word characters
572+
# (possessive 's or trailing apostrophes)
573+
text = re.sub(
574+
r"(?<=\w)'(?:s\b|(?=\s|$))",
575+
lambda m: "s" if m.group().endswith("s") else "",
576+
text,
577+
)
578+
# Remove standalone 's patterns
579+
text = re.sub(r"\s+'s\b", "", text)
580+
text = re.sub(r"^'s\s*", "", text)
581+
# Remove standalone apostrophes
582+
text = re.sub(r"\s+'\s+", " ", text)
583+
return re.sub(r"(?<!\w)'\s*", "", text)

tests/test_agents.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,14 @@ async def test_get_directory_index(
105105
f" {[d.formatted_citation for d in results[0].docs.values()]}."
106106
)
107107

108+
# Check single quoted text in the query doesn't crash us
109+
results = await index.query(query="Who is 'Bates'")
110+
assert results
111+
112+
# Check possessive in the query doesn't crash us
113+
results = await index.query(query="What is Bates' first name")
114+
assert results
115+
108116
with subtests.test(msg="check-md-query"):
109117
results = await index.query(query="what is a gravity hill?", min_score=5)
110118
assert results

tests/test_paperqa.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
from paperqa.readers import parse_pdf_to_pages, read_doc
5151
from paperqa.types import ChunkMetadata
5252
from paperqa.utils import (
53+
clean_possessives,
5354
encode_id,
5455
extract_score,
5556
maybe_get_date,
@@ -1980,3 +1981,29 @@ def test_maybe_get_date():
19801981
assert maybe_get_date(datetime(2023, 1, 1)) == datetime(2023, 1, 1)
19811982
assert maybe_get_date("foo") is None
19821983
assert maybe_get_date("") is None
1984+
1985+
1986+
@pytest.mark.parametrize(
1987+
("raw_text", "cleaned_text"),
1988+
[
1989+
("name", "name"),
1990+
(" name", " name"),
1991+
("name ", "name "),
1992+
(" ", " "),
1993+
("Bates name", "Bates name"),
1994+
("Bate's name", "Bates name"),
1995+
("Bate's name Bate's name", "Bates name Bates name"),
1996+
("Bates' name", "Bates name"),
1997+
("X's Y", "Xs Y"),
1998+
("' name", "name"),
1999+
(" ' name", " name"),
2000+
("name ' name", "name name"),
2001+
("'s name", "name"),
2002+
(" 's name", " name"),
2003+
("s' name", "s name"),
2004+
("S' name", "S name"),
2005+
("Bates 's name", "Bates name"),
2006+
],
2007+
)
2008+
def test_clean_possessives(raw_text: str, cleaned_text: str) -> None:
2009+
assert clean_possessives(raw_text) == cleaned_text

0 commit comments

Comments
 (0)