Skip to content

Commit e36bbec

Browse files
committed
Incorporated a Doc.content_hash to ensure doc_id uniqueness, with tests
1 parent 2776815 commit e36bbec

File tree

7 files changed

+117
-32
lines changed

7 files changed

+117
-32
lines changed

src/paperqa/clients/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,8 @@ async def upgrade_doc_to_doc_details(self, doc: Doc, **kwargs) -> DocDetails:
233233
doc_details.key = doc.docname
234234
if "citation" in doc.fields_to_overwrite_from_metadata:
235235
doc_details.citation = doc.citation
236+
if "content_hash" in doc.fields_to_overwrite_from_metadata:
237+
doc_details.content_hash = doc.content_hash
236238
return provided_doc_details + doc_details
237239

238240
# if we can't get metadata, just return the doc, but don't overwrite any fields

src/paperqa/docs.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -265,18 +265,20 @@ async def aadd( # noqa: PLR0912
265265
"""Add a document to the collection."""
266266
all_settings = get_settings(settings)
267267
parse_config = all_settings.parsing
268+
content_hash = md5sum(path)
268269
dockey_is_content_hash = False
269270
if dockey is None:
270-
# md5 sum of file contents (not path!)
271-
dockey = md5sum(path)
271+
dockey = content_hash
272272
dockey_is_content_hash = True
273273
if llm_model is None:
274274
llm_model = all_settings.get_llm()
275275
if citation is None:
276276
# Peek first chunk
277277
texts = await read_doc(
278278
path,
279-
Doc(docname="", citation="", dockey=dockey), # Fake doc
279+
Doc( # Fake doc
280+
docname="", citation="", dockey=dockey, content_hash=content_hash
281+
),
280282
chunk_chars=parse_config.chunk_size,
281283
overlap=parse_config.overlap,
282284
page_size_limit=parse_config.page_size_limit,
@@ -307,6 +309,7 @@ async def aadd( # noqa: PLR0912
307309
),
308310
citation=citation,
309311
dockey=dockey,
312+
content_hash=content_hash,
310313
)
311314

312315
# try to extract DOI / title from the citation

src/paperqa/llms.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
)
3131
from typing_extensions import override
3232

33-
from paperqa.types import Doc, Text
33+
from paperqa.types import AUTOPOPULATE_VALUE, Doc, Text
3434

3535
if TYPE_CHECKING:
3636
from qdrant_client.http.models import Record
@@ -495,6 +495,7 @@ async def fetch_batch_with_semaphore(offset: int) -> None:
495495
docname=doc_data.get("docname", ""),
496496
citation=doc_data.get("citation", ""),
497497
dockey=doc_data["dockey"],
498+
content_hash=doc_data.get("content_hash", AUTOPOPULATE_VALUE),
498499
)
499500
docs.docnames.add(doc_data.get("docname", ""))
500501

src/paperqa/types.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import ast
4+
import contextlib
45
import csv
56
import hashlib
67
import json
@@ -37,11 +38,13 @@
3738

3839
from paperqa.utils import (
3940
bytes_to_string,
41+
compute_unique_doc_id,
4042
create_bibtex_key,
4143
encode_id,
4244
format_bibtex,
4345
get_citation_ids,
4446
maybe_get_date,
47+
md5sum,
4548
string_to_bytes,
4649
)
4750
from paperqa.version import __version__ as pqa_version
@@ -61,7 +64,10 @@
6164
"docname",
6265
"dockey",
6366
"citation",
67+
"content_hash", # Metadata providers won't give this
6468
}
69+
# Sentinel to autopopulate a field within model_validator
70+
AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
6571

6672

6773
class Doc(Embeddable):
@@ -70,6 +76,13 @@ class Doc(Embeddable):
7076
docname: str
7177
dockey: DocKey
7278
citation: str
79+
content_hash: str | None = Field(
80+
default=AUTOPOPULATE_VALUE,
81+
description=(
82+
"Optional hash of the document's contents (to reiterate, not a file path to"
83+
" the document, but the document's contents itself)."
84+
),
85+
)
7386
# Sort the serialization to minimize the diff of serialized objects
7487
fields_to_overwrite_from_metadata: Annotated[set[str], PlainSerializer(sorted)] = (
7588
Field(
@@ -171,10 +184,6 @@ def __hash__(self) -> int:
171184
return hash((self.name, self.text))
172185

173186

174-
# Sentinel to autopopulate a field within model_validator
175-
AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
176-
177-
178187
class Context(BaseModel):
179188
"""A class to hold the context of a question."""
180189

@@ -660,8 +669,8 @@ class DocDetails(Doc):
660669
doc_id: str | None = Field(
661670
default=None,
662671
description=(
663-
"Unique ID for this document. Simple ways to acquire one include"
664-
" hashing the DOI or a stringifying a UUID."
672+
"Unique ID for this document. A simple and robust way to acquire one is"
673+
" hashing the paper content's hash concatenate with the lowercased DOI."
665674
),
666675
)
667676
file_location: str | os.PathLike | None = Field(
@@ -720,9 +729,9 @@ def lowercase_doi_and_populate_doc_id(cls, data: dict[str, Any]) -> dict[str, An
720729
doi = doi.replace(url_prefix_to_remove, "")
721730
data["doi"] = doi.lower()
722731
if not data.get("doc_id"): # keep user defined doc_ids
723-
data["doc_id"] = encode_id(doi.lower())
732+
data["doc_id"] = compute_unique_doc_id(doi, data.get("content_hash"))
724733
elif not data.get("doc_id"): # keep user defined doc_ids
725-
data["doc_id"] = encode_id(uuid4())
734+
data["doc_id"] = compute_unique_doc_id(doi, data.get("content_hash"))
726735

727736
if "dockey" in data.get(
728737
"fields_to_overwrite_from_metadata",
@@ -933,6 +942,17 @@ def populate_bibtex_key_citation(cls, data: dict[str, Any]) -> dict[str, Any]:
933942
data["citation"] = data.get("title") or CITATION_FALLBACK_DATA["title"]
934943
return data
935944

945+
@classmethod
946+
def populate_content_hash(cls, data: dict[str, Any]) -> dict[str, Any]:
947+
if ( # Check for missing or autopopulate value, but preserve `None`
948+
data.get("content_hash", AUTOPOPULATE_VALUE) == AUTOPOPULATE_VALUE
949+
):
950+
data["content_hash"] = None # Assume we don't have it
951+
if data.get("file_location"): # Try to update it
952+
with contextlib.suppress(FileNotFoundError):
953+
data["content_hash"] = md5sum(data["file_location"])
954+
return data
955+
936956
@model_validator(mode="before")
937957
@classmethod
938958
def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
@@ -952,6 +972,7 @@ def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
952972
data[possibly_str_field], str
953973
):
954974
data[possibly_str_field] = ast.literal_eval(data[possibly_str_field])
975+
data = cls.populate_content_hash(data)
955976
data = cls.lowercase_doi_and_populate_doc_id(data)
956977
data = cls.remove_invalid_authors(data)
957978
data = cls.misc_string_cleaning(data)
@@ -1112,6 +1133,14 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
11121133
)
11131134
else:
11141135
merged_data[field] = max(self_value, other_value)
1136+
elif field == "content_hash" and ( # noqa: PLR0916
1137+
# Hashes are both present but differ
1138+
(self_value and other_value and self_value != other_value)
1139+
# One hash is explicitly disabled (not autopopulated)
1140+
or (self_value is None or other_value is None)
1141+
):
1142+
# We don't know which to pick, so just discard the value
1143+
merged_data[field] = None
11151144

11161145
else:
11171146
# Prefer non-null values, default preference for 'other' object.
@@ -1126,10 +1155,13 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
11261155
else self_value
11271156
)
11281157

1129-
# Recalculate doc_id if doi has changed
1130-
if merged_data["doi"] != self.doi:
1131-
merged_data["doc_id"] = (
1132-
encode_id(merged_data["doi"].lower()) if merged_data["doi"] else None
1158+
if (
1159+
merged_data["doi"] != self.doi
1160+
or merged_data["content_hash"] != self.content_hash
1161+
):
1162+
# Recalculate doc_id if doi or content hash has changed
1163+
merged_data["doc_id"] = compute_unique_doc_id(
1164+
merged_data["doi"], merged_data.get("content_hash")
11331165
)
11341166

11351167
# Create and return new DocDetails instance

src/paperqa/utils.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from http import HTTPStatus
1818
from pathlib import Path
1919
from typing import TYPE_CHECKING, Any, BinaryIO, ClassVar, TypeVar
20-
from uuid import UUID
20+
from uuid import UUID, uuid4
2121

2222
import aiohttp
2323
import httpx
@@ -104,7 +104,7 @@ def strings_similarity(s1: str, s2: str, case_insensitive: bool = True) -> float
104104

105105
def hexdigest(data: str | bytes) -> str:
106106
if isinstance(data, str):
107-
return hashlib.md5(data.encode("utf-8")).hexdigest() # noqa: S324
107+
data = data.encode("utf-8")
108108
return hashlib.md5(data).hexdigest() # noqa: S324
109109

110110

@@ -217,6 +217,14 @@ def encode_id(value: str | bytes | UUID, maxsize: int | None = 16) -> str:
217217
return hashlib.md5(value).hexdigest()[:maxsize] # noqa: S324
218218

219219

220+
def compute_unique_doc_id(doi: str | None, content_hash: str | None) -> str:
221+
if doi:
222+
value_to_encode: str = doi.lower() + (content_hash or "")
223+
else:
224+
value_to_encode = content_hash or str(uuid4())
225+
return encode_id(value_to_encode)
226+
227+
220228
def get_year(ts: datetime | None = None) -> str:
221229
"""Get the year from the input datetime, otherwise using the current datetime."""
222230
if ts is None:

tests/test_agents.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
from paperqa.prompts import CANNOT_ANSWER_PHRASE, CONTEXT_INNER_PROMPT_NOT_DETAILED
6161
from paperqa.settings import AgentSettings, IndexSettings, Settings
6262
from paperqa.types import Context, Doc, PQASession, Text
63-
from paperqa.utils import encode_id, extract_thought, get_year, md5sum
63+
from paperqa.utils import compute_unique_doc_id, extract_thought, get_year, md5sum
6464

6565

6666
@pytest.mark.asyncio
@@ -117,11 +117,16 @@ async def test_get_directory_index(
117117
results = await index.query(query="what is a gravity hill?", min_score=5)
118118
assert results
119119
first_result = results[0]
120+
assert len(first_result.docs) == 1, "Expected one result (gravity_hill.md)"
120121
target_doc_path = (paper_dir / "gravity_hill.md").absolute()
121122
expected_ids = {
122-
md5sum(target_doc_path), # What we actually expect
123-
encode_id(
124-
"10.2307/j.ctt5vkfh7.11" # Crossref may match this Gravity Hill poem, lol
123+
compute_unique_doc_id(
124+
None,
125+
md5sum(target_doc_path), # What we actually expect
126+
),
127+
compute_unique_doc_id(
128+
"10.2307/j.ctt5vkfh7.11", # Crossref may match this Gravity Hill poem, lol
129+
next(iter(first_result.docs.values())).content_hash,
125130
),
126131
}
127132
for expected_id in expected_ids:
@@ -132,9 +137,9 @@ async def test_get_directory_index(
132137
f"Failed to match an ID in {expected_ids}, got citations"
133138
f" {[d.formatted_citation for d in first_result.docs.values()]}."
134139
)
135-
assert all(
136-
x in first_result.docs[expected_id].formatted_citation
137-
for x in ("Wikipedia", "Gravity")
140+
assert (
141+
"gravity hill"
142+
in first_result.docs[expected_id].formatted_citation.lower()
138143
)
139144

140145
# Check getting the same index name will not reprocess files

tests/test_paperqa.py

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from unittest.mock import MagicMock, call
1818
from uuid import UUID
1919

20+
import anyio
2021
import httpx
2122
import litellm
2223
import numpy as np
@@ -752,9 +753,11 @@ async def test_get_reasoning(docs_fixture: Docs, llm: str, llm_settings: dict) -
752753

753754

754755
@pytest.mark.asyncio
755-
async def test_duplicate(stub_data_dir: Path) -> None:
756+
async def test_duplicate(stub_data_dir: Path, tmp_path) -> None:
756757
"""Check Docs doesn't store duplicates, while checking nonduplicate docs are stored."""
757758
docs = Docs()
759+
760+
# First, check adding a straight-up duplicate doc
758761
assert await docs.aadd(
759762
stub_data_dir / "bates.txt",
760763
citation="WikiMedia Foundation, 2023, Accessed now",
@@ -767,16 +770,44 @@ async def test_duplicate(stub_data_dir: Path) -> None:
767770
dockey="test1",
768771
)
769772
is None
770-
)
773+
), "Expected duplicate add to indicate no new doc was added"
771774
assert len(docs.docs) == 1, "Should have added only one document"
775+
776+
# Next, check adding a different doc works, and also check citation inference
777+
common_doi = "10.1234/flag"
772778
assert await docs.aadd(
773-
stub_data_dir / "flag_day.html",
774-
citation="WikiMedia Foundation, 2023, Accessed now",
775-
dockey="test2",
779+
stub_data_dir / "flag_day.html", dockey="flag_day", doi=common_doi
776780
)
777781
assert (
778-
len(set(docs.docs.values())) == 2
782+
len(set(docs.docs.keys())) == 2
779783
), "Unique documents should be hashed as unique"
784+
flag_day = docs.docs["flag_day"]
785+
assert isinstance(flag_day, DocDetails)
786+
assert flag_day.doi == common_doi
787+
assert all(
788+
x in flag_day.citation.lower() for x in ("wikipedia", "flag")
789+
), "Expected citation to be inferred"
790+
assert flag_day.content_hash
791+
792+
# Now, check adding a different file but same metadata
793+
# (emulating main text vs supplemental information)
794+
# will be seen as a different doc
795+
flag_day_content = await anyio.Path(stub_data_dir / "flag_day.html").read_bytes()
796+
assert len(flag_day_content) >= 1000, "Expected long file to test truncation"
797+
await anyio.Path(tmp_path / "flag_day.html").write_bytes(flag_day_content[:-100])
798+
assert await docs.aadd(
799+
tmp_path / "flag_day.html", dockey="flag_day_shorter", doi=common_doi
800+
)
801+
assert len(set(docs.docs.keys())) == 3, "Expected a third document to be added"
802+
shorter_flag_day = docs.docs["flag_day_shorter"]
803+
assert isinstance(shorter_flag_day, DocDetails)
804+
assert shorter_flag_day.doi == common_doi
805+
assert all(
806+
x in shorter_flag_day.citation.lower() for x in ("wikipedia", "flag")
807+
), "Expected citation to be inferred"
808+
assert shorter_flag_day.content_hash
809+
assert flag_day.content_hash != shorter_flag_day.content_hash
810+
assert flag_day.doc_id != shorter_flag_day.doc_id
780811

781812

782813
@pytest.mark.asyncio
@@ -1058,6 +1089,7 @@ async def test_pdf_reader_w_no_match_doc_details(stub_data_dir: Path) -> None:
10581089
"Wellawatte et al, XAI Review, 2023",
10591090
)
10601091
(doc_details,) = docs.docs.values()
1092+
assert doc_details.content_hash == "41f786fcc56d27ff0c1507153fae3774"
10611093
assert doc_details.docname == docname, "Added name should match between details"
10621094
# doc will be a DocDetails object, but nothing can be found
10631095
# thus, we retain the prior citation data
@@ -1147,10 +1179,12 @@ async def test_pdf_reader_match_doc_details(stub_data_dir: Path) -> None:
11471179
fields=["author", "journal", "citation_count"],
11481180
)
11491181
(doc_details,) = docs.docs.values()
1182+
assert doc_details.content_hash == "41f786fcc56d27ff0c1507153fae3774"
11501183
assert doc_details.docname == docname, "Added name should match between details"
11511184
# Crossref is non-deterministic in its ordering for results
1185+
# (it can give DOI '10.1021/acs.jctc.2c01235' or DOI '10.26434/chemrxiv-2022-qfv02')
11521186
# thus we need to capture both possible dockeys
1153-
assert doc_details.dockey in {"d7763485f06aabde", "5300ef1d5fb960d7"}
1187+
assert doc_details.dockey in {"8ce7ddba9c9dcae6", "a353fa2478475c9c"}
11541188
assert isinstance(doc_details, DocDetails)
11551189
# note year is unknown because citation string is only parsed for authors/title/doi
11561190
# AND we do not request it back from the metadata sources

0 commit comments

Comments
 (0)