Skip to content

Commit e685706

Browse files
committed
Incorporated a Doc.content_hash to ensure doc_id uniqueness, with tests
1 parent 6218e97 commit e685706

File tree

6 files changed

+105
-25
lines changed

6 files changed

+105
-25
lines changed

src/paperqa/clients/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,8 @@ async def upgrade_doc_to_doc_details(self, doc: Doc, **kwargs) -> DocDetails:
222222
doc_details.key = doc.docname
223223
if "citation" in doc.fields_to_overwrite_from_metadata:
224224
doc_details.citation = doc.citation
225+
if "content_hash" in doc.fields_to_overwrite_from_metadata:
226+
doc_details.content_hash = doc.content_hash
225227
return provided_doc_details + doc_details
226228

227229
# if we can't get metadata, just return the doc, but don't overwrite any fields

src/paperqa/docs.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -265,18 +265,20 @@ async def aadd( # noqa: PLR0912
265265
"""Add a document to the collection."""
266266
all_settings = get_settings(settings)
267267
parse_config = all_settings.parsing
268+
content_hash = md5sum(path)
268269
dockey_is_content_hash = False
269270
if dockey is None:
270-
# md5 sum of file contents (not path!)
271-
dockey = md5sum(path)
271+
dockey = content_hash
272272
dockey_is_content_hash = True
273273
if llm_model is None:
274274
llm_model = all_settings.get_llm()
275275
if citation is None:
276276
# Peek first chunk
277277
texts = await read_doc(
278278
path,
279-
Doc(docname="", citation="", dockey=dockey), # Fake doc
279+
Doc( # Fake doc
280+
docname="", citation="", dockey=dockey, content_hash=content_hash
281+
),
280282
chunk_chars=parse_config.chunk_size,
281283
overlap=parse_config.overlap,
282284
page_size_limit=parse_config.page_size_limit,
@@ -306,6 +308,7 @@ async def aadd( # noqa: PLR0912
306308
),
307309
citation=citation,
308310
dockey=dockey,
311+
content_hash=content_hash,
309312
)
310313

311314
# try to extract DOI / title from the citation

src/paperqa/llms.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
)
3131
from typing_extensions import override
3232

33-
from paperqa.types import Doc, Text
33+
from paperqa.types import AUTOPOPULATE_VALUE, Doc, Text
3434

3535
if TYPE_CHECKING:
3636
from qdrant_client.http.models import Record
@@ -495,6 +495,7 @@ async def fetch_batch_with_semaphore(offset: int) -> None:
495495
docname=doc_data.get("docname", ""),
496496
citation=doc_data.get("citation", ""),
497497
dockey=doc_data["dockey"],
498+
content_hash=doc_data.get("content_hash", AUTOPOPULATE_VALUE),
498499
)
499500
docs.docnames.add(doc_data.get("docname", ""))
500501

src/paperqa/types.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import ast
4+
import contextlib
45
import csv
56
import logging
67
import os
@@ -30,11 +31,13 @@
3031
)
3132

3233
from paperqa.utils import (
34+
compute_unique_doc_id,
3335
create_bibtex_key,
3436
encode_id,
3537
format_bibtex,
3638
get_citation_ids,
3739
maybe_get_date,
40+
md5sum,
3841
)
3942
from paperqa.version import __version__ as pqa_version
4043

@@ -53,7 +56,10 @@
5356
"docname",
5457
"dockey",
5558
"citation",
59+
"content_hash", # Metadata providers won't give this
5660
}
61+
# Sentinel to autopopulate a field within model_validator
62+
AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
5763

5864

5965
class Doc(Embeddable):
@@ -62,6 +68,13 @@ class Doc(Embeddable):
6268
docname: str
6369
dockey: DocKey
6470
citation: str
71+
content_hash: str | None = Field(
72+
default=AUTOPOPULATE_VALUE,
73+
description=(
74+
"Optional hash of the document's contents (to reiterate, not a file path to"
75+
" the document, but the document's contents itself)."
76+
),
77+
)
6578
# Sort the serialization to minimize the diff of serialized objects
6679
fields_to_overwrite_from_metadata: Annotated[set[str], PlainSerializer(sorted)] = (
6780
Field(
@@ -160,10 +173,6 @@ def __hash__(self) -> int:
160173
return hash((self.name, self.text))
161174

162175

163-
# Sentinel to autopopulate a field within model_validator
164-
AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
165-
166-
167176
class Context(BaseModel):
168177
"""A class to hold the context of a question."""
169178

@@ -570,8 +579,8 @@ class DocDetails(Doc):
570579
doc_id: str | None = Field(
571580
default=None,
572581
description=(
573-
"Unique ID for this document. Simple ways to acquire one include"
574-
" hashing the DOI or a stringifying a UUID."
582+
"Unique ID for this document. A simple and robust way to acquire one is"
583+
" hashing the paper content's hash concatenate with the lowercased DOI."
575584
),
576585
)
577586
file_location: str | os.PathLike | None = Field(
@@ -630,9 +639,9 @@ def lowercase_doi_and_populate_doc_id(cls, data: dict[str, Any]) -> dict[str, An
630639
doi = doi.replace(url_prefix_to_remove, "")
631640
data["doi"] = doi.lower()
632641
if not data.get("doc_id"): # keep user defined doc_ids
633-
data["doc_id"] = encode_id(doi.lower())
642+
data["doc_id"] = compute_unique_doc_id(doi, data.get("content_hash"))
634643
elif not data.get("doc_id"): # keep user defined doc_ids
635-
data["doc_id"] = encode_id(uuid4())
644+
data["doc_id"] = compute_unique_doc_id(doi, data.get("content_hash"))
636645

637646
if "dockey" in data.get(
638647
"fields_to_overwrite_from_metadata",
@@ -838,6 +847,17 @@ def populate_bibtex_key_citation(cls, data: dict[str, Any]) -> dict[str, Any]:
838847
data["citation"] = data.get("title") or CITATION_FALLBACK_DATA["title"]
839848
return data
840849

850+
@classmethod
851+
def populate_content_hash(cls, data: dict[str, Any]) -> dict[str, Any]:
852+
if ( # Check for missing or autopopulate value, but preserve `None`
853+
data.get("content_hash", AUTOPOPULATE_VALUE) == AUTOPOPULATE_VALUE
854+
):
855+
data["content_hash"] = None # Assume we don't have it
856+
if data.get("file_location"): # Try to update it
857+
with contextlib.suppress(FileNotFoundError):
858+
data["content_hash"] = md5sum(data["file_location"])
859+
return data
860+
841861
@model_validator(mode="before")
842862
@classmethod
843863
def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
@@ -857,6 +877,7 @@ def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
857877
data[possibly_str_field], str
858878
):
859879
data[possibly_str_field] = ast.literal_eval(data[possibly_str_field])
880+
data = cls.populate_content_hash(data)
860881
data = cls.lowercase_doi_and_populate_doc_id(data)
861882
data = cls.remove_invalid_authors(data)
862883
data = cls.misc_string_cleaning(data)
@@ -1017,6 +1038,14 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
10171038
)
10181039
else:
10191040
merged_data[field] = max(self_value, other_value)
1041+
elif field == "content_hash" and ( # noqa: PLR0916
1042+
# Hashes are both present but differ
1043+
(self_value and other_value and self_value != other_value)
1044+
# One hash is explicitly disabled (not autopopulated)
1045+
or (self_value is None or other_value is None)
1046+
):
1047+
# We don't know which to pick, so just discard the value
1048+
merged_data[field] = None
10201049

10211050
else:
10221051
# Prefer non-null values, default preference for 'other' object.
@@ -1031,10 +1060,13 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
10311060
else self_value
10321061
)
10331062

1034-
# Recalculate doc_id if doi has changed
1035-
if merged_data["doi"] != self.doi:
1036-
merged_data["doc_id"] = (
1037-
encode_id(merged_data["doi"].lower()) if merged_data["doi"] else None
1063+
if (
1064+
merged_data["doi"] != self.doi
1065+
or merged_data["content_hash"] != self.content_hash
1066+
):
1067+
# Recalculate doc_id if doi or content hash has changed
1068+
merged_data["doc_id"] = compute_unique_doc_id(
1069+
merged_data["doi"], merged_data.get("content_hash")
10381070
)
10391071

10401072
# Create and return new DocDetails instance

src/paperqa/utils.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from http import HTTPStatus
1717
from pathlib import Path
1818
from typing import Any, BinaryIO, ClassVar, TypeVar
19-
from uuid import UUID
19+
from uuid import UUID, uuid4
2020

2121
import aiohttp
2222
import httpx
@@ -97,7 +97,7 @@ def strings_similarity(s1: str, s2: str, case_insensitive: bool = True) -> float
9797

9898
def hexdigest(data: str | bytes) -> str:
9999
if isinstance(data, str):
100-
return hashlib.md5(data.encode("utf-8")).hexdigest() # noqa: S324
100+
data = data.encode("utf-8")
101101
return hashlib.md5(data).hexdigest() # noqa: S324
102102

103103

@@ -210,6 +210,14 @@ def encode_id(value: str | bytes | UUID, maxsize: int | None = 16) -> str:
210210
return hashlib.md5(value).hexdigest()[:maxsize] # noqa: S324
211211

212212

213+
def compute_unique_doc_id(doi: str | None, content_hash: str | None) -> str:
214+
if doi:
215+
value_to_encode: str = doi.lower() + (content_hash or "")
216+
else:
217+
value_to_encode = content_hash or str(uuid4())
218+
return encode_id(value_to_encode)
219+
220+
213221
def get_year(ts: datetime | None = None) -> str:
214222
"""Get the year from the input datetime, otherwise using the current datetime."""
215223
if ts is None:

tests/test_paperqa.py

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from typing import cast
1414
from uuid import UUID
1515

16+
import anyio
1617
import httpx
1718
import numpy as np
1819
import pytest
@@ -633,9 +634,11 @@ async def test_get_reasoning(docs_fixture: Docs, llm: str, llm_settings: dict) -
633634

634635

635636
@pytest.mark.asyncio
636-
async def test_duplicate(stub_data_dir: Path) -> None:
637+
async def test_duplicate(stub_data_dir: Path, tmp_path) -> None:
637638
"""Check Docs doesn't store duplicates, while checking nonduplicate docs are stored."""
638639
docs = Docs()
640+
641+
# First, check adding a straight-up duplicate doc
639642
assert await docs.aadd(
640643
stub_data_dir / "bates.txt",
641644
citation="WikiMedia Foundation, 2023, Accessed now",
@@ -648,16 +651,44 @@ async def test_duplicate(stub_data_dir: Path) -> None:
648651
dockey="test1",
649652
)
650653
is None
651-
)
654+
), "Expected duplicate add to indicate no new doc was added"
652655
assert len(docs.docs) == 1, "Should have added only one document"
656+
657+
# Next, check adding a different doc works, and also check citation inference
658+
common_doi = "10.1234/flag"
653659
assert await docs.aadd(
654-
stub_data_dir / "flag_day.html",
655-
citation="WikiMedia Foundation, 2023, Accessed now",
656-
dockey="test2",
660+
stub_data_dir / "flag_day.html", dockey="flag_day", doi=common_doi
657661
)
658662
assert (
659-
len(set(docs.docs.values())) == 2
663+
len(set(docs.docs.keys())) == 2
660664
), "Unique documents should be hashed as unique"
665+
flag_day = docs.docs["flag_day"]
666+
assert isinstance(flag_day, DocDetails)
667+
assert flag_day.doi == common_doi
668+
assert all(
669+
x in flag_day.citation.lower() for x in ("wikipedia", "flag")
670+
), "Expected citation to be inferred"
671+
assert flag_day.content_hash
672+
673+
# Now, check adding a different file but same metadata
674+
# (emulating main text vs supplemental information)
675+
# will be seen as a different doc
676+
flag_day_content = await anyio.Path(stub_data_dir / "flag_day.html").read_bytes()
677+
assert len(flag_day_content) >= 1000, "Expected long file to test truncation"
678+
await anyio.Path(tmp_path / "flag_day.html").write_bytes(flag_day_content[:-100])
679+
assert await docs.aadd(
680+
tmp_path / "flag_day.html", dockey="flag_day_shorter", doi=common_doi
681+
)
682+
assert len(set(docs.docs.keys())) == 3, "Expected a third document to be added"
683+
shorter_flag_day = docs.docs["flag_day_shorter"]
684+
assert isinstance(shorter_flag_day, DocDetails)
685+
assert shorter_flag_day.doi == common_doi
686+
assert all(
687+
x in shorter_flag_day.citation.lower() for x in ("wikipedia", "flag")
688+
), "Expected citation to be inferred"
689+
assert shorter_flag_day.content_hash
690+
assert flag_day.content_hash != shorter_flag_day.content_hash
691+
assert flag_day.doc_id != shorter_flag_day.doc_id
661692

662693

663694
@pytest.mark.asyncio
@@ -939,6 +970,7 @@ async def test_pdf_reader_w_no_match_doc_details(stub_data_dir: Path) -> None:
939970
"Wellawatte et al, XAI Review, 2023",
940971
)
941972
(doc_details,) = docs.docs.values()
973+
assert doc_details.content_hash == "41f786fcc56d27ff0c1507153fae3774"
942974
assert doc_details.docname == docname, "Added name should match between details"
943975
# doc will be a DocDetails object, but nothing can be found
944976
# thus, we retain the prior citation data
@@ -1028,10 +1060,12 @@ async def test_pdf_reader_match_doc_details(stub_data_dir: Path) -> None:
10281060
fields=["author", "journal", "citation_count"],
10291061
)
10301062
(doc_details,) = docs.docs.values()
1063+
assert doc_details.content_hash == "41f786fcc56d27ff0c1507153fae3774"
10311064
assert doc_details.docname == docname, "Added name should match between details"
10321065
# Crossref is non-deterministic in its ordering for results
1066+
# (it can give DOI '10.1021/acs.jctc.2c01235' or DOI '10.26434/chemrxiv-2022-qfv02')
10331067
# thus we need to capture both possible dockeys
1034-
assert doc_details.dockey in {"d7763485f06aabde", "5300ef1d5fb960d7"}
1068+
assert doc_details.dockey in {"8ce7ddba9c9dcae6", "a353fa2478475c9c"}
10351069
assert isinstance(doc_details, DocDetails)
10361070
# note year is unknown because citation string is only parsed for authors/title/doi
10371071
# AND we do not request it back from the metadata sources

0 commit comments

Comments
 (0)