Fixed type problems from llmclient (#770)

maykcaldas · web-flow · commit 641f583c7fea · 2025-01-16T15:43:46.000-08:00
Co-authored-by: James Braza &lt;jamesbraza@gmail.com&gt;

- Corrected typing in `DocDetails`
- Corrected the typing of `Text.doc` from `Doc` to `Doc | DocDetails` and updated how `Text.doc` is created
- Defined `AUTOPOPULATE_VALUE` to declare default values for `DocDetails` entries
- Changed input data typing on `@model_validator from `dict` to `Mapping` to ensure the immutability of the input.
- Tests for serialization/deserialization and immutability of input data were included.
diff --git a/paperqa/agents/helpers.py b/paperqa/agents/helpers.py
@@ -92,11 +92,13 @@ def table_formatter(
         table.add_column("Title", style="cyan")
         table.add_column("File", style="magenta")
         for obj, filename in objects:
-            try:
-                display_name = cast(DocDetails, cast(Docs, obj).texts[0].doc).title
-            except AttributeError:
-                display_name = cast(Docs, obj).texts[0].doc.formatted_citation
-            table.add_row(cast(str, display_name)[:max_chars_per_column], filename)
+            docs = cast(Docs, obj)  # Assume homogeneous objects
+            doc = docs.texts[0].doc
+            if isinstance(doc, DocDetails) and doc.title:
+                display_name: str = doc.title  # Prefer title if available
+            else:
+                display_name = doc.formatted_citation
+            table.add_row(display_name[:max_chars_per_column], filename)
         return table
     raise NotImplementedError(
         f"Object type {type(example_object)} can not be converted to table."
diff --git a/paperqa/agents/search.py b/paperqa/agents/search.py
@@ -10,6 +10,7 @@
 import warnings
 import zlib
 from collections.abc import Callable, Collection, Sequence
+from datetime import datetime
 from enum import StrEnum, auto
 from typing import TYPE_CHECKING, Any, ClassVar
 from uuid import UUID
@@ -70,6 +71,8 @@ def default(self, o):
             return list(o)
         if isinstance(o, os.PathLike):
             return str(o)
+        if isinstance(o, datetime):
+            return o.isoformat()
         return json.JSONEncoder.default(self, o)
 
 
diff --git a/paperqa/clients/crossref.py b/paperqa/clients/crossref.py
@@ -197,7 +197,7 @@ async def parse_crossref_to_doc_details(
         elif len(date_parts) == 1:
             publication_date = datetime(date_parts[0], 1, 1)
 
-    doc_details = DocDetails(  # type: ignore[call-arg]
+    doc_details = DocDetails(
         key=None if not bibtex else bibtex.split("{")[1].split(",")[0],
         bibtex_type=CROSSREF_CONTENT_TYPE_TO_BIBTEX_MAPPING.get(
             message.get("type", "other"), "misc"
diff --git a/paperqa/clients/journal_quality.py b/paperqa/clients/journal_quality.py
@@ -44,7 +44,7 @@ async def _process(
         # docname can be blank since the validation will add it
         # remember, if both have docnames (i.e. key) they are
         # wiped and re-generated with resultant data
-        return doc_details + DocDetails(  # type: ignore[call-arg]
+        return doc_details + DocDetails(
             source_quality=max(
                 [
                     self.data.get(query.journal.casefold(), DocDetails.UNDEFINED_JOURNAL_QUALITY),  # type: ignore[union-attr]
diff --git a/paperqa/clients/openalex.py b/paperqa/clients/openalex.py
@@ -178,7 +178,7 @@ def parse_openalex_to_doc_details(message: dict[str, Any]) -> DocDetails:
 
     bibtex_type = BIBTEX_MAPPING.get(message.get("type") or "other", "misc")
 
-    return DocDetails(  # type: ignore[call-arg]
+    return DocDetails(
         key=None,
         bibtex_type=bibtex_type,
         bibtex=None,
diff --git a/paperqa/clients/retractions.py b/paperqa/clients/retractions.py
@@ -71,7 +71,7 @@ async def _process(self, query: DOIQuery, doc_details: DocDetails) -> DocDetails
         if not self.doi_set:
             await self.load_data()
 
-        return doc_details + DocDetails(is_retracted=query.doi in self.doi_set)  # type: ignore[call-arg]
+        return doc_details + DocDetails(is_retracted=query.doi in self.doi_set)
 
     def query_creator(self, doc_details: DocDetails, **kwargs) -> DOIQuery | None:
         try:
diff --git a/paperqa/clients/semantic_scholar.py b/paperqa/clients/semantic_scholar.py
@@ -182,7 +182,7 @@ async def parse_s2_to_doc_details(
 
     journal_data = paper_data.get("journal") or {}
 
-    doc_details = DocDetails(  # type: ignore[call-arg]
+    doc_details = DocDetails(
         key=None if not bibtex else bibtex.split("{")[1].split(",")[0],
         bibtex_type="article",  # s2 should be basically all articles
         bibtex=bibtex,
diff --git a/paperqa/clients/unpaywall.py b/paperqa/clients/unpaywall.py
@@ -166,7 +166,7 @@ def _create_doc_details(self, data: UnpaywallResponse) -> DocDetails:
         if data.best_oa_location:
             pdf_url = data.best_oa_location.url_for_pdf
             license = data.best_oa_location.license  # noqa: A001
-        return DocDetails(  # type: ignore[call-arg]
+        return DocDetails(
             authors=[
                 f"{author.given} {author.family}" for author in (data.z_authors or [])
             ],
diff --git a/paperqa/core.py b/paperqa/core.py
@@ -108,7 +108,7 @@ async def map_fxn_summary(
             text=Text(
                 text=text.text,
                 name=text.name,
-                doc=text.doc.__class__(**text.doc.model_dump(exclude={"embedding"})),
+                doc=text.doc.model_dump(exclude={"embedding"}),
             ),
             score=score,  # pylint: disable=possibly-used-before-assignment
             **extras,
diff --git a/paperqa/docs.py b/paperqa/docs.py
@@ -470,10 +470,12 @@ async def aadd_texts(
         # 3. Update self
         # NOTE: we defer adding texts to the texts index to retrieval time
         # (e.g. `self.texts_index.add_texts_and_embeddings(texts)`)
-        self.docs[doc.dockey] = doc
-        self.texts += texts
-        self.docnames.add(doc.docname)
-        return True
+        if doc.docname and doc.dockey:
+            self.docs[doc.dockey] = doc
+            self.texts += texts
+            self.docnames.add(doc.docname)
+            return True
+        return False
 
     def delete(
         self,
@@ -489,8 +491,9 @@ def delete(
             doc = next((doc for doc in self.docs.values() if doc.docname == name), None)
             if doc is None:
                 return
-            self.docnames.remove(doc.docname)
-            dockey = doc.dockey
+            if doc.docname and doc.dockey:
+                self.docnames.remove(doc.docname)
+                dockey = doc.dockey
         del self.docs[dockey]
         self.deleted_dockeys.add(dockey)
         self.texts = list(filter(lambda x: x.doc.dockey != dockey, self.texts))
diff --git a/paperqa/readers.py b/paperqa/readers.py
@@ -10,7 +10,13 @@
 from html2text import __version__ as html2text_version
 from html2text import html2text
 
-from paperqa.types import ChunkMetadata, Doc, ParsedMetadata, ParsedText, Text
+from paperqa.types import (
+    ChunkMetadata,
+    Doc,
+    ParsedMetadata,
+    ParsedText,
+    Text,
+)
 from paperqa.utils import ImpossibleParsingError
 from paperqa.version import __version__ as pqa_version
 
diff --git a/paperqa/types.py b/paperqa/types.py
@@ -4,7 +4,8 @@
 import os
 import re
 import warnings
-from collections.abc import Collection
+from collections.abc import Collection, Mapping
+from copy import deepcopy
 from datetime import datetime
 from typing import Any, ClassVar, cast
 from uuid import UUID, uuid4
@@ -40,16 +41,23 @@
 
 
 class Doc(Embeddable):
+    model_config = ConfigDict(extra="forbid")
+
     docname: str
-    citation: str
     dockey: DocKey
+    citation: str
     overwrite_fields_from_metadata: bool = Field(
         default=True,
         description=(
             "flag to overwrite fields from metadata when upgrading to a DocDetails"
         ),
     )
 
+    @model_validator(mode="before")
+    @classmethod
+    def remove_computed_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
+        return {k: v for k, v in data.items() if k != "formatted_citation"}
+
     def __hash__(self) -> int:
         return hash((self.docname, self.dockey))
 
@@ -80,7 +88,7 @@ def matches_filter_criteria(self, filter_criteria: dict) -> bool:
 class Text(Embeddable):
     text: str
     name: str
-    doc: Doc
+    doc: Doc | DocDetails = Field(union_mode="left_to_right")
 
     def __hash__(self) -> int:
         return hash(self.text)
@@ -215,7 +223,7 @@ def filter_content_for_user(self) -> None:
                 text=Text(
                     text="",
                     **c.text.model_dump(exclude={"text", "embedding", "doc"}),
-                    doc=Doc(**c.text.doc.model_dump(exclude={"embedding"})),
+                    doc=c.text.doc.model_dump(exclude={"embedding"}),
                 ),
             )
             for c in self.contexts
@@ -304,12 +312,18 @@ def reduce_content(self) -> str:
 
 
 class DocDetails(Doc):
-    model_config = ConfigDict(validate_assignment=True)
+    model_config = ConfigDict(validate_assignment=True, extra="ignore")
+
+    # Sentinel to auto-populate a field within model_validator
+    AUTOPOPULATE_VALUE: ClassVar[str] = ""
 
-    citation: str = ""
+    docname: str = AUTOPOPULATE_VALUE
+    dockey: DocKey = AUTOPOPULATE_VALUE
+    citation: str = AUTOPOPULATE_VALUE
     key: str | None = None
     bibtex: str | None = Field(
-        default=None, description="Autogenerated from other represented fields."
+        default=AUTOPOPULATE_VALUE,
+        description="Autogenerated from other represented fields.",
     )
     authors: list[str] | None = None
     publication_date: datetime | None = None
@@ -593,7 +607,9 @@ def populate_bibtex_key_citation(  # noqa: PLR0912
 
     @model_validator(mode="before")
     @classmethod
-    def validate_all_fields(cls, data: dict[str, Any]) -> dict[str, Any]:
+    def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
+        data = deepcopy(data)  # Avoid mutating input
+        data = dict(data)
         data = cls.lowercase_doi_and_populate_doc_id(data)
         data = cls.remove_invalid_authors(data)
         data = cls.misc_string_cleaning(data)
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -59,13 +59,13 @@ def test_cli_ask(agent_index_dir: Path, stub_data_dir: Path) -> None:
     assert response.session.formatted_answer
 
     search_result = search_query(
-        " ".join(response.session.formatted_answer.split()[:5]),
+        " ".join(response.session.formatted_answer.split()),
         "answers",
         settings,
     )
     found_answer = search_result[0][0]
     assert isinstance(found_answer, AnswerResponse)
-    assert found_answer.model_dump_json() == response.model_dump_json()
+    assert found_answer.model_dump() == response.model_dump()
 
 
 def test_cli_can_build_and_search_index(
@@ -80,5 +80,5 @@ def test_cli_can_build_and_search_index(
     result = search_query("XAI", index_name, settings)
     assert len(result) == 1
     assert isinstance(result[0][0], Docs)
-    assert result[0][0].docnames == {"Wellawatte"}
+    assert all(d.startswith("Wellawatte") for d in result[0][0].docnames)
     assert result[0][1] == "paper.pdf"
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -709,7 +709,10 @@ def test_sparse_embedding(stub_data_dir: Path, vector_store: type[VectorStore])
         citation="WikiMedia Foundation, 2023, Accessed now",
         embedding_model=SparseEmbeddingModel(),
     )
-    assert any(cast(list[float], docs.texts[0].embedding))
+    assert isinstance(
+        docs.texts[0].embedding, list
+    ), "We require embeddings to be a list"
+    assert any(docs.texts[0].embedding), "We require embeddings to be populated"
     assert all(
         len(np.array(x.embedding).shape) == 1 for x in docs.texts
     ), "Embeddings should be 1D"
@@ -731,7 +734,10 @@ def test_hybrid_embedding(stub_data_dir: Path, vector_store: type[VectorStore])
         citation="WikiMedia Foundation, 2023, Accessed now",
         embedding_model=emb_model,
     )
-    assert any(cast(list[float], docs.texts[0].embedding))
+    assert isinstance(
+        docs.texts[0].embedding, list
+    ), "We require embeddings to be a list"
+    assert any(docs.texts[0].embedding), "We require embeddings to be populated"
 
     # check the embeddings are the same size
     assert docs.texts[0].embedding is not None
@@ -1237,7 +1243,7 @@ def test_answer_rename(recwarn) -> None:
     ],
 )
 def test_dois_resolve_to_correct_journals(doi_journals):
-    details = DocDetails(doi=doi_journals["doi"])  # type: ignore[call-arg]
+    details = DocDetails(doi=doi_journals["doi"])
     assert details.journal == doi_journals["journal"]
 
 
@@ -1309,6 +1315,45 @@ def test_docdetails_merge_with_list_fields() -> None:
     assert isinstance(merged_doc, DocDetails), "Merged doc should also be DocDetails"
 
 
+def test_docdetails_deserialization() -> None:
+    deserialize_to_doc = {
+        "citation": "stub",
+        "dockey": "stub",
+        "docname": "Stub",
+        "embedding": None,
+        "formatted_citation": "stub",
+        "overwrite_fields_from_metadata": True,
+    }
+    deepcopy_deserialize_to_doc = deepcopy(deserialize_to_doc)
+    doc = Doc(**deserialize_to_doc)
+    assert not isinstance(doc, DocDetails), "Should just be Doc, not DocDetails"
+    assert (
+        deserialize_to_doc == deepcopy_deserialize_to_doc
+    ), "Deserialization should not mutate input"
+
+    doc_details = DocDetails(**deserialize_to_doc)
+    serialized_doc_details = doc_details.model_dump(exclude_none=True)
+    for key, value in {
+        "docname": "unknownauthorsUnknownyearunknowntitle",
+        "citation": "Unknown authors. Unknown title. Unknown journal, Unknown year.",
+        "overwrite_fields_from_metadata": True,
+        "key": "unknownauthorsUnknownyearunknowntitle",
+        "bibtex": (
+            '@article{unknownauthorsUnknownyearunknowntitle,\n    author = "authors,'
+            ' Unknown",\n    title = "Unknown title",\n    year = "Unknown year",\n   '
+            ' journal = "Unknown journal"\n}\n'
+        ),
+        "other": {},
+        "formatted_citation": (
+            "Unknown authors. Unknown title. Unknown journal, Unknown year."
+        ),
+    }.items():
+        assert serialized_doc_details[key] == value
+    assert (
+        deserialize_to_doc == deepcopy_deserialize_to_doc
+    ), "Deserialization should not mutate input"
+
+
 @pytest.mark.vcr
 @pytest.mark.parametrize("use_partition", [True, False])
 @pytest.mark.asyncio

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ async def _process(`
`44`	`44`	`# docname can be blank since the validation will add it`
`45`	`45`	`# remember, if both have docnames (i.e. key) they are`
`46`	`46`	`# wiped and re-generated with resultant data`
`47`		`- return doc_details + DocDetails( # type: ignore[call-arg]`
	`47`	`+ return doc_details + DocDetails(`
`48`	`48`	`source_quality=max(`
`49`	`49`	`[`
`50`	`50`	`self.data.get(query.journal.casefold(), DocDetails.UNDEFINED_JOURNAL_QUALITY), # type: ignore[union-attr]`