Ensuring manifest deserialization works for non-primitive fields (#1011)

jamesbraza · web-flow · commit f1f7d5b79ff1 · 2025-07-16T16:43:38.000-07:00
diff --git a/README.md b/README.md
@@ -760,7 +760,8 @@ The paper directory is not modified in any way, it's just read from.
 The indexing process attempts to infer paper metadata like title and DOI
 using LLM-powered text processing.
 You can avoid this point of uncertainty using a "manifest" file,
-which is a CSV containing three columns (order doesn't matter):
+which is a CSV containing `DocDetails` fields (order doesn't matter).
+For example:
 
 - `file_location`: relative path to the paper's PDF within the index directory
 - `doi`: DOI of the paper
@@ -769,6 +770,9 @@ which is a CSV containing three columns (order doesn't matter):
 By providing this information,
 we ensure queries to metadata providers like Crossref are accurate.
 
+To ease creating a manifest, there is a helper class method `Doc.to_csv`,
+which also works when called on `DocDetails`.
+
 ### Reusing Index
 
 The local search indexes are built based on a hash of the current `Settings` object.
diff --git a/paperqa/agents/search.py b/paperqa/agents/search.py
@@ -8,6 +8,7 @@
 import pathlib
 import pickle
 import re
+import sys
 import warnings
 import zlib
 from collections import Counter
@@ -451,9 +452,12 @@ async def maybe_get_manifest(
         try:
             async with await anyio.open_file(filename, mode="r") as file:
                 content = await file.read()
+            reader_kwargs: dict[str, Any] = {}
+            if sys.version_info >= (3, 12):  # Unlocks `bool | None` fields
+                reader_kwargs["quoting"] = csv.QUOTE_NOTNULL
             file_loc_to_records = {
                 str(r.get("file_location")): r
-                for r in csv.DictReader(content.splitlines())
+                for r in csv.DictReader(content.splitlines(), **reader_kwargs)
                 if r.get("file_location")
             }
             if not file_loc_to_records:
diff --git a/paperqa/types.py b/paperqa/types.py
@@ -1,14 +1,16 @@
 from __future__ import annotations
 
+import ast
+import csv
 import logging
 import os
 import re
 import warnings
-from collections.abc import Collection, Mapping
+from collections.abc import Collection, Iterable, Mapping, Sequence
 from copy import deepcopy
 from datetime import datetime
 from enum import StrEnum
-from typing import Annotated, Any, ClassVar, cast
+from typing import Annotated, Any, ClassVar, Self, cast
 from uuid import UUID, uuid4
 
 import tiktoken
@@ -99,6 +101,33 @@ def matches_filter_criteria(self, filter_criteria: Mapping[str, Any]) -> bool:
                 return False
         return True
 
+    FIELDS_TO_EXCLUDE_FROM_CSV: ClassVar[set[str]] = {
+        "embedding",  # Don't store to allow for configuration of embedding models
+    }
+    CSV_FIELDS_UP_FRONT: ClassVar[Sequence[str]] = ()
+
+    @classmethod
+    def to_csv(cls, values: Iterable[Self], target_csv_path: str | os.PathLike) -> None:
+        """Dump many instances into a CSV, for later use as a manifest."""
+        headers = set(cls.model_fields) - cls.FIELDS_TO_EXCLUDE_FROM_CSV
+        with open(target_csv_path, "w", encoding="utf-8") as f:
+            writer = csv.DictWriter(
+                f,
+                fieldnames=[
+                    *sorted(cls.CSV_FIELDS_UP_FRONT),  # Make easy reading
+                    *sorted(headers - set(cls.CSV_FIELDS_UP_FRONT)),
+                ],
+            )
+            writer.writeheader()
+            writer.writerows(
+                [
+                    v.model_dump(
+                        exclude={"formatted_citation"} | cls.FIELDS_TO_EXCLUDE_FROM_CSV
+                    )
+                    for v in values
+                ]
+            )
+
 
 class Text(Embeddable):
     """A text chunk ready for use in retrieval with a linked document."""
@@ -565,6 +594,11 @@ class DocDetails(Doc):
         "http://dx.doi.org/",
     }
     AUTHOR_NAMES_TO_REMOVE: ClassVar[Collection[str]] = {"et al", "et al."}
+    FIELDS_TO_EXCLUDE_FROM_CSV: ClassVar[set[str]] = {
+        "bibtex",  # Let this be autogenerated, to avoid dealing with newlines
+        "embedding",  # Don't store to allow for configuration of embedding models
+    }
+    CSV_FIELDS_UP_FRONT: ClassVar[Sequence[str]] = ("doi", "file_location")
 
     @field_validator("key")
     @classmethod
@@ -805,10 +839,18 @@ def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
         data = deepcopy(data)  # Avoid mutating input
         data = dict(data)
         if isinstance(data.get("fields_to_overwrite_from_metadata"), str):
+            raw_value = data["fields_to_overwrite_from_metadata"]
+            if (raw_value[0], raw_value[-1]) in {("[", "]"), ("{", "}")}:
+                # If string-ified set or list, remove brackets before split
+                raw_value = raw_value[1:-1]
             data["fields_to_overwrite_from_metadata"] = {
-                s.strip()
-                for s in data.get("fields_to_overwrite_from_metadata", "").split(",")
+                s.strip("\"' ") for s in raw_value.split(",")
             }
+        for possibly_str_field in ("authors", "other"):
+            if data.get(possibly_str_field) and isinstance(
+                data[possibly_str_field], str
+            ):
+                data[possibly_str_field] = ast.literal_eval(data[possibly_str_field])
         data = cls.lowercase_doi_and_populate_doc_id(data)
         data = cls.remove_invalid_authors(data)
         data = cls.misc_string_cleaning(data)
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -1,8 +1,10 @@
 import contextlib
+import csv
 import os
 import pathlib
 import pickle
 import re
+import sys
 from collections.abc import AsyncIterable, Sequence
 from copy import deepcopy
 from datetime import datetime, timedelta
@@ -1473,13 +1475,15 @@ def test_docdetails_merge_with_list_fields() -> None:
     assert isinstance(merged_doc, DocDetails), "Merged doc should also be DocDetails"
 
 
-def test_docdetails_deserialization() -> None:
+@pytest.mark.skipif(sys.version_info < (3, 12), reason="Uses `csv.QUOTE_NOTNULL`.")
+def test_docdetails_deserialization(tmp_path) -> None:
     deserialize_to_doc = {
         "citation": "stub",
         "dockey": "stub",
         "docname": "Stub",
         "embedding": None,
         "formatted_citation": "stub",
+        "fields_to_overwrite_from_metadata": {"key", "doc_id", "docname", "citation"},
     }
     deepcopy_deserialize_to_doc = deepcopy(deserialize_to_doc)
     doc = Doc(**deserialize_to_doc)
@@ -1510,6 +1514,17 @@ def test_docdetails_deserialization() -> None:
         deserialize_to_doc == deepcopy_deserialize_to_doc
     ), "Deserialization should not mutate input"
 
+    doc_details = DocDetails(
+        **deserialize_to_doc, other={"apple": "sauce"}, authors=["Thomas Anderson"]
+    )
+    DocDetails.to_csv([doc_details], target_csv_path=Path(tmp_path) / "manifest.csv")
+    with open(tmp_path / "manifest.csv", encoding="utf-8") as f:
+        csv_deserialized = DocDetails(
+            # type ignore comments are here since mypy can't recognize pytest skip
+            **next(csv.DictReader(f.readlines(), quoting=csv.QUOTE_NOTNULL))  # type: ignore[attr-defined,unused-ignore]
+        )
+    assert doc_details == csv_deserialized, "Round-trip CSV deserialization failed"
+
 
 def test_docdetails_doc_id_roundtrip() -> None:
     """Test that DocDetails can be initialized with doc_id or doi inputs."""