Support parallel indexing (#849)

JoaquinPolonuer · pre-commit-ci-lite[bot] · jamesbraza · web-flow · commit f8d364a05ae0 · 2025-02-10T13:07:15.000-08:00
Co-authored-by: pre-commit-ci-lite[bot] &lt;117423508+pre-commit-ci-lite[bot]@users.noreply.github.com&gt;
Co-authored-by: James Braza &lt;jamesbraza@gmail.com&gt;
diff --git a/paperqa/agents/search.py b/paperqa/agents/search.py
@@ -9,7 +9,8 @@
 import pickle
 import warnings
 import zlib
-from collections.abc import Callable, Collection, Sequence
+from collections import Counter
+from collections.abc import AsyncIterator, Callable, Collection, Sequence
 from datetime import datetime
 from enum import StrEnum, auto
 from typing import TYPE_CHECKING, Any, ClassVar
@@ -152,6 +153,7 @@ def __init__(
         self._schema: Schema | None = None
         self._index: Index | None = None
         self._searcher: Searcher | None = None
+        self._writer: IndexWriter | None = None
         self._index_files: dict[str, str] = {}
         self.changed = False
         self.storage = storage
@@ -237,6 +239,15 @@ async def searcher(self) -> Searcher:
             self._searcher = index.searcher()
         return self._searcher
 
+    @contextlib.asynccontextmanager
+    async def writer(self, reset: bool = False) -> AsyncIterator[IndexWriter]:
+        if not self._writer:
+            index = await self.index
+            self._writer = index.writer()
+        yield self._writer
+        if reset:
+            self._writer = None
+
     @property
     async def count(self) -> int:
         return (await self.searcher).num_docs
@@ -295,10 +306,9 @@ async def add_document(
         async def _add_document() -> None:
             if not await self.filecheck(index_doc["file_location"], index_doc["body"]):
                 try:
-                    writer: IndexWriter = (await self.index).writer()
-                    writer.add_document(Document.from_dict(index_doc))  # type: ignore[call-arg]
-                    writer.commit()
-                    writer.wait_merging_threads()
+                    async with self.writer() as writer:
+                        # Let caller handle commit to allow for batching
+                        writer.add_document(Document.from_dict(index_doc))  # type: ignore[call-arg]
 
                     filehash = self.filehash(index_doc["body"])
                     (await self.index_files)[index_doc["file_location"]] = filehash
@@ -326,19 +336,17 @@ async def _add_document() -> None:
             )
             raise
 
-    @staticmethod
     @retry(
         stop=stop_after_attempt(1000),
         wait=wait_random_exponential(multiplier=0.25, max=60),
         retry=retry_if_exception_type(AsyncRetryError),
         reraise=True,
     )
-    def delete_document(index: Index, file_location: str) -> None:
+    async def delete_document(self, file_location: str) -> None:
         try:
-            writer: IndexWriter = index.writer()
-            writer.delete_documents("file_location", file_location)
-            writer.commit()
-            writer.wait_merging_threads()
+            async with self.writer() as writer:
+                writer.delete_documents("file_location", file_location)
+            await self.save_index()
         except ValueError as e:
             if "Failed to acquire Lockfile: LockBusy." in str(e):
                 raise AsyncRetryError("Failed to acquire lock") from e
@@ -347,7 +355,7 @@ def delete_document(index: Index, file_location: str) -> None:
     async def remove_from_index(self, file_location: str) -> None:
         index_files = await self.index_files
         if index_files.get(file_location):
-            self.delete_document(await self.index, file_location)
+            await self.delete_document(file_location)
             filehash = index_files.pop(file_location)
             docs_index_dir = await self.docs_index_directory
             # TODO: since the directory is part of the filehash these
@@ -359,6 +367,9 @@ async def remove_from_index(self, file_location: str) -> None:
             self.changed = True
 
     async def save_index(self) -> None:
+        async with self.writer(reset=True) as writer:
+            writer.commit()
+            writer.wait_merging_threads()
         file_index_path = await self.file_index_filename
         async with await anyio.open_file(file_index_path, "wb") as f:
             await f.write(zlib.compress(pickle.dumps(await self.index_files)))
@@ -461,8 +472,10 @@ async def process_file(
     manifest: dict[str, Any],
     semaphore: anyio.Semaphore,
     settings: Settings,
+    processed_counter: Counter[str],
     progress_bar_update: Callable[[], Any] | None = None,
 ) -> None:
+
     abs_file_path = (
         pathlib.Path(settings.agent.index.paper_directory).absolute() / rel_file_path
     )
@@ -496,16 +509,23 @@ async def process_file(
                     fields=["title", "author", "journal", "year"],
                     settings=settings,
                 )
-            except (ValueError, ImpossibleParsingError):
+            except Exception as e:
+                # We handle any exception here because we want to save_index so we
+                # 1. can resume the build without rebuilding this file if a separate
+                # process_file invocation leads to a segfault or crash.
+                # 2. don't have deadlock issues after.
                 logger.exception(
                     f"Error parsing {file_location}, skipping index for this file."
                 )
                 await search_index.mark_failed_document(file_location)
-                # Save so we can resume the build without rebuilding this file if a
-                # separate process_file invocation leads to a segfault or crash
                 await search_index.save_index()
                 if progress_bar_update:
                     progress_bar_update()
+
+                if not isinstance(e, ValueError | ImpossibleParsingError):
+                    # ImpossibleParsingError: parsing failure, don't retry
+                    # ValueError: TODOC
+                    raise
                 return
 
             this_doc = next(iter(tmp_docs.docs.values()))
@@ -525,9 +545,15 @@ async def process_file(
                 },
                 document=tmp_docs,
             )
-            # Save so we can resume the build without rebuilding this file if a
-            # separate process_file invocation leads to a segfault or crash
-            await search_index.save_index()
+
+            processed_counter["batched_save_counter"] += 1
+            if (
+                processed_counter["batched_save_counter"]
+                == settings.agent.index.batch_size
+            ):
+                await search_index.save_index()
+                processed_counter["batched_save_counter"] = 0
+
             logger.info(f"Complete ({title}).")
 
         # Update progress bar for either a new or previously indexed file
@@ -674,6 +700,7 @@ async def get_directory_index(  # noqa: PLR0912
     )
     with progress_bar:
         async with anyio.create_task_group() as tg:
+            processed_counter: Counter[str] = Counter()
             for rel_file_path in valid_papers_rel_file_paths:
                 if index_settings.sync_with_paper_directory:
                     tg.start_soon(
@@ -683,6 +710,7 @@ async def get_directory_index(  # noqa: PLR0912
                         manifest,
                         semaphore,
                         _settings,
+                        processed_counter,
                         progress_bar_update_fn,
                     )
                 else:
diff --git a/paperqa/docs.py b/paperqa/docs.py
@@ -285,7 +285,7 @@ async def aadd(  # noqa: PLR0912
             llm_model = all_settings.get_llm()
         if citation is None:
             # Peek first chunk
-            texts = read_doc(
+            texts = await read_doc(
                 path,
                 Doc(docname="", citation="", dockey=dockey),  # Fake doc
                 chunk_chars=parse_config.chunk_size,
@@ -370,7 +370,7 @@ async def aadd(  # noqa: PLR0912
                 doc, **(query_kwargs | kwargs)
             )
 
-        texts = read_doc(
+        texts = await read_doc(
             path,
             doc,
             chunk_chars=parse_config.chunk_size,
diff --git a/paperqa/readers.py b/paperqa/readers.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import asyncio
 import os
 from math import ceil
 from pathlib import Path
@@ -252,7 +253,7 @@ def chunk_code_text(
 
 
 @overload
-def read_doc(
+async def read_doc(
     path: str | os.PathLike,
     doc: Doc,
     parsed_text_only: Literal[False],
@@ -264,7 +265,7 @@ def read_doc(
 
 
 @overload
-def read_doc(
+async def read_doc(
     path: str | os.PathLike,
     doc: Doc,
     parsed_text_only: Literal[False] = ...,
@@ -276,7 +277,7 @@ def read_doc(
 
 
 @overload
-def read_doc(
+async def read_doc(
     path: str | os.PathLike,
     doc: Doc,
     parsed_text_only: Literal[True],
@@ -288,7 +289,7 @@ def read_doc(
 
 
 @overload
-def read_doc(
+async def read_doc(
     path: str | os.PathLike,
     doc: Doc,
     parsed_text_only: Literal[False],
@@ -299,7 +300,7 @@ def read_doc(
 ) -> tuple[list[Text], ParsedMetadata]: ...
 
 
-def read_doc(
+async def read_doc(
     path: str | os.PathLike,
     doc: Doc,
     parsed_text_only: bool = False,
@@ -311,7 +312,6 @@ def read_doc(
     """Parse a document and split into chunks.
 
     Optionally can include just the parsing as well as metadata about the parsing/chunking
-
     Args:
         path: local document path
         doc: object with document metadata
@@ -322,18 +322,29 @@ def read_doc(
         page_size_limit: optional limit on the number of characters per page
     """
     str_path = str(path)
-    parsed_text = None
 
     # start with parsing -- users may want to store this separately
     if str_path.endswith(".pdf"):
-        parsed_text = parse_pdf_to_pages(path, page_size_limit=page_size_limit)
+        # TODO: Make parse_pdf_to_pages async
+        parsed_text = await asyncio.to_thread(
+            parse_pdf_to_pages, path, page_size_limit=page_size_limit
+        )
     elif str_path.endswith(".txt"):
-        parsed_text = parse_text(path, page_size_limit=page_size_limit)
+        # TODO: Make parse_text async
+        parsed_text = await asyncio.to_thread(
+            parse_text, path, page_size_limit=page_size_limit
+        )
     elif str_path.endswith(".html"):
-        parsed_text = parse_text(path, html=True, page_size_limit=page_size_limit)
+        parsed_text = await asyncio.to_thread(
+            parse_text, path, html=True, page_size_limit=page_size_limit
+        )
     else:
-        parsed_text = parse_text(
-            path, split_lines=True, use_tiktoken=False, page_size_limit=page_size_limit
+        parsed_text = await asyncio.to_thread(
+            parse_text,
+            path,
+            split_lines=True,
+            use_tiktoken=False,
+            page_size_limit=page_size_limit,
         )
 
     if parsed_text_only:
@@ -352,7 +363,9 @@ def read_doc(
             parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap
         )
         chunk_metadata = ChunkMetadata(
-            chunk_chars=chunk_chars, overlap=overlap, chunk_type="overlap_pdf_by_page"
+            chunk_chars=chunk_chars,
+            overlap=overlap,
+            chunk_type="overlap_pdf_by_page",
         )
     elif str_path.endswith((".txt", ".html")):
         chunked_text = chunk_text(
@@ -366,7 +379,9 @@ def read_doc(
             parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap
         )
         chunk_metadata = ChunkMetadata(
-            chunk_chars=chunk_chars, overlap=overlap, chunk_type="overlap_code_by_line"
+            chunk_chars=chunk_chars,
+            overlap=overlap,
+            chunk_type="overlap_code_by_line",
         )
 
     if include_metadata:
diff --git a/paperqa/settings.py b/paperqa/settings.py
@@ -408,6 +408,11 @@ class IndexSettings(BaseModel):
         default=5,  # low default for folks without S2/Crossref keys
         description="Number of concurrent filesystem reads for indexing",
     )
+    batch_size: int = Field(
+        default=1,
+        ge=1,
+        description="Number of files to process before committing to the index.",
+    )
     sync_with_paper_directory: bool = Field(
         default=True,
         description=(
diff --git a/pyproject.toml b/pyproject.toml
@@ -232,6 +232,7 @@ disable = [
     "too-many-lines",  # Don't care to enforce this
     "too-many-locals",  # Rely on ruff PLR0914 for this
     "too-many-positional-arguments",  # Don't care to enforce this
+    "too-many-public-methods",  # Rely on ruff PLR0904 for this
     "too-many-return-statements",  # Rely on ruff PLR0911 for this
     "too-many-statements",  # Rely on ruff PLR0915 for this
     "undefined-loop-variable",  # Don't care to enforce this
diff --git a/tests/test_agents.py b/tests/test_agents.py
@@ -161,10 +161,11 @@ async def crashing_aadd(*args, **kwargs) -> str | None:
             ) as mock_aadd,
         ):
             index = await get_directory_index(settings=agent_test_settings)
+
+    assert len(await index.index_files) == num_source_files
     assert (
-        mock_aadd.await_count <= crash_threshold
-    ), "Should have been able to resume build"
-    assert len(await index.index_files) > crash_threshold
+        mock_aadd.await_count < num_source_files
+    ), "Should not rebuild the whole index"
 
 
 @pytest.mark.asyncio
@@ -1117,3 +1118,39 @@ async def test_continuation(self) -> None:
         # Check continuation of the search
         result = await tool.clinical_trials_search("Covid-19 vaccines", state)
         assert len(state.docs.docs) > trial_count, "Search was unable to continue"
+
+
+@pytest.mark.asyncio
+async def test_index_build_concurrency(agent_test_settings: Settings) -> None:
+
+    high_concurrency_settings = agent_test_settings.model_copy(deep=True)
+    high_concurrency_settings.agent.index.name = "high_concurrency"
+    high_concurrency_settings.agent.index.concurrency = 3
+    high_concurrency_settings.agent.index.batch_size = 3
+    with patch.object(
+        SearchIndex, "save_index", side_effect=SearchIndex.save_index, autospec=True
+    ) as mock_save_index:
+        start_time = time.perf_counter()
+        await get_directory_index(settings=high_concurrency_settings)
+        high_concurrency_duration = time.perf_counter() - start_time
+    high_batch_save_count = mock_save_index.call_count
+
+    low_concurrency_settings = agent_test_settings.model_copy(deep=True)
+    low_concurrency_settings.agent.index.name = "low_concurrency"
+    low_concurrency_settings.agent.index.concurrency = 1
+    low_concurrency_settings.agent.index.batch_size = 1
+    with patch.object(
+        SearchIndex, "save_index", side_effect=SearchIndex.save_index, autospec=True
+    ) as mock_save_index:
+        start_time = time.perf_counter()
+        await get_directory_index(settings=low_concurrency_settings)
+        low_concurrency_duration = time.perf_counter() - start_time
+    low_batch_save_count = mock_save_index.call_count
+
+    assert high_concurrency_duration * 1.1 < low_concurrency_duration, (
+        f"Expected high concurrency to be faster, but took {high_concurrency_duration:.2f}s "
+        f"compared to {low_concurrency_duration:.2f}s"
+    )
+    assert (
+        high_batch_save_count < low_batch_save_count
+    ), f"Expected fewer save_index with high batch size, but got {high_batch_save_count} vs {low_batch_save_count}"
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py