Future-House
diff --git a/‎.github/workflows/build.yml
Lines changed: 19 additions & 3 deletions b/‎.github/workflows/build.yml
Lines changed: 19 additions & 3 deletions
diff --git a/‎.github/workflows/tests.yml
Lines changed: 20 additions & 2 deletions b/‎.github/workflows/tests.yml
Lines changed: 20 additions & 2 deletions
diff --git a/‎.gitignore
Lines changed: 1 addition & 1 deletion b/‎.gitignore
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 67 additions & 67 deletions b/‎README.md
Lines changed: 67 additions & 67 deletions
diff --git a/‎packages/paper-qa-pymupdf/LICENSE
Lines changed: 661 additions & 0 deletions b/‎packages/paper-qa-pymupdf/LICENSE
Lines changed: 661 additions & 0 deletions
diff --git a/‎packages/paper-qa-pymupdf/README.md
Lines changed: 10 additions & 0 deletions b/‎packages/paper-qa-pymupdf/README.md
Lines changed: 10 additions & 0 deletions
diff --git a/‎packages/paper-qa-pymupdf/pyproject.toml
Lines changed: 44 additions & 0 deletions b/‎packages/paper-qa-pymupdf/pyproject.toml
Lines changed: 44 additions & 0 deletions
diff --git a/‎packages/paper-qa-pymupdf/src/paperqa_pymupdf/__init__.py
Lines changed: 7 additions & 0 deletions b/‎packages/paper-qa-pymupdf/src/paperqa_pymupdf/__init__.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py
Lines changed: 75 additions & 0 deletions b/‎packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py
Lines changed: 75 additions & 0 deletions
@@ -10,13 +10,29 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - id: build
+      - id: build-paper-qa-pymupdf
         uses: hynek/build-and-inspect-python-package@v2
-      - name: Download built artifact to dist/
+        with:
+          path: packages/paper-qa-pymupdf
+          upload-name-suffix: -paper-qa-pymupdf
+      - name: Download built paper-qa-pymupdf artifact to dist/
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ steps.build-paper-qa-pymupdf.outputs.artifact-name }}
+          path: dist
+      - name: Clean up paper-qa-pymupdf build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
+        run: rm -r ${{ steps.build-paper-qa-pymupdf.outputs.dist }}
+      - id: build-paper-qa
+        uses: hynek/build-and-inspect-python-package@v2
+        with:
+          upload-name-suffix: -paper-qa
+      - name: Download built paper-qa artifact to dist/
         uses: actions/download-artifact@v4
         with:
-          name: ${{ steps.build.outputs.artifact-name }}
+          name: ${{ steps.build-paper-qa.outputs.artifact-name }}
           path: dist
+      - name: Clean up paper-qa build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
+        run: rm -r ${{ steps.build-paper-qa.outputs.dist }}
       - uses: pypa/gh-action-pypi-publish@release/v1
         with:
           password: ${{ secrets.PYPI_API_TOKEN }}
@@ -34,9 +34,27 @@ jobs:
         with:
           enable-cache: true
       - run: uv python pin ${{ matrix.python-version }}
-      - uses: hynek/build-and-inspect-python-package@v2
+      - name: Check paper-qa-pymupdf build
+        id: build-paper-qa-pymupdf
+        if: matrix.python-version == '3.11'
+        uses: hynek/build-and-inspect-python-package@v2
+        with:
+          path: packages/paper-qa-pymupdf
+          upload-name-suffix: -paper-qa-pymupdf
+      - name: Clean up paper-qa-pymupdf build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
+        if: matrix.python-version == '3.11'
+        run: rm -r ${{ steps.build-paper-qa-pymupdf.outputs.dist }}
+      - name: Check paper-qa build
+        id: build-paper-qa
+        if: matrix.python-version == '3.11'
+        uses: hynek/build-and-inspect-python-package@v2
+        with:
+          upload-name-suffix: -paper-qa
+      - name: Clean up paper-qa build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
+        if: matrix.python-version == '3.11'
+        run: rm -r ${{ steps.build-paper-qa.outputs.dist }}
       - run: uv sync --python-preference=only-managed
-      - run: uv run pylint paperqa
+      - run: uv run pylint src packages
       - uses: suzuki-shunsuke/[email protected]
   test:
     runs-on: ubuntu-latest
 
@@ -311,4 +311,4 @@ tests/example2.*
 !tests/stub_data/.DS_Store
 
 # Client data
-paperqa/clients/client_data/retractions.csv
+src/paperqa/clients/client_data/retractions.csv
@@ -7,7 +7,7 @@ repos:
       - id: check-added-large-files
         exclude: |
           (?x)^(
-            paperqa/clients/client_data.*|
+            src/paperqa/clients/client_data.*|
             tests/stub_data.*
           )$
       - id: check-byte-order-marker
 
@@ -0,0 +1,44 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools>=64", "setuptools_scm>=8"]
+
+[project]
+authors = [
+    {email = "[email protected]", name = "FutureHouse technical staff"},
+]
+classifiers = [
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: GNU Affero General Public License v3",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = [
+    "PyMuPDF>=1.24.12",  # For pymupdf.set_messages addition
+    "paper-qa",
+]
+description = "PaperQA readers implemented using PyMuPDF"
+dynamic = ["version"]
+license = {file = "LICENSE"}
+maintainers = [
+    {email = "[email protected]", name = "James Braza"},
+    {email = "[email protected]", name = "Michael Skarlinski"},
+    {email = "[email protected]", name = "Andrew White"},
+]
+name = "paper-qa-pymupdf"
+readme = "README.md"
+requires-python = ">=3.11"
+
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools_scm]
+root = "../.."
+version_file = "src/paperqa_pymupdf/version.py"
@@ -0,0 +1,7 @@
+from .reader import BLOCK_TEXT_INDEX, parse_pdf_to_pages, setup_pymupdf_python_logging
+
+__all__ = [
+    "BLOCK_TEXT_INDEX",
+    "parse_pdf_to_pages",
+    "setup_pymupdf_python_logging",
+]
@@ -0,0 +1,75 @@
+import os
+
+import pymupdf
+from paperqa.types import ParsedMetadata, ParsedText
+from paperqa.utils import ImpossibleParsingError
+from paperqa.version import __version__ as pqa_version
+
+
+def setup_pymupdf_python_logging() -> None:
+    """
+    Configure PyMuPDF to use Python logging.
+
+    SEE: https://pymupdf.readthedocs.io/en/latest/app3.html#diagnostics
+    """
+    pymupdf.set_messages(pylogging=True)
+
+
+BLOCK_TEXT_INDEX = 4
+
+
+def parse_pdf_to_pages(
+    path: str | os.PathLike,
+    page_size_limit: int | None = None,
+    use_block_parsing: bool = False,
+    **_,
+) -> ParsedText:
+
+    with pymupdf.open(path) as file:
+        pages: dict[str, str] = {}
+        total_length = 0
+
+        for i in range(file.page_count):
+            try:
+                page = file.load_page(i)
+            except pymupdf.mupdf.FzErrorFormat as exc:
+                raise ImpossibleParsingError(
+                    f"Page loading via {pymupdf.__name__} failed on page {i} of"
+                    f" {file.page_count} for the PDF at path {path}, likely this PDF"
+                    " file is corrupt."
+                ) from exc
+
+            if use_block_parsing:
+                # NOTE: this block-based parsing appears to be better, but until
+                # fully validated on 1+ benchmarks, it's considered experimental
+
+                # Extract text blocks from the page
+                # Note: sort=False is important to preserve the order of text blocks
+                # as they appear in the PDF
+                blocks = page.get_text("blocks", sort=False)
+
+                # Concatenate text blocks into a single string
+                text = "\n".join(
+                    block[BLOCK_TEXT_INDEX]
+                    for block in blocks
+                    if len(block) > BLOCK_TEXT_INDEX
+                )
+            else:
+                text = page.get_text("text", sort=True)
+
+            if page_size_limit and len(text) > page_size_limit:
+                raise ImpossibleParsingError(
+                    f"The text in page {i} of {file.page_count} was {len(text)} chars"
+                    f" long, which exceeds the {page_size_limit} char limit for the PDF"
+                    f" at path {path}."
+                )
+            pages[str(i + 1)] = text
+            total_length += len(text)
+
+    metadata = ParsedMetadata(
+        parsing_libraries=[f"pymupdf ({pymupdf.__version__})"],
+        paperqa_version=pqa_version,
+        total_parsed_text_length=total_length,
+        parse_type="pdf",
+    )
+    return ParsedText(content=pages, metadata=metadata)