Future-House
diff --git a/‎README.md
Lines changed: 24 additions & 0 deletions b/‎README.md
Lines changed: 24 additions & 0 deletions
diff --git a/‎packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py
Lines changed: 96 additions & 6 deletions b/‎packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py
Lines changed: 96 additions & 6 deletions
diff --git a/‎packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py
Lines changed: 126 additions & 9 deletions b/‎packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py
Lines changed: 126 additions & 9 deletions
diff --git a/‎packages/paper-qa-pypdf/pyproject.toml
Lines changed: 5 additions & 0 deletions b/‎packages/paper-qa-pypdf/pyproject.toml
Lines changed: 5 additions & 0 deletions
@@ -35,6 +35,7 @@ question answering, summarization, and contradiction detection.
     - [Local Embedding Models (Sentence Transformers)](#local-embedding-models-sentence-transformers)
   - [Adjusting number of sources](#adjusting-number-of-sources)
   - [Using Code or HTML](#using-code-or-html)
+  - [Multimodal Support](#multimodal-support)
   - [Using External DB/Vector DB and Caching](#using-external-dbvector-db-and-caching)
   - [Creating Index](#creating-index)
     - [Manifest Files](#manifest-files)
@@ -726,6 +727,28 @@ session = await docs.aquery("Where is the search bar in the header defined?")
 print(session)
 ```
 
+### Multimodal Support
+
+Multimodal support centers on:
+
+- Standalone images
+- Images or tables in PDFs
+
+The `Docs` object stores media via a `ParsedMedia` object.
+When chunking a document, media are not split at chunk boundaries,
+so it's possible 2+ chunks can correspond with the same media.
+This means within PaperQA each chunk
+has a one-to-many relationship between `ParsedMedia` and chunks.
+
+Depending on the source document, the same image can appear multiple times
+(e.g. each page of a PDF has a logo in the margins).
+Thus, clients should consider media databases
+to have a many-to-many relationship with chunks.
+
+When creating contextual summaries on a given chunk (a `Text`),
+the summary LLM is passed both the chunk's text and the chunk's associated media,
+but the output contextual summary itself remains text-only.
+
 ### Using External DB/Vector DB and Caching
 
 You may want to cache parsed texts and embeddings in an external database or file.
@@ -895,6 +918,7 @@ will return much faster than the first query and we'll be certain the authors ma
 | `parsing.pdfs_use_block_parsing`             | `False`                                | Opt-in flag for block-based PDF parsing over text-based PDF parsing.                                    |
 | `parsing.use_doc_details`                    | `True`                                 | Whether to get metadata details for docs.                                                               |
 | `parsing.overlap`                            | `250`                                  | Characters to overlap chunks.                                                                           |
+| `parsing.multimodal`                         | `True`                                 | Flag to parse both text and images from applicable documents.                                           |
 | `parsing.defer_embedding`                    | `False`                                | Whether to defer embedding until summarization.                                                         |
 | `parsing.parse_pdf`                          | `paperqa_pypdf.parse_pdf_to_pages`     | Function to parse PDF files.                                                                            |
 | `parsing.configure_pdf_parser`               | No-op                                  | Callable to configure the PDF parser within `parse_pdf`, useful for behaviors such as enabling logging. |
 
@@ -1,7 +1,7 @@
 import os
 
 import pymupdf
-from paperqa.types import ParsedMetadata, ParsedText
+from paperqa.types import ParsedMedia, ParsedMetadata, ParsedText
 from paperqa.utils import ImpossibleParsingError
 from paperqa.version import __version__ as pqa_version
 
@@ -16,18 +16,61 @@ def setup_pymupdf_python_logging() -> None:
 
 
 BLOCK_TEXT_INDEX = 4
+# Attributes of pymupdf.Pixmap that contain useful metadata
+PYMUPDF_PIXMAP_ATTRS = {
+    "alpha",
+    # YAGNI on "digest" because it's not JSON serializable
+    "height",
+    "irect",
+    "is_monochrome",
+    "is_unicolor",
+    "n",
+    "size",
+    "stride",
+    "width",
+    "x",
+    "xres",
+    "y",
+    "yres",
+}
 
 
 def parse_pdf_to_pages(
     path: str | os.PathLike,
     page_size_limit: int | None = None,
     use_block_parsing: bool = False,
+    parse_media: bool = True,
+    full_page: bool = False,
+    image_cluster_tolerance: float | tuple[float, float] = 25,
+    image_dpi: float | None = 150,
     **_,
 ) -> ParsedText:
+    """Parse a PDF.
+
+    Args:
+        path: Path to the PDF file to parse.
+        page_size_limit: Sensible character limit one page's text,
+            used to catch bad PDF reads.
+        use_block_parsing: Opt-in flag to parse text block-wise.
+        parse_media: Flag to also parse media (e.g. images, tables).
+        full_page: Set True to screenshot the entire page as one image,
+            instead of parsing individual images or tables.
+        image_cluster_tolerance: Tolerance (points) passed to `Page.cluster_drawings`.
+            Can be a single value to apply to both X and Y directions,
+            or a two-tuple to specify X and Y directions separately.
+            The default was chosen to perform well on image extraction from LitQA2 PDFs.
+        image_dpi: Dots per inch for images captured from the PDF.
+        **_: Thrown away kwargs.
+    """
+    x_tol, y_tol = (
+        image_cluster_tolerance
+        if isinstance(image_cluster_tolerance, tuple)
+        else (image_cluster_tolerance, image_cluster_tolerance)
+    )
 
     with pymupdf.open(path) as file:
-        pages: dict[str, str] = {}
-        total_length = 0
+        content: dict[str, str | tuple[str, list[ParsedMedia]]] = {}
+        total_length = count_media = 0
 
         for i in range(file.page_count):
             try:
@@ -63,13 +106,60 @@ def parse_pdf_to_pages(
                     f" long, which exceeds the {page_size_limit} char limit for the PDF"
                     f" at path {path}."
                 )
-            pages[str(i + 1)] = text
+            media: list[ParsedMedia] = []
+            if parse_media:
+                if full_page:  # Capture the entire page as one image
+                    pix = page.get_pixmap(dpi=image_dpi)
+                    media.append(
+                        ParsedMedia(
+                            index=0,
+                            data=pix.tobytes(),
+                            info={"type": "screenshot"}
+                            | {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS},
+                        )
+                    )
+                else:
+                    # Capture drawings/figures
+                    for box_i, box in enumerate(
+                        page.cluster_drawings(
+                            drawings=page.get_drawings(),
+                            x_tolerance=x_tol,
+                            y_tolerance=y_tol,
+                        )
+                    ):
+                        pix = page.get_pixmap(clip=box, dpi=image_dpi)
+                        media.append(
+                            ParsedMedia(
+                                index=box_i,
+                                data=pix.tobytes(),
+                                info={"bbox": tuple(box), "type": "drawing"}
+                                | {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS},
+                            )
+                        )
+
+                    # Capture tables
+                    for table_i, table in enumerate(t for t in page.find_tables()):
+                        pix = page.get_pixmap(clip=table.bbox, dpi=image_dpi)
+                        media.append(
+                            ParsedMedia(
+                                index=table_i,
+                                data=pix.tobytes(),
+                                text=table.to_markdown().strip(),
+                                info={"bbox": tuple(table.bbox), "type": "table"}
+                                | {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS},
+                            )
+                        )
+                content[str(i + 1)] = text, media
+            else:
+                content[str(i + 1)] = text
             total_length += len(text)
+            count_media += len(media)
 
     metadata = ParsedMetadata(
-        parsing_libraries=[f"pymupdf ({pymupdf.__version__})"],
+        parsing_libraries=[f"{pymupdf.__name__} ({pymupdf.__version__})"],
         paperqa_version=pqa_version,
         total_parsed_text_length=total_length,
+        count_parsed_media=count_media,
         parse_type="pdf",
     )
-    return ParsedText(content=pages, metadata=metadata)
+    return ParsedText(content=content, metadata=metadata)
@@ -1,17 +1,22 @@
+import base64
+import json
 from pathlib import Path
+from typing import cast
 
 import pymupdf
 import pytest
-from paperqa.readers import PDFParserFn
-from paperqa.utils import ImpossibleParsingError
+from paperqa import Doc, Docs
+from paperqa.readers import PDFParserFn, chunk_pdf
+from paperqa.utils import ImpossibleParsingError, bytes_to_string
 
 from paperqa_pymupdf import parse_pdf_to_pages
 
 REPO_ROOT = Path(__file__).parents[3]
 STUB_DATA_DIR = REPO_ROOT / "tests" / "stub_data"
 
 
-def test_parse_pdf_to_pages() -> None:
+@pytest.mark.asyncio
+async def test_parse_pdf_to_pages() -> None:
     assert isinstance(parse_pdf_to_pages, PDFParserFn)
 
     filepath = STUB_DATA_DIR / "pasa.pdf"
@@ -21,19 +26,131 @@ def test_parse_pdf_to_pages() -> None:
     assert (
         "Abstract\n\nWe introduce PaSa, an advanced Paper Search"
         "\nagent powered by large language models."
-    ) in parsed_text.content["1"], "Block parsing failed to handle abstract"
+    ) in parsed_text.content["1"][0], "Block parsing failed to handle abstract"
 
-    # Check Figure 1
-    p2_text = parsed_text.content["2"]
+    # Check the images in Figure 1
+    assert not isinstance(parsed_text.content["2"], str)
+    p2_text, p2_media = parsed_text.content["2"]
     assert "Figure 1" in p2_text, "Expected Figure 1 title"
     assert "Crawler" in p2_text, "Expected Figure 1 contents"
+    (p2_image,) = [m for m in p2_media if m.info["type"] == "drawing"]
+    assert p2_image.index == 0
+    assert isinstance(p2_image.data, bytes)
+
+    # Check the image is valid base64
+    base64_data = bytes_to_string(p2_image.data)
+    assert base64_data
+    assert base64.b64decode(base64_data, validate=True) == p2_image.data
+
+    # Check we can round-trip serialize the image
+    serde_p2_image = type(p2_image).model_validate_json(p2_image.model_dump_json())
+    assert serde_p2_image == p2_image
+
+    # Check useful attributes are present and are JSON serializable
+    json.dumps(p2_image.info)
+    for attr in ("width", "height"):
+        dim = p2_image.info[attr]
+        assert isinstance(dim, int | float)
+        assert dim > 0, "Edge length should be positive"
+
+    # Check Figure 1 can be used to answer questions
+    doc = Doc(
+        docname="He2025",
+        dockey="stub",
+        citation=(
+            'He, Yichen, et al. "PaSa: An LLM Agent for Comprehensive Academic Paper'
+            ' Search." *arXiv*, 2025, arXiv:2501.10120v1. Accessed 2025.'
+        ),
+    )
+    texts = chunk_pdf(parsed_text, doc=doc, chunk_chars=3000, overlap=100)
+    # pylint: disable=duplicate-code
+    fig_1_text = texts[1]
+    assert (
+        "Figure 1: Architecture of PaSa" in fig_1_text.text
+    ), "Expecting Figure 1 for the test to work"
+    assert fig_1_text.media, "Expecting media to test multimodality"
+    fig_1_text.text = "stub"  # Replace text to confirm multimodality works
+    docs = Docs()
+    assert await docs.aadd_texts(texts=[fig_1_text], doc=doc)
+    for query, substrings_min_counts in [
+        ("What actions can the Crawler take?", [(("search", "expand", "stop"), 2)]),
+        ("What actions can the Selector take?", [(("select", "drop"), 2)]),
+        (
+            "How many User Query are there, and what do they do?",
+            [(("two", "2"), 2), (("crawler", "selector"), 2)],
+        ),
+    ]:
+        session = await docs.aquery(query=query)
+        assert session.contexts, "Expected contexts to be generated"
+        assert all(
+            c.text.text == fig_1_text.text and c.text.media == fig_1_text.media
+            for c in session.contexts
+        ), "Expected context to reuse Figure 1's text and media"
+        for substrings, min_count in cast(
+            list[tuple[tuple[str, ...], int]], substrings_min_counts
+        ):
+            assert (
+                sum(x in session.answer.lower() for x in substrings) >= min_count
+            ), f"Expected {session.answer=} to have at {substrings} present"
+
+    # Let's check the full page parsing behavior
+    parsed_text_full_page = parse_pdf_to_pages(filepath, full_page=True)
+    assert isinstance(parsed_text_full_page.content, dict)
+    assert "1" in parsed_text_full_page.content, "Parsed text should contain page 1"
+    assert "2" in parsed_text_full_page.content, "Parsed text should contain page 2"
+    for page_num in ("1", "2"):
+        page_content = parsed_text_full_page.content[page_num]
+        assert not isinstance(page_content, str), f"Page {page_num} should have images"
+        # Check each page has exactly one image
+        page_text, (full_page_image,) = page_content
+        assert page_text
+        assert full_page_image.index == 0, "Full page image should have index 0"
+        assert isinstance(full_page_image.data, bytes)
+        assert len(full_page_image.data) > 0, "Full page image should have data"
+        # Check useful attributes are present and are JSON serializable
+        json.dumps(p2_image.info)
+        for attr in ("width", "height"):
+            dim = full_page_image.info[attr]
+            assert isinstance(dim, int | float)
+            assert dim > 0, "Edge length should be positive"
+
+    # Check the no-media behavior
+    parsed_text_no_media = parse_pdf_to_pages(filepath, parse_media=False)
+    assert isinstance(parsed_text_no_media.content, dict)
+    assert all(isinstance(c, str) for c in parsed_text_no_media.content.values())
 
     # Check metadata
-    (parsing_library,) = parsed_text.metadata.parsing_libraries
-    assert pymupdf.__name__ in parsing_library
-    assert parsed_text.metadata.parse_type == "pdf"
+    for pt in (parsed_text, parsed_text_full_page, parsed_text_no_media):
+        (parsing_library,) = pt.metadata.parsing_libraries
+        assert pymupdf.__name__ in parsing_library
+        assert pt.metadata.parse_type == "pdf"
+
+    # Check commonalities across all modes
+    assert (
+        len(parsed_text.content)
+        == len(parsed_text_full_page.content)
+        == len(parsed_text_no_media.content)
+    ), "All modes should parse the same number of pages"
 
 
 def test_page_size_limit_denial() -> None:
     with pytest.raises(ImpossibleParsingError, match="char limit"):
         parse_pdf_to_pages(STUB_DATA_DIR / "paper.pdf", page_size_limit=10)  # chars
+
+
+def test_table_parsing() -> None:
+    filepath = STUB_DATA_DIR / "influence.pdf"
+    parsed_text = parse_pdf_to_pages(filepath)
+    assert isinstance(parsed_text.content, dict)
+    assert all(
+        t and t[0] != "\n" and t[-1] != "\n" for t in parsed_text.content.values()
+    ), "Expected no leading/trailing newlines in parsed text"
+    assert "1" in parsed_text.content, "Parsed text should contain page 1"
+    all_tables = {
+        i: [m for m in pagenum_media[1] if m.info["type"] == "table"]
+        for i, pagenum_media in parsed_text.content.items()
+        if isinstance(pagenum_media, tuple)
+    }
+    assert (
+        sum(len(tables) for tables in all_tables.values()) >= 2
+    ), "Expected a few tables to be parsed"
@@ -33,6 +33,11 @@ name = "paper-qa-pypdf"
 readme = "README.md"
 requires-python = ">=3.11"
 
+[project.optional-dependencies]
+media = [
+    "pypdfium2>=4.22.0",  # Pin for PYPDFIUM_INFO addition
+]
+
 [tool.ruff]
 extend = "../../pyproject.toml"