Skip to content

Commit f71d023

Browse files
authored
Multimodal PDF support (#1047)
1 parent 0caa926 commit f71d023

File tree

16 files changed

+8164
-7672
lines changed

16 files changed

+8164
-7672
lines changed

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ question answering, summarization, and contradiction detection.
3535
- [Local Embedding Models (Sentence Transformers)](#local-embedding-models-sentence-transformers)
3636
- [Adjusting number of sources](#adjusting-number-of-sources)
3737
- [Using Code or HTML](#using-code-or-html)
38+
- [Multimodal Support](#multimodal-support)
3839
- [Using External DB/Vector DB and Caching](#using-external-dbvector-db-and-caching)
3940
- [Creating Index](#creating-index)
4041
- [Manifest Files](#manifest-files)
@@ -726,6 +727,28 @@ session = await docs.aquery("Where is the search bar in the header defined?")
726727
print(session)
727728
```
728729

730+
### Multimodal Support
731+
732+
Multimodal support centers on:
733+
734+
- Standalone images
735+
- Images or tables in PDFs
736+
737+
The `Docs` object stores media via a `ParsedMedia` object.
738+
When chunking a document, media are not split at chunk boundaries,
739+
so it's possible 2+ chunks can correspond with the same media.
740+
This means within PaperQA each chunk
741+
has a one-to-many relationship between `ParsedMedia` and chunks.
742+
743+
Depending on the source document, the same image can appear multiple times
744+
(e.g. each page of a PDF has a logo in the margins).
745+
Thus, clients should consider media databases
746+
to have a many-to-many relationship with chunks.
747+
748+
When creating contextual summaries on a given chunk (a `Text`),
749+
the summary LLM is passed both the chunk's text and the chunk's associated media,
750+
but the output contextual summary itself remains text-only.
751+
729752
### Using External DB/Vector DB and Caching
730753

731754
You may want to cache parsed texts and embeddings in an external database or file.
@@ -895,6 +918,7 @@ will return much faster than the first query and we'll be certain the authors ma
895918
| `parsing.pdfs_use_block_parsing` | `False` | Opt-in flag for block-based PDF parsing over text-based PDF parsing. |
896919
| `parsing.use_doc_details` | `True` | Whether to get metadata details for docs. |
897920
| `parsing.overlap` | `250` | Characters to overlap chunks. |
921+
| `parsing.multimodal` | `True` | Flag to parse both text and images from applicable documents. |
898922
| `parsing.defer_embedding` | `False` | Whether to defer embedding until summarization. |
899923
| `parsing.parse_pdf` | `paperqa_pypdf.parse_pdf_to_pages` | Function to parse PDF files. |
900924
| `parsing.configure_pdf_parser` | No-op | Callable to configure the PDF parser within `parse_pdf`, useful for behaviors such as enabling logging. |

packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py

Lines changed: 96 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import os
22

33
import pymupdf
4-
from paperqa.types import ParsedMetadata, ParsedText
4+
from paperqa.types import ParsedMedia, ParsedMetadata, ParsedText
55
from paperqa.utils import ImpossibleParsingError
66
from paperqa.version import __version__ as pqa_version
77

@@ -16,18 +16,61 @@ def setup_pymupdf_python_logging() -> None:
1616

1717

1818
BLOCK_TEXT_INDEX = 4
19+
# Attributes of pymupdf.Pixmap that contain useful metadata
20+
PYMUPDF_PIXMAP_ATTRS = {
21+
"alpha",
22+
# YAGNI on "digest" because it's not JSON serializable
23+
"height",
24+
"irect",
25+
"is_monochrome",
26+
"is_unicolor",
27+
"n",
28+
"size",
29+
"stride",
30+
"width",
31+
"x",
32+
"xres",
33+
"y",
34+
"yres",
35+
}
1936

2037

2138
def parse_pdf_to_pages(
2239
path: str | os.PathLike,
2340
page_size_limit: int | None = None,
2441
use_block_parsing: bool = False,
42+
parse_media: bool = True,
43+
full_page: bool = False,
44+
image_cluster_tolerance: float | tuple[float, float] = 25,
45+
image_dpi: float | None = 150,
2546
**_,
2647
) -> ParsedText:
48+
"""Parse a PDF.
49+
50+
Args:
51+
path: Path to the PDF file to parse.
52+
page_size_limit: Sensible character limit one page's text,
53+
used to catch bad PDF reads.
54+
use_block_parsing: Opt-in flag to parse text block-wise.
55+
parse_media: Flag to also parse media (e.g. images, tables).
56+
full_page: Set True to screenshot the entire page as one image,
57+
instead of parsing individual images or tables.
58+
image_cluster_tolerance: Tolerance (points) passed to `Page.cluster_drawings`.
59+
Can be a single value to apply to both X and Y directions,
60+
or a two-tuple to specify X and Y directions separately.
61+
The default was chosen to perform well on image extraction from LitQA2 PDFs.
62+
image_dpi: Dots per inch for images captured from the PDF.
63+
**_: Thrown away kwargs.
64+
"""
65+
x_tol, y_tol = (
66+
image_cluster_tolerance
67+
if isinstance(image_cluster_tolerance, tuple)
68+
else (image_cluster_tolerance, image_cluster_tolerance)
69+
)
2770

2871
with pymupdf.open(path) as file:
29-
pages: dict[str, str] = {}
30-
total_length = 0
72+
content: dict[str, str | tuple[str, list[ParsedMedia]]] = {}
73+
total_length = count_media = 0
3174

3275
for i in range(file.page_count):
3376
try:
@@ -63,13 +106,60 @@ def parse_pdf_to_pages(
63106
f" long, which exceeds the {page_size_limit} char limit for the PDF"
64107
f" at path {path}."
65108
)
66-
pages[str(i + 1)] = text
109+
media: list[ParsedMedia] = []
110+
if parse_media:
111+
if full_page: # Capture the entire page as one image
112+
pix = page.get_pixmap(dpi=image_dpi)
113+
media.append(
114+
ParsedMedia(
115+
index=0,
116+
data=pix.tobytes(),
117+
info={"type": "screenshot"}
118+
| {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS},
119+
)
120+
)
121+
else:
122+
# Capture drawings/figures
123+
for box_i, box in enumerate(
124+
page.cluster_drawings(
125+
drawings=page.get_drawings(),
126+
x_tolerance=x_tol,
127+
y_tolerance=y_tol,
128+
)
129+
):
130+
pix = page.get_pixmap(clip=box, dpi=image_dpi)
131+
media.append(
132+
ParsedMedia(
133+
index=box_i,
134+
data=pix.tobytes(),
135+
info={"bbox": tuple(box), "type": "drawing"}
136+
| {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS},
137+
)
138+
)
139+
140+
# Capture tables
141+
for table_i, table in enumerate(t for t in page.find_tables()):
142+
pix = page.get_pixmap(clip=table.bbox, dpi=image_dpi)
143+
media.append(
144+
ParsedMedia(
145+
index=table_i,
146+
data=pix.tobytes(),
147+
text=table.to_markdown().strip(),
148+
info={"bbox": tuple(table.bbox), "type": "table"}
149+
| {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS},
150+
)
151+
)
152+
content[str(i + 1)] = text, media
153+
else:
154+
content[str(i + 1)] = text
67155
total_length += len(text)
156+
count_media += len(media)
68157

69158
metadata = ParsedMetadata(
70-
parsing_libraries=[f"pymupdf ({pymupdf.__version__})"],
159+
parsing_libraries=[f"{pymupdf.__name__} ({pymupdf.__version__})"],
71160
paperqa_version=pqa_version,
72161
total_parsed_text_length=total_length,
162+
count_parsed_media=count_media,
73163
parse_type="pdf",
74164
)
75-
return ParsedText(content=pages, metadata=metadata)
165+
return ParsedText(content=content, metadata=metadata)
Lines changed: 126 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,22 @@
1+
import base64
2+
import json
13
from pathlib import Path
4+
from typing import cast
25

36
import pymupdf
47
import pytest
5-
from paperqa.readers import PDFParserFn
6-
from paperqa.utils import ImpossibleParsingError
8+
from paperqa import Doc, Docs
9+
from paperqa.readers import PDFParserFn, chunk_pdf
10+
from paperqa.utils import ImpossibleParsingError, bytes_to_string
711

812
from paperqa_pymupdf import parse_pdf_to_pages
913

1014
REPO_ROOT = Path(__file__).parents[3]
1115
STUB_DATA_DIR = REPO_ROOT / "tests" / "stub_data"
1216

1317

14-
def test_parse_pdf_to_pages() -> None:
18+
@pytest.mark.asyncio
19+
async def test_parse_pdf_to_pages() -> None:
1520
assert isinstance(parse_pdf_to_pages, PDFParserFn)
1621

1722
filepath = STUB_DATA_DIR / "pasa.pdf"
@@ -21,19 +26,131 @@ def test_parse_pdf_to_pages() -> None:
2126
assert (
2227
"Abstract\n\nWe introduce PaSa, an advanced Paper Search"
2328
"\nagent powered by large language models."
24-
) in parsed_text.content["1"], "Block parsing failed to handle abstract"
29+
) in parsed_text.content["1"][0], "Block parsing failed to handle abstract"
2530

26-
# Check Figure 1
27-
p2_text = parsed_text.content["2"]
31+
# Check the images in Figure 1
32+
assert not isinstance(parsed_text.content["2"], str)
33+
p2_text, p2_media = parsed_text.content["2"]
2834
assert "Figure 1" in p2_text, "Expected Figure 1 title"
2935
assert "Crawler" in p2_text, "Expected Figure 1 contents"
36+
(p2_image,) = [m for m in p2_media if m.info["type"] == "drawing"]
37+
assert p2_image.index == 0
38+
assert isinstance(p2_image.data, bytes)
39+
40+
# Check the image is valid base64
41+
base64_data = bytes_to_string(p2_image.data)
42+
assert base64_data
43+
assert base64.b64decode(base64_data, validate=True) == p2_image.data
44+
45+
# Check we can round-trip serialize the image
46+
serde_p2_image = type(p2_image).model_validate_json(p2_image.model_dump_json())
47+
assert serde_p2_image == p2_image
48+
49+
# Check useful attributes are present and are JSON serializable
50+
json.dumps(p2_image.info)
51+
for attr in ("width", "height"):
52+
dim = p2_image.info[attr]
53+
assert isinstance(dim, int | float)
54+
assert dim > 0, "Edge length should be positive"
55+
56+
# Check Figure 1 can be used to answer questions
57+
doc = Doc(
58+
docname="He2025",
59+
dockey="stub",
60+
citation=(
61+
'He, Yichen, et al. "PaSa: An LLM Agent for Comprehensive Academic Paper'
62+
' Search." *arXiv*, 2025, arXiv:2501.10120v1. Accessed 2025.'
63+
),
64+
)
65+
texts = chunk_pdf(parsed_text, doc=doc, chunk_chars=3000, overlap=100)
66+
# pylint: disable=duplicate-code
67+
fig_1_text = texts[1]
68+
assert (
69+
"Figure 1: Architecture of PaSa" in fig_1_text.text
70+
), "Expecting Figure 1 for the test to work"
71+
assert fig_1_text.media, "Expecting media to test multimodality"
72+
fig_1_text.text = "stub" # Replace text to confirm multimodality works
73+
docs = Docs()
74+
assert await docs.aadd_texts(texts=[fig_1_text], doc=doc)
75+
for query, substrings_min_counts in [
76+
("What actions can the Crawler take?", [(("search", "expand", "stop"), 2)]),
77+
("What actions can the Selector take?", [(("select", "drop"), 2)]),
78+
(
79+
"How many User Query are there, and what do they do?",
80+
[(("two", "2"), 2), (("crawler", "selector"), 2)],
81+
),
82+
]:
83+
session = await docs.aquery(query=query)
84+
assert session.contexts, "Expected contexts to be generated"
85+
assert all(
86+
c.text.text == fig_1_text.text and c.text.media == fig_1_text.media
87+
for c in session.contexts
88+
), "Expected context to reuse Figure 1's text and media"
89+
for substrings, min_count in cast(
90+
list[tuple[tuple[str, ...], int]], substrings_min_counts
91+
):
92+
assert (
93+
sum(x in session.answer.lower() for x in substrings) >= min_count
94+
), f"Expected {session.answer=} to have at {substrings} present"
95+
96+
# Let's check the full page parsing behavior
97+
parsed_text_full_page = parse_pdf_to_pages(filepath, full_page=True)
98+
assert isinstance(parsed_text_full_page.content, dict)
99+
assert "1" in parsed_text_full_page.content, "Parsed text should contain page 1"
100+
assert "2" in parsed_text_full_page.content, "Parsed text should contain page 2"
101+
for page_num in ("1", "2"):
102+
page_content = parsed_text_full_page.content[page_num]
103+
assert not isinstance(page_content, str), f"Page {page_num} should have images"
104+
# Check each page has exactly one image
105+
page_text, (full_page_image,) = page_content
106+
assert page_text
107+
assert full_page_image.index == 0, "Full page image should have index 0"
108+
assert isinstance(full_page_image.data, bytes)
109+
assert len(full_page_image.data) > 0, "Full page image should have data"
110+
# Check useful attributes are present and are JSON serializable
111+
json.dumps(p2_image.info)
112+
for attr in ("width", "height"):
113+
dim = full_page_image.info[attr]
114+
assert isinstance(dim, int | float)
115+
assert dim > 0, "Edge length should be positive"
116+
117+
# Check the no-media behavior
118+
parsed_text_no_media = parse_pdf_to_pages(filepath, parse_media=False)
119+
assert isinstance(parsed_text_no_media.content, dict)
120+
assert all(isinstance(c, str) for c in parsed_text_no_media.content.values())
30121

31122
# Check metadata
32-
(parsing_library,) = parsed_text.metadata.parsing_libraries
33-
assert pymupdf.__name__ in parsing_library
34-
assert parsed_text.metadata.parse_type == "pdf"
123+
for pt in (parsed_text, parsed_text_full_page, parsed_text_no_media):
124+
(parsing_library,) = pt.metadata.parsing_libraries
125+
assert pymupdf.__name__ in parsing_library
126+
assert pt.metadata.parse_type == "pdf"
127+
128+
# Check commonalities across all modes
129+
assert (
130+
len(parsed_text.content)
131+
== len(parsed_text_full_page.content)
132+
== len(parsed_text_no_media.content)
133+
), "All modes should parse the same number of pages"
35134

36135

37136
def test_page_size_limit_denial() -> None:
38137
with pytest.raises(ImpossibleParsingError, match="char limit"):
39138
parse_pdf_to_pages(STUB_DATA_DIR / "paper.pdf", page_size_limit=10) # chars
139+
140+
141+
def test_table_parsing() -> None:
142+
filepath = STUB_DATA_DIR / "influence.pdf"
143+
parsed_text = parse_pdf_to_pages(filepath)
144+
assert isinstance(parsed_text.content, dict)
145+
assert all(
146+
t and t[0] != "\n" and t[-1] != "\n" for t in parsed_text.content.values()
147+
), "Expected no leading/trailing newlines in parsed text"
148+
assert "1" in parsed_text.content, "Parsed text should contain page 1"
149+
all_tables = {
150+
i: [m for m in pagenum_media[1] if m.info["type"] == "table"]
151+
for i, pagenum_media in parsed_text.content.items()
152+
if isinstance(pagenum_media, tuple)
153+
}
154+
assert (
155+
sum(len(tables) for tables in all_tables.values()) >= 2
156+
), "Expected a few tables to be parsed"

packages/paper-qa-pypdf/pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ name = "paper-qa-pypdf"
3333
readme = "README.md"
3434
requires-python = ">=3.11"
3535

36+
[project.optional-dependencies]
37+
media = [
38+
"pypdfium2>=4.22.0", # Pin for PYPDFIUM_INFO addition
39+
]
40+
3641
[tool.ruff]
3742
extend = "../../pyproject.toml"
3843

0 commit comments

Comments
 (0)