1
+ import base64
2
+ import json
1
3
from pathlib import Path
4
+ from typing import cast
2
5
3
6
import pymupdf
4
7
import pytest
5
- from paperqa .readers import PDFParserFn
6
- from paperqa .utils import ImpossibleParsingError
8
+ from paperqa import Doc , Docs
9
+ from paperqa .readers import PDFParserFn , chunk_pdf
10
+ from paperqa .utils import ImpossibleParsingError , bytes_to_string
7
11
8
12
from paperqa_pymupdf import parse_pdf_to_pages
9
13
10
14
REPO_ROOT = Path (__file__ ).parents [3 ]
11
15
STUB_DATA_DIR = REPO_ROOT / "tests" / "stub_data"
12
16
13
17
14
- def test_parse_pdf_to_pages () -> None :
18
+ @pytest .mark .asyncio
19
+ async def test_parse_pdf_to_pages () -> None :
15
20
assert isinstance (parse_pdf_to_pages , PDFParserFn )
16
21
17
22
filepath = STUB_DATA_DIR / "pasa.pdf"
@@ -21,19 +26,131 @@ def test_parse_pdf_to_pages() -> None:
21
26
assert (
22
27
"Abstract\n \n We introduce PaSa, an advanced Paper Search"
23
28
"\n agent powered by large language models."
24
- ) in parsed_text .content ["1" ], "Block parsing failed to handle abstract"
29
+ ) in parsed_text .content ["1" ][ 0 ] , "Block parsing failed to handle abstract"
25
30
26
- # Check Figure 1
27
- p2_text = parsed_text .content ["2" ]
31
+ # Check the images in Figure 1
32
+ assert not isinstance (parsed_text .content ["2" ], str )
33
+ p2_text , p2_media = parsed_text .content ["2" ]
28
34
assert "Figure 1" in p2_text , "Expected Figure 1 title"
29
35
assert "Crawler" in p2_text , "Expected Figure 1 contents"
36
+ (p2_image ,) = [m for m in p2_media if m .info ["type" ] == "drawing" ]
37
+ assert p2_image .index == 0
38
+ assert isinstance (p2_image .data , bytes )
39
+
40
+ # Check the image is valid base64
41
+ base64_data = bytes_to_string (p2_image .data )
42
+ assert base64_data
43
+ assert base64 .b64decode (base64_data , validate = True ) == p2_image .data
44
+
45
+ # Check we can round-trip serialize the image
46
+ serde_p2_image = type (p2_image ).model_validate_json (p2_image .model_dump_json ())
47
+ assert serde_p2_image == p2_image
48
+
49
+ # Check useful attributes are present and are JSON serializable
50
+ json .dumps (p2_image .info )
51
+ for attr in ("width" , "height" ):
52
+ dim = p2_image .info [attr ]
53
+ assert isinstance (dim , int | float )
54
+ assert dim > 0 , "Edge length should be positive"
55
+
56
+ # Check Figure 1 can be used to answer questions
57
+ doc = Doc (
58
+ docname = "He2025" ,
59
+ dockey = "stub" ,
60
+ citation = (
61
+ 'He, Yichen, et al. "PaSa: An LLM Agent for Comprehensive Academic Paper'
62
+ ' Search." *arXiv*, 2025, arXiv:2501.10120v1. Accessed 2025.'
63
+ ),
64
+ )
65
+ texts = chunk_pdf (parsed_text , doc = doc , chunk_chars = 3000 , overlap = 100 )
66
+ # pylint: disable=duplicate-code
67
+ fig_1_text = texts [1 ]
68
+ assert (
69
+ "Figure 1: Architecture of PaSa" in fig_1_text .text
70
+ ), "Expecting Figure 1 for the test to work"
71
+ assert fig_1_text .media , "Expecting media to test multimodality"
72
+ fig_1_text .text = "stub" # Replace text to confirm multimodality works
73
+ docs = Docs ()
74
+ assert await docs .aadd_texts (texts = [fig_1_text ], doc = doc )
75
+ for query , substrings_min_counts in [
76
+ ("What actions can the Crawler take?" , [(("search" , "expand" , "stop" ), 2 )]),
77
+ ("What actions can the Selector take?" , [(("select" , "drop" ), 2 )]),
78
+ (
79
+ "How many User Query are there, and what do they do?" ,
80
+ [(("two" , "2" ), 2 ), (("crawler" , "selector" ), 2 )],
81
+ ),
82
+ ]:
83
+ session = await docs .aquery (query = query )
84
+ assert session .contexts , "Expected contexts to be generated"
85
+ assert all (
86
+ c .text .text == fig_1_text .text and c .text .media == fig_1_text .media
87
+ for c in session .contexts
88
+ ), "Expected context to reuse Figure 1's text and media"
89
+ for substrings , min_count in cast (
90
+ list [tuple [tuple [str , ...], int ]], substrings_min_counts
91
+ ):
92
+ assert (
93
+ sum (x in session .answer .lower () for x in substrings ) >= min_count
94
+ ), f"Expected { session .answer = } to have at { substrings } present"
95
+
96
+ # Let's check the full page parsing behavior
97
+ parsed_text_full_page = parse_pdf_to_pages (filepath , full_page = True )
98
+ assert isinstance (parsed_text_full_page .content , dict )
99
+ assert "1" in parsed_text_full_page .content , "Parsed text should contain page 1"
100
+ assert "2" in parsed_text_full_page .content , "Parsed text should contain page 2"
101
+ for page_num in ("1" , "2" ):
102
+ page_content = parsed_text_full_page .content [page_num ]
103
+ assert not isinstance (page_content , str ), f"Page { page_num } should have images"
104
+ # Check each page has exactly one image
105
+ page_text , (full_page_image ,) = page_content
106
+ assert page_text
107
+ assert full_page_image .index == 0 , "Full page image should have index 0"
108
+ assert isinstance (full_page_image .data , bytes )
109
+ assert len (full_page_image .data ) > 0 , "Full page image should have data"
110
+ # Check useful attributes are present and are JSON serializable
111
+ json .dumps (p2_image .info )
112
+ for attr in ("width" , "height" ):
113
+ dim = full_page_image .info [attr ]
114
+ assert isinstance (dim , int | float )
115
+ assert dim > 0 , "Edge length should be positive"
116
+
117
+ # Check the no-media behavior
118
+ parsed_text_no_media = parse_pdf_to_pages (filepath , parse_media = False )
119
+ assert isinstance (parsed_text_no_media .content , dict )
120
+ assert all (isinstance (c , str ) for c in parsed_text_no_media .content .values ())
30
121
31
122
# Check metadata
32
- (parsing_library ,) = parsed_text .metadata .parsing_libraries
33
- assert pymupdf .__name__ in parsing_library
34
- assert parsed_text .metadata .parse_type == "pdf"
123
+ for pt in (parsed_text , parsed_text_full_page , parsed_text_no_media ):
124
+ (parsing_library ,) = pt .metadata .parsing_libraries
125
+ assert pymupdf .__name__ in parsing_library
126
+ assert pt .metadata .parse_type == "pdf"
127
+
128
+ # Check commonalities across all modes
129
+ assert (
130
+ len (parsed_text .content )
131
+ == len (parsed_text_full_page .content )
132
+ == len (parsed_text_no_media .content )
133
+ ), "All modes should parse the same number of pages"
35
134
36
135
37
136
def test_page_size_limit_denial () -> None :
38
137
with pytest .raises (ImpossibleParsingError , match = "char limit" ):
39
138
parse_pdf_to_pages (STUB_DATA_DIR / "paper.pdf" , page_size_limit = 10 ) # chars
139
+
140
+
141
+ def test_table_parsing () -> None :
142
+ filepath = STUB_DATA_DIR / "influence.pdf"
143
+ parsed_text = parse_pdf_to_pages (filepath )
144
+ assert isinstance (parsed_text .content , dict )
145
+ assert all (
146
+ t and t [0 ] != "\n " and t [- 1 ] != "\n " for t in parsed_text .content .values ()
147
+ ), "Expected no leading/trailing newlines in parsed text"
148
+ assert "1" in parsed_text .content , "Parsed text should contain page 1"
149
+ all_tables = {
150
+ i : [m for m in pagenum_media [1 ] if m .info ["type" ] == "table" ]
151
+ for i , pagenum_media in parsed_text .content .items ()
152
+ if isinstance (pagenum_media , tuple )
153
+ }
154
+ assert (
155
+ sum (len (tables ) for tables in all_tables .values ()) >= 2
156
+ ), "Expected a few tables to be parsed"
0 commit comments