Skip to content

Commit fdbe0c7

Browse files
committed
adding logic to split pages that are too large to process
1 parent a9b7b0b commit fdbe0c7

File tree

3 files changed

+62
-0
lines changed

3 files changed

+62
-0
lines changed

_sample_docs/super_long_pages.pdf

1.38 MB
Binary file not shown.

_test_unstructured_client/integration/test_decorators.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,21 @@ def test_integration_split_pdf_with_caching(
185185
if cache_dir:
186186
assert not Path(cache_dir).exists()
187187

188+
@pytest.mark.parametrize("filename", ["_sample_docs/super_long_pages.pdf"])
189+
def test_long_pages_hi_res(filename):
190+
req = operations.PartitionRequest(partition_parameters=shared.PartitionParameters(
191+
files=shared.Files(content=open(filename, "rb"), file_name=filename, ),
192+
strategy=shared.Strategy.HI_RES,
193+
split_pdf_page=True,
194+
split_pdf_allow_failed=True,
195+
split_pdf_concurrency_level=15
196+
), )
197+
198+
client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")
188199

200+
response = client.general.partition(request=req)
201+
assert response.status_code == 200
202+
assert len(response.elements)
189203

190204
def test_integration_split_pdf_for_file_with_no_name():
191205
"""

src/unstructured_client/_hooks/custom/split_pdf_hook.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,8 @@ def before_request(
317317
fallback_value=DEFAULT_CACHE_TMP_DATA_DIR,
318318
)
319319

320+
pdf = self._clean_large_pages(pdf)
321+
320322
page_range_start, page_range_end = form_utils.get_page_range(
321323
form_data,
322324
key=PARTITION_FORM_PAGE_RANGE_KEY.replace("[]", ""),
@@ -423,6 +425,52 @@ async def call_api_partial(
423425

424426
return response
425427

428+
def _clean_large_pages(self,
429+
pdf: PdfReader) -> PdfReader:
430+
max_page_length = 4000
431+
any_page_over_maximum_length = False
432+
for page in pdf.pages:
433+
if page.mediabox.height >= max_page_length:
434+
any_page_over_maximum_length = True
435+
436+
# early exit if all pages are safely under the max page length
437+
if not any_page_over_maximum_length:
438+
return pdf
439+
440+
w = PdfWriter()
441+
page_nums = 0
442+
443+
map_of_pages_to_clean = {}
444+
for page in pdf.pages:
445+
if page.mediabox.height <= max_page_length:
446+
page_nums += 1
447+
w.add_page(page)
448+
continue
449+
450+
num_pages_to_add = math.ceil(page.mediabox.height / max_page_length)
451+
452+
page_start = page.mediabox.height
453+
page_end = page_start - max_page_length
454+
for _ in range(num_pages_to_add):
455+
page_nums += 1
456+
map_of_pages_to_clean[page_nums] = {"top": page_start, "bottom": page_end}
457+
w.add_page(page)
458+
page_start = page_end
459+
page_end = page_start - max_page_length
460+
461+
page_nums = 0
462+
for page in w.pages:
463+
page_nums += 1
464+
if map_of_pages_to_clean.get(page_nums) is None:
465+
continue
466+
page.mediabox.top = map_of_pages_to_clean.get(page_nums)["top"]
467+
page.mediabox.bottom = map_of_pages_to_clean.get(page_nums)["bottom"]
468+
469+
chunk_buffer = io.BytesIO()
470+
w.write(chunk_buffer)
471+
chunk_buffer.seek(0)
472+
return PdfReader(chunk_buffer)
473+
426474
def _get_pdf_chunks_in_memory(
427475
self,
428476
pdf: PdfReader,

0 commit comments

Comments
 (0)