Skip to content

Commit e2c5b3f

Browse files
committed
feat: add support for tall pages in pdfs by splitting them horizontally
1 parent c1653da commit e2c5b3f

File tree

1 file changed

+207
-3
lines changed

1 file changed

+207
-3
lines changed

src/unstructured_client/_hooks/custom/split_pdf_hook.py

Lines changed: 207 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import asyncio
4+
import copy
45
import io
56
import json
67
import logging
@@ -13,6 +14,8 @@
1314
from pathlib import Path
1415
from typing import Any, Coroutine, Optional, Tuple, Union, cast, Generator, BinaryIO
1516

17+
from PIL import Image
18+
1619
import aiofiles
1720
import httpx
1821
import nest_asyncio # type: ignore
@@ -55,6 +58,7 @@
5558
MAX_PAGES_PER_SPLIT = 20
5659
HI_RES_STRATEGY = 'hi_res'
5760
MAX_PAGE_LENGTH = 4000
61+
TALL_PAGE_ASPECT_RATIO_THRESHOLD = 1.5
5862

5963

6064
async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Response]:
@@ -304,6 +308,10 @@ def before_request(
304308
return request
305309

306310
pdf = pdf_utils.check_pdf(pdf)
311+
312+
original_page_count = len(pdf.pages)
313+
pdf = self._split_tall_pages(pdf)
314+
image_processing_performed = (len(pdf.pages) != original_page_count)
307315

308316
starting_page_number = form_utils.get_starting_page_number(
309317
form_data,
@@ -349,10 +357,28 @@ def before_request(
349357
num_pages=page_count, concurrency_level=concurrency_level
350358
)
351359

352-
# If the doc is small enough, and we aren't slicing it with a page range:
353-
# do not split, just continue with the original request
354-
if split_size >= page_count and page_count == len(pdf.pages):
360+
# If the doc is small enough, and we aren't slicing it with a page range,
361+
# AND no image processing was performed: do not split, just continue with the original request
362+
if split_size >= page_count and page_count == len(pdf.pages) and not image_processing_performed:
355363
return request
364+
365+
# If image processing was performed, we need to send the processed PDF even for single pages
366+
if image_processing_performed and len(pdf.pages) == 1:
367+
# Create a single chunk with the processed PDF
368+
processed_pdf_data = io.BytesIO()
369+
pdf_writer = PdfWriter()
370+
pdf_writer.add_page(pdf.pages[0])
371+
pdf_writer.write(processed_pdf_data)
372+
processed_pdf_data.seek(0)
373+
374+
# Create new request with processed PDF
375+
processed_request = request_utils.create_pdf_chunk_request(
376+
form_data=form_data,
377+
pdf_chunk=(processed_pdf_data, 1),
378+
filename=pdf_file_meta["filename"],
379+
original_request=request,
380+
)
381+
return processed_request
356382

357383
pdf = self._trim_large_pages(pdf, form_data)
358384

@@ -445,6 +471,184 @@ async def call_api_partial(
445471

446472
return response
447473

474+
def _split_tall_pages(self, pdf: PdfReader) -> PdfReader:
475+
"""Checks for and splits pages that are disproportionately tall."""
476+
# Initial analysis of the PDF structure
477+
writer = PdfWriter()
478+
any_page_split = False
479+
480+
for page in pdf.pages:
481+
height = float(page.mediabox.height)
482+
width = float(page.mediabox.width)
483+
484+
if width == 0: # Avoid division by zero for invalid pages
485+
writer.add_page(page)
486+
continue
487+
488+
aspect_ratio = height / width
489+
logger.info(f"Page aspect ratio: {aspect_ratio:.2f} (threshold: {TALL_PAGE_ASPECT_RATIO_THRESHOLD})")
490+
491+
if aspect_ratio <= TALL_PAGE_ASPECT_RATIO_THRESHOLD:
492+
writer.add_page(page)
493+
continue
494+
495+
any_page_split = True
496+
num_splits = math.ceil(aspect_ratio / TALL_PAGE_ASPECT_RATIO_THRESHOLD)
497+
logger.info(f"Target splits: {num_splits} parts")
498+
499+
try:
500+
split_pages = self._split_page_with_image_processing(page, num_splits)
501+
if split_pages and len(split_pages) > 1:
502+
logger.info(f"Image processing succeeded: {len(split_pages)} parts")
503+
for split_page in split_pages:
504+
writer.add_page(split_page)
505+
else:
506+
logger.warning("Image processing failed - no valid splits returned")
507+
self._add_media_box_split_pages(writer, page, num_splits, height)
508+
except Exception as e:
509+
logger.error(f"Image processing exception: {e}")
510+
self._add_media_box_split_pages(writer, page, num_splits, height)
511+
512+
if not any_page_split:
513+
return pdf
514+
515+
# If we split any pages, return a new PdfReader from the modified content
516+
buffer = io.BytesIO()
517+
writer.write(buffer)
518+
buffer.seek(0)
519+
return PdfReader(buffer)
520+
521+
def _split_page_with_image_processing(self, page, num_splits):
522+
"""Split a page by extracting and processing its images."""
523+
if "/Resources" not in page or "/XObject" not in page["/Resources"]:
524+
return None
525+
526+
xobjects = page["/Resources"]["/XObject"]
527+
528+
for obj_name, obj in xobjects.items():
529+
if hasattr(obj, 'get_object'):
530+
obj = obj.get_object()
531+
532+
if obj.get("/Subtype") == "/Image":
533+
width = int(obj.get("/Width", 0))
534+
height = int(obj.get("/Height", 0))
535+
original_pixels = width * height
536+
537+
image_data = self._extract_image_data(obj)
538+
if not image_data:
539+
continue
540+
541+
try:
542+
pil_image = Image.open(io.BytesIO(image_data))
543+
except Exception as e:
544+
continue
545+
546+
# Calculate target resolution to stay under API limits
547+
# API limit is ~179M pixels, let's target 80M pixels total for safety margin
548+
target_pixels_total = 80_000_000
549+
target_pixels_per_split = target_pixels_total // num_splits
550+
551+
# Calculate scale factor if we need to reduce resolution
552+
scale_factor = 1.0
553+
if original_pixels > target_pixels_per_split:
554+
scale_factor = (target_pixels_per_split / original_pixels) ** 0.5
555+
556+
# Apply scaling
557+
new_width = int(pil_image.width * scale_factor)
558+
new_height = int(pil_image.height * scale_factor)
559+
pil_image = pil_image.resize((new_width, new_height), Image.Resampling.LANCZOS)
560+
561+
strip_height = pil_image.height // num_splits
562+
total_split_pixels = 0
563+
split_pages = []
564+
565+
for i in range(num_splits):
566+
top = i * strip_height
567+
bottom = min((i + 1) * strip_height, pil_image.height)
568+
569+
cropped_image = pil_image.crop((0, top, pil_image.width, bottom))
570+
strip_pixels = cropped_image.width * cropped_image.height
571+
total_split_pixels += strip_pixels
572+
573+
new_page = self._create_page_with_image(cropped_image, page)
574+
if new_page:
575+
split_pages.append(new_page)
576+
else:
577+
return None
578+
579+
if split_pages and len(split_pages) == num_splits:
580+
return split_pages
581+
582+
return None
583+
584+
def _extract_image_data(self, image_obj):
585+
"""Extract raw image data from a PDF image object."""
586+
try:
587+
if "/Filter" in image_obj:
588+
filter_type = image_obj["/Filter"]
589+
590+
if filter_type in ["/DCTDecode", "/JPXDecode"]:
591+
# JPEG or JPEG2000 - data is already compressed
592+
data = image_obj._data
593+
return data
594+
elif filter_type == "/FlateDecode":
595+
# PNG-like compression
596+
import zlib
597+
compressed_data = image_obj._data
598+
data = zlib.decompress(compressed_data)
599+
return data
600+
601+
# Fallback to raw data
602+
data = image_obj._data
603+
return data
604+
605+
except Exception as e:
606+
return None
607+
608+
def _create_page_with_image(self, pil_image, original_page):
609+
"""Create a new PDF page containing the given PIL image."""
610+
try:
611+
img_buffer = io.BytesIO()
612+
613+
# Convert to RGB if necessary
614+
if pil_image.mode != 'RGB':
615+
pil_image = pil_image.convert('RGB')
616+
617+
# Save the image as PDF
618+
pil_image.save(img_buffer, format='PDF')
619+
img_buffer.seek(0)
620+
621+
# Create a new PDF reader from the image
622+
img_pdf = PdfReader(img_buffer)
623+
if not img_pdf.pages:
624+
return None
625+
626+
new_page = img_pdf.pages[0]
627+
return new_page
628+
629+
except Exception as e:
630+
return None
631+
632+
def _add_media_box_split_pages(self, writer, page, num_splits, page_height):
633+
"""Fallback method to add pages with media box splitting (original approach)."""
634+
split_height = page_height / num_splits
635+
636+
for i in range(num_splits):
637+
# Create a deep copy to modify the media box independently
638+
new_page = copy.deepcopy(page)
639+
640+
# Calculate new coordinates for the crop
641+
top_coord = page.mediabox.top - (i * split_height)
642+
bottom_coord = page.mediabox.top - ((i + 1) * split_height)
643+
644+
# Set the new media box to crop the page
645+
new_page.mediabox.lower_left = (page.mediabox.left, bottom_coord)
646+
new_page.mediabox.lower_right = (page.mediabox.right, bottom_coord)
647+
new_page.mediabox.upper_left = (page.mediabox.left, top_coord)
648+
new_page.mediabox.upper_right = (page.mediabox.right, top_coord)
649+
650+
writer.add_page(new_page)
651+
448652
def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfReader:
449653
if form_data['strategy'] != HI_RES_STRATEGY:
450654
return pdf

0 commit comments

Comments
 (0)