diff --git a/_sample_docs/super_long_pages.pdf b/_sample_docs/super_long_pages.pdf new file mode 100644 index 00000000..9fbb2890 Binary files /dev/null and b/_sample_docs/super_long_pages.pdf differ diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py index dc94bc61..e1cc73e0 100644 --- a/_test_unstructured_client/integration/test_decorators.py +++ b/_test_unstructured_client/integration/test_decorators.py @@ -185,7 +185,21 @@ def test_integration_split_pdf_with_caching( if cache_dir: assert not Path(cache_dir).exists() +@pytest.mark.parametrize("filename", ["_sample_docs/super_long_pages.pdf"]) +def test_long_pages_hi_res(filename): + req = operations.PartitionRequest(partition_parameters=shared.PartitionParameters( + files=shared.Files(content=open(filename, "rb"), file_name=filename, ), + strategy=shared.Strategy.HI_RES, + split_pdf_page=True, + split_pdf_allow_failed=True, + split_pdf_concurrency_level=15 + ), ) + + client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000") + response = client.general.partition(request=req) + assert response.status_code == 200 + assert len(response.elements) def test_integration_split_pdf_for_file_with_no_name(): """ diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index e21b145c..1f1e536b 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -50,6 +50,8 @@ MAX_CONCURRENCY_LEVEL = 50 MIN_PAGES_PER_SPLIT = 2 MAX_PAGES_PER_SPLIT = 20 +HI_RES_STRATEGY = 'hi_res' +MAX_PAGE_LENGTH = 4000 async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Response]: @@ -334,6 +336,8 @@ def before_request( if split_size >= page_count and page_count == len(pdf.pages): return request + pdf = self._trim_large_pages(pdf, form_data) + if self.cache_tmp_data_feature: pdf_chunk_paths = self._get_pdf_chunk_paths( pdf, @@ -423,6 +427,34 @@ async def call_api_partial( return response + def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfReader: + if form_data['strategy'] != HI_RES_STRATEGY: + return pdf + + max_page_length = MAX_PAGE_LENGTH + any_page_over_maximum_length = False + for page in pdf.pages: + if page.mediabox.height >= max_page_length: + any_page_over_maximum_length = True + + # early exit if all pages are safely under the max page length + if not any_page_over_maximum_length: + return pdf + + w = PdfWriter() + + # trims large pages that exceed the maximum supported height for processing + for page in pdf.pages: + if page.mediabox.height >= max_page_length: + page.mediabox.top = page.mediabox.height + page.mediabox.bottom = page.mediabox.top - max_page_length + w.add_page(page) + + chunk_buffer = io.BytesIO() + w.write(chunk_buffer) + chunk_buffer.seek(0) + return PdfReader(chunk_buffer) + def _get_pdf_chunks_in_memory( self, pdf: PdfReader,