adding logic to split pages that are too large to process

jordan-homan · jordan-homan · commit fdbe0c7cca32 · 2024-11-21T10:28:04.000-05:00
diff --git a/_sample_docs/super_long_pages.pdf b/_sample_docs/super_long_pages.pdf
diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py
@@ -185,7 +185,21 @@ def test_integration_split_pdf_with_caching(
     if cache_dir:
         assert not Path(cache_dir).exists()
 
+@pytest.mark.parametrize("filename", ["_sample_docs/super_long_pages.pdf"])
+def test_long_pages_hi_res(filename):
+    req = operations.PartitionRequest(partition_parameters=shared.PartitionParameters(
+        files=shared.Files(content=open(filename, "rb"), file_name=filename, ),
+        strategy=shared.Strategy.HI_RES,
+        split_pdf_page=True,
+        split_pdf_allow_failed=True,
+        split_pdf_concurrency_level=15
+    ), )
+
+    client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")
 
+    response = client.general.partition(request=req)
+    assert response.status_code == 200
+    assert len(response.elements)
 
 def test_integration_split_pdf_for_file_with_no_name():
     """
diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -317,6 +317,8 @@ def before_request(
             fallback_value=DEFAULT_CACHE_TMP_DATA_DIR,
         )
 
+        pdf = self._clean_large_pages(pdf)
+
         page_range_start, page_range_end = form_utils.get_page_range(
             form_data,
             key=PARTITION_FORM_PAGE_RANGE_KEY.replace("[]", ""),
@@ -423,6 +425,52 @@ async def call_api_partial(
 
         return response
 
+    def _clean_large_pages(self,
+                           pdf: PdfReader) -> PdfReader:
+        max_page_length = 4000
+        any_page_over_maximum_length = False
+        for page in pdf.pages:
+            if page.mediabox.height >= max_page_length:
+                any_page_over_maximum_length = True
+
+        # early exit if all pages are safely under the max page length
+        if not any_page_over_maximum_length:
+            return pdf
+
+        w = PdfWriter()
+        page_nums = 0
+
+        map_of_pages_to_clean = {}
+        for page in pdf.pages:
+            if page.mediabox.height <= max_page_length:
+                page_nums += 1
+                w.add_page(page)
+                continue
+
+            num_pages_to_add = math.ceil(page.mediabox.height / max_page_length)
+
+            page_start = page.mediabox.height
+            page_end = page_start - max_page_length
+            for _ in range(num_pages_to_add):
+                page_nums += 1
+                map_of_pages_to_clean[page_nums] = {"top": page_start, "bottom": page_end}
+                w.add_page(page)
+                page_start = page_end
+                page_end = page_start - max_page_length
+
+        page_nums = 0
+        for page in w.pages:
+            page_nums += 1
+            if map_of_pages_to_clean.get(page_nums) is None:
+                continue
+            page.mediabox.top = map_of_pages_to_clean.get(page_nums)["top"]
+            page.mediabox.bottom = map_of_pages_to_clean.get(page_nums)["bottom"]
+
+        chunk_buffer = io.BytesIO()
+        w.write(chunk_buffer)
+        chunk_buffer.seek(0)
+        return PdfReader(chunk_buffer)
+
     def _get_pdf_chunks_in_memory(
             self,
             pdf: PdfReader,