feat: introduce feature flag for the feature

pawel-kmiecik · pawel-kmiecik · commit 7fcc9644af8c · 2024-11-05T15:39:18.000+01:00
diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import logging
+from pathlib import Path
 from typing import TYPE_CHECKING
 from typing_extensions import TypeAlias
 
@@ -19,6 +20,8 @@
 PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page"
 PARTITION_FORM_PAGE_RANGE_KEY = "split_pdf_page_range[]"
 PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY = "split_pdf_allow_failed"
+PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY = "split_pdf_cache_tmp_data"
+PARTITION_FORM_SPLIT_CACHE_TMP_DATA_DIR_KEY = "split_pdf_cache_tmp_data_dir"
 PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number"
 PARTITION_FORM_CONCURRENCY_LEVEL_KEY = "split_pdf_concurrency_level"
 
@@ -126,6 +129,71 @@ def get_split_pdf_allow_failed_param(
 
     return allow_failed.lower() == "true"
 
+def get_split_pdf_cache_tmp_data(
+    form_data: FormData, key: str, fallback_value: bool,
+) -> bool:
+    """Retrieves the value for cache tmp data that should be used for splitting pdf.
+
+    In case given the value is not a correct (existing) dir (Path), it will use the
+    default value.
+
+    Args:
+        form_data: The form data containing the desired flag value.
+        key: The key to look for in the form data.
+        fallback_value: The default value to use in case of an error.
+
+    Returns:
+        The flag value for 'cache tmp data' feature after validation.
+    """
+    cache_tmp_data = form_data.get(key)
+
+    if not isinstance(cache_tmp_data, str):
+        return fallback_value
+
+    if cache_tmp_data.lower() not in ["true", "false"]:
+        logger.warning(
+            "'%s' is not a valid boolean. Using default value '%s'.",
+            key,
+            fallback_value,
+        )
+        return fallback_value
+
+    return cache_tmp_data.lower() == "true"
+
+def get_split_pdf_cache_tmp_data_dir(
+    form_data: FormData, key: str, fallback_value: Path | str,
+) -> Path | str:
+    """Retrieves the value for cache tmp data dir that should be used for splitting pdf.
+
+    In case given the number is not a "false" or "true" literal, it will use the
+    default value.
+
+    Args:
+        form_data: The form data containing the desired flag value.
+        key: The key to look for in the form data.
+        fallback_value: The default value to use in case of an error.
+
+    Returns:
+        The flag value for 'cache tmp data' feature after validation.
+    """
+    cache_tmp_data_dir = form_data.get(key)
+
+    if not isinstance(cache_tmp_data_dir, str) and not isinstance(cache_tmp_data_dir, Path):
+        return fallback_value
+
+    if isinstance(cache_tmp_data_dir, str):
+        cache_tmp_data_dir = Path(cache_tmp_data_dir)
+
+    if not cache_tmp_data_dir.exists():
+        logger.warning(
+            "'%s' does not exist. Using default value '%s'.",
+            key,
+            fallback_value,
+        )
+        return fallback_value
+
+    return cache_tmp_data_dir.resolve()
+
 
 def get_split_pdf_concurrency_level_param(
     form_data: FormData, key: str, fallback_value: int, max_allowed: int
diff --git a/src/unstructured_client/_hooks/custom/request_utils.py b/src/unstructured_client/_hooks/custom/request_utils.py
@@ -4,7 +4,7 @@
 import io
 import json
 import logging
-from typing import Tuple, Any, BinaryIO
+from typing import Tuple, Any, BinaryIO, cast, IO
 
 import httpx
 from httpx._multipart import DataField, FileField
@@ -15,6 +15,8 @@
     PARTITION_FORM_FILES_KEY,
     PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
     PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
+    PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY,
+    PARTITION_FORM_SPLIT_CACHE_TMP_DATA_DIR_KEY,
     PARTITION_FORM_PAGE_RANGE_KEY,
     PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
     FormData,
@@ -82,6 +84,8 @@ def create_pdf_chunk_request_params(
         PARTITION_FORM_PAGE_RANGE_KEY,
         PARTITION_FORM_PAGE_RANGE_KEY.replace("[]", ""),
         PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
+        PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY,
+        PARTITION_FORM_SPLIT_CACHE_TMP_DATA_DIR_KEY,
     ]
     chunk_payload = {key: form_data[key] for key in form_data if key not in fields_to_drop}
     chunk_payload[PARTITION_FORM_SPLIT_PDF_PAGE_KEY] = "false"
@@ -110,9 +114,15 @@ def create_pdf_chunk_request(
     data = create_pdf_chunk_request_params(form_data, page_number)
     original_headers = prepare_request_headers(original_request.headers)
 
+    pdf_chunk_content = (
+        pdf_chunk_file.getvalue()
+        if isinstance(pdf_chunk_file, io.BytesIO)
+        else pdf_chunk_file
+    )
+
     pdf_chunk_partition_params = shared.PartitionParameters(
         files=shared.Files(
-            content=pdf_chunk_file,
+            content=pdf_chunk_content,
             file_name=filename,
             content_type="application/pdf",
         ),
diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import asyncio
+import io
 import json
 import logging
 import math
@@ -18,6 +19,7 @@
 from httpx import AsyncClient
 from pypdf import PdfReader, PdfWriter
 from requests_toolbelt.multipart.decoder import MultipartDecoder  # type: ignore
+from unstructured.chunking.dispatch import chunk
 
 from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
 from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
@@ -27,7 +29,7 @@
     PARTITION_FORM_PAGE_RANGE_KEY,
     PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
     PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
-    PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
+    PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY,
 )
 from unstructured_client._hooks.types import (
     AfterErrorContext,
@@ -45,6 +47,8 @@
 DEFAULT_STARTING_PAGE_NUMBER = 1
 DEFAULT_ALLOW_FAILED = False
 DEFAULT_CONCURRENCY_LEVEL = 10
+DEFAULT_CACHE_TMP_DATA = False
+DEFAULT_CACHE_TMP_DATA_DIR = tempfile.gettempdir()
 MAX_CONCURRENCY_LEVEL = 50
 MIN_PAGES_PER_SPLIT = 2
 MAX_PAGES_PER_SPLIT = 20
@@ -309,6 +313,17 @@ def before_request(
         )
         limiter = asyncio.Semaphore(concurrency_level)
 
+        self.cache_tmp_data_feature = form_utils.get_split_pdf_cache_tmp_data(
+            form_data,
+            key=PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY,
+            fallback_value=DEFAULT_CACHE_TMP_DATA,
+        )
+
+        self.cache_tmp_data_dir = form_utils.get_split_pdf_cache_tmp_data_dir(
+            form_data,
+            key=PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY,
+            fallback_value=DEFAULT_CACHE_TMP_DATA_DIR,
+        )
 
         page_range_start, page_range_end = form_utils.get_page_range(
             form_data,
@@ -327,16 +342,24 @@ def before_request(
         if split_size >= page_count and page_count == len(pdf.pages):
             return request
 
-        pdf_chunk_paths = self._get_pdf_chunk_paths(
-            pdf,
-            operation_id=operation_id,
-            split_size=split_size,
-            page_start=page_range_start,
-            page_end=page_range_end
-        )
-        # force free PDF object memory
-        del pdf
-        pdf_chunks = self._get_pdf_chunk_files(pdf_chunk_paths)
+        if self.cache_tmp_data_feature:
+            pdf_chunk_paths = self._get_pdf_chunk_paths(
+                pdf,
+                operation_id=operation_id,
+                split_size=split_size,
+                page_start=page_range_start,
+                page_end=page_range_end
+            )
+            # force free PDF object memory
+            del pdf
+            pdf_chunks = self._get_pdf_chunk_files(pdf_chunk_paths)
+        else:
+            pdf_chunks = self._get_pdf_chunks_in_memory(
+                pdf,
+                split_size=split_size,
+                page_start=page_range_start,
+                page_end=page_range_end
+            )
 
         self.coroutines_to_execute[operation_id] = []
         set_index = 1
@@ -393,19 +416,62 @@ async def call_api_partial(
         del response._request  # pylint: disable=protected-access
         response._request = None  # pylint: disable=protected-access
 
-        # If we get 200, dump the contents to a file and return the path
-        temp_dir = self.tempdirs[operation_id]
+
         if response.status_code == 200:
-            temp_file_name = f"{temp_dir.name}/{uuid.uuid4()}.json"
-            async with aiofiles.open(temp_file_name, mode='wb') as temp_file:
-                # Avoid reading the entire response into memory
-                async for bytes_chunk in response.aiter_bytes():
-                    await temp_file.write(bytes_chunk)
-            # we save the path in content attribute to be used in after_success
-            response._content = temp_file_name.encode()  # pylint: disable=protected-access
+            if self.cache_tmp_data_feature:
+                # If we get 200, dump the contents to a file and return the path
+                temp_dir = self.tempdirs[operation_id]
+                temp_file_name = f"{temp_dir.name}/{uuid.uuid4()}.json"
+                async with aiofiles.open(temp_file_name, mode='wb') as temp_file:
+                    # Avoid reading the entire response into memory
+                    async for bytes_chunk in response.aiter_bytes():
+                        await temp_file.write(bytes_chunk)
+                # we save the path in content attribute to be used in after_success
+                response._content = temp_file_name.encode()  # pylint: disable=protected-access
 
         return response
 
+    def _get_pdf_chunks_in_memory(
+            self,
+            pdf: PdfReader,
+            split_size: int = 1,
+            page_start: int = 1,
+            page_end: Optional[int] = None
+    ) -> Generator[Tuple[BinaryIO, int], None, None]:
+        """Reads given bytes of a pdf file and split it into n pdf-chunks, each
+        with `split_size` pages. The chunks are written into temporary files in
+        a temporary directory corresponding to the operation_id.
+
+        Args:
+            file_content: Content of the PDF file.
+            split_size: Split size, e.g. if the given file has 10 pages
+                and this value is set to 2 it will yield 5 documents, each containing 2 pages
+                of the original document. By default it will split each page to a separate file.
+            page_start: Begin splitting at this page number
+            page_end: If provided, split up to and including this page number
+
+        Returns:
+            The list of temporary file paths.
+        """
+
+        offset = page_start - 1
+        offset_end = page_end or len(pdf.pages)
+
+        chunk_no = 0
+        while offset < offset_end:
+            chunk_no += 1
+            new_pdf = PdfWriter()
+            chunk_buffer = io.BytesIO()
+
+            end = min(offset + split_size, offset_end)
+
+            for page in list(pdf.pages[offset:end]):
+                new_pdf.add_page(page)
+            new_pdf.write(chunk_buffer)
+            chunk_buffer.seek(0)
+            yield chunk_buffer, offset
+            offset += split_size
+
     def _get_pdf_chunk_paths(
         self,
         pdf: PdfReader,
@@ -434,7 +500,8 @@ def _get_pdf_chunk_paths(
         offset_end = page_end or len(pdf.pages)
 
         tempdir = tempfile.TemporaryDirectory(  # pylint: disable=consider-using-with
-            suffix="unstructured_client"
+            dir=self.cache_tmp_data_dir,
+            prefix="unstructured_client_"
         )
         self.tempdirs[operation_id] = tempdir
         tempdir_path = Path(tempdir.name)
@@ -517,7 +584,10 @@ def _await_elements(
                     response_number,
                 )
                 successful_responses.append(res)
-                elements.append(load_elements_from_response(res))
+                if self.cache_tmp_data_feature:
+                    elements.append(load_elements_from_response(res))
+                else:
+                    elements.append(res.json())
             else:
                 error_message = f"Failed to partition set {response_number}."
 
diff --git a/src/unstructured_client/models/shared/partition_parameters.py b/src/unstructured_client/models/shared/partition_parameters.py
@@ -124,6 +124,10 @@ class PartitionParametersTypedDict(TypedDict):
     r"""This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend."""
     split_pdf_page_range: NotRequired[List[int]]
     r"""When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. A ValueError is thrown if the given range is invalid. It's an internal parameter for the Python client and is not sent to the backend."""
+    split_pdf_cache_tmp_data: NotRequired[bool]
+    r"""When `split_pdf_page` is set to `True`, this parameter determines if the temporary data used for splitting the PDF should be cached into disc - if enabled should save significant amount of RAM memory when processing big files. It's an internal parameter for the Python client and is not sent to the backend."""
+    split_pdf_cache_tmp_data_dir: NotRequired[str]
+    r"""When `split_pdf_page` is set to `True` and `split_pdf_cache_tmp_data` feature is used, this parameter specifies the directory where the temporary data used for splitting the PDF should be cached into disc. It's an internal parameter for the Python client and is not sent to the backend."""
     starting_page_number: NotRequired[Nullable[int]]
     r"""When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27."""
     strategy: NotRequired[Strategy]