Merge branch 'main' into pawel/fix-split-pdf-memory-usage

pawel-kmiecik · pawel-kmiecik · commit f2582a005ebf · 2024-10-30T13:01:56.000+01:00
diff --git a/.speakeasy/gen.lock b/.speakeasy/gen.lock
@@ -3,10 +3,10 @@ id: 8b5fa338-9106-4734-abf0-e30d67044a90
 management:
   docChecksum: 21f469b38bb72725739ee9d9d0fc8780
   docVersion: 1.0.51
-  speakeasyVersion: 1.418.1
-  generationVersion: 2.438.3
-  releaseVersion: 0.26.1
-  configChecksum: 55ded3ef4f1b052725cdab6587da0ea4
+  speakeasyVersion: 1.422.1
+  generationVersion: 2.438.15
+  releaseVersion: 0.26.2
+  configChecksum: c46fa7f108a08d4565530aa29da677b5
   repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git
   repoSubDirectory: .
   installationURL: https://github.com/Unstructured-IO/unstructured-python-client.git
diff --git a/.speakeasy/workflow.lock b/.speakeasy/workflow.lock
@@ -1,8 +1,8 @@
-speakeasyVersion: 1.418.1
+speakeasyVersion: 1.422.1
 sources:
     my-source:
         sourceNamespace: my-source
-        sourceRevisionDigest: sha256:a820d523af3e56f0dd1fc84f4f29e15330edb26cc253e93981bddb4a5176ac3c
+        sourceRevisionDigest: sha256:31c94056ebc941cdfcf3fd4ba5e04880e978740963f7ce79169ba66cd033d74d
         sourceBlobDigest: sha256:27e4879df402e924f9f65d336ea6d2fc8b16a00b87b4a802866238f7e9f639d3
         tags:
             - latest
@@ -11,7 +11,7 @@ targets:
     unstructured-python:
         source: my-source
         sourceNamespace: my-source
-        sourceRevisionDigest: sha256:a820d523af3e56f0dd1fc84f4f29e15330edb26cc253e93981bddb4a5176ac3c
+        sourceRevisionDigest: sha256:31c94056ebc941cdfcf3fd4ba5e04880e978740963f7ce79169ba66cd033d74d
         sourceBlobDigest: sha256:27e4879df402e924f9f65d336ea6d2fc8b16a00b87b4a802866238f7e9f639d3
 workflow:
     workflowVersion: 1.0.0
diff --git a/RELEASES.md b/RELEASES.md
@@ -704,4 +704,14 @@ Based on:
 ### Generated
 - [python v0.26.1] .
 ### Releases
-- [PyPI v0.26.1] https://pypi.org/project/unstructured-client/0.26.1 - .
+- [PyPI v0.26.1] https://pypi.org/project/unstructured-client/0.26.1 - .
+
+## 2024-10-28 00:09:56
+### Changes
+Based on:
+- OpenAPI Doc  
+- Speakeasy CLI 1.422.1 (2.438.15) https://github.com/speakeasy-api/speakeasy
+### Generated
+- [python v0.26.2] .
+### Releases
+- [PyPI v0.26.2] https://pypi.org/project/unstructured-client/0.26.2 - .
diff --git a/gen.yaml b/gen.yaml
@@ -10,7 +10,7 @@ generation:
   auth:
     oAuth2ClientCredentialsEnabled: false
 python:
-  version: 0.26.1
+  version: 0.26.2
   additionalDependencies:
     dev:
       deepdiff: '>=6.0'
diff --git a/src/unstructured_client/_hooks/custom/logger_hook.py b/src/unstructured_client/_hooks/custom/logger_hook.py
@@ -77,13 +77,10 @@ def after_error(
         if response and response.status_code == 200:
             # NOTE: Even though this is an after_error method, due to split_pdf_hook logic we may get
             # a success here when one of the split requests was partitioned successfully
-            logger.info("Successfully partitioned the document.")
-        
-        else:
-            logger.error("Failed to partition the document.")
-            if response:
-                logger.error("Server responded with %d - %s", response.status_code, response.text)
-            if error is not None:
-                logger.error("Following error occurred - %s", error)
-        
+            return response, error
+        logger.error("Failed to partition the document.")
+        if response:
+            logger.error("Server responded with %d - %s", response.status_code, response.text)
+        if error is not None:
+            logger.error("Following error occurred - %s", error)
         return response, error
diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -146,7 +146,6 @@ def sdk_init(
         Returns:
             Tuple[str, HttpClient]: The initialized SDK options.
         """
-
         class DummyTransport(httpx.BaseTransport):
             def __init__(self, base_transport: httpx.BaseTransport):
                 self.base_transport = base_transport
@@ -238,39 +237,32 @@ def before_request(
         if split_pdf_page is None or split_pdf_page == "false":
             return request
 
-        logger.info("Preparing to split document for partition.")
         file = form_data.get(PARTITION_FORM_FILES_KEY)
         if (
                 file is None
                 or not isinstance(file, shared.Files)
                 or not pdf_utils.is_pdf(file)
         ):
-            logger.info("Partitioning without split.")
             return request
 
         starting_page_number = form_utils.get_starting_page_number(
             form_data,
             key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
             fallback_value=DEFAULT_STARTING_PAGE_NUMBER,
         )
-        if starting_page_number > 1:
-            logger.info("Starting page number set to %d", starting_page_number)
-        logger.info("Starting page number set to %d", starting_page_number)
 
         self.allow_failed = form_utils.get_split_pdf_allow_failed_param(
             form_data,
             key=PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
             fallback_value=DEFAULT_ALLOW_FAILED,
         )
-        logger.info("Allow failed set to %d", self.allow_failed)
 
         concurrency_level = form_utils.get_split_pdf_concurrency_level_param(
             form_data,
             key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
             fallback_value=DEFAULT_CONCURRENCY_LEVEL,
             max_allowed=MAX_CONCURRENCY_LEVEL,
         )
-        logger.info("Concurrency level set to %d", concurrency_level)
         limiter = asyncio.Semaphore(concurrency_level)
 
         content = cast(bytes, file.content)
@@ -283,25 +275,14 @@ def before_request(
         )
 
         page_count = page_range_end - page_range_start + 1
-        logger.info(
-            "Splitting pages %d to %d (%d total)",
-            page_range_start,
-            page_range_end,
-            page_count,
-        )
 
         split_size = get_optimal_split_size(
             num_pages=page_count, concurrency_level=concurrency_level
         )
-        logger.info("Determined optimal split size of %d pages.", split_size)
 
         # If the doc is small enough, and we aren't slicing it with a page range:
         # do not split, just continue with the original request
         if split_size >= page_count and page_count == len(pdf.pages):
-            logger.info(
-                "Document has too few pages (%d) to be split efficiently. Partitioning without split.",
-                page_count,
-            )
             return request
 
         pages = self._get_pdf_pages(
@@ -329,7 +310,7 @@ def before_request(
         # Use a variable to adjust the httpx client timeout, or default to 30 minutes
         # When we're able to reuse the SDK to make these calls, we can remove this var
         # The SDK timeout will be controlled by parameter
-        client_timeout_minutes = 30
+        client_timeout_minutes = 60
         if timeout_var := os.getenv("UNSTRUCTURED_CLIENT_TIMEOUT_MINUTES"):
             client_timeout_minutes = int(timeout_var)
 
@@ -365,14 +346,8 @@ async def call_api_partial(pdf_chunk: Tuple[BinaryIO, int]):
 
         self.coroutines_to_execute[operation_id] = []
         set_index = 1
-        for pdf_chunk_file, page_index, all_pages_number in pages:
+        for pdf_chunk_file, page_index in pages:
             page_number = page_index + starting_page_number
-            logger.info(
-                "Partitioning set #%d (pages %d-%d).",
-                set_index,
-                page_number,
-                min(page_number + split_size - 1, all_pages_number),
-            )
 
             coroutine = call_api_partial((pdf_chunk_file, page_number))
             self.coroutines_to_execute[operation_id].append(coroutine)
diff --git a/src/unstructured_client/_version.py b/src/unstructured_client/_version.py
@@ -3,7 +3,7 @@
 import importlib.metadata
 
 __title__: str = "unstructured-client"
-__version__: str = "0.26.1"
+__version__: str = "0.26.2"
 
 try:
     if __package__ is not None:
diff --git a/src/unstructured_client/general.py b/src/unstructured_client/general.py
@@ -99,18 +99,20 @@ def partition(
             data = utils.unmarshal_json(http_res.text, errors.HTTPValidationErrorData)
             raise errors.HTTPValidationError(data=data)
         if utils.match_response(http_res, "4XX", "*"):
+            http_res_text = utils.stream_to_text(http_res)
             raise errors.SDKError(
-                "API error occurred", http_res.status_code, http_res.text, http_res
+                "API error occurred", http_res.status_code, http_res_text, http_res
             )
         if utils.match_response(http_res, "5XX", "application/json"):
             data = utils.unmarshal_json(http_res.text, errors.ServerErrorData)
             raise errors.ServerError(data=data)
 
         content_type = http_res.headers.get("Content-Type")
+        http_res_text = utils.stream_to_text(http_res)
         raise errors.SDKError(
             f"Unexpected response received (code: {http_res.status_code}, type: {content_type})",
             http_res.status_code,
-            http_res.text,
+            http_res_text,
             http_res,
         )
 
@@ -204,17 +206,19 @@ async def partition_async(
             data = utils.unmarshal_json(http_res.text, errors.HTTPValidationErrorData)
             raise errors.HTTPValidationError(data=data)
         if utils.match_response(http_res, "4XX", "*"):
+            http_res_text = await utils.stream_to_text_async(http_res)
             raise errors.SDKError(
-                "API error occurred", http_res.status_code, http_res.text, http_res
+                "API error occurred", http_res.status_code, http_res_text, http_res
             )
         if utils.match_response(http_res, "5XX", "application/json"):
             data = utils.unmarshal_json(http_res.text, errors.ServerErrorData)
             raise errors.ServerError(data=data)
 
         content_type = http_res.headers.get("Content-Type")
+        http_res_text = await utils.stream_to_text_async(http_res)
         raise errors.SDKError(
             f"Unexpected response received (code: {http_res.status_code}, type: {content_type})",
             http_res.status_code,
-            http_res.text,
+            http_res_text,
             http_res,
         )
diff --git a/src/unstructured_client/sdkconfiguration.py b/src/unstructured_client/sdkconfiguration.py
@@ -34,9 +34,9 @@ class SDKConfiguration:
     server: Optional[str] = ""
     language: str = "python"
     openapi_doc_version: str = "1.0.51"
-    sdk_version: str = "0.26.1"
-    gen_version: str = "2.438.3"
-    user_agent: str = "speakeasy-sdk/python 0.26.1 2.438.3 1.0.51 unstructured-client"
+    sdk_version: str = "0.26.2"
+    gen_version: str = "2.438.15"
+    user_agent: str = "speakeasy-sdk/python 0.26.2 2.438.15 1.0.51 unstructured-client"
     retry_config: OptionalNullable[RetryConfig] = Field(default_factory=lambda: UNSET)
     timeout_ms: Optional[int] = None
 
diff --git a/src/unstructured_client/utils/__init__.py b/src/unstructured_client/utils/__init__.py
@@ -27,6 +27,9 @@
     serialize_float,
     serialize_int,
     stream_to_text,
+    stream_to_text_async,
+    stream_to_bytes,
+    stream_to_bytes_async,
     validate_const,
     validate_decimal,
     validate_float,
@@ -80,6 +83,9 @@
     "serialize_request_body",
     "SerializedRequestBody",
     "stream_to_text",
+    "stream_to_text_async",
+    "stream_to_bytes",
+    "stream_to_bytes_async",
     "template_url",
     "unmarshal",
     "unmarshal_json",
diff --git a/src/unstructured_client/utils/serializers.py b/src/unstructured_client/utils/serializers.py
@@ -185,6 +185,18 @@ def stream_to_text(stream: httpx.Response) -> str:
     return "".join(stream.iter_text())
 
 
+async def stream_to_text_async(stream: httpx.Response) -> str:
+    return "".join([chunk async for chunk in stream.aiter_text()])
+
+
+def stream_to_bytes(stream: httpx.Response) -> bytes:
+    return stream.content
+
+
+async def stream_to_bytes_async(stream: httpx.Response) -> bytes:
+    return await stream.aread()
+
+
 def get_pydantic_model(data: Any, typ: Any) -> Any:
     if not _contains_pydantic_model(data):
         return unmarshal(data, typ)