Merge branch 'main' into chore/remove-slow-tests

awalker4 · web-flow · commit 8719f92f38f1 · 2025-08-01T08:14:19.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+## 0.42.0
+
+### Enhancements
+
+### Features
+
+### Fixes
+* Fix some environments failing to split pdfs with `Can't patch loop of type <class 'uvloop.Loop'>`, remove usage of `nest-asyncio`
+
 ## 0.41.0
 
 ### Enhancements
diff --git a/_test_unstructured_client/integration/test_integration.py b/_test_unstructured_client/integration/test_integration.py
@@ -13,7 +13,7 @@
 from unstructured_client.utils.retries import BackoffStrategy, RetryConfig
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def client() -> UnstructuredClient:
     _client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"))
     yield _client
@@ -50,18 +50,9 @@ def test_partition_strategies(split_pdf, strategy, client, doc_path):
     assert len(response.elements)
 
 
-@pytest.fixture(scope="session")
-def event_loop():
-    """Make the loop session scope to use session async fixtures."""
-    policy = asyncio.get_event_loop_policy()
-    loop = policy.new_event_loop()
-    yield loop
-    loop.close()
-
-
 @pytest.mark.parametrize("split_pdf", [True, False])
 @pytest.mark.parametrize("error", [(500, ServerError), (403, SDKError), (422, HTTPValidationError)])
-def test_partition_handling_server_error(error, split_pdf, monkeypatch, doc_path, event_loop):
+def test_partition_handling_server_error(error, split_pdf, monkeypatch, doc_path):
     """
     Mock different error responses, assert that the client throws the correct error
     """
@@ -197,6 +188,7 @@ async def test_partition_async_processes_concurrent_files(client, doc_path):
 
 
 def test_uvloop_partitions_without_errors(client, doc_path):
+    """Test that we can use pdf splitting within another asyncio loop."""
     async def call_api():
         filename = "layout-parser-paper-fast.pdf"
         with open(doc_path / filename, "rb") as f:
diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py
@@ -341,6 +341,7 @@ def test_unit_get_page_range_returns_valid_range(page_range, expected_result):
 
 async def _request_mock(
         async_client: httpx.AsyncClient, # not used by mock
+        limiter: asyncio.Semaphore, # not used by mock
         fails: bool,
         content: str) -> requests.Response:
     response = requests.Response()
@@ -407,6 +408,7 @@ async def test_unit_disallow_failed_coroutines(
 
 async def _fetch_canceller_error(
         async_client: httpx.AsyncClient, # not used by mock
+        limiter: asyncio.Semaphore, # not used by mock
         fails: bool,
         content: str,
         cancelled_counter: Counter):
@@ -416,7 +418,7 @@ async def _fetch_canceller_error(
             print("Doesn't fail")
         else:
             print("Fails")
-        return await _request_mock(async_client=async_client, fails=fails, content=content)
+        return await _request_mock(async_client=async_client, limiter=limiter, fails=fails, content=content)
     except asyncio.CancelledError:
         cancelled_counter.update(["cancelled"])
         print(cancelled_counter["cancelled"])
diff --git a/gen.yaml b/gen.yaml
@@ -18,7 +18,7 @@ generation:
     generateNewTests: false
     skipResponseBodyAssertions: false
 python:
-  version: 0.41.0
+  version: 0.42.0
   additionalDependencies:
     dev:
       deepdiff: '>=6.0'
@@ -34,7 +34,6 @@ python:
       aiofiles: '>=24.1.0'
       cryptography: '>=3.1'
       httpx: '>=0.27.0'
-      nest-asyncio: '>=1.6.0'
       pypdf: '>=4.0'
       requests-toolbelt: '>=1.0.0'
   authors:
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,8 +10,8 @@ dependencies = [
     "cryptography >=3.1",
     "httpcore >=1.0.9",
     "httpx >=0.27.0",
+    "pydantic >=2.10.3",
     "nest-asyncio >=1.6.0",
-    "pydantic >=2.11.2",
     "pypdf >=4.0",
     "requests-toolbelt >=1.0.0",
 ]
@@ -48,6 +48,7 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
+asyncio_mode = "auto"
 asyncio_default_fixture_loop_scope = "function"
 pythonpath = ["src"]
 
diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -9,13 +9,13 @@
 import tempfile
 import uuid
 from collections.abc import Awaitable
+from concurrent import futures
 from functools import partial
 from pathlib import Path
 from typing import Any, Coroutine, Optional, Tuple, Union, cast, Generator, BinaryIO
 
 import aiofiles
 import httpx
-import nest_asyncio  # type: ignore
 from httpx import AsyncClient
 from pypdf import PdfReader, PdfWriter
 
@@ -56,6 +56,11 @@
 HI_RES_STRATEGY = 'hi_res'
 MAX_PAGE_LENGTH = 4000
 
+def _run_coroutines_in_separate_thread(
+        coroutines_task: Coroutine[Any, Any, list[tuple[int, httpx.Response]]],
+) -> list[tuple[int, httpx.Response]]:
+    return asyncio.run(coroutines_task)
+
 
 async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Response]:
     response = await coro
@@ -64,7 +69,8 @@ async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Respons
 
 async def run_tasks(
     coroutines: list[partial[Coroutine[Any, Any, httpx.Response]]],
-    allow_failed: bool = False
+    allow_failed: bool = False,
+    concurrency_level: int = 10,
 ) -> list[tuple[int, httpx.Response]]:
     """Run a list of coroutines in parallel and return the results in order.
 
@@ -80,13 +86,14 @@ async def run_tasks(
     # Use a variable to adjust the httpx client timeout, or default to 30 minutes
     # When we're able to reuse the SDK to make these calls, we can remove this var
     # The SDK timeout will be controlled by parameter
+    limiter = asyncio.Semaphore(concurrency_level)
     client_timeout_minutes = 60
     if timeout_var := os.getenv("UNSTRUCTURED_CLIENT_TIMEOUT_MINUTES"):
         client_timeout_minutes = int(timeout_var)
     client_timeout = httpx.Timeout(60 * client_timeout_minutes)
 
     async with httpx.AsyncClient(timeout=client_timeout) as client:
-        armed_coroutines = [coro(async_client=client) for coro in coroutines] # type: ignore
+        armed_coroutines = [coro(async_client=client, limiter=limiter) for coro in coroutines] # type: ignore
         if allow_failed:
             responses = await asyncio.gather(*armed_coroutines, return_exceptions=False)
             return list(enumerate(responses, 1))
@@ -110,16 +117,6 @@ async def run_tasks(
         return sorted(results, key=lambda x: x[0])
 
 
-def context_is_uvloop():
-    """Return true if uvloop is installed and we're currently in a uvloop context. Our asyncio splitting code currently doesn't work under uvloop."""
-    try:
-        import uvloop  # type: ignore[import]  # pylint: disable=import-outside-toplevel
-        loop = asyncio.get_event_loop()
-        return isinstance(loop, uvloop.Loop)
-    except (ImportError, RuntimeError):
-        return False
-
-
 def get_optimal_split_size(num_pages: int, concurrency_level: int) -> int:
     """Distributes pages to workers evenly based on the number of pages and desired concurrency level."""
     if num_pages < MAX_PAGES_PER_SPLIT * concurrency_level:
@@ -163,8 +160,10 @@ def __init__(self) -> None:
         self.coroutines_to_execute: dict[
             str, list[partial[Coroutine[Any, Any, httpx.Response]]]
         ] = {}
+        self.concurrency_level: dict[str, int] = {}
         self.api_successful_responses: dict[str, list[httpx.Response]] = {}
         self.api_failed_responses: dict[str, list[httpx.Response]] = {}
+        self.executors: dict[str, futures.ThreadPoolExecutor] = {}
         self.tempdirs: dict[str, tempfile.TemporaryDirectory] = {}
         self.allow_failed: bool = DEFAULT_ALLOW_FAILED
         self.cache_tmp_data_feature: bool = DEFAULT_CACHE_TMP_DATA
@@ -264,14 +263,6 @@ def before_request(
             logger.warning("HTTP client not accessible! Continuing without splitting.")
             return request
 
-        if context_is_uvloop():
-            logger.warning("Splitting is currently incompatible with uvloop. Continuing without splitting.")
-            return request
-
-        # This allows us to use an event loop in an env with an existing loop
-        # Temporary fix until we can improve the async splitting behavior
-        nest_asyncio.apply()
-
         # This is our key into coroutines_to_execute
         # We need to pass it on to after_success so
         # we know which results are ours
@@ -317,13 +308,15 @@ def before_request(
             fallback_value=DEFAULT_ALLOW_FAILED,
         )
 
-        concurrency_level = form_utils.get_split_pdf_concurrency_level_param(
+        self.concurrency_level[operation_id] = form_utils.get_split_pdf_concurrency_level_param(
             form_data,
             key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
             fallback_value=DEFAULT_CONCURRENCY_LEVEL,
             max_allowed=MAX_CONCURRENCY_LEVEL,
         )
-        limiter = asyncio.Semaphore(concurrency_level)
+
+        executor = futures.ThreadPoolExecutor(max_workers=1)
+        self.executors[operation_id] = executor
 
         self.cache_tmp_data_feature = form_utils.get_split_pdf_cache_tmp_data(
             form_data,
@@ -346,7 +339,7 @@ def before_request(
         page_count = page_range_end - page_range_start + 1
 
         split_size = get_optimal_split_size(
-            num_pages=page_count, concurrency_level=concurrency_level
+            num_pages=page_count, concurrency_level=self.concurrency_level[operation_id]
         )
 
         # If the doc is small enough, and we aren't slicing it with a page range:
@@ -389,7 +382,6 @@ def before_request(
             # in `after_success`.
             coroutine = partial(
                 self.call_api_partial,
-                limiter=limiter,
                 operation_id=operation_id,
                 pdf_chunk_request=pdf_chunk_request,
                 pdf_chunk_file=pdf_chunk_file,
@@ -607,10 +599,16 @@ def _await_elements(self, operation_id: str) -> Optional[list]:
         if tasks is None:
             return None
 
-        ioloop = asyncio.get_event_loop()
-        task_responses: list[tuple[int, httpx.Response]] = ioloop.run_until_complete(
-            run_tasks(tasks, allow_failed=self.allow_failed)
-        )
+        concurrency_level = self.concurrency_level.get(operation_id, DEFAULT_CONCURRENCY_LEVEL)
+        coroutines = run_tasks(tasks, allow_failed=self.allow_failed, concurrency_level=concurrency_level)
+
+        # sending the coroutines to a separate thread to avoid blocking the current event loop
+        # this operation should be removed when the SDK is updated to support async hooks
+        executor = self.executors.get(operation_id)
+        if executor is None:
+            raise RuntimeError("Executor not found for operation_id")
+        task_responses_future = executor.submit(_run_coroutines_in_separate_thread, coroutines)
+        task_responses = task_responses_future.result()
 
         if task_responses is None:
             return None
@@ -714,6 +712,10 @@ def _clear_operation(self, operation_id: str) -> None:
         """
         self.coroutines_to_execute.pop(operation_id, None)
         self.api_successful_responses.pop(operation_id, None)
+        self.concurrency_level.pop(operation_id, None)
+        executor = self.executors.pop(operation_id, None)
+        if executor is not None:
+            executor.shutdown(wait=True)
         tempdir = self.tempdirs.pop(operation_id, None)
         if tempdir:
             tempdir.cleanup()