1111from collections .abc import Awaitable
1212from functools import partial
1313from pathlib import Path
14- from typing import Any , Coroutine , Optional , Tuple , Union , cast , Generator , BinaryIO , Callable
14+ from typing import Any , Coroutine , Optional , Tuple , Union , cast , Generator , BinaryIO
1515
1616import aiofiles
1717import httpx
1818import nest_asyncio # type: ignore
1919from httpx import AsyncClient
2020from pypdf import PdfReader , PdfWriter
21- from requests_toolbelt .multipart .decoder import MultipartDecoder # type: ignore
22- from unstructured .chunking .dispatch import chunk
2321
2422from unstructured_client ._hooks .custom import form_utils , pdf_utils , request_utils
2523from unstructured_client ._hooks .custom .common import UNSTRUCTURED_CLIENT_LOGGER_NAME
@@ -60,7 +58,7 @@ async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Respons
6058
6159
6260async def run_tasks (
63- coroutines : list [Callable [[ AsyncClient ], Coroutine ]],
61+ coroutines : list [partial [ Coroutine [ Any , Any , httpx . Response ] ]],
6462 allow_failed : bool = False
6563) -> list [tuple [int , httpx .Response ]]:
6664 """Run a list of coroutines in parallel and return the results in order.
@@ -83,7 +81,7 @@ async def run_tasks(
8381 client_timeout = httpx .Timeout (60 * client_timeout_minutes )
8482
8583 async with httpx .AsyncClient (timeout = client_timeout ) as client :
86- armed_coroutines = [coro (async_client = client ) for coro in coroutines ]
84+ armed_coroutines = [coro (async_client = client ) for coro in coroutines ] # type: ignore
8785 if allow_failed :
8886 responses = await asyncio .gather (* armed_coroutines , return_exceptions = False )
8987 return list (enumerate (responses , 1 ))
@@ -157,12 +155,14 @@ def __init__(self) -> None:
157155 self .base_url : Optional [str ] = None
158156 self .async_client : Optional [AsyncHttpClient ] = None
159157 self .coroutines_to_execute : dict [
160- str , list [Coroutine [Any , Any , httpx .Response ]]
158+ str , list [partial [ Coroutine [Any , Any , httpx .Response ] ]]
161159 ] = {}
162160 self .api_successful_responses : dict [str , list [httpx .Response ]] = {}
163161 self .api_failed_responses : dict [str , list [httpx .Response ]] = {}
164162 self .tempdirs : dict [str , tempfile .TemporaryDirectory ] = {}
165163 self .allow_failed : bool = DEFAULT_ALLOW_FAILED
164+ self .cache_tmp_data_feature : bool = DEFAULT_CACHE_TMP_DATA
165+ self .cache_tmp_data_dir : str = DEFAULT_CACHE_TMP_DATA_DIR
166166
167167 def sdk_init (
168168 self , base_url : str , client : HttpClient
@@ -266,15 +266,7 @@ def before_request(
266266 form_data = request_utils .get_multipart_stream_fields (request )
267267 if not form_data :
268268 return request
269- # For future - avoid reading the request content as it might issue
270- # OOM errors for large files. Instead, the `stream` (MultipartStream) parameter
271- # should be used which contains the list of DataField or FileField objects
272- # request_content = request.read()
273- # request_body = request_content
274269
275-
276- # decoded_body = MultipartDecoder(request_body, content_type)
277- # form_data = form_utils.parse_form_data(decoded_body)
278270 split_pdf_page = form_data .get (PARTITION_FORM_SPLIT_PDF_PAGE_KEY )
279271 if split_pdf_page is None or split_pdf_page == "false" :
280272 return request
@@ -505,7 +497,7 @@ def _get_pdf_chunk_paths(
505497 )
506498 self .tempdirs [operation_id ] = tempdir
507499 tempdir_path = Path (tempdir .name )
508- pdf_chunk_paths = []
500+ pdf_chunk_paths : list [ Tuple [ Path , int ]] = []
509501 chunk_no = 0
510502 while offset < offset_end :
511503 chunk_no += 1
@@ -517,7 +509,7 @@ def _get_pdf_chunk_paths(
517509 new_pdf .add_page (page )
518510 with open (tempdir_path / f"chunk_{ chunk_no } .pdf" , "wb" ) as pdf_chunk :
519511 new_pdf .write (pdf_chunk )
520- pdf_chunk_paths .append ((pdf_chunk .name , offset ))
512+ pdf_chunk_paths .append ((Path ( pdf_chunk .name ) , offset ))
521513 offset += split_size
522514 return pdf_chunk_paths
523515
0 commit comments