chore: fixing lint

pawel-kmiecik · pawel-kmiecik · commit 49e3c8e76302 · 2024-11-05T16:05:01.000+01:00
diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py
@@ -161,8 +161,8 @@ def get_split_pdf_cache_tmp_data(
     return cache_tmp_data.lower() == "true"
 
 def get_split_pdf_cache_tmp_data_dir(
-    form_data: FormData, key: str, fallback_value: Path | str,
-) -> Path | str:
+    form_data: FormData, key: str, fallback_value: str,
+) -> str:
     """Retrieves the value for cache tmp data dir that should be used for splitting pdf.
 
     In case given the number is not a "false" or "true" literal, it will use the
@@ -178,21 +178,19 @@ def get_split_pdf_cache_tmp_data_dir(
     """
     cache_tmp_data_dir = form_data.get(key)
 
-    if not isinstance(cache_tmp_data_dir, str) and not isinstance(cache_tmp_data_dir, Path):
+    if not isinstance(cache_tmp_data_dir, str):
         return fallback_value
+    cache_tmp_data_path = Path(cache_tmp_data_dir)
 
-    if isinstance(cache_tmp_data_dir, str):
-        cache_tmp_data_dir = Path(cache_tmp_data_dir)
-
-    if not cache_tmp_data_dir.exists():
+    if not cache_tmp_data_path.exists():
         logger.warning(
             "'%s' does not exist. Using default value '%s'.",
             key,
             fallback_value,
         )
         return fallback_value
 
-    return cache_tmp_data_dir.resolve()
+    return str(cache_tmp_data_path.resolve())
 
 
 def get_split_pdf_concurrency_level_param(
diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py
@@ -8,7 +8,6 @@
 from pypdf.errors import PdfReadError
 
 from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
-from unstructured_client.models import shared
 
 logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME)
 
diff --git a/src/unstructured_client/_hooks/custom/request_utils.py b/src/unstructured_client/_hooks/custom/request_utils.py
@@ -4,11 +4,10 @@
 import io
 import json
 import logging
-from typing import Tuple, Any, BinaryIO, cast, IO
+from typing import Tuple, Any, BinaryIO
 
 import httpx
 from httpx._multipart import DataField, FileField
-from requests_toolbelt.multipart.encoder import MultipartEncoder  # type: ignore
 
 from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
 from unstructured_client._hooks.custom.form_utils import (
@@ -45,7 +44,7 @@ def get_multipart_stream_fields(request: httpx.Request) -> dict[str, Any]:
         return {}
     fields = request.stream.fields
 
-    mapped_fields = {}
+    mapped_fields: dict[str, Any] = {}
     for field in fields:
         if isinstance(field, DataField):
             if "[]" in field.name:
@@ -114,7 +113,7 @@ def create_pdf_chunk_request(
     data = create_pdf_chunk_request_params(form_data, page_number)
     original_headers = prepare_request_headers(original_request.headers)
 
-    pdf_chunk_content = (
+    pdf_chunk_content: BinaryIO | bytes = (
         pdf_chunk_file.getvalue()
         if isinstance(pdf_chunk_file, io.BytesIO)
         else pdf_chunk_file
@@ -135,6 +134,8 @@ def create_pdf_chunk_request(
         "multipart",
         shared.PartitionParameters,
     )
+    if serialized_body is None:
+        raise ValueError("Failed to serialize the request body.")
     return httpx.Request(
         method="POST",
         url=original_request.url or "",
diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -11,15 +11,13 @@
 from collections.abc import Awaitable
 from functools import partial
 from pathlib import Path
-from typing import Any, Coroutine, Optional, Tuple, Union, cast, Generator, BinaryIO, Callable
+from typing import Any, Coroutine, Optional, Tuple, Union, cast, Generator, BinaryIO
 
 import aiofiles
 import httpx
 import nest_asyncio  # type: ignore
 from httpx import AsyncClient
 from pypdf import PdfReader, PdfWriter
-from requests_toolbelt.multipart.decoder import MultipartDecoder  # type: ignore
-from unstructured.chunking.dispatch import chunk
 
 from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
 from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
@@ -60,7 +58,7 @@ async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Respons
 
 
 async def run_tasks(
-    coroutines: list[Callable[[AsyncClient], Coroutine]],
+    coroutines: list[partial[Coroutine[Any, Any, httpx.Response]]],
     allow_failed: bool = False
 ) -> list[tuple[int, httpx.Response]]:
     """Run a list of coroutines in parallel and return the results in order.
@@ -83,7 +81,7 @@ async def run_tasks(
     client_timeout = httpx.Timeout(60 * client_timeout_minutes)
 
     async with httpx.AsyncClient(timeout=client_timeout) as client:
-        armed_coroutines = [coro(async_client=client) for coro in coroutines]
+        armed_coroutines = [coro(async_client=client) for coro in coroutines] # type: ignore
         if allow_failed:
             responses = await asyncio.gather(*armed_coroutines, return_exceptions=False)
             return list(enumerate(responses, 1))
@@ -157,12 +155,14 @@ def __init__(self) -> None:
         self.base_url: Optional[str] = None
         self.async_client: Optional[AsyncHttpClient] = None
         self.coroutines_to_execute: dict[
-            str, list[Coroutine[Any, Any, httpx.Response]]
+            str, list[partial[Coroutine[Any, Any, httpx.Response]]]
         ] = {}
         self.api_successful_responses: dict[str, list[httpx.Response]] = {}
         self.api_failed_responses: dict[str, list[httpx.Response]] = {}
         self.tempdirs: dict[str, tempfile.TemporaryDirectory] = {}
         self.allow_failed: bool = DEFAULT_ALLOW_FAILED
+        self.cache_tmp_data_feature: bool = DEFAULT_CACHE_TMP_DATA
+        self.cache_tmp_data_dir: str = DEFAULT_CACHE_TMP_DATA_DIR
 
     def sdk_init(
             self, base_url: str, client: HttpClient
@@ -266,15 +266,7 @@ def before_request(
         form_data = request_utils.get_multipart_stream_fields(request)
         if not form_data:
             return request
-        # For future - avoid reading the request content as it might issue
-        # OOM errors for large files. Instead, the `stream` (MultipartStream) parameter
-        # should be used which contains the list of DataField or FileField objects
-        # request_content = request.read()
-        # request_body = request_content
 
-
-        # decoded_body = MultipartDecoder(request_body, content_type)
-        # form_data = form_utils.parse_form_data(decoded_body)
         split_pdf_page = form_data.get(PARTITION_FORM_SPLIT_PDF_PAGE_KEY)
         if split_pdf_page is None or split_pdf_page == "false":
             return request
@@ -505,7 +497,7 @@ def _get_pdf_chunk_paths(
         )
         self.tempdirs[operation_id] = tempdir
         tempdir_path = Path(tempdir.name)
-        pdf_chunk_paths = []
+        pdf_chunk_paths: list[Tuple[Path, int]] = []
         chunk_no = 0
         while offset < offset_end:
             chunk_no += 1
@@ -517,7 +509,7 @@ def _get_pdf_chunk_paths(
                 new_pdf.add_page(page)
             with open(tempdir_path / f"chunk_{chunk_no}.pdf", "wb") as pdf_chunk:
                 new_pdf.write(pdf_chunk)
-                pdf_chunk_paths.append((pdf_chunk.name, offset))
+                pdf_chunk_paths.append((Path(pdf_chunk.name), offset))
             offset += split_size
         return pdf_chunk_paths