diff --git a/.gitignore b/.gitignore index 6de83ed8..353e453e 100755 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,12 @@ __pycache__/ .idea/ openapi.json openapi_client.json + +# Environments +.env +.envrc +.venv* +venv* +env/ +ENV/ +env.bak/ diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py index e1cc73e0..fdfd8b8e 100644 --- a/_test_unstructured_client/integration/test_decorators.py +++ b/_test_unstructured_client/integration/test_decorators.py @@ -2,6 +2,7 @@ import tempfile from pathlib import Path +from typing import Literal import httpx import json @@ -15,6 +16,7 @@ from unstructured_client import UnstructuredClient from unstructured_client.models import shared, operations from unstructured_client.models.errors import HTTPValidationError +from unstructured_client.models.shared.partition_parameters import Strategy from unstructured_client.utils.retries import BackoffStrategy, RetryConfig from unstructured_client._hooks.custom import form_utils from unstructured_client._hooks.custom import split_pdf_hook @@ -105,19 +107,22 @@ def test_integration_split_pdf_has_same_output_as_non_split( ) assert len(diff) == 0 -@pytest.mark.parametrize( ("filename", "expected_ok", "strategy"), [ - ("_sample_docs/layout-parser-paper.pdf", True, "hi_res"), # 16 -]# pages -) -@pytest.mark.parametrize( ("use_caching", "cache_dir"), [ + +@pytest.mark.parametrize(("filename", "expected_ok", "strategy"), [ + ("_sample_docs/layout-parser-paper.pdf", True, shared.Strategy.HI_RES), # 16 pages +]) +@pytest.mark.parametrize(("use_caching", "cache_dir"), [ (True, None), # Use default cache dir (True, Path(tempfile.gettempdir()) / "test_integration_unstructured_client1"), # Use custom cache dir (False, None), # Don't use caching (False, Path(tempfile.gettempdir()) / "test_integration_unstructured_client2"), # Don't use caching, use custom cache dir ]) def test_integration_split_pdf_with_caching( - filename: str, expected_ok: bool, strategy: str, use_caching: bool, - cache_dir: Path | None + filename: str, + expected_ok: bool, + strategy: Literal[Strategy.HI_RES], + use_caching: bool, + cache_dir: Path | None, ): try: response = requests.get("http://localhost:8000/general/docs") @@ -140,10 +145,9 @@ def test_integration_split_pdf_with_caching( parameters = shared.PartitionParameters( files=files, strategy=strategy, - languages=["eng"], split_pdf_page=True, split_pdf_cache_tmp_data=use_caching, - split_pdf_cache_dir=cache_dir, + split_pdf_cache_tmp_data_dir=str(cache_dir), ) req = operations.PartitionRequest( @@ -185,6 +189,7 @@ def test_integration_split_pdf_with_caching( if cache_dir: assert not Path(cache_dir).exists() + @pytest.mark.parametrize("filename", ["_sample_docs/super_long_pages.pdf"]) def test_long_pages_hi_res(filename): req = operations.PartitionRequest(partition_parameters=shared.PartitionParameters( diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index adc743a8..12792c69 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -1,25 +1,26 @@ from __future__ import annotations import asyncio -import io -import logging from asyncio import Task from collections import Counter from functools import partial -from typing import Coroutine +from pathlib import Path +from unittest.mock import MagicMock, patch import httpx import pytest import requests -from requests_toolbelt import MultipartDecoder, MultipartEncoder +from requests_toolbelt import MultipartDecoder from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils from unstructured_client._hooks.custom.form_utils import ( + FormData, PARTITION_FORM_CONCURRENCY_LEVEL_KEY, PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, PARTITION_FORM_PAGE_RANGE_KEY, ) from unstructured_client._hooks.custom.split_pdf_hook import ( + DEFAULT_CACHE_TMP_DATA_DIR, DEFAULT_CONCURRENCY_LEVEL, DEFAULT_STARTING_PAGE_NUMBER, MAX_CONCURRENCY_LEVEL, @@ -434,3 +435,30 @@ async def test_remaining_tasks_cancelled_when_fails_disallowed(): await asyncio.sleep(1) print("Cancelled amount: ", cancelled_counter["cancelled"]) assert len(tasks) > cancelled_counter["cancelled"] > 0 + + +@patch("unstructured_client._hooks.custom.form_utils.Path") +def test_unit_get_split_pdf_cache_tmp_data_dir_uses_dir_from_form_data(mock_path: MagicMock): + """Test get_split_pdf_cache_tmp_data_dir uses the directory from the form data.""" + # -- Create the form_data + dir_key = form_utils.PARTITION_FORM_SPLIT_CACHE_TMP_DATA_DIR_KEY # -- "split_pdf_cache_tmp_data_dir" + mock_dir = "/mock/dir" + form_data: FormData = {dir_key: mock_dir} + + # -- Mock the Path object in form_utils + mock_path_instance = MagicMock() + mock_path.return_value = mock_path_instance + mock_path_instance.exists.return_value = True + mock_path_instance.resolve.return_value = Path(mock_dir) + + result = form_utils.get_split_pdf_cache_tmp_data_dir( + form_data = form_data, + key=dir_key, + fallback_value=DEFAULT_CACHE_TMP_DATA_DIR # -- tempfile.gettempdir() + ) + + assert dir_key == "split_pdf_cache_tmp_data_dir" + assert form_data.get(dir_key) == "/mock/dir" + mock_path.assert_called_once_with(mock_dir) + mock_path_instance.exists.assert_called_once() + assert result == str(Path(mock_dir).resolve()) diff --git a/_test_unstructured_client/unit_utils.py b/_test_unstructured_client/unit_utils.py index b85ad8c5..b1b00da4 100644 --- a/_test_unstructured_client/unit_utils.py +++ b/_test_unstructured_client/unit_utils.py @@ -2,6 +2,7 @@ from __future__ import annotations +import pathlib from typing import Any from unittest.mock import ( ANY, @@ -31,6 +32,12 @@ "property_mock", ) +def sample_docs_path(file_name: str) -> str: + """Resolve the absolute-path to `file_name` in the sample-docs directory.""" + sample_docs_dir = pathlib.Path(__file__).parent.parent / "_sample_docs" + file_path = sample_docs_dir / file_name + return str(file_path.resolve()) + # ------------------------------------------------------------------------------------------------ # MOCKING FIXTURES diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 1f1e536b..f79e9e24 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -25,9 +25,11 @@ PARTITION_FORM_CONCURRENCY_LEVEL_KEY, PARTITION_FORM_FILES_KEY, PARTITION_FORM_PAGE_RANGE_KEY, - PARTITION_FORM_SPLIT_PDF_PAGE_KEY, + PARTITION_FORM_SPLIT_CACHE_TMP_DATA_DIR_KEY, + PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY, PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY, - PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY, + PARTITION_FORM_SPLIT_PDF_PAGE_KEY, + PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, ) from unstructured_client._hooks.types import ( AfterErrorContext, @@ -315,7 +317,7 @@ def before_request( self.cache_tmp_data_dir = form_utils.get_split_pdf_cache_tmp_data_dir( form_data, - key=PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY, + key=PARTITION_FORM_SPLIT_CACHE_TMP_DATA_DIR_KEY, fallback_value=DEFAULT_CACHE_TMP_DATA_DIR, ) @@ -546,7 +548,7 @@ def _get_pdf_chunk_paths( return pdf_chunk_paths def _get_pdf_chunk_files( - self, pdf_chunks: list[Tuple[Path, int]] + self, pdf_chunks: list[Tuple[Path, int]] ) -> Generator[Tuple[BinaryIO, int], None, None]: """Yields the file objects for the given pdf chunk paths. @@ -573,8 +575,7 @@ def _get_pdf_chunk_files( raise yield pdf_chunk_file, offset - def _await_elements( - self, operation_id: str) -> Optional[list]: + def _await_elements(self, operation_id: str) -> Optional[list]: """ Waits for the partition requests to complete and returns the flattened elements.