|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
| 3 | +import tempfile |
| 4 | +from pathlib import Path |
| 5 | + |
3 | 6 | import httpx |
4 | 7 | import json |
5 | 8 | import pytest |
@@ -102,6 +105,87 @@ def test_integration_split_pdf_has_same_output_as_non_split( |
102 | 105 | ) |
103 | 106 | assert len(diff) == 0 |
104 | 107 |
|
| 108 | +@pytest.mark.parametrize( ("filename", "expected_ok", "strategy"), [ |
| 109 | + ("_sample_docs/layout-parser-paper.pdf", True, "hi_res"), # 16 |
| 110 | +]# pages |
| 111 | +) |
| 112 | +@pytest.mark.parametrize( ("use_caching", "cache_dir"), [ |
| 113 | + (True, None), # Use default cache dir |
| 114 | + (True, Path(tempfile.gettempdir()) / "test_integration_unstructured_client1"), # Use custom cache dir |
| 115 | + (False, None), # Don't use caching |
| 116 | + (False, Path(tempfile.gettempdir()) / "test_integration_unstructured_client2"), # Don't use caching, use custom cache dir |
| 117 | +]) |
| 118 | +def test_integration_split_pdf_with_caching( |
| 119 | + filename: str, expected_ok: bool, strategy: str, use_caching: bool, |
| 120 | + cache_dir: Path | None |
| 121 | +): |
| 122 | + try: |
| 123 | + response = requests.get("http://localhost:8000/general/docs") |
| 124 | + assert response.status_code == 200, "The unstructured-api is not running on localhost:8000" |
| 125 | + except requests.exceptions.ConnectionError: |
| 126 | + assert False, "The unstructured-api is not running on localhost:8000" |
| 127 | + |
| 128 | + client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000") |
| 129 | + |
| 130 | + with open(filename, "rb") as f: |
| 131 | + files = shared.Files( |
| 132 | + content=f.read(), |
| 133 | + file_name=filename, |
| 134 | + ) |
| 135 | + |
| 136 | + if not expected_ok: |
| 137 | + # This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error |
| 138 | + files.file_name += ".pdf" |
| 139 | + |
| 140 | + parameters = shared.PartitionParameters( |
| 141 | + files=files, |
| 142 | + strategy=strategy, |
| 143 | + languages=["eng"], |
| 144 | + split_pdf_page=True, |
| 145 | + split_pdf_cache_tmp_data=use_caching, |
| 146 | + split_pdf_cache_dir=cache_dir, |
| 147 | + ) |
| 148 | + |
| 149 | + req = operations.PartitionRequest( |
| 150 | + partition_parameters=parameters |
| 151 | + ) |
| 152 | + |
| 153 | + try: |
| 154 | + resp_split = client.general.partition(request=req) |
| 155 | + except (HTTPValidationError, AttributeError) as exc: |
| 156 | + if not expected_ok: |
| 157 | + assert "File does not appear to be a valid PDF" in str(exc) |
| 158 | + return |
| 159 | + else: |
| 160 | + assert exc is None |
| 161 | + |
| 162 | + parameters.split_pdf_page = False |
| 163 | + |
| 164 | + req = operations.PartitionRequest( |
| 165 | + partition_parameters=parameters |
| 166 | + ) |
| 167 | + |
| 168 | + resp_single = client.general.partition(request=req) |
| 169 | + |
| 170 | + assert len(resp_split.elements) == len(resp_single.elements) |
| 171 | + assert resp_split.content_type == resp_single.content_type |
| 172 | + assert resp_split.status_code == resp_single.status_code |
| 173 | + |
| 174 | + diff = DeepDiff( |
| 175 | + t1=resp_split.elements, |
| 176 | + t2=resp_single.elements, |
| 177 | + exclude_regex_paths=[ |
| 178 | + r"root\[\d+\]\['metadata'\]\['parent_id'\]", |
| 179 | + r"root\[\d+\]\['element_id'\]", |
| 180 | + ], |
| 181 | + ) |
| 182 | + assert len(diff) == 0 |
| 183 | + |
| 184 | + # make sure the cache dir was cleaned if passed explicitly |
| 185 | + if cache_dir: |
| 186 | + assert not Path(cache_dir).exists() |
| 187 | + |
| 188 | + |
105 | 189 |
|
106 | 190 | def test_integration_split_pdf_for_file_with_no_name(): |
107 | 191 | """ |
|
0 commit comments