Skip to content

Commit ee8b0f9

Browse files
feat: pass list type parameters via client sdk (#2567)
The purpose of this PR is to support using the same type of parameters as `partition_*()` when using `partition_via_api()`. This PR works together with `unsturctured-api` [PR #368](Unstructured-IO/unstructured-api#368). **Note:** This PR will support extracting image blocks("Image", "Table") via partition_via_api(). ### Summary - update `partition_via_api()` to convert all list type parameters to JSON formatted strings before passing them to the unstructured client SDK - add a unit test function to test extracting image blocks via `parition_via_api()` - add a unit test function to test list type parameters passed to API via unstructured client sdk ### Testing ``` from unstructured.partition.api import partition_via_api elements = partition_via_api( filename="example-docs/embedded-images-tables.pdf", api_key="YOUR-API-KEY", strategy="hi_res", extract_image_block_types=["image", "table"], ) image_block_elements = [el for el in elements if el.category == "Image" or el.category == "Table"] print("\n\n".join([el.metadata.image_mime_type for el in image_block_elements])) print("\n\n".join([el.metadata.image_base64 for el in image_block_elements])) ```
1 parent 8f78538 commit ee8b0f9

File tree

3 files changed

+57
-1
lines changed

3 files changed

+57
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
### Fixes
1414

15+
* **Fix passing list type parameters when calling unstructured API via `partition_via_api()`** Update `partition_via_api()` to convert all list type parameters to JSON formatted strings before calling the unstructured client SDK. This will support image block extraction via `partition_via_api()`.
1516
* **Add OctoAI embedder** Adds support for embeddings via OctoAI.
1617
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors**
1718
* **Fix don't treat plain text files with double quotes as JSON ** If a file can be deserialized as JSON but it deserializes as a string, treat it as plain text even though it's valid JSON.

test_unstructured/partition/test_api.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import base64
12
import contextlib
23
import json
34
import os
@@ -8,7 +9,7 @@
89
import requests
910
from unstructured_client.general import General
1011

11-
from unstructured.documents.elements import NarrativeText
12+
from unstructured.documents.elements import ElementType, NarrativeText
1213
from unstructured.partition.api import partition_multiple_via_api, partition_via_api
1314

1415
DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -210,6 +211,52 @@ def test_partition_via_api_valid_request_data_kwargs():
210211
assert isinstance(elements, list)
211212

212213

214+
def test_partition_via_api_image_block_extraction():
215+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "embedded-images-tables.pdf")
216+
elements = partition_via_api(
217+
filename=filename,
218+
strategy="hi_res",
219+
extract_image_block_types=["image", "table"],
220+
api_key=get_api_key(),
221+
)
222+
image_elements = [el for el in elements if el.category == ElementType.IMAGE]
223+
for el in image_elements:
224+
assert el.metadata.image_base64 is not None
225+
assert el.metadata.image_mime_type is not None
226+
image_data = base64.b64decode(el.metadata.image_base64)
227+
assert isinstance(image_data, bytes)
228+
229+
230+
def test_partition_via_api_pass_list_type_parameters(monkeypatch):
231+
mock_request = Mock(return_value=MockResponse(status_code=200))
232+
monkeypatch.setattr(requests.Session, "request", mock_request)
233+
234+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "embedded-images-tables.pdf")
235+
236+
partition_via_api(
237+
filename=filename,
238+
strategy="hi_res",
239+
extract_image_block_types=["image", "table"],
240+
skip_infer_table_types=["pdf", "docx"],
241+
languages=["eng"],
242+
)
243+
244+
mock_request.assert_called_with(
245+
"POST",
246+
ANY,
247+
data=ANY,
248+
files=[
249+
["extract_image_block_types", [None, '["image", "table"]']],
250+
["files", ANY],
251+
["languages", [None, '["eng"]']],
252+
["skip_infer_table_types", [None, '["pdf", "docx"]']],
253+
["strategy", [None, "hi_res"]],
254+
],
255+
headers=ANY,
256+
params=ANY,
257+
)
258+
259+
213260
# Note(austin) - This test is way too noisy against the hosted api
214261
# def test_partition_via_api_invalid_request_data_kwargs():
215262
# filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf")

unstructured/partition/api.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import contextlib
2+
import json
23
from typing import (
34
IO,
45
List,
@@ -88,6 +89,13 @@ def partition_via_api(
8889
file_name=metadata_filename,
8990
)
9091

92+
# NOTE(christine): Converts all list type parameters to JSON formatted strings
93+
# (e.g. ["image", "table"] -> '["image", "table"]')
94+
# This can be removed if "speakeasy" supports passing list type parameters to FastAPI.
95+
for k, v in request_kwargs.items():
96+
if isinstance(v, list):
97+
request_kwargs[k] = json.dumps(v)
98+
9199
req = shared.PartitionParameters(
92100
files=files,
93101
**request_kwargs,

0 commit comments

Comments
 (0)