Skip to content

Commit 023307f

Browse files
feat: support list type parameters (#368)
The purpose of this PR is to support parsing all list type parameters, including `extract_image_block_types` when calling unstructured API via unstructured client SDK (Python/JS) generated by `speakeasy`. Currently, the `speakeasy` doesn't generate proper client code to pass list type parameters to unstructured API because they do not expect to support specific client code for `FastAPI` that the unstructured API relies on. To address this issue, I updated the unstructured API code to parse all list type parameters passed as JSON-formatted lists (e.g. `'["image", "table"]'`). **NOTE:** You must pass the list type parameter as a JSON-formatted list when calling unstructured API via unstructured client SDK. (e.g. `extract_image_block_types = '["image", "table"]'`, `skip_infer_table_types='["docx", "xlsx"]'`...) ### Summary - update `SmartValueParser.value_or_first_element()` to parse JSON format string (e.g. `'["image", "table"]'`) that is convertible to a list - apply `SmartValueParser.value_or_first_element()` to all list type parameters - remove existing `extract_image_block_types` parsing logic ### Testing - via unstructured_client_sdk (Python) ``` s = UnstructuredClient( server_url="http://localhost:8000/general/v0/general", api_key_auth="YOUR-API-KEY" ) filename = "sample-docs/embedded-images-tables.pdf" with open(filename, "rb") as f: # Note that this currently only supports a single file files = shared.Files( content=f.read(), file_name=filename, ) req = shared.PartitionParameters( files=files, # Other partition params strategy="hi_res", extract_image_block_types='["image", "table"]', languages=["pdf"], ) try: resp = s.general.partition(req) print([el.get("metadata").get("image_mime_type") for el in resp.elements if el.get("metadata").get("image_mime_type")]) except SDKError as e: print(e) ``` - via unstructured_client_sdk (JS) ``` import { UnstructuredClient } from "unstructured-client"; import * as fs from "fs"; const key = "YOUR-API-KEY"; const client = new UnstructuredClient({ serverURL: "http://localhost:8000", security: { apiKeyAuth: key, }, }); const filename = "sample-docs/embedded-images-tables.pdf"; const data = fs.readFileSync(filename); client.general.partition({ // Note that this currently only supports a single file files: { content: data, fileName: filename, }, // Other partition params strategy: "hi_res", extractImageBlockTypes: '["image", "table"]', }).then((res) => { if (res.statusCode == 200) { console.log(res.elements); } }).catch((e) => { console.log(e.statusCode); console.log(e.body); }); ``` - via default `requests` client (Python) ``` url = "http://localhost:8000/general/v0/general" headers = { 'accept': 'application/json', 'unstructured-api-key': "YOUR-API-KEY" } data = { "strategy": "hi_res", "extract_image_block_types": ["Image", "Table"], } filename = "sample-docs/embedded-images-tables.pdf" file_data = {'files': open(filename, 'rb')} response = requests.post(url, headers=headers, data=data, files=file_data) file_data['files'].close() elements = response.json() print([el.get("metadata").get("image_mime_type") for el in elements if el.get("metadata").get("image_mime_type")]) ``` - via `curl` command ``` curl -X 'POST' \ 'http://localhost:8000/general/v0/general' \ -H 'accept: application/json' \ -H 'Content-Type: multipart/form-data' \ -F 'files=@sample-docs/embedded-images-tables.pdf' \ -F 'strategy=hi_res' \ -F 'extract_image_block_types=["image", "table"]' \ | jq -C . | less -R ```
1 parent db7780c commit 023307f

File tree

3 files changed

+21
-11
lines changed

3 files changed

+21
-11
lines changed

prepline_general/api/general.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -359,16 +359,6 @@ def pipeline_api(
359359

360360
ocr_languages_str = "+".join(ocr_languages) if ocr_languages and len(ocr_languages) else None
361361

362-
if extract_image_block_types:
363-
try:
364-
# Handle the case when the user passes the table of strings as a json inside the
365-
# first element of the array
366-
loaded_array = json.loads(extract_image_block_types[0])
367-
if isinstance(loaded_array, list):
368-
extract_image_block_types = loaded_array
369-
except (json.JSONDecodeError, IndexError):
370-
pass # noqa
371-
372362
extract_image_block_to_payload = bool(extract_image_block_types)
373363

374364
try:

prepline_general/api/models/form_params.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def as_form(
5252
description="The languages present in the document, for use in partitioning and/or OCR",
5353
example="[eng]",
5454
),
55+
BeforeValidator(SmartValueParser[List[str]]().value_or_first_element),
5556
] = [], # noqa
5657
ocr_languages: Annotated[
5758
List[str],
@@ -149,6 +150,7 @@ def as_form(
149150
description="The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields",
150151
example="""["image", "table"]""",
151152
),
153+
BeforeValidator(SmartValueParser[List[str]]().value_or_first_element),
152154
] = [], # noqa
153155
# -- chunking options --
154156
chunking_strategy: Annotated[

prepline_general/api/utils.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from typing import TypeVar, Union, List, Optional, Generic, get_origin, get_args, Type, Any
1+
import json
2+
from typing import TypeVar, Union, List, Optional, Generic, get_origin, get_args, Type, Any, Tuple
23

34
T = TypeVar("T")
45
E = TypeVar("E")
@@ -37,6 +38,19 @@ def _return_cast_first_element(values: list[E], origin_class: type) -> E | None:
3738
return value
3839

3940

41+
def is_convertible_to_list(s: str) -> Tuple[bool, Union[List, str]]:
42+
"""Determines if a given string is convertible to a list by attempting to parse it as JSON."""
43+
44+
try:
45+
result = json.loads(s)
46+
if isinstance(result, list):
47+
return True, result # Return the list if conversion is successful
48+
else:
49+
return False, "Input is valid JSON but not a list." # Valid JSON but not a list
50+
except json.JSONDecodeError:
51+
return False, "Input is not valid JSON." # Invalid JSON
52+
53+
4054
class SmartValueParser(Generic[T]):
4155
"""Class handle api parameters that are passed in form of a specific value or as a list of strings from which
4256
the first element is used, cast to a proper type
@@ -58,6 +72,10 @@ def value_or_first_element(self, value: Union[T, list[T]]) -> list[T] | T | None
5872
extracted_value: T | None = _return_cast_first_element(value, origin_class)
5973
return extracted_value
6074
elif isinstance(value, list) and origin_class == list and container_elems_class:
75+
if len(value) == 1:
76+
is_list, result = is_convertible_to_list(str(value[0]))
77+
new_value = result if is_list else value
78+
return [_cast_to_type(elem, container_elems_class) for elem in new_value]
6179
return [_cast_to_type(elem, container_elems_class) for elem in value]
6280
return _cast_to_type(value, origin_class) # noqa
6381

0 commit comments

Comments
 (0)