feat: support list type parameters (#368)

christinestraub · web-flow · commit 023307f28310 · 2024-02-20T22:22:30.000-08:00
The purpose of this PR is to support parsing all list type parameters,
including `extract_image_block_types` when calling unstructured API via
unstructured client SDK (Python/JS) generated by `speakeasy`.

Currently, the `speakeasy` doesn't generate proper client code to pass
list type parameters to unstructured API because they do not expect to
support specific client code for `FastAPI` that the unstructured API
relies on. To address this issue, I updated the unstructured API code to
parse all list type parameters passed as JSON-formatted lists (e.g.
`'["image", "table"]'`).


**NOTE:** You must pass the list type parameter as a JSON-formatted list
when calling unstructured API via unstructured client SDK.
(e.g. `extract_image_block_types = '["image", "table"]'`,
`skip_infer_table_types='["docx", "xlsx"]'`...)


### Summary
- update `SmartValueParser.value_or_first_element()` to parse JSON
format string (e.g. `'["image", "table"]'`) that is convertible to a
list
- apply `SmartValueParser.value_or_first_element()` to all list type
parameters
- remove existing `extract_image_block_types` parsing logic

### Testing
- via unstructured_client_sdk (Python)
```
s = UnstructuredClient(
    server_url="http://localhost:8000/general/v0/general",
    api_key_auth="YOUR-API-KEY"
)

filename = "sample-docs/embedded-images-tables.pdf"

with open(filename, "rb") as f:
    # Note that this currently only supports a single file
    files = shared.Files(
        content=f.read(),
        file_name=filename,
    )

req = shared.PartitionParameters(
    files=files,
    # Other partition params
    strategy="hi_res",
    extract_image_block_types='["image", "table"]',
    languages=["pdf"],
)

try:
    resp = s.general.partition(req)
    print([el.get("metadata").get("image_mime_type") for el in resp.elements if el.get("metadata").get("image_mime_type")])
except SDKError as e:
    print(e)
```

- via unstructured_client_sdk (JS)
```
import { UnstructuredClient } from "unstructured-client";
import * as fs from "fs";

const key = "YOUR-API-KEY";

const client = new UnstructuredClient({
    serverURL: "http://localhost:8000",
    security: {
        apiKeyAuth: key,
    },
});

const filename = "sample-docs/embedded-images-tables.pdf";
const data = fs.readFileSync(filename);

client.general.partition({
    // Note that this currently only supports a single file
    files: {
        content: data,
        fileName: filename,
    },
    // Other partition params
    strategy: "hi_res",
    extractImageBlockTypes: '["image", "table"]',
}).then((res) =&gt; {
    if (res.statusCode == 200) {
        console.log(res.elements);
    }
}).catch((e) =&gt; {
    console.log(e.statusCode);
    console.log(e.body);
});

```
- via default `requests` client (Python)
```
url = "http://localhost:8000/general/v0/general"

headers = {
    'accept': 'application/json',
    'unstructured-api-key': "YOUR-API-KEY"
}

data = {
    "strategy": "hi_res",
    "extract_image_block_types": ["Image", "Table"],

}

filename = "sample-docs/embedded-images-tables.pdf"
file_data = {'files': open(filename, 'rb')}

response = requests.post(url, headers=headers, data=data, files=file_data)

file_data['files'].close()

elements = response.json()
print([el.get("metadata").get("image_mime_type") for el in elements if el.get("metadata").get("image_mime_type")])
```

- via `curl` command
```
curl -X 'POST' \
  'http://localhost:8000/general/v0/general' \
  -H 'accept: application/json' \
  -H 'Content-Type: multipart/form-data' \
  -F 'files=@sample-docs/embedded-images-tables.pdf' \
  -F 'strategy=hi_res' \
  -F 'extract_image_block_types=["image", "table"]' \
  | jq -C . | less -R
```
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
@@ -359,16 +359,6 @@ def pipeline_api(
 
     ocr_languages_str = "+".join(ocr_languages) if ocr_languages and len(ocr_languages) else None
 
-    if extract_image_block_types:
-        try:
-            # Handle the case when the user passes the table of strings as a json inside the
-            # first element of the array
-            loaded_array = json.loads(extract_image_block_types[0])
-            if isinstance(loaded_array, list):
-                extract_image_block_types = loaded_array
-        except (json.JSONDecodeError, IndexError):
-            pass  # noqa
-
     extract_image_block_to_payload = bool(extract_image_block_types)
 
     try:
diff --git a/prepline_general/api/models/form_params.py b/prepline_general/api/models/form_params.py
@@ -52,6 +52,7 @@ def as_form(
                 description="The languages present in the document, for use in partitioning and/or OCR",
                 example="[eng]",
             ),
+            BeforeValidator(SmartValueParser[List[str]]().value_or_first_element),
         ] = [],  # noqa
         ocr_languages: Annotated[
             List[str],
@@ -149,6 +150,7 @@ def as_form(
                 description="The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields",
                 example="""["image", "table"]""",
             ),
+            BeforeValidator(SmartValueParser[List[str]]().value_or_first_element),
         ] = [],  # noqa
         # -- chunking options --
         chunking_strategy: Annotated[
diff --git a/prepline_general/api/utils.py b/prepline_general/api/utils.py
@@ -1,4 +1,5 @@
-from typing import TypeVar, Union, List, Optional, Generic, get_origin, get_args, Type, Any
+import json
+from typing import TypeVar, Union, List, Optional, Generic, get_origin, get_args, Type, Any, Tuple
 
 T = TypeVar("T")
 E = TypeVar("E")
@@ -37,6 +38,19 @@ def _return_cast_first_element(values: list[E], origin_class: type) -> E | None:
     return value
 
 
+def is_convertible_to_list(s: str) -> Tuple[bool, Union[List, str]]:
+    """Determines if a given string is convertible to a list by attempting to parse it as JSON."""
+
+    try:
+        result = json.loads(s)
+        if isinstance(result, list):
+            return True, result  # Return the list if conversion is successful
+        else:
+            return False, "Input is valid JSON but not a list."  # Valid JSON but not a list
+    except json.JSONDecodeError:
+        return False, "Input is not valid JSON."  # Invalid JSON
+
+
 class SmartValueParser(Generic[T]):
     """Class handle api parameters that are passed in form of a specific value or as a list of strings from which
     the first element is used, cast to a proper type
@@ -58,6 +72,10 @@ def value_or_first_element(self, value: Union[T, list[T]]) -> list[T] | T | None
             extracted_value: T | None = _return_cast_first_element(value, origin_class)
             return extracted_value
         elif isinstance(value, list) and origin_class == list and container_elems_class:
+            if len(value) == 1:
+                is_list, result = is_convertible_to_list(str(value[0]))
+                new_value = result if is_list else value
+                return [_cast_to_type(elem, container_elems_class) for elem in new_value]
             return [_cast_to_type(elem, container_elems_class) for elem in value]
         return _cast_to_type(value, origin_class)  # noqa