Skip to content

Commit 85bef70

Browse files
authored
Merge pull request #13 from Unstructured-IO/jj/cleaning
Cleaning
2 parents fd2a559 + e0761fa commit 85bef70

File tree

16 files changed

+531
-334
lines changed

16 files changed

+531
-334
lines changed

README.md

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,30 +34,37 @@ from unstructured_client.models.errors import SDKError
3434

3535
s = UnstructuredClient(api_key_auth="YOUR_API_KEY")
3636

37-
filename = "sample-docs/layout-parser-paper.pdf"
38-
file = open(filename, "rb")
37+
filename = "sample-docs/layout-parser-paper-fast.pdf"
3938

40-
req = shared.PartitionParameters(
39+
with open(filename, "rb") as f:
4140
# Note that this currently only supports a single file
42-
files=shared.Files(
43-
content=file.read(),
44-
file_name=filename,
45-
),
46-
# Other partition params
47-
strategy="fast",
41+
files=shared.Files(
42+
content=f.read(),
43+
file_name=filename,
44+
)
45+
46+
req = shared.PartitionParameters(
47+
files=files,
48+
strategy='ocr_only',
49+
languages=["eng"],
4850
)
4951

5052
try:
51-
res = s.general.partition(req)
52-
print(res.elements[0])
53+
resp = s.general.partition(req)
54+
print(resp.elements[0])
5355
except SDKError as e:
5456
print(e)
5557

5658
# {
57-
# 'type': 'Title',
58-
# 'element_id': '015301d4f56aa4b20ec10ac889d2343f',
59-
# 'metadata': {'filename': 'layout-parser-paper.pdf', 'filetype': 'application/pdf', 'page_number': 1},
60-
# 'text': 'LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis'
59+
# 'type': 'UncategorizedText',
60+
# 'element_id': 'fc550084fda1e008e07a0356894f5816',
61+
# 'metadata': {
62+
# 'filename': 'layout-parser-paper-fast.pdf',
63+
# 'filetype': 'application/pdf',
64+
# 'languages': ['eng'],
65+
# 'page_number': 1
66+
# },
67+
# 'text': '2103.15348v2 [cs.CV] 21 Jun 2021'
6168
# }
6269
```
6370

168 KB
Binary file not shown.

setup.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,7 @@
3232
"typing_extensions>=4.7.1",
3333
"urllib3>=2.0.4",
3434
],
35-
extras_require={
36-
"dev":["pylint==2.16.2"]
37-
},
38-
package_dir={'': 'src'},
39-
python_requires='>=3.8'
35+
extras_require={"dev": ["pylint==2.16.2"]},
36+
package_dir={"": "src"},
37+
python_requires=">=3.8",
4038
)

src/unstructured_client/general.py

Lines changed: 59 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -5,65 +5,93 @@
55
from unstructured_client import utils
66
from unstructured_client.models import errors, operations, shared
77

8+
89
class General:
910
sdk_configuration: SDKConfiguration
1011

1112
def __init__(self, sdk_config: SDKConfiguration) -> None:
1213
self.sdk_configuration = sdk_config
13-
14-
15-
16-
def partition(self, request: shared.PartitionParameters, retries: Optional[utils.RetryConfig] = None) -> operations.PartitionResponse:
14+
15+
def partition(
16+
self,
17+
request: shared.PartitionParameters,
18+
retries: Optional[utils.RetryConfig] = None,
19+
) -> operations.PartitionResponse:
1720
r"""Pipeline 1"""
1821
base_url = utils.template_url(*self.sdk_configuration.get_server_details())
19-
20-
url = base_url + '/general/v0/general'
22+
23+
url = base_url + "/general/v0/general"
2124
headers = {}
22-
req_content_type, data, form = utils.serialize_request_body(request, "request", False, True, 'multipart')
23-
if req_content_type not in ('multipart/form-data', 'multipart/mixed'):
24-
headers['content-type'] = req_content_type
25-
headers['Accept'] = 'application/json'
26-
headers['user-agent'] = self.sdk_configuration.user_agent
27-
25+
req_content_type, data, form = utils.serialize_request_body(
26+
request, "request", False, True, "multipart"
27+
)
28+
if req_content_type not in ("multipart/form-data", "multipart/mixed"):
29+
headers["content-type"] = req_content_type
30+
headers["Accept"] = "application/json"
31+
headers["user-agent"] = self.sdk_configuration.user_agent
32+
2833
if callable(self.sdk_configuration.security):
29-
client = utils.configure_security_client(self.sdk_configuration.client, self.sdk_configuration.security())
34+
client = utils.configure_security_client(
35+
self.sdk_configuration.client, self.sdk_configuration.security()
36+
)
3037
else:
31-
client = utils.configure_security_client(self.sdk_configuration.client, self.sdk_configuration.security)
32-
38+
client = utils.configure_security_client(
39+
self.sdk_configuration.client, self.sdk_configuration.security
40+
)
41+
3342
global_retry_config = self.sdk_configuration.retry_config
3443
retry_config = retries
3544
if retry_config is None:
3645
if global_retry_config:
3746
retry_config = global_retry_config
3847
else:
39-
retry_config = utils.RetryConfig('backoff', utils.BackoffStrategy(500, 60000, 1.5, 3600000), True)
48+
retry_config = utils.RetryConfig(
49+
"backoff", utils.BackoffStrategy(500, 60000, 1.5, 3600000), True
50+
)
4051

4152
def do_request():
42-
return client.request('POST', url, data=data, files=form, headers=headers)
53+
return client.request("POST", url, data=data, files=form, headers=headers)
54+
55+
http_res = utils.retry(do_request, utils.Retries(retry_config, ["5xx"]))
56+
content_type = http_res.headers.get("Content-Type")
4357

44-
http_res = utils.retry(do_request, utils.Retries(retry_config, [
45-
'5xx'
46-
]))
47-
content_type = http_res.headers.get('Content-Type')
58+
res = operations.PartitionResponse(
59+
status_code=http_res.status_code,
60+
content_type=content_type,
61+
raw_response=http_res,
62+
)
4863

49-
res = operations.PartitionResponse(status_code=http_res.status_code, content_type=content_type, raw_response=http_res)
50-
5164
if http_res.status_code == 200:
52-
if utils.match_content_type(content_type, 'application/json'):
65+
if utils.match_content_type(content_type, "application/json"):
5366
out = utils.unmarshal_json(http_res.text, Optional[List[Any]])
5467
res.elements = out
5568
else:
56-
raise errors.SDKError(f'unknown content-type received: {content_type}', http_res.status_code, http_res.text, http_res)
69+
raise errors.SDKError(
70+
f"unknown content-type received: {content_type}",
71+
http_res.status_code,
72+
http_res.text,
73+
http_res,
74+
)
5775
elif http_res.status_code == 422:
58-
if utils.match_content_type(content_type, 'application/json'):
76+
if utils.match_content_type(content_type, "application/json"):
5977
out = utils.unmarshal_json(http_res.text, errors.HTTPValidationError)
6078
out.raw_response = http_res
6179
raise out
6280
else:
63-
raise errors.SDKError(f'unknown content-type received: {content_type}', http_res.status_code, http_res.text, http_res)
64-
elif http_res.status_code >= 400 and http_res.status_code < 500 or http_res.status_code >= 500 and http_res.status_code < 600:
65-
raise errors.SDKError('API error occurred', http_res.status_code, http_res.text, http_res)
81+
raise errors.SDKError(
82+
f"unknown content-type received: {content_type}",
83+
http_res.status_code,
84+
http_res.text,
85+
http_res,
86+
)
87+
elif (
88+
http_res.status_code >= 400
89+
and http_res.status_code < 500
90+
or http_res.status_code >= 500
91+
and http_res.status_code < 600
92+
):
93+
raise errors.SDKError(
94+
"API error occurred", http_res.status_code, http_res.text, http_res
95+
)
6696

6797
return res
68-
69-

src/unstructured_client/models/errors/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
from .sdkerror import *
55
from .validationerror import *
66

7-
__all__ = ["HTTPValidationError","SDKError","ValidationError"]
7+
__all__ = ["HTTPValidationError", "SDKError", "ValidationError"]

src/unstructured_client/models/errors/httpvalidationerror.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,17 @@
99

1010

1111
@dataclass_json(undefined=Undefined.EXCLUDE)
12-
1312
@dataclasses.dataclass
1413
class HTTPValidationError(Exception):
15-
detail: Optional[List[ValidationError]] = dataclasses.field(default=None, metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('detail'), 'exclude': lambda f: f is None }})
16-
14+
detail: Optional[List[ValidationError]] = dataclasses.field(
15+
default=None,
16+
metadata={
17+
"dataclasses_json": {
18+
"letter_case": utils.get_field_name("detail"),
19+
"exclude": lambda f: f is None,
20+
}
21+
},
22+
)
1723

1824
def __str__(self) -> str:
1925
return utils.marshal_json(self)

src/unstructured_client/models/errors/sdkerror.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,27 @@
55

66
class SDKError(Exception):
77
"""Represents an error returned by the API."""
8+
89
message: str
910
status_code: int
1011
body: str
1112
raw_response: requests_http.Response
1213

13-
def __init__(self, message: str, status_code: int, body: str, raw_response: requests_http.Response):
14+
def __init__(
15+
self,
16+
message: str,
17+
status_code: int,
18+
body: str,
19+
raw_response: requests_http.Response,
20+
):
1421
self.message = message
1522
self.status_code = status_code
1623
self.body = body
1724
self.raw_response = raw_response
1825

1926
def __str__(self):
20-
body = ''
27+
body = ""
2128
if len(self.body) > 0:
22-
body = f'\n{self.body}'
29+
body = f"\n{self.body}"
2330

24-
return f'{self.message}: Status {self.status_code}{body}'
31+
return f"{self.message}: Status {self.status_code}{body}"

src/unstructured_client/models/errors/validationerror.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,12 @@
1010
@dataclass_json(undefined=Undefined.EXCLUDE)
1111
@dataclasses.dataclass
1212
class ValidationError:
13-
loc: List[Union[str, int]] = dataclasses.field(metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('loc') }})
14-
msg: str = dataclasses.field(metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('msg') }})
15-
type: str = dataclasses.field(metadata={'dataclasses_json': { 'letter_case': utils.get_field_name('type') }})
16-
17-
13+
loc: List[Union[str, int]] = dataclasses.field(
14+
metadata={"dataclasses_json": {"letter_case": utils.get_field_name("loc")}}
15+
)
16+
msg: str = dataclasses.field(
17+
metadata={"dataclasses_json": {"letter_case": utils.get_field_name("msg")}}
18+
)
19+
type: str = dataclasses.field(
20+
metadata={"dataclasses_json": {"letter_case": utils.get_field_name("type")}}
21+
)

src/unstructured_client/models/operations/partition.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,3 @@ class PartitionResponse:
1616
r"""Successful Response"""
1717
raw_response: Optional[requests_http.Response] = dataclasses.field(default=None)
1818
r"""Raw HTTP response; suitable for custom response parsing"""
19-
20-

src/unstructured_client/models/shared/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
from .partition_parameters import *
44
from .security import *
55

6-
__all__ = ["Files","PartitionParameters","Security"]
6+
__all__ = ["Files", "PartitionParameters", "Security"]

0 commit comments

Comments
 (0)