Skip to content

Commit f2582a0

Browse files
committed
Merge branch 'main' into pawel/fix-split-pdf-memory-usage
2 parents c461d8c + 99c6385 commit f2582a0

File tree

11 files changed

+57
-53
lines changed

11 files changed

+57
-53
lines changed

.speakeasy/gen.lock

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@ id: 8b5fa338-9106-4734-abf0-e30d67044a90
33
management:
44
docChecksum: 21f469b38bb72725739ee9d9d0fc8780
55
docVersion: 1.0.51
6-
speakeasyVersion: 1.418.1
7-
generationVersion: 2.438.3
8-
releaseVersion: 0.26.1
9-
configChecksum: 55ded3ef4f1b052725cdab6587da0ea4
6+
speakeasyVersion: 1.422.1
7+
generationVersion: 2.438.15
8+
releaseVersion: 0.26.2
9+
configChecksum: c46fa7f108a08d4565530aa29da677b5
1010
repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git
1111
repoSubDirectory: .
1212
installationURL: https://github.com/Unstructured-IO/unstructured-python-client.git

.speakeasy/workflow.lock

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
speakeasyVersion: 1.418.1
1+
speakeasyVersion: 1.422.1
22
sources:
33
my-source:
44
sourceNamespace: my-source
5-
sourceRevisionDigest: sha256:a820d523af3e56f0dd1fc84f4f29e15330edb26cc253e93981bddb4a5176ac3c
5+
sourceRevisionDigest: sha256:31c94056ebc941cdfcf3fd4ba5e04880e978740963f7ce79169ba66cd033d74d
66
sourceBlobDigest: sha256:27e4879df402e924f9f65d336ea6d2fc8b16a00b87b4a802866238f7e9f639d3
77
tags:
88
- latest
@@ -11,7 +11,7 @@ targets:
1111
unstructured-python:
1212
source: my-source
1313
sourceNamespace: my-source
14-
sourceRevisionDigest: sha256:a820d523af3e56f0dd1fc84f4f29e15330edb26cc253e93981bddb4a5176ac3c
14+
sourceRevisionDigest: sha256:31c94056ebc941cdfcf3fd4ba5e04880e978740963f7ce79169ba66cd033d74d
1515
sourceBlobDigest: sha256:27e4879df402e924f9f65d336ea6d2fc8b16a00b87b4a802866238f7e9f639d3
1616
workflow:
1717
workflowVersion: 1.0.0

RELEASES.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -704,4 +704,14 @@ Based on:
704704
### Generated
705705
- [python v0.26.1] .
706706
### Releases
707-
- [PyPI v0.26.1] https://pypi.org/project/unstructured-client/0.26.1 - .
707+
- [PyPI v0.26.1] https://pypi.org/project/unstructured-client/0.26.1 - .
708+
709+
## 2024-10-28 00:09:56
710+
### Changes
711+
Based on:
712+
- OpenAPI Doc
713+
- Speakeasy CLI 1.422.1 (2.438.15) https://github.com/speakeasy-api/speakeasy
714+
### Generated
715+
- [python v0.26.2] .
716+
### Releases
717+
- [PyPI v0.26.2] https://pypi.org/project/unstructured-client/0.26.2 - .

gen.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ generation:
1010
auth:
1111
oAuth2ClientCredentialsEnabled: false
1212
python:
13-
version: 0.26.1
13+
version: 0.26.2
1414
additionalDependencies:
1515
dev:
1616
deepdiff: '>=6.0'

src/unstructured_client/_hooks/custom/logger_hook.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,10 @@ def after_error(
7777
if response and response.status_code == 200:
7878
# NOTE: Even though this is an after_error method, due to split_pdf_hook logic we may get
7979
# a success here when one of the split requests was partitioned successfully
80-
logger.info("Successfully partitioned the document.")
81-
82-
else:
83-
logger.error("Failed to partition the document.")
84-
if response:
85-
logger.error("Server responded with %d - %s", response.status_code, response.text)
86-
if error is not None:
87-
logger.error("Following error occurred - %s", error)
88-
80+
return response, error
81+
logger.error("Failed to partition the document.")
82+
if response:
83+
logger.error("Server responded with %d - %s", response.status_code, response.text)
84+
if error is not None:
85+
logger.error("Following error occurred - %s", error)
8986
return response, error

src/unstructured_client/_hooks/custom/split_pdf_hook.py

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,6 @@ def sdk_init(
146146
Returns:
147147
Tuple[str, HttpClient]: The initialized SDK options.
148148
"""
149-
150149
class DummyTransport(httpx.BaseTransport):
151150
def __init__(self, base_transport: httpx.BaseTransport):
152151
self.base_transport = base_transport
@@ -238,39 +237,32 @@ def before_request(
238237
if split_pdf_page is None or split_pdf_page == "false":
239238
return request
240239

241-
logger.info("Preparing to split document for partition.")
242240
file = form_data.get(PARTITION_FORM_FILES_KEY)
243241
if (
244242
file is None
245243
or not isinstance(file, shared.Files)
246244
or not pdf_utils.is_pdf(file)
247245
):
248-
logger.info("Partitioning without split.")
249246
return request
250247

251248
starting_page_number = form_utils.get_starting_page_number(
252249
form_data,
253250
key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
254251
fallback_value=DEFAULT_STARTING_PAGE_NUMBER,
255252
)
256-
if starting_page_number > 1:
257-
logger.info("Starting page number set to %d", starting_page_number)
258-
logger.info("Starting page number set to %d", starting_page_number)
259253

260254
self.allow_failed = form_utils.get_split_pdf_allow_failed_param(
261255
form_data,
262256
key=PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
263257
fallback_value=DEFAULT_ALLOW_FAILED,
264258
)
265-
logger.info("Allow failed set to %d", self.allow_failed)
266259

267260
concurrency_level = form_utils.get_split_pdf_concurrency_level_param(
268261
form_data,
269262
key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
270263
fallback_value=DEFAULT_CONCURRENCY_LEVEL,
271264
max_allowed=MAX_CONCURRENCY_LEVEL,
272265
)
273-
logger.info("Concurrency level set to %d", concurrency_level)
274266
limiter = asyncio.Semaphore(concurrency_level)
275267

276268
content = cast(bytes, file.content)
@@ -283,25 +275,14 @@ def before_request(
283275
)
284276

285277
page_count = page_range_end - page_range_start + 1
286-
logger.info(
287-
"Splitting pages %d to %d (%d total)",
288-
page_range_start,
289-
page_range_end,
290-
page_count,
291-
)
292278

293279
split_size = get_optimal_split_size(
294280
num_pages=page_count, concurrency_level=concurrency_level
295281
)
296-
logger.info("Determined optimal split size of %d pages.", split_size)
297282

298283
# If the doc is small enough, and we aren't slicing it with a page range:
299284
# do not split, just continue with the original request
300285
if split_size >= page_count and page_count == len(pdf.pages):
301-
logger.info(
302-
"Document has too few pages (%d) to be split efficiently. Partitioning without split.",
303-
page_count,
304-
)
305286
return request
306287

307288
pages = self._get_pdf_pages(
@@ -329,7 +310,7 @@ def before_request(
329310
# Use a variable to adjust the httpx client timeout, or default to 30 minutes
330311
# When we're able to reuse the SDK to make these calls, we can remove this var
331312
# The SDK timeout will be controlled by parameter
332-
client_timeout_minutes = 30
313+
client_timeout_minutes = 60
333314
if timeout_var := os.getenv("UNSTRUCTURED_CLIENT_TIMEOUT_MINUTES"):
334315
client_timeout_minutes = int(timeout_var)
335316

@@ -365,14 +346,8 @@ async def call_api_partial(pdf_chunk: Tuple[BinaryIO, int]):
365346

366347
self.coroutines_to_execute[operation_id] = []
367348
set_index = 1
368-
for pdf_chunk_file, page_index, all_pages_number in pages:
349+
for pdf_chunk_file, page_index in pages:
369350
page_number = page_index + starting_page_number
370-
logger.info(
371-
"Partitioning set #%d (pages %d-%d).",
372-
set_index,
373-
page_number,
374-
min(page_number + split_size - 1, all_pages_number),
375-
)
376351

377352
coroutine = call_api_partial((pdf_chunk_file, page_number))
378353
self.coroutines_to_execute[operation_id].append(coroutine)

src/unstructured_client/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import importlib.metadata
44

55
__title__: str = "unstructured-client"
6-
__version__: str = "0.26.1"
6+
__version__: str = "0.26.2"
77

88
try:
99
if __package__ is not None:

src/unstructured_client/general.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,18 +99,20 @@ def partition(
9999
data = utils.unmarshal_json(http_res.text, errors.HTTPValidationErrorData)
100100
raise errors.HTTPValidationError(data=data)
101101
if utils.match_response(http_res, "4XX", "*"):
102+
http_res_text = utils.stream_to_text(http_res)
102103
raise errors.SDKError(
103-
"API error occurred", http_res.status_code, http_res.text, http_res
104+
"API error occurred", http_res.status_code, http_res_text, http_res
104105
)
105106
if utils.match_response(http_res, "5XX", "application/json"):
106107
data = utils.unmarshal_json(http_res.text, errors.ServerErrorData)
107108
raise errors.ServerError(data=data)
108109

109110
content_type = http_res.headers.get("Content-Type")
111+
http_res_text = utils.stream_to_text(http_res)
110112
raise errors.SDKError(
111113
f"Unexpected response received (code: {http_res.status_code}, type: {content_type})",
112114
http_res.status_code,
113-
http_res.text,
115+
http_res_text,
114116
http_res,
115117
)
116118

@@ -204,17 +206,19 @@ async def partition_async(
204206
data = utils.unmarshal_json(http_res.text, errors.HTTPValidationErrorData)
205207
raise errors.HTTPValidationError(data=data)
206208
if utils.match_response(http_res, "4XX", "*"):
209+
http_res_text = await utils.stream_to_text_async(http_res)
207210
raise errors.SDKError(
208-
"API error occurred", http_res.status_code, http_res.text, http_res
211+
"API error occurred", http_res.status_code, http_res_text, http_res
209212
)
210213
if utils.match_response(http_res, "5XX", "application/json"):
211214
data = utils.unmarshal_json(http_res.text, errors.ServerErrorData)
212215
raise errors.ServerError(data=data)
213216

214217
content_type = http_res.headers.get("Content-Type")
218+
http_res_text = await utils.stream_to_text_async(http_res)
215219
raise errors.SDKError(
216220
f"Unexpected response received (code: {http_res.status_code}, type: {content_type})",
217221
http_res.status_code,
218-
http_res.text,
222+
http_res_text,
219223
http_res,
220224
)

src/unstructured_client/sdkconfiguration.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ class SDKConfiguration:
3434
server: Optional[str] = ""
3535
language: str = "python"
3636
openapi_doc_version: str = "1.0.51"
37-
sdk_version: str = "0.26.1"
38-
gen_version: str = "2.438.3"
39-
user_agent: str = "speakeasy-sdk/python 0.26.1 2.438.3 1.0.51 unstructured-client"
37+
sdk_version: str = "0.26.2"
38+
gen_version: str = "2.438.15"
39+
user_agent: str = "speakeasy-sdk/python 0.26.2 2.438.15 1.0.51 unstructured-client"
4040
retry_config: OptionalNullable[RetryConfig] = Field(default_factory=lambda: UNSET)
4141
timeout_ms: Optional[int] = None
4242

src/unstructured_client/utils/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
serialize_float,
2828
serialize_int,
2929
stream_to_text,
30+
stream_to_text_async,
31+
stream_to_bytes,
32+
stream_to_bytes_async,
3033
validate_const,
3134
validate_decimal,
3235
validate_float,
@@ -80,6 +83,9 @@
8083
"serialize_request_body",
8184
"SerializedRequestBody",
8285
"stream_to_text",
86+
"stream_to_text_async",
87+
"stream_to_bytes",
88+
"stream_to_bytes_async",
8389
"template_url",
8490
"unmarshal",
8591
"unmarshal_json",

0 commit comments

Comments
 (0)