Skip to content

Commit 09fb86f

Browse files
committed
feat: saving working efficient memory saving
1 parent e6481b8 commit 09fb86f

File tree

3 files changed

+269
-89
lines changed

3 files changed

+269
-89
lines changed

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import io
44
import logging
5-
from typing import cast
5+
from typing import cast, Optional
66

77
from pypdf import PdfReader
88
from pypdf.errors import PdfReadError
@@ -17,7 +17,21 @@
1717
pdf_logger = logging.getLogger("pypdf")
1818
pdf_logger.setLevel(logging.ERROR)
1919

20+
def read_pdf(file: shared.Files) -> Optional[PdfReader]:
21+
"""Reads the given PDF file.
2022
23+
Args:
24+
file: The PDF file to be read.
25+
26+
Returns:
27+
The PdfReader object if the file is a PDF, None otherwise.
28+
"""
29+
30+
try:
31+
content = cast(bytes, file.content)
32+
return PdfReader(io.BytesIO(content), strict=False)
33+
except (PdfReadError, UnicodeDecodeError):
34+
return None
2135

2236
def is_pdf(file: shared.Files) -> bool:
2337
"""Checks if the given file is a PDF.
@@ -31,10 +45,5 @@ def is_pdf(file: shared.Files) -> bool:
3145
True if the file is a PDF, False otherwise.
3246
"""
3347

34-
try:
35-
content = cast(bytes, file.content)
36-
PdfReader(io.BytesIO(content), strict=True)
37-
except (PdfReadError, UnicodeDecodeError):
38-
return False
48+
return read_pdf(file) is not None
3949

40-
return True

src/unstructured_client/_hooks/custom/request_utils.py

Lines changed: 142 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from typing import Tuple, Any, BinaryIO
88

99
import httpx
10+
from httpx._multipart import MultipartStream
1011
from requests_toolbelt.multipart.encoder import MultipartEncoder # type: ignore
1112

1213
from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
@@ -18,7 +19,8 @@
1819
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
1920
FormData,
2021
)
21-
from unstructured_client.utils import BackoffStrategy, Retries, RetryConfig, retry_async
22+
from unstructured_client.models import shared
23+
from unstructured_client.utils import BackoffStrategy, Retries, RetryConfig, retry_async, serialize_request_body
2224

2325
logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME)
2426

@@ -53,32 +55,153 @@ def create_pdf_request_body(
5355
"application/pdf",
5456
)))
5557

56-
payload_fields.append((PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, str(page_number)))
58+
return MultipartEncoder(fields=payload_fields)
5759

58-
body = MultipartEncoder(
59-
fields=payload_fields
60-
)
61-
return body
6260

61+
def create_pdf_request_data(
62+
form_data: FormData,
63+
pdf_chunk: BinaryIO,
64+
filename: str,
65+
page_number: int
66+
) -> dict[str, Any]:
67+
"""Creates the request body for the partition API."
6368
64-
async def call_api_async(
65-
client: httpx.AsyncClient,
69+
Args:
70+
form_data: The form data.
71+
pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
72+
filename: The filename.
73+
page_number: The page number.
74+
75+
"""
76+
payload = prepare_request_payload(form_data)
77+
payload[PARTITION_FORM_STARTING_PAGE_NUMBER_KEY] = str(page_number)
78+
79+
# payload[PARTITION_FORM_FILES_KEY] = (
80+
# filename,
81+
# pdf_chunk,
82+
# "application/pdf",
83+
# )
84+
#
85+
# return MultipartEncoder(fields=payload_fields)
86+
return payload
87+
88+
def prepare_pdf_chunk_request_payload(form_data: FormData) -> FormData:
89+
"""Prepares the request payload by removing unnecessary keys and updating the file.
90+
91+
Args:
92+
form_data: The original form data.
93+
94+
Returns:
95+
The updated request payload.
96+
"""
97+
fields_to_drop = [
98+
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
99+
PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
100+
PARTITION_FORM_FILES_KEY,
101+
PARTITION_FORM_PAGE_RANGE_KEY,
102+
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
103+
]
104+
chunk_payload = {key: form_data[key] for key in form_data if key not in fields_to_drop}
105+
chunk_payload[PARTITION_FORM_SPLIT_PDF_PAGE_KEY] = "false"
106+
return chunk_payload
107+
108+
def create_pdf_chunk_request_data(
109+
form_data: FormData,
110+
page_number: int
111+
) -> dict[str, Any]:
112+
"""Creates the request body for the partition API."
113+
114+
Args:
115+
form_data: The form data.
116+
pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
117+
filename: The filename.
118+
page_number: The page number.
119+
120+
"""
121+
fields_to_drop = [
122+
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
123+
PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
124+
PARTITION_FORM_FILES_KEY,
125+
PARTITION_FORM_PAGE_RANGE_KEY,
126+
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
127+
]
128+
chunk_payload = {key: form_data[key] for key in form_data if key not in fields_to_drop}
129+
chunk_payload[PARTITION_FORM_SPLIT_PDF_PAGE_KEY] = "false"
130+
chunk_payload[PARTITION_FORM_STARTING_PAGE_NUMBER_KEY] = str(page_number)
131+
return chunk_payload
132+
133+
def create_pdf_chunk_request(
134+
form_data: FormData,
66135
pdf_chunk: Tuple[BinaryIO, int],
67136
original_request: httpx.Request,
68-
form_data: FormData,
69137
filename: str,
70-
limiter: asyncio.Semaphore,
71-
) -> httpx.Response:
138+
) -> httpx.Request:
139+
"""Creates a new request object with the updated payload for the partition API.
140+
141+
Args:
142+
form_data: The form data.
143+
pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
144+
original_request: The original request.
145+
filename: The filename.
146+
147+
Returns:
148+
The updated request object.
149+
"""
72150
pdf_chunk_file, page_number = pdf_chunk
73-
body = create_pdf_request_body(form_data, pdf_chunk_file, filename, page_number)
151+
data = create_pdf_chunk_request_data(form_data, page_number)
74152
original_headers = prepare_request_headers(original_request.headers)
75153

76-
new_request = httpx.Request(
154+
pdf_chunk_partition_params = shared.PartitionParameters(
155+
files=shared.Files(
156+
content=pdf_chunk_file,
157+
file_name=filename,
158+
content_type="application/pdf",
159+
),
160+
**data,
161+
)
162+
serialized_body = serialize_request_body(
163+
pdf_chunk_partition_params,
164+
False,
165+
False,
166+
"multipart",
167+
shared.PartitionParameters,
168+
)
169+
# chunk_request = client.build_request(
170+
# method="POST",
171+
# url=original_request.url or "",
172+
# headers={**original_headers, "Content-Type": "application/pdf"},
173+
# files={PARTITION_FORM_FILES_KEY: (filename, pdf_chunk_file, "application/pdf")},
174+
# data=data,
175+
# )
176+
pdf_chunk_request = httpx.Request(
77177
method="POST",
78178
url=original_request.url or "",
79-
content=body.to_string(),
80-
headers={**original_headers, "Content-Type": body.content_type},
179+
# headers={**original_headers, "Content-Type": serialized_body.media_type},
180+
headers={**original_headers},
181+
content=serialized_body.content,
182+
data=serialized_body.data,
183+
files=serialized_body.files,
81184
)
185+
return pdf_chunk_request
186+
187+
188+
189+
async def call_api_async(
190+
client: httpx.AsyncClient,
191+
pdf_chunk_request: httpx.Request,
192+
pdf_chunk_file: BinaryIO,
193+
limiter: asyncio.Semaphore,
194+
) -> httpx.Response:
195+
# pdf_chunk_file, page_number = pdf_chunk
196+
# body = create_pdf_request_body(form_data, pdf_chunk_file, filename, page_number)
197+
# original_headers = prepare_request_headers(original_request.headers)
198+
199+
# new_request = httpx.Request(
200+
# method="POST",
201+
# url=original_request.url or "",
202+
# content=body.to_string(),
203+
# headers={**original_headers, "Content-Type": body.content_type},
204+
# )
82205

83206
one_second = 1000
84207
one_minute = 1000 * 60
@@ -101,7 +224,7 @@ async def call_api_async(
101224
]
102225

103226
async def do_request():
104-
return await client.send(new_request)
227+
return await client.send(pdf_chunk_request)
105228

106229
async with limiter:
107230
try:
@@ -110,6 +233,9 @@ async def do_request():
110233
Retries(retry_config, retryable_codes)
111234
)
112235
return response
236+
except Exception as e:
237+
print(e)
238+
raise e
113239
finally:
114240
if not pdf_chunk_file.closed:
115241
pdf_chunk_file.close()

0 commit comments

Comments
 (0)