Skip to content

Commit b06af78

Browse files
committed
chore: cleaning
1 parent 09fb86f commit b06af78

File tree

3 files changed

+109
-190
lines changed

3 files changed

+109
-190
lines changed

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import io
44
import logging
5-
from typing import cast, Optional
5+
from typing import cast, Optional, BinaryIO
66

77
from pypdf import PdfReader
88
from pypdf.errors import PdfReadError
@@ -17,33 +17,20 @@
1717
pdf_logger = logging.getLogger("pypdf")
1818
pdf_logger.setLevel(logging.ERROR)
1919

20-
def read_pdf(file: shared.Files) -> Optional[PdfReader]:
20+
def read_pdf(pdf_file: BinaryIO) -> Optional[PdfReader]:
2121
"""Reads the given PDF file.
2222
2323
Args:
24-
file: The PDF file to be read.
24+
pdf_file: The PDF file to be read.
2525
2626
Returns:
2727
The PdfReader object if the file is a PDF, None otherwise.
2828
"""
2929

3030
try:
31-
content = cast(bytes, file.content)
32-
return PdfReader(io.BytesIO(content), strict=False)
31+
if isinstance(pdf_file, bytes):
32+
content = cast(bytes, pdf_file)
33+
pdf_file = io.BytesIO(content)
34+
return PdfReader(pdf_file, strict=False)
3335
except (PdfReadError, UnicodeDecodeError):
3436
return None
35-
36-
def is_pdf(file: shared.Files) -> bool:
37-
"""Checks if the given file is a PDF.
38-
39-
Tries to read that file. If there is no error then we assume it is a proper PDF.
40-
41-
Args:
42-
file: The file to be checked.
43-
44-
Returns:
45-
True if the file is a PDF, False otherwise.
46-
"""
47-
48-
return read_pdf(file) is not None
49-

src/unstructured_client/_hooks/custom/request_utils.py

Lines changed: 38 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
from __future__ import annotations
22

33
import asyncio
4-
import copy
4+
import io
55
import json
66
import logging
77
from typing import Tuple, Any, BinaryIO
88

99
import httpx
10-
from httpx._multipart import MultipartStream
10+
from httpx._multipart import DataField, FileField
1111
from requests_toolbelt.multipart.encoder import MultipartEncoder # type: ignore
1212

1313
from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
@@ -24,99 +24,51 @@
2424

2525
logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME)
2626

27-
28-
def create_pdf_request_body(
29-
form_data: FormData,
30-
pdf_chunk: BinaryIO,
31-
filename: str,
32-
page_number: int
33-
) -> MultipartEncoder:
34-
"""Creates the request body for the partition API."
35-
36-
Args:
37-
form_data: The form data.
38-
pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
39-
filename: The filename.
40-
page_number: The page number.
41-
42-
"""
43-
payload = prepare_request_payload(form_data)
44-
45-
payload_fields: list[tuple[str, Any]] = []
46-
for key, value in payload.items():
47-
if isinstance(value, list):
48-
payload_fields.extend([(key, list_value) for list_value in value])
49-
else:
50-
payload_fields.append((key, value))
51-
52-
payload_fields.append((PARTITION_FORM_FILES_KEY, (
53-
filename,
54-
pdf_chunk,
55-
"application/pdf",
56-
)))
57-
58-
return MultipartEncoder(fields=payload_fields)
59-
60-
61-
def create_pdf_request_data(
62-
form_data: FormData,
63-
pdf_chunk: BinaryIO,
64-
filename: str,
65-
page_number: int
66-
) -> dict[str, Any]:
67-
"""Creates the request body for the partition API."
68-
69-
Args:
70-
form_data: The form data.
71-
pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
72-
filename: The filename.
73-
page_number: The page number.
74-
75-
"""
76-
payload = prepare_request_payload(form_data)
77-
payload[PARTITION_FORM_STARTING_PAGE_NUMBER_KEY] = str(page_number)
78-
79-
# payload[PARTITION_FORM_FILES_KEY] = (
80-
# filename,
81-
# pdf_chunk,
82-
# "application/pdf",
83-
# )
84-
#
85-
# return MultipartEncoder(fields=payload_fields)
86-
return payload
87-
88-
def prepare_pdf_chunk_request_payload(form_data: FormData) -> FormData:
89-
"""Prepares the request payload by removing unnecessary keys and updating the file.
27+
def get_multipart_stream_fields(request: httpx.Request) -> dict[str, Any]:
28+
"""Extracts the multipart fields from the request.
9029
9130
Args:
92-
form_data: The original form data.
31+
request: The request object.
9332
9433
Returns:
95-
The updated request payload.
34+
The multipart fields.
9635
"""
97-
fields_to_drop = [
98-
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
99-
PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
100-
PARTITION_FORM_FILES_KEY,
101-
PARTITION_FORM_PAGE_RANGE_KEY,
102-
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
103-
]
104-
chunk_payload = {key: form_data[key] for key in form_data if key not in fields_to_drop}
105-
chunk_payload[PARTITION_FORM_SPLIT_PDF_PAGE_KEY] = "false"
106-
return chunk_payload
107-
108-
def create_pdf_chunk_request_data(
36+
content_type = request.headers.get("Content-Type", "")
37+
if "multipart" not in content_type:
38+
return {}
39+
if request.stream is None or not hasattr(request.stream, "fields"):
40+
return {}
41+
fields = request.stream.fields
42+
43+
mapped_fields = {}
44+
for field in fields:
45+
if isinstance(field, DataField):
46+
if "[]" in field.name:
47+
name = field.name.replace("[]", "")
48+
if name not in mapped_fields:
49+
mapped_fields[name] = []
50+
mapped_fields[name].append(field.value)
51+
mapped_fields[field.name] = field.value
52+
elif isinstance(field, FileField):
53+
mapped_fields[field.name] = {
54+
"filename": field.filename,
55+
"content_type": field.headers.get("Content-Type", ""),
56+
"file": field.file,
57+
}
58+
return mapped_fields
59+
60+
def create_pdf_chunk_request_params(
10961
form_data: FormData,
11062
page_number: int
11163
) -> dict[str, Any]:
11264
"""Creates the request body for the partition API."
11365
11466
Args:
11567
form_data: The form data.
116-
pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
117-
filename: The filename.
11868
page_number: The page number.
11969
70+
Returns:
71+
The updated request payload for the chunk.
12072
"""
12173
fields_to_drop = [
12274
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
@@ -140,15 +92,16 @@ def create_pdf_chunk_request(
14092
14193
Args:
14294
form_data: The form data.
143-
pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
95+
pdf_chunk: Tuple of pdf chunk contents (can be both io.BytesIO or
96+
a file object created with e.g. open()) and the page number.
14497
original_request: The original request.
14598
filename: The filename.
14699
147100
Returns:
148101
The updated request object.
149102
"""
150103
pdf_chunk_file, page_number = pdf_chunk
151-
data = create_pdf_chunk_request_data(form_data, page_number)
104+
data = create_pdf_chunk_request_params(form_data, page_number)
152105
original_headers = prepare_request_headers(original_request.headers)
153106

154107
pdf_chunk_partition_params = shared.PartitionParameters(
@@ -166,23 +119,14 @@ def create_pdf_chunk_request(
166119
"multipart",
167120
shared.PartitionParameters,
168121
)
169-
# chunk_request = client.build_request(
170-
# method="POST",
171-
# url=original_request.url or "",
172-
# headers={**original_headers, "Content-Type": "application/pdf"},
173-
# files={PARTITION_FORM_FILES_KEY: (filename, pdf_chunk_file, "application/pdf")},
174-
# data=data,
175-
# )
176-
pdf_chunk_request = httpx.Request(
122+
return httpx.Request(
177123
method="POST",
178124
url=original_request.url or "",
179-
# headers={**original_headers, "Content-Type": serialized_body.media_type},
180125
headers={**original_headers},
181126
content=serialized_body.content,
182127
data=serialized_body.data,
183128
files=serialized_body.files,
184129
)
185-
return pdf_chunk_request
186130

187131

188132

@@ -192,17 +136,6 @@ async def call_api_async(
192136
pdf_chunk_file: BinaryIO,
193137
limiter: asyncio.Semaphore,
194138
) -> httpx.Response:
195-
# pdf_chunk_file, page_number = pdf_chunk
196-
# body = create_pdf_request_body(form_data, pdf_chunk_file, filename, page_number)
197-
# original_headers = prepare_request_headers(original_request.headers)
198-
199-
# new_request = httpx.Request(
200-
# method="POST",
201-
# url=original_request.url or "",
202-
# content=body.to_string(),
203-
# headers={**original_headers, "Content-Type": body.content_type},
204-
# )
205-
206139
one_second = 1000
207140
one_minute = 1000 * 60
208141

@@ -237,7 +170,7 @@ async def do_request():
237170
print(e)
238171
raise e
239172
finally:
240-
if not pdf_chunk_file.closed:
173+
if not isinstance(pdf_chunk_file, io.BytesIO) and not pdf_chunk_file.closed:
241174
pdf_chunk_file.close()
242175

243176

@@ -257,29 +190,6 @@ def prepare_request_headers(
257190
new_headers.pop("Content-Length", None)
258191
return new_headers
259192

260-
261-
def prepare_request_payload(form_data: FormData) -> FormData:
262-
"""Prepares the request payload by removing unnecessary keys and updating the file.
263-
264-
Args:
265-
form_data: The original form data.
266-
267-
Returns:
268-
The updated request payload.
269-
"""
270-
payload = copy.deepcopy(form_data)
271-
payload.pop(PARTITION_FORM_SPLIT_PDF_PAGE_KEY, None)
272-
payload.pop(PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY, None)
273-
payload.pop(PARTITION_FORM_FILES_KEY, None)
274-
payload.pop(PARTITION_FORM_PAGE_RANGE_KEY, None)
275-
payload.pop(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, None)
276-
updated_parameters = {
277-
PARTITION_FORM_SPLIT_PDF_PAGE_KEY: "false",
278-
}
279-
payload.update(updated_parameters)
280-
return payload
281-
282-
283193
def create_response(elements: list) -> httpx.Response:
284194
"""
285195
Creates a modified response object with updated content.

0 commit comments

Comments
 (0)