77from typing import Tuple , Any , BinaryIO
88
99import httpx
10+ from httpx ._multipart import MultipartStream
1011from requests_toolbelt .multipart .encoder import MultipartEncoder # type: ignore
1112
1213from unstructured_client ._hooks .custom .common import UNSTRUCTURED_CLIENT_LOGGER_NAME
1819 PARTITION_FORM_STARTING_PAGE_NUMBER_KEY ,
1920 FormData ,
2021)
21- from unstructured_client .utils import BackoffStrategy , Retries , RetryConfig , retry_async
22+ from unstructured_client .models import shared
23+ from unstructured_client .utils import BackoffStrategy , Retries , RetryConfig , retry_async , serialize_request_body
2224
2325logger = logging .getLogger (UNSTRUCTURED_CLIENT_LOGGER_NAME )
2426
@@ -53,32 +55,153 @@ def create_pdf_request_body(
5355 "application/pdf" ,
5456 )))
5557
56- payload_fields . append (( PARTITION_FORM_STARTING_PAGE_NUMBER_KEY , str ( page_number )) )
58+ return MultipartEncoder ( fields = payload_fields )
5759
58- body = MultipartEncoder (
59- fields = payload_fields
60- )
61- return body
6260
61+ def create_pdf_request_data (
62+ form_data : FormData ,
63+ pdf_chunk : BinaryIO ,
64+ filename : str ,
65+ page_number : int
66+ ) -> dict [str , Any ]:
67+ """Creates the request body for the partition API."
6368
64- async def call_api_async (
65- client : httpx .AsyncClient ,
69+ Args:
70+ form_data: The form data.
71+ pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
72+ filename: The filename.
73+ page_number: The page number.
74+
75+ """
76+ payload = prepare_request_payload (form_data )
77+ payload [PARTITION_FORM_STARTING_PAGE_NUMBER_KEY ] = str (page_number )
78+
79+ # payload[PARTITION_FORM_FILES_KEY] = (
80+ # filename,
81+ # pdf_chunk,
82+ # "application/pdf",
83+ # )
84+ #
85+ # return MultipartEncoder(fields=payload_fields)
86+ return payload
87+
88+ def prepare_pdf_chunk_request_payload (form_data : FormData ) -> FormData :
89+ """Prepares the request payload by removing unnecessary keys and updating the file.
90+
91+ Args:
92+ form_data: The original form data.
93+
94+ Returns:
95+ The updated request payload.
96+ """
97+ fields_to_drop = [
98+ PARTITION_FORM_SPLIT_PDF_PAGE_KEY ,
99+ PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY ,
100+ PARTITION_FORM_FILES_KEY ,
101+ PARTITION_FORM_PAGE_RANGE_KEY ,
102+ PARTITION_FORM_STARTING_PAGE_NUMBER_KEY ,
103+ ]
104+ chunk_payload = {key : form_data [key ] for key in form_data if key not in fields_to_drop }
105+ chunk_payload [PARTITION_FORM_SPLIT_PDF_PAGE_KEY ] = "false"
106+ return chunk_payload
107+
108+ def create_pdf_chunk_request_data (
109+ form_data : FormData ,
110+ page_number : int
111+ ) -> dict [str , Any ]:
112+ """Creates the request body for the partition API."
113+
114+ Args:
115+ form_data: The form data.
116+ pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
117+ filename: The filename.
118+ page_number: The page number.
119+
120+ """
121+ fields_to_drop = [
122+ PARTITION_FORM_SPLIT_PDF_PAGE_KEY ,
123+ PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY ,
124+ PARTITION_FORM_FILES_KEY ,
125+ PARTITION_FORM_PAGE_RANGE_KEY ,
126+ PARTITION_FORM_STARTING_PAGE_NUMBER_KEY ,
127+ ]
128+ chunk_payload = {key : form_data [key ] for key in form_data if key not in fields_to_drop }
129+ chunk_payload [PARTITION_FORM_SPLIT_PDF_PAGE_KEY ] = "false"
130+ chunk_payload [PARTITION_FORM_STARTING_PAGE_NUMBER_KEY ] = str (page_number )
131+ return chunk_payload
132+
133+ def create_pdf_chunk_request (
134+ form_data : FormData ,
66135 pdf_chunk : Tuple [BinaryIO , int ],
67136 original_request : httpx .Request ,
68- form_data : FormData ,
69137 filename : str ,
70- limiter : asyncio .Semaphore ,
71- ) -> httpx .Response :
138+ ) -> httpx .Request :
139+ """Creates a new request object with the updated payload for the partition API.
140+
141+ Args:
142+ form_data: The form data.
143+ pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
144+ original_request: The original request.
145+ filename: The filename.
146+
147+ Returns:
148+ The updated request object.
149+ """
72150 pdf_chunk_file , page_number = pdf_chunk
73- body = create_pdf_request_body (form_data , pdf_chunk_file , filename , page_number )
151+ data = create_pdf_chunk_request_data (form_data , page_number )
74152 original_headers = prepare_request_headers (original_request .headers )
75153
76- new_request = httpx .Request (
154+ pdf_chunk_partition_params = shared .PartitionParameters (
155+ files = shared .Files (
156+ content = pdf_chunk_file ,
157+ file_name = filename ,
158+ content_type = "application/pdf" ,
159+ ),
160+ ** data ,
161+ )
162+ serialized_body = serialize_request_body (
163+ pdf_chunk_partition_params ,
164+ False ,
165+ False ,
166+ "multipart" ,
167+ shared .PartitionParameters ,
168+ )
169+ # chunk_request = client.build_request(
170+ # method="POST",
171+ # url=original_request.url or "",
172+ # headers={**original_headers, "Content-Type": "application/pdf"},
173+ # files={PARTITION_FORM_FILES_KEY: (filename, pdf_chunk_file, "application/pdf")},
174+ # data=data,
175+ # )
176+ pdf_chunk_request = httpx .Request (
77177 method = "POST" ,
78178 url = original_request .url or "" ,
79- content = body .to_string (),
80- headers = {** original_headers , "Content-Type" : body .content_type },
179+ # headers={**original_headers, "Content-Type": serialized_body.media_type},
180+ headers = {** original_headers },
181+ content = serialized_body .content ,
182+ data = serialized_body .data ,
183+ files = serialized_body .files ,
81184 )
185+ return pdf_chunk_request
186+
187+
188+
189+ async def call_api_async (
190+ client : httpx .AsyncClient ,
191+ pdf_chunk_request : httpx .Request ,
192+ pdf_chunk_file : BinaryIO ,
193+ limiter : asyncio .Semaphore ,
194+ ) -> httpx .Response :
195+ # pdf_chunk_file, page_number = pdf_chunk
196+ # body = create_pdf_request_body(form_data, pdf_chunk_file, filename, page_number)
197+ # original_headers = prepare_request_headers(original_request.headers)
198+
199+ # new_request = httpx.Request(
200+ # method="POST",
201+ # url=original_request.url or "",
202+ # content=body.to_string(),
203+ # headers={**original_headers, "Content-Type": body.content_type},
204+ # )
82205
83206 one_second = 1000
84207 one_minute = 1000 * 60
@@ -101,7 +224,7 @@ async def call_api_async(
101224 ]
102225
103226 async def do_request ():
104- return await client .send (new_request )
227+ return await client .send (pdf_chunk_request )
105228
106229 async with limiter :
107230 try :
@@ -110,6 +233,9 @@ async def do_request():
110233 Retries (retry_config , retryable_codes )
111234 )
112235 return response
236+ except Exception as e :
237+ print (e )
238+ raise e
113239 finally :
114240 if not pdf_chunk_file .closed :
115241 pdf_chunk_file .close ()
0 commit comments