11from __future__ import annotations
22
33import asyncio
4- import copy
4+ import io
55import json
66import logging
77from typing import Tuple , Any , BinaryIO
88
99import httpx
10- from httpx ._multipart import MultipartStream
10+ from httpx ._multipart import DataField , FileField
1111from requests_toolbelt .multipart .encoder import MultipartEncoder # type: ignore
1212
1313from unstructured_client ._hooks .custom .common import UNSTRUCTURED_CLIENT_LOGGER_NAME
2424
2525logger = logging .getLogger (UNSTRUCTURED_CLIENT_LOGGER_NAME )
2626
27-
28- def create_pdf_request_body (
29- form_data : FormData ,
30- pdf_chunk : BinaryIO ,
31- filename : str ,
32- page_number : int
33- ) -> MultipartEncoder :
34- """Creates the request body for the partition API."
35-
36- Args:
37- form_data: The form data.
38- pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
39- filename: The filename.
40- page_number: The page number.
41-
42- """
43- payload = prepare_request_payload (form_data )
44-
45- payload_fields : list [tuple [str , Any ]] = []
46- for key , value in payload .items ():
47- if isinstance (value , list ):
48- payload_fields .extend ([(key , list_value ) for list_value in value ])
49- else :
50- payload_fields .append ((key , value ))
51-
52- payload_fields .append ((PARTITION_FORM_FILES_KEY , (
53- filename ,
54- pdf_chunk ,
55- "application/pdf" ,
56- )))
57-
58- return MultipartEncoder (fields = payload_fields )
59-
60-
61- def create_pdf_request_data (
62- form_data : FormData ,
63- pdf_chunk : BinaryIO ,
64- filename : str ,
65- page_number : int
66- ) -> dict [str , Any ]:
67- """Creates the request body for the partition API."
68-
69- Args:
70- form_data: The form data.
71- pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
72- filename: The filename.
73- page_number: The page number.
74-
75- """
76- payload = prepare_request_payload (form_data )
77- payload [PARTITION_FORM_STARTING_PAGE_NUMBER_KEY ] = str (page_number )
78-
79- # payload[PARTITION_FORM_FILES_KEY] = (
80- # filename,
81- # pdf_chunk,
82- # "application/pdf",
83- # )
84- #
85- # return MultipartEncoder(fields=payload_fields)
86- return payload
87-
88- def prepare_pdf_chunk_request_payload (form_data : FormData ) -> FormData :
89- """Prepares the request payload by removing unnecessary keys and updating the file.
27+ def get_multipart_stream_fields (request : httpx .Request ) -> dict [str , Any ]:
28+ """Extracts the multipart fields from the request.
9029
9130 Args:
92- form_data : The original form data .
31+ request : The request object .
9332
9433 Returns:
95- The updated request payload .
34+ The multipart fields .
9635 """
97- fields_to_drop = [
98- PARTITION_FORM_SPLIT_PDF_PAGE_KEY ,
99- PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY ,
100- PARTITION_FORM_FILES_KEY ,
101- PARTITION_FORM_PAGE_RANGE_KEY ,
102- PARTITION_FORM_STARTING_PAGE_NUMBER_KEY ,
103- ]
104- chunk_payload = {key : form_data [key ] for key in form_data if key not in fields_to_drop }
105- chunk_payload [PARTITION_FORM_SPLIT_PDF_PAGE_KEY ] = "false"
106- return chunk_payload
107-
108- def create_pdf_chunk_request_data (
36+ content_type = request .headers .get ("Content-Type" , "" )
37+ if "multipart" not in content_type :
38+ return {}
39+ if request .stream is None or not hasattr (request .stream , "fields" ):
40+ return {}
41+ fields = request .stream .fields
42+
43+ mapped_fields = {}
44+ for field in fields :
45+ if isinstance (field , DataField ):
46+ if "[]" in field .name :
47+ name = field .name .replace ("[]" , "" )
48+ if name not in mapped_fields :
49+ mapped_fields [name ] = []
50+ mapped_fields [name ].append (field .value )
51+ mapped_fields [field .name ] = field .value
52+ elif isinstance (field , FileField ):
53+ mapped_fields [field .name ] = {
54+ "filename" : field .filename ,
55+ "content_type" : field .headers .get ("Content-Type" , "" ),
56+ "file" : field .file ,
57+ }
58+ return mapped_fields
59+
60+ def create_pdf_chunk_request_params (
10961 form_data : FormData ,
11062 page_number : int
11163) -> dict [str , Any ]:
11264 """Creates the request body for the partition API."
11365
11466 Args:
11567 form_data: The form data.
116- pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
117- filename: The filename.
11868 page_number: The page number.
11969
70+ Returns:
71+ The updated request payload for the chunk.
12072 """
12173 fields_to_drop = [
12274 PARTITION_FORM_SPLIT_PDF_PAGE_KEY ,
@@ -140,15 +92,16 @@ def create_pdf_chunk_request(
14092
14193 Args:
14294 form_data: The form data.
143- pdf_chunk: The pdf chunk - can be both io.BytesIO or a file object (created with open())
95+ pdf_chunk: Tuple of pdf chunk contents (can be both io.BytesIO or
96+ a file object created with e.g. open()) and the page number.
14497 original_request: The original request.
14598 filename: The filename.
14699
147100 Returns:
148101 The updated request object.
149102 """
150103 pdf_chunk_file , page_number = pdf_chunk
151- data = create_pdf_chunk_request_data (form_data , page_number )
104+ data = create_pdf_chunk_request_params (form_data , page_number )
152105 original_headers = prepare_request_headers (original_request .headers )
153106
154107 pdf_chunk_partition_params = shared .PartitionParameters (
@@ -166,23 +119,14 @@ def create_pdf_chunk_request(
166119 "multipart" ,
167120 shared .PartitionParameters ,
168121 )
169- # chunk_request = client.build_request(
170- # method="POST",
171- # url=original_request.url or "",
172- # headers={**original_headers, "Content-Type": "application/pdf"},
173- # files={PARTITION_FORM_FILES_KEY: (filename, pdf_chunk_file, "application/pdf")},
174- # data=data,
175- # )
176- pdf_chunk_request = httpx .Request (
122+ return httpx .Request (
177123 method = "POST" ,
178124 url = original_request .url or "" ,
179- # headers={**original_headers, "Content-Type": serialized_body.media_type},
180125 headers = {** original_headers },
181126 content = serialized_body .content ,
182127 data = serialized_body .data ,
183128 files = serialized_body .files ,
184129 )
185- return pdf_chunk_request
186130
187131
188132
@@ -192,17 +136,6 @@ async def call_api_async(
192136 pdf_chunk_file : BinaryIO ,
193137 limiter : asyncio .Semaphore ,
194138) -> httpx .Response :
195- # pdf_chunk_file, page_number = pdf_chunk
196- # body = create_pdf_request_body(form_data, pdf_chunk_file, filename, page_number)
197- # original_headers = prepare_request_headers(original_request.headers)
198-
199- # new_request = httpx.Request(
200- # method="POST",
201- # url=original_request.url or "",
202- # content=body.to_string(),
203- # headers={**original_headers, "Content-Type": body.content_type},
204- # )
205-
206139 one_second = 1000
207140 one_minute = 1000 * 60
208141
@@ -237,7 +170,7 @@ async def do_request():
237170 print (e )
238171 raise e
239172 finally :
240- if not pdf_chunk_file .closed :
173+ if not isinstance ( pdf_chunk_file , io . BytesIO ) and not pdf_chunk_file .closed :
241174 pdf_chunk_file .close ()
242175
243176
@@ -257,29 +190,6 @@ def prepare_request_headers(
257190 new_headers .pop ("Content-Length" , None )
258191 return new_headers
259192
260-
261- def prepare_request_payload (form_data : FormData ) -> FormData :
262- """Prepares the request payload by removing unnecessary keys and updating the file.
263-
264- Args:
265- form_data: The original form data.
266-
267- Returns:
268- The updated request payload.
269- """
270- payload = copy .deepcopy (form_data )
271- payload .pop (PARTITION_FORM_SPLIT_PDF_PAGE_KEY , None )
272- payload .pop (PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY , None )
273- payload .pop (PARTITION_FORM_FILES_KEY , None )
274- payload .pop (PARTITION_FORM_PAGE_RANGE_KEY , None )
275- payload .pop (PARTITION_FORM_STARTING_PAGE_NUMBER_KEY , None )
276- updated_parameters = {
277- PARTITION_FORM_SPLIT_PDF_PAGE_KEY : "false" ,
278- }
279- payload .update (updated_parameters )
280- return payload
281-
282-
283193def create_response (elements : list ) -> httpx .Response :
284194 """
285195 Creates a modified response object with updated content.
0 commit comments