@@ -146,7 +146,6 @@ def sdk_init(
146146 Returns:
147147 Tuple[str, HttpClient]: The initialized SDK options.
148148 """
149-
150149 class DummyTransport (httpx .BaseTransport ):
151150 def __init__ (self , base_transport : httpx .BaseTransport ):
152151 self .base_transport = base_transport
@@ -238,39 +237,32 @@ def before_request(
238237 if split_pdf_page is None or split_pdf_page == "false" :
239238 return request
240239
241- logger .info ("Preparing to split document for partition." )
242240 file = form_data .get (PARTITION_FORM_FILES_KEY )
243241 if (
244242 file is None
245243 or not isinstance (file , shared .Files )
246244 or not pdf_utils .is_pdf (file )
247245 ):
248- logger .info ("Partitioning without split." )
249246 return request
250247
251248 starting_page_number = form_utils .get_starting_page_number (
252249 form_data ,
253250 key = PARTITION_FORM_STARTING_PAGE_NUMBER_KEY ,
254251 fallback_value = DEFAULT_STARTING_PAGE_NUMBER ,
255252 )
256- if starting_page_number > 1 :
257- logger .info ("Starting page number set to %d" , starting_page_number )
258- logger .info ("Starting page number set to %d" , starting_page_number )
259253
260254 self .allow_failed = form_utils .get_split_pdf_allow_failed_param (
261255 form_data ,
262256 key = PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY ,
263257 fallback_value = DEFAULT_ALLOW_FAILED ,
264258 )
265- logger .info ("Allow failed set to %d" , self .allow_failed )
266259
267260 concurrency_level = form_utils .get_split_pdf_concurrency_level_param (
268261 form_data ,
269262 key = PARTITION_FORM_CONCURRENCY_LEVEL_KEY ,
270263 fallback_value = DEFAULT_CONCURRENCY_LEVEL ,
271264 max_allowed = MAX_CONCURRENCY_LEVEL ,
272265 )
273- logger .info ("Concurrency level set to %d" , concurrency_level )
274266 limiter = asyncio .Semaphore (concurrency_level )
275267
276268 content = cast (bytes , file .content )
@@ -283,25 +275,14 @@ def before_request(
283275 )
284276
285277 page_count = page_range_end - page_range_start + 1
286- logger .info (
287- "Splitting pages %d to %d (%d total)" ,
288- page_range_start ,
289- page_range_end ,
290- page_count ,
291- )
292278
293279 split_size = get_optimal_split_size (
294280 num_pages = page_count , concurrency_level = concurrency_level
295281 )
296- logger .info ("Determined optimal split size of %d pages." , split_size )
297282
298283 # If the doc is small enough, and we aren't slicing it with a page range:
299284 # do not split, just continue with the original request
300285 if split_size >= page_count and page_count == len (pdf .pages ):
301- logger .info (
302- "Document has too few pages (%d) to be split efficiently. Partitioning without split." ,
303- page_count ,
304- )
305286 return request
306287
307288 pages = self ._get_pdf_pages (
@@ -329,7 +310,7 @@ def before_request(
329310 # Use a variable to adjust the httpx client timeout, or default to 30 minutes
330311 # When we're able to reuse the SDK to make these calls, we can remove this var
331312 # The SDK timeout will be controlled by parameter
332- client_timeout_minutes = 30
313+ client_timeout_minutes = 60
333314 if timeout_var := os .getenv ("UNSTRUCTURED_CLIENT_TIMEOUT_MINUTES" ):
334315 client_timeout_minutes = int (timeout_var )
335316
@@ -365,14 +346,8 @@ async def call_api_partial(pdf_chunk: Tuple[BinaryIO, int]):
365346
366347 self .coroutines_to_execute [operation_id ] = []
367348 set_index = 1
368- for pdf_chunk_file , page_index , all_pages_number in pages :
349+ for pdf_chunk_file , page_index in pages :
369350 page_number = page_index + starting_page_number
370- logger .info (
371- "Partitioning set #%d (pages %d-%d)." ,
372- set_index ,
373- page_number ,
374- min (page_number + split_size - 1 , all_pages_number ),
375- )
376351
377352 coroutine = call_api_partial ((pdf_chunk_file , page_number ))
378353 self .coroutines_to_execute [operation_id ].append (coroutine )
0 commit comments