@@ -222,39 +222,32 @@ def before_request(
222222 if split_pdf_page is None or split_pdf_page == "false" :
223223 return request
224224
225- logger .info ("Preparing to split document for partition." )
226225 file = form_data .get (PARTITION_FORM_FILES_KEY )
227226 if (
228227 file is None
229228 or not isinstance (file , shared .Files )
230229 or not pdf_utils .is_pdf (file )
231230 ):
232- logger .info ("Partitioning without split." )
233231 return request
234232
235233 starting_page_number = form_utils .get_starting_page_number (
236234 form_data ,
237235 key = PARTITION_FORM_STARTING_PAGE_NUMBER_KEY ,
238236 fallback_value = DEFAULT_STARTING_PAGE_NUMBER ,
239237 )
240- if starting_page_number > 1 :
241- logger .info ("Starting page number set to %d" , starting_page_number )
242- logger .info ("Starting page number set to %d" , starting_page_number )
243238
244239 self .allow_failed = form_utils .get_split_pdf_allow_failed_param (
245240 form_data ,
246241 key = PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY ,
247242 fallback_value = DEFAULT_ALLOW_FAILED ,
248243 )
249- logger .info ("Allow failed set to %d" , self .allow_failed )
250244
251245 concurrency_level = form_utils .get_split_pdf_concurrency_level_param (
252246 form_data ,
253247 key = PARTITION_FORM_CONCURRENCY_LEVEL_KEY ,
254248 fallback_value = DEFAULT_CONCURRENCY_LEVEL ,
255249 max_allowed = MAX_CONCURRENCY_LEVEL ,
256250 )
257- logger .info ("Concurrency level set to %d" , concurrency_level )
258251 limiter = asyncio .Semaphore (concurrency_level )
259252
260253 content = cast (bytes , file .content )
@@ -267,40 +260,17 @@ def before_request(
267260 )
268261
269262 page_count = page_range_end - page_range_start + 1
270- logger .info (
271- "Splitting pages %d to %d (%d total)" ,
272- page_range_start ,
273- page_range_end ,
274- page_count ,
275- )
276263
277264 split_size = get_optimal_split_size (
278265 num_pages = page_count , concurrency_level = concurrency_level
279266 )
280- logger .info ("Determined optimal split size of %d pages." , split_size )
281267
282268 # If the doc is small enough, and we aren't slicing it with a page range:
283269 # do not split, just continue with the original request
284270 if split_size >= page_count and page_count == len (pdf .pages ):
285- logger .info (
286- "Document has too few pages (%d) to be split efficiently. Partitioning without split." ,
287- page_count ,
288- )
289271 return request
290272
291273 pages = pdf_utils .get_pdf_pages (pdf , split_size = split_size , page_start = page_range_start , page_end = page_range_end )
292- logger .info (
293- "Partitioning %d files with %d page(s) each." ,
294- math .floor (page_count / split_size ),
295- split_size ,
296- )
297-
298- # Log the remainder pages if there are any
299- if page_count % split_size > 0 :
300- logger .info (
301- "Partitioning 1 file with %d page(s)." ,
302- page_count % split_size ,
303- )
304274
305275 # Use a variable to adjust the httpx client timeout, or default to 30 minutes
306276 # When we're able to reuse the SDK to make these calls, we can remove this var
@@ -326,14 +296,8 @@ async def call_api_partial(page):
326296
327297 self .coroutines_to_execute [operation_id ] = []
328298 set_index = 1
329- for page_content , page_index , all_pages_number in pages :
299+ for page_content , page_index in pages :
330300 page_number = page_index + starting_page_number
331- logger .info (
332- "Partitioning set #%d (pages %d-%d)." ,
333- set_index ,
334- page_number ,
335- min (page_number + split_size - 1 , all_pages_number ),
336- )
337301
338302 coroutine = call_api_partial ((page_content , page_number ))
339303 self .coroutines_to_execute [operation_id ].append (coroutine )
0 commit comments