@@ -294,18 +294,6 @@ def before_request(
294294 )
295295 # force free PDF object memory
296296 del pdf
297- logger .info (
298- "Partitioning %d files with %d page(s) each." ,
299- math .floor (page_count / split_size ),
300- split_size ,
301- )
302-
303- # Log the remainder pages if there are any
304- if page_count % split_size > 0 :
305- logger .info (
306- "Partitioning 1 file with %d page(s)." ,
307- page_count % split_size ,
308- )
309297
310298 # Use a variable to adjust the httpx client timeout, or default to 30 minutes
311299 # When we're able to reuse the SDK to make these calls, we can remove this var
@@ -374,9 +362,10 @@ def _get_pdf_pages(
374362 split_size : int = 1 ,
375363 page_start : int = 1 ,
376364 page_end : Optional [int ] = None
377- ) -> Generator [Tuple [io .BytesIO , int ], None , None ]:
378- """Reads given bytes of a pdf file and split it into n file-like objects, each
379- with `split_size` pages.
365+ ) -> Generator [Tuple [BinaryIO , int ], None , None ]:
366+ """Reads given bytes of a pdf file and split it into n pdf-chunks, each
367+ with `split_size` pages. The chunks are written into temporary files in
368+ a temporary directory corresponding to the operation_id.
380369
381370 Args:
382371 file_content: Content of the PDF file.
@@ -387,7 +376,7 @@ def _get_pdf_pages(
387376 page_end: If provided, split up to and including this page number
388377
389378 Yields:
390- The file contents with their page number and overall pages number of the original document .
379+ The file object with their page number.
391380 """
392381
393382 offset = page_start - 1
@@ -423,6 +412,7 @@ def _get_pdf_pages(
423412 except Exception : # pylint: disable=broad-except
424413 if pdf_chunk_file and not pdf_chunk_file .closed :
425414 pdf_chunk_file .close ()
415+ raise
426416 yield pdf_chunk_file , offset
427417
428418 def _await_elements (
0 commit comments