@@ -317,6 +317,8 @@ def before_request(
317317 fallback_value = DEFAULT_CACHE_TMP_DATA_DIR ,
318318 )
319319
320+ pdf = self ._clean_large_pages (pdf )
321+
320322 page_range_start , page_range_end = form_utils .get_page_range (
321323 form_data ,
322324 key = PARTITION_FORM_PAGE_RANGE_KEY .replace ("[]" , "" ),
@@ -423,6 +425,52 @@ async def call_api_partial(
423425
424426 return response
425427
428+ def _clean_large_pages (self ,
429+ pdf : PdfReader ) -> PdfReader :
430+ max_page_length = 4000
431+ any_page_over_maximum_length = False
432+ for page in pdf .pages :
433+ if page .mediabox .height >= max_page_length :
434+ any_page_over_maximum_length = True
435+
436+ # early exit if all pages are safely under the max page length
437+ if not any_page_over_maximum_length :
438+ return pdf
439+
440+ w = PdfWriter ()
441+ page_nums = 0
442+
443+ map_of_pages_to_clean : dict [int , dict [str , int ]] = {}
444+ for page in pdf .pages :
445+ if page .mediabox .height <= max_page_length :
446+ page_nums += 1
447+ w .add_page (page )
448+ continue
449+
450+ num_pages_to_add = math .ceil (page .mediabox .height / max_page_length )
451+
452+ page_start = page .mediabox .height
453+ page_end = page_start - max_page_length
454+ for _ in range (num_pages_to_add ):
455+ page_nums += 1
456+ map_of_pages_to_clean [page_nums ]: dict [str , int ] = {"top" : page_start , "bottom" : page_end }
457+ w .add_page (page )
458+ page_start = page_end
459+ page_end = page_start - max_page_length
460+
461+ page_nums = 0
462+ for page in w .pages :
463+ page_nums += 1
464+ if map_of_pages_to_clean .get (page_nums ) is None :
465+ continue
466+ page .mediabox .top = map_of_pages_to_clean .get (page_nums )["top" ]
467+ page .mediabox .bottom = map_of_pages_to_clean .get (page_nums )["bottom" ]
468+
469+ chunk_buffer = io .BytesIO ()
470+ w .write (chunk_buffer )
471+ chunk_buffer .seek (0 )
472+ return PdfReader (chunk_buffer )
473+
426474 def _get_pdf_chunks_in_memory (
427475 self ,
428476 pdf : PdfReader ,
0 commit comments