@@ -607,3 +607,157 @@ def test_4179():
607607 finally :
608608 pymupdf .TOOLS .set_aa_level (aa )
609609 pymupdf .TOOLS .unset_quad_corrections (uqc )
610+
611+
612+ def test_extendable_textpage ():
613+
614+ # 2025-01-28:
615+ #
616+ # We can create a pdf with two pages whose text is adjacent when stitched
617+ # together vertically.
618+ #
619+ # We can append page to stext_page ok.
620+ #
621+ # Extracted spans are adjacent vertically as hoped.
622+ #
623+ # But... We always get a separate block for each page, even though the y
624+ # coordinates are adjacent and so we would expect stext_page to return a
625+ # single block. This is all with `sort=True`.
626+ #
627+ # Maybe sort=true doesn't ever join adjacent blocks??
628+ #
629+ print ()
630+
631+ path = os .path .normpath (f'{ __file__ } /../../tests/test_extendable_textpage.pdf' )
632+ with pymupdf .open (filetype = 'pdf' ) as document :
633+ document .new_page ()
634+ document .new_page ()
635+ document .save (path )
636+
637+ # Create document with two pages and text where a paragraph spans the two
638+ # pages.
639+ #
640+ with pymupdf .open (path ) as document :
641+ page0 = document [0 ]
642+ page1 = document [1 ]
643+ y = 100
644+ for i in range (4 ):
645+ page0 .insert_text ((100 , y + 9.6 ), 'abcd' [i ] * 16 )
646+ page1 .insert_text ((100 , y + 9.6 ), 'efgh' [i ] * 16 )
647+ y += 9.6
648+ if i % 2 == 0 :
649+ y += 9.6 * 1
650+ rect = (100 , 100 , 200 , y )
651+ rect2 = pymupdf .mupdf .FzRect (* rect )
652+ document [0 ].draw_rect ((100 , 100 , 200 , y ), (1 , 0 , 0 ))
653+ document [1 ].draw_rect ((100 , 100 , 200 , y ), (1 , 0 , 0 ))
654+ path2 = os .path .normpath (f'{ __file__ } /../../tests/test_extendable_textpage2.pdf' )
655+ document .save (path2 )
656+
657+ # Create a stext page for both pages of our document, using direct calls to
658+ # MuPDF for now.
659+
660+ with pymupdf .Document (path2 ) as document :
661+
662+ # Notes:
663+ #
664+ # We need to reuse the stext device for second page. Otherwise if we
665+ # create a new device, the first text in second page will always be in
666+ # a new block, because pen position for new device is (0, 0) and this
667+ # will usually be treated as a paragraph gap to the first text.
668+ #
669+ # At the moment we use infinite mediabox when using
670+ # fz_new_stext_page()'s to create the stext device. I don't know what a
671+ # non-infinite mediabox would be useful for.
672+ #
673+ # FZ_STEXT_CLIP_RECT isn't useful at the moment, because we would need
674+ # to modify it to be in stext pagae coordinates (i.e. adding ctm.f
675+ # to y0 and y1) when we append the second page. But it's internal
676+ # data and there's no api to modify it. So for now we don't specify
677+ # FZ_STEXT_CLIP_RECT when creating the stext device, so we always
678+ # include each page's entire contents.
679+ #
680+
681+ ctm = pymupdf .mupdf .FzMatrix ()
682+ cookie = pymupdf .mupdf .FzCookie ()
683+
684+ stext_page = pymupdf .mupdf .FzStextPage (
685+ pymupdf .mupdf .FzRect (pymupdf .mupdf .FzRect .Fixed_INFINITE ), # mediabox
686+ )
687+ stext_options = pymupdf .mupdf .FzStextOptions ()
688+ #stext_options.flags |= pymupdf.mupdf.FZ_STEXT_CLIP_RECT
689+ #stext_options.clip = rect2.internal()
690+ device = pymupdf .mupdf .fz_new_stext_device (stext_page , stext_options )
691+
692+ # Append second page to stext_page and prepare ctm for any later page.
693+ page = document [0 ]
694+ pymupdf .mupdf .fz_run_page (page .this , device , ctm , cookie )
695+ ctm .f += rect2 .y1 - rect2 .y0
696+
697+ # Append second page to stext_page and prepare for any later page.
698+ page = document [1 ]
699+ pymupdf .mupdf .fz_run_page (page .this , device , ctm , cookie )
700+ ctm .f += rect2 .y1 - rect2 .y0
701+
702+ # We've finished adding text to stext_page.
703+ pymupdf .mupdf .fz_close_device (device )
704+
705+ # Read text from stext_page.
706+ text_page = pymupdf .TextPage (stext_page )
707+
708+ # Read text from stext_page using text_page.extractDICT().
709+ print (f'Using text_page.extractDICT().' )
710+ print (f'{ text_page .this .m_internal .mediabox = } ' )
711+ d = text_page .extractDICT (sort = True )
712+ y0_prev = None
713+ pno = 0
714+ ydelta = 0
715+ for block in d ['blocks' ]:
716+ print (f'block' )
717+ for line in block ['lines' ]:
718+ print (f' line' )
719+ for span in line ['spans' ]:
720+ print (f' span' )
721+ bbox = span ['bbox' ]
722+ x0 , y0 , x1 , y1 = bbox
723+ dy = y0 - y0_prev if y0_prev else 0
724+ y0_prev = y0
725+ print (f' { dy = : 5.2f} height={ y1 - y0 :.02f} { x0 :.02f} { y0 :.02f} { x1 :.02f} { y1 :.02f} { span ["text" ]= } ' )
726+ if 'eee' in span ['text' ]:
727+ pno = 1
728+ ydelta = rect2 .y1 - rect2 .y0
729+ y0 -= ydelta
730+ y1 -= ydelta
731+ document [pno ].draw_rect ((x0 , y0 , x1 , y1 ), (0 , 1 , 0 ))
732+
733+ print ('\n \n \n \n ' )
734+
735+ print (f'Using text_page.extractText()' )
736+ text = text_page .extractText (True )
737+ print (f'{ text } ' )
738+
739+ print ('\n \n \n \n ' )
740+ print (f'Using extractBLOCKS' )
741+ text = list ()
742+ for x0 , y0 , x1 , y1 , line , no , type_ in text_page .extractBLOCKS ():
743+ print (f'block:' )
744+ print (f' bbox={ x0 , y0 , x1 , y1 } { no = } ' )
745+ print (f' { line = } ' )
746+ text .append (line )
747+
748+ print ("\n \n \n " )
749+ print (f'extractBLOCKS joined by newlines:' )
750+ print ('\n ' .join (text ))
751+
752+ # This checks that lines before/after pages break are treated as a
753+ # single paragraph.
754+ assert text == [
755+ 'aaaaaaaaaaaaaaaa\n ' ,
756+ 'bbbbbbbbbbbbbbbb\n cccccccccccccccc\n ' ,
757+ 'dddddddddddddddd\n eeeeeeeeeeeeeeee\n ' ,
758+ 'ffffffffffffffff\n gggggggggggggggg\n ' ,
759+ 'hhhhhhhhhhhhhhhh\n ' ,
760+ ]
761+
762+ path3 = os .path .normpath (f'{ __file__ } /../../tests/test_extendable_textpage3.pdf' )
763+ document .save (path3 )
0 commit comments