@@ -469,6 +469,11 @@ def test_4139():
469469
470470def test_4245 ():
471471 path = os .path .normpath (f'{ __file__ } /../../tests/resources/test_4245.pdf' )
472+ with pymupdf .open (path ) as document :
473+ page = document [0 ]
474+ regions = page .search_for ('Bart Simpson' )
475+ print (f'{ regions = } ' )
476+ page .add_highlight_annot (regions )
472477 with pymupdf .open (path ) as document :
473478 page = document [0 ]
474479 regions = page .search_for ('Bart Simpson' )
@@ -649,50 +654,70 @@ def test_extendable_textpage():
649654 # 2025-01-28:
650655 #
651656 # We can create a pdf with two pages whose text is adjacent when stitched
652- # together vertically.
657+ # together vertically:
653658 #
654- # We can append page to stext_page ok.
659+ # Page 1:
660+ #
661+ # aaaa
662+ #
663+ # bbbb
664+ # cccc
665+ #
666+ # dddd
667+ #
668+ # Page 2:
669+ #
670+ # eeee
671+ #
672+ # ffff
673+ # gggg
674+ #
675+ # hhhh
655676 #
656- # Extracted spans are adjacent vertically as hoped.
657677 #
658- # But... We always get a separate block for each page, even though the y
659- # coordinates are adjacent and so we would expect stext_page to return a
660- # single block. This is all with `sort=True`.
678+ # Create a textpage for both of these pages. Then when extracting text,
679+ # we need to get (specifically the `dddd` and `eeee` sequences need to be
680+ # treated as the same block):
661681 #
662- # Maybe sort=true doesn't ever join adjacent blocks??
682+ # aaaa
683+ #
684+ # bbbb
685+ # cccc
686+ #
687+ # dddd
688+ # eeee
689+ #
690+ # ffff
691+ # gggg
692+ #
693+ # hhhh
663694 #
664695 print ()
665696
666697 path = os .path .normpath (f'{ __file__ } /../../tests/test_extendable_textpage.pdf' )
667698 with pymupdf .open (filetype = 'pdf' ) as document :
668699 document .new_page ()
669700 document .new_page ()
670- document .save (path )
671-
672- # Create document with two pages and text where a paragraph spans the two
673- # pages.
674- #
675- with pymupdf .open (path ) as document :
676701 page0 = document [0 ]
677702 page1 = document [1 ]
678703 y = 100
704+ line_height = 9.6
679705 for i in range (4 ):
680- page0 .insert_text ((100 , y + 9.6 ), 'abcd' [i ] * 16 )
681- page1 .insert_text ((100 , y + 9.6 ), 'efgh' [i ] * 16 )
682- y += 9.6
706+ page0 .insert_text ((100 , y + line_height ), 'abcd' [i ] * 16 )
707+ page1 .insert_text ((100 , y + line_height ), 'efgh' [i ] * 16 )
708+ y += line_height
683709 if i % 2 == 0 :
684- y += 9.6 * 1
685- rect = (100 , 100 , 200 , y )
686- rect2 = pymupdf .mupdf .FzRect (* rect )
687- document [0 ].draw_rect ((100 , 100 , 200 , y ), (1 , 0 , 0 ))
688- document [1 ].draw_rect ((100 , 100 , 200 , y ), (1 , 0 , 0 ))
689- path2 = os .path .normpath (f'{ __file__ } /../../tests/test_extendable_textpage2.pdf' )
690- document .save (path2 )
710+ y += line_height
711+ rect = pymupdf .mupdf .FzRect (100 , 100 , 200 , y )
712+ document [0 ].draw_rect (rect , (1 , 0 , 0 ))
713+ document [1 ].draw_rect (rect , (1 , 0 , 0 ))
714+ document .save (path )
691715
692- # Create a stext page for both pages of our document, using direct calls to
693- # MuPDF for now.
716+ # Create a stext page for the text regions in both pages of our document,
717+ # using direct calls to MuPDF.
718+ #
694719
695- with pymupdf .Document (path2 ) as document :
720+ with pymupdf .Document (path ) as document :
696721
697722 # Notes:
698723 #
@@ -701,9 +726,9 @@ def test_extendable_textpage():
701726 # a new block, because pen position for new device is (0, 0) and this
702727 # will usually be treated as a paragraph gap to the first text.
703728 #
704- # At the moment we use infinite mediabox when using
705- # fz_new_stext_page()'s to create the stext device . I don't know what a
706- # non-infinite mediabox would be useful for.
729+ # At the moment we use infinite mediabox when creating the
730+ # fz_stext_page . I don't know what a non-infinite mediabox would be
731+ # useful for.
707732 #
708733 # FZ_STEXT_CLIP_RECT isn't useful at the moment, because we would need
709734 # to modify it to be in stext pagae coordinates (i.e. adding ctm.f
@@ -713,31 +738,40 @@ def test_extendable_textpage():
713738 # include each page's entire contents.
714739 #
715740
716- ctm = pymupdf .mupdf .FzMatrix ()
741+ # We use our knowledge of the text rect in each page to manipulate ctm
742+ # so that the stext contains text starting at (0, 0) and extending
743+ # downwards.
744+ #
745+ y = 0
717746 cookie = pymupdf .mupdf .FzCookie ()
718747
719748 stext_page = pymupdf .mupdf .FzStextPage (
720749 pymupdf .mupdf .FzRect (pymupdf .mupdf .FzRect .Fixed_INFINITE ), # mediabox
721750 )
722751 stext_options = pymupdf .mupdf .FzStextOptions ()
723752 #stext_options.flags |= pymupdf.mupdf.FZ_STEXT_CLIP_RECT
724- #stext_options.clip = rect2 .internal()
753+ #stext_options.clip = rect .internal()
725754 device = pymupdf .mupdf .fz_new_stext_device (stext_page , stext_options )
726755
727- # Append second page to stext_page and prepare ctm for any later page.
756+ # Add first page to stext_page at (0, y), and update <y> for the next
757+ # page.
728758 page = document [0 ]
759+ ctm = pymupdf .mupdf .FzMatrix (1 , 0 , 0 , 1 , - rect .x0 , - rect .y0 + y )
729760 pymupdf .mupdf .fz_run_page (page .this , device , ctm , cookie )
730- ctm . f += rect2 .y1 - rect2 .y0
761+ y += rect .y1 - rect .y0
731762
732- # Append second page to stext_page and prepare for any later page.
763+ # Add second page to stext_page at (0, y), and update <y> for the next
764+ # page.
733765 page = document [1 ]
766+ ctm = pymupdf .mupdf .FzMatrix (1 , 0 , 0 , 1 , - rect .x0 , - rect .y0 + y )
734767 pymupdf .mupdf .fz_run_page (page .this , device , ctm , cookie )
735- ctm . f += rect2 .y1 - rect2 .y0
768+ y += rect .y1 - rect .y0
736769
737770 # We've finished adding text to stext_page.
738771 pymupdf .mupdf .fz_close_device (device )
739772
740- # Read text from stext_page.
773+ # Create a pymupdf.TextPage() for <stext_page> so we can use
774+ # text_page.extractDICT() etc.
741775 text_page = pymupdf .TextPage (stext_page )
742776
743777 # Read text from stext_page using text_page.extractDICT().
@@ -748,30 +782,32 @@ def test_extendable_textpage():
748782 pno = 0
749783 ydelta = 0
750784 for block in d ['blocks' ]:
751- print (f'block' )
785+ print (f'block { block [ "bbox" ] = } ' )
752786 for line in block ['lines' ]:
753- print (f' line' )
787+ print (f' line { line [ "bbox" ] = } ' )
754788 for span in line ['spans' ]:
755- print (f' span' )
789+ print (f' span { span [ "bbox" ] = } ' )
756790 bbox = span ['bbox' ]
757791 x0 , y0 , x1 , y1 = bbox
758792 dy = y0 - y0_prev if y0_prev else 0
759793 y0_prev = y0
760794 print (f' { dy = : 5.2f} height={ y1 - y0 :.02f} { x0 :.02f} { y0 :.02f} { x1 :.02f} { y1 :.02f} { span ["text" ]= } ' )
761795 if 'eee' in span ['text' ]:
762796 pno = 1
763- ydelta = rect2 .y1 - rect2 .y0
797+ ydelta = rect .y1 - rect .y0
764798 y0 -= ydelta
765799 y1 -= ydelta
800+ # Debugging - add green lines on original document
801+ # translating final blocks info into original coors.
766802 document [pno ].draw_rect ((x0 , y0 , x1 , y1 ), (0 , 1 , 0 ))
767803
768- print ('\n \n \n \n ' )
804+ print ('\n \n ' )
769805
770806 print (f'Using text_page.extractText()' )
771807 text = text_page .extractText (True )
772808 print (f'{ text } ' )
773809
774- print ('\n \n \n \n ' )
810+ print ('\n \n ' )
775811 print (f'Using extractBLOCKS' )
776812 text = list ()
777813 for x0 , y0 , x1 , y1 , line , no , type_ in text_page .extractBLOCKS ():
@@ -780,7 +816,7 @@ def test_extendable_textpage():
780816 print (f' { line = } ' )
781817 text .append (line )
782818
783- print ("\n \n \n " )
819+ print ("\n \n " )
784820 print (f'extractBLOCKS joined by newlines:' )
785821 print ('\n ' .join (text ))
786822
0 commit comments