tests/test_textextract.py:test_extendable_textpage(): improved code and comments.

julian-smith-artifex-com · julian-smith-artifex-com · commit 20bc5a21fb95 · 2025-08-18T19:08:06.000+01:00
diff --git a/tests/test_textextract.py b/tests/test_textextract.py
@@ -469,6 +469,11 @@ def test_4139():
 
 def test_4245():
     path = os.path.normpath(f'{__file__}/../../tests/resources/test_4245.pdf')
+    with pymupdf.open(path) as document:
+        page = document[0]
+        regions = page.search_for('Bart Simpson')
+        print(f'{regions=}')
+        page.add_highlight_annot(regions)
     with pymupdf.open(path) as document:
         page = document[0]
         regions = page.search_for('Bart Simpson')
@@ -649,50 +654,70 @@ def test_extendable_textpage():
     # 2025-01-28:
     #
     # We can create a pdf with two pages whose text is adjacent when stitched
-    # together vertically.
+    # together vertically:
     #
-    # We can append page to stext_page ok.
+    # Page 1:
+    # 
+    #     aaaa
+    #    
+    #     bbbb
+    #     cccc
+    #     
+    #     dddd
+    #     
+    # Page 2:
+    #     
+    #     eeee
+    #     
+    #     ffff
+    #     gggg
+    #     
+    #     hhhh
     #
-    # Extracted spans are adjacent vertically as hoped.
     #
-    # But... We always get a separate block for each page, even though the y
-    # coordinates are adjacent and so we would expect stext_page to return a
-    # single block. This is all with `sort=True`.
+    # Create a textpage for both of these pages. Then when extracting text,
+    # we need to get (specifically the `dddd` and `eeee` sequences need to be
+    # treated as the same block):
     #
-    # Maybe sort=true doesn't ever join adjacent blocks??
+    #     aaaa
+    #    
+    #     bbbb
+    #     cccc
+    #     
+    #     dddd
+    #     eeee
+    #     
+    #     ffff
+    #     gggg
+    #     
+    #     hhhh
     #
     print()
     
     path = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage.pdf')
     with pymupdf.open(filetype='pdf') as document:
         document.new_page()
         document.new_page()
-        document.save(path)
-
-    # Create document with two pages and text where a paragraph spans the two
-    # pages.
-    #
-    with pymupdf.open(path) as document:
         page0 = document[0]
         page1 = document[1]
         y = 100
+        line_height = 9.6
         for i in range(4):
-            page0.insert_text((100, y+9.6), 'abcd'[i] * 16)
-            page1.insert_text((100, y+9.6), 'efgh'[i] * 16)
-            y += 9.6
+            page0.insert_text((100, y+line_height), 'abcd'[i] * 16)
+            page1.insert_text((100, y+line_height), 'efgh'[i] * 16)
+            y += line_height
             if i%2 == 0:
-                y += 9.6*1
-        rect = (100, 100, 200, y)
-        rect2 = pymupdf.mupdf.FzRect(*rect)
-        document[0].draw_rect((100, 100, 200, y), (1, 0, 0))
-        document[1].draw_rect((100, 100, 200, y), (1, 0, 0))
-        path2 = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage2.pdf')
-        document.save(path2)
+                y += line_height
+        rect = pymupdf.mupdf.FzRect(100, 100, 200, y)
+        document[0].draw_rect(rect, (1, 0, 0))
+        document[1].draw_rect(rect, (1, 0, 0))
+        document.save(path)
     
-    # Create a stext page for both pages of our document, using direct calls to
-    # MuPDF for now.
+    # Create a stext page for the text regions in both pages of our document,
+    # using direct calls to MuPDF.
+    #
     
-    with pymupdf.Document(path2) as document:
+    with pymupdf.Document(path) as document:
     
         # Notes:
         #
@@ -701,9 +726,9 @@ def test_extendable_textpage():
         # a new block, because pen position for new device is (0, 0) and this
         # will usually be treated as a paragraph gap to the first text.
         #
-        # At the moment we use infinite mediabox when using
-        # fz_new_stext_page()'s to create the stext device. I don't know what a
-        # non-infinite mediabox would be useful for.
+        # At the moment we use infinite mediabox when creating the
+        # fz_stext_page. I don't know what a non-infinite mediabox would be
+        # useful for.
         #
         # FZ_STEXT_CLIP_RECT isn't useful at the moment, because we would need
         # to modify it to be in stext pagae coordinates (i.e. adding ctm.f
@@ -713,31 +738,40 @@ def test_extendable_textpage():
         # include each page's entire contents.
         #
         
-        ctm = pymupdf.mupdf.FzMatrix()
+        # We use our knowledge of the text rect in each page to manipulate ctm
+        # so that the stext contains text starting at (0, 0) and extending
+        # downwards.
+        #
+        y = 0
         cookie = pymupdf.mupdf.FzCookie()
         
         stext_page = pymupdf.mupdf.FzStextPage(
                 pymupdf.mupdf.FzRect(pymupdf.mupdf.FzRect.Fixed_INFINITE),  # mediabox
                 )
         stext_options = pymupdf.mupdf.FzStextOptions()
         #stext_options.flags |= pymupdf.mupdf.FZ_STEXT_CLIP_RECT
-        #stext_options.clip = rect2.internal()
+        #stext_options.clip = rect.internal()
         device = pymupdf.mupdf.fz_new_stext_device(stext_page, stext_options)
         
-        # Append second page to stext_page and prepare ctm for any later page.
+        # Add first page to stext_page at (0, y), and update <y> for the next
+        # page.
         page = document[0]
+        ctm = pymupdf.mupdf.FzMatrix(1, 0, 0, 1, -rect.x0, -rect.y0 + y)
         pymupdf.mupdf.fz_run_page(page.this, device, ctm, cookie)
-        ctm.f += rect2.y1 - rect2.y0
+        y += rect.y1 - rect.y0
         
-        # Append second page to stext_page and prepare for any later page.
+        # Add second page to stext_page at (0, y), and update <y> for the next
+        # page.
         page = document[1]
+        ctm = pymupdf.mupdf.FzMatrix(1, 0, 0, 1, -rect.x0, -rect.y0 + y)
         pymupdf.mupdf.fz_run_page(page.this, device, ctm, cookie)
-        ctm.f += rect2.y1 - rect2.y0
+        y += rect.y1 - rect.y0
         
         # We've finished adding text to stext_page.
         pymupdf.mupdf.fz_close_device(device)
         
-        # Read text from stext_page.
+        # Create a pymupdf.TextPage() for <stext_page> so we can use
+        # text_page.extractDICT() etc.
         text_page = pymupdf.TextPage(stext_page)
         
         # Read text from stext_page using text_page.extractDICT().
@@ -748,30 +782,32 @@ def test_extendable_textpage():
         pno = 0
         ydelta = 0
         for block in d['blocks']:
-            print(f'block')
+            print(f'block {block["bbox"]=}')
             for line in block['lines']:
-                print(f'    line')
+                print(f'    line {line["bbox"]=}')
                 for span in line['spans']:
-                    print(f'        span')
+                    print(f'        span {span["bbox"]=}')
                     bbox = span['bbox']
                     x0, y0, x1, y1 = bbox
                     dy = y0 - y0_prev if y0_prev else 0
                     y0_prev = y0
                     print(f'                {dy=: 5.2f} height={y1-y0:.02f} {x0:.02f} {y0:.02f} {x1:.02f} {y1:.02f} {span["text"]=}')
                     if 'eee' in span['text']:
                         pno = 1
-                        ydelta = rect2.y1 - rect2.y0
+                        ydelta = rect.y1 - rect.y0
                     y0 -= ydelta
                     y1 -= ydelta
+                    # Debugging - add green lines on original document
+                    # translating final blocks info into original coors.
                     document[pno].draw_rect((x0, y0, x1, y1), (0, 1, 0))
         
-        print('\n\n\n\n')
+        print('\n\n')
         
         print(f'Using text_page.extractText()')
         text = text_page.extractText(True)
         print(f'{text}')
         
-        print('\n\n\n\n')
+        print('\n\n')
         print(f'Using extractBLOCKS')
         text = list()
         for x0, y0, x1, y1, line, no, type_ in text_page.extractBLOCKS():
@@ -780,7 +816,7 @@ def test_extendable_textpage():
             print(f'    {line=}')
             text.append(line)
         
-        print("\n\n\n")
+        print("\n\n")
         print(f'extractBLOCKS joined by newlines:')
         print('\n'.join(text))