Skip to content

Commit 20bc5a2

Browse files
tests/test_textextract.py:test_extendable_textpage(): improved code and comments.
1 parent 795b9f5 commit 20bc5a2

File tree

1 file changed

+79
-43
lines changed

1 file changed

+79
-43
lines changed

tests/test_textextract.py

Lines changed: 79 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,11 @@ def test_4139():
469469

470470
def test_4245():
471471
path = os.path.normpath(f'{__file__}/../../tests/resources/test_4245.pdf')
472+
with pymupdf.open(path) as document:
473+
page = document[0]
474+
regions = page.search_for('Bart Simpson')
475+
print(f'{regions=}')
476+
page.add_highlight_annot(regions)
472477
with pymupdf.open(path) as document:
473478
page = document[0]
474479
regions = page.search_for('Bart Simpson')
@@ -649,50 +654,70 @@ def test_extendable_textpage():
649654
# 2025-01-28:
650655
#
651656
# We can create a pdf with two pages whose text is adjacent when stitched
652-
# together vertically.
657+
# together vertically:
653658
#
654-
# We can append page to stext_page ok.
659+
# Page 1:
660+
#
661+
# aaaa
662+
#
663+
# bbbb
664+
# cccc
665+
#
666+
# dddd
667+
#
668+
# Page 2:
669+
#
670+
# eeee
671+
#
672+
# ffff
673+
# gggg
674+
#
675+
# hhhh
655676
#
656-
# Extracted spans are adjacent vertically as hoped.
657677
#
658-
# But... We always get a separate block for each page, even though the y
659-
# coordinates are adjacent and so we would expect stext_page to return a
660-
# single block. This is all with `sort=True`.
678+
# Create a textpage for both of these pages. Then when extracting text,
679+
# we need to get (specifically the `dddd` and `eeee` sequences need to be
680+
# treated as the same block):
661681
#
662-
# Maybe sort=true doesn't ever join adjacent blocks??
682+
# aaaa
683+
#
684+
# bbbb
685+
# cccc
686+
#
687+
# dddd
688+
# eeee
689+
#
690+
# ffff
691+
# gggg
692+
#
693+
# hhhh
663694
#
664695
print()
665696

666697
path = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage.pdf')
667698
with pymupdf.open(filetype='pdf') as document:
668699
document.new_page()
669700
document.new_page()
670-
document.save(path)
671-
672-
# Create document with two pages and text where a paragraph spans the two
673-
# pages.
674-
#
675-
with pymupdf.open(path) as document:
676701
page0 = document[0]
677702
page1 = document[1]
678703
y = 100
704+
line_height = 9.6
679705
for i in range(4):
680-
page0.insert_text((100, y+9.6), 'abcd'[i] * 16)
681-
page1.insert_text((100, y+9.6), 'efgh'[i] * 16)
682-
y += 9.6
706+
page0.insert_text((100, y+line_height), 'abcd'[i] * 16)
707+
page1.insert_text((100, y+line_height), 'efgh'[i] * 16)
708+
y += line_height
683709
if i%2 == 0:
684-
y += 9.6*1
685-
rect = (100, 100, 200, y)
686-
rect2 = pymupdf.mupdf.FzRect(*rect)
687-
document[0].draw_rect((100, 100, 200, y), (1, 0, 0))
688-
document[1].draw_rect((100, 100, 200, y), (1, 0, 0))
689-
path2 = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage2.pdf')
690-
document.save(path2)
710+
y += line_height
711+
rect = pymupdf.mupdf.FzRect(100, 100, 200, y)
712+
document[0].draw_rect(rect, (1, 0, 0))
713+
document[1].draw_rect(rect, (1, 0, 0))
714+
document.save(path)
691715

692-
# Create a stext page for both pages of our document, using direct calls to
693-
# MuPDF for now.
716+
# Create a stext page for the text regions in both pages of our document,
717+
# using direct calls to MuPDF.
718+
#
694719

695-
with pymupdf.Document(path2) as document:
720+
with pymupdf.Document(path) as document:
696721

697722
# Notes:
698723
#
@@ -701,9 +726,9 @@ def test_extendable_textpage():
701726
# a new block, because pen position for new device is (0, 0) and this
702727
# will usually be treated as a paragraph gap to the first text.
703728
#
704-
# At the moment we use infinite mediabox when using
705-
# fz_new_stext_page()'s to create the stext device. I don't know what a
706-
# non-infinite mediabox would be useful for.
729+
# At the moment we use infinite mediabox when creating the
730+
# fz_stext_page. I don't know what a non-infinite mediabox would be
731+
# useful for.
707732
#
708733
# FZ_STEXT_CLIP_RECT isn't useful at the moment, because we would need
709734
# to modify it to be in stext pagae coordinates (i.e. adding ctm.f
@@ -713,31 +738,40 @@ def test_extendable_textpage():
713738
# include each page's entire contents.
714739
#
715740

716-
ctm = pymupdf.mupdf.FzMatrix()
741+
# We use our knowledge of the text rect in each page to manipulate ctm
742+
# so that the stext contains text starting at (0, 0) and extending
743+
# downwards.
744+
#
745+
y = 0
717746
cookie = pymupdf.mupdf.FzCookie()
718747

719748
stext_page = pymupdf.mupdf.FzStextPage(
720749
pymupdf.mupdf.FzRect(pymupdf.mupdf.FzRect.Fixed_INFINITE), # mediabox
721750
)
722751
stext_options = pymupdf.mupdf.FzStextOptions()
723752
#stext_options.flags |= pymupdf.mupdf.FZ_STEXT_CLIP_RECT
724-
#stext_options.clip = rect2.internal()
753+
#stext_options.clip = rect.internal()
725754
device = pymupdf.mupdf.fz_new_stext_device(stext_page, stext_options)
726755

727-
# Append second page to stext_page and prepare ctm for any later page.
756+
# Add first page to stext_page at (0, y), and update <y> for the next
757+
# page.
728758
page = document[0]
759+
ctm = pymupdf.mupdf.FzMatrix(1, 0, 0, 1, -rect.x0, -rect.y0 + y)
729760
pymupdf.mupdf.fz_run_page(page.this, device, ctm, cookie)
730-
ctm.f += rect2.y1 - rect2.y0
761+
y += rect.y1 - rect.y0
731762

732-
# Append second page to stext_page and prepare for any later page.
763+
# Add second page to stext_page at (0, y), and update <y> for the next
764+
# page.
733765
page = document[1]
766+
ctm = pymupdf.mupdf.FzMatrix(1, 0, 0, 1, -rect.x0, -rect.y0 + y)
734767
pymupdf.mupdf.fz_run_page(page.this, device, ctm, cookie)
735-
ctm.f += rect2.y1 - rect2.y0
768+
y += rect.y1 - rect.y0
736769

737770
# We've finished adding text to stext_page.
738771
pymupdf.mupdf.fz_close_device(device)
739772

740-
# Read text from stext_page.
773+
# Create a pymupdf.TextPage() for <stext_page> so we can use
774+
# text_page.extractDICT() etc.
741775
text_page = pymupdf.TextPage(stext_page)
742776

743777
# Read text from stext_page using text_page.extractDICT().
@@ -748,30 +782,32 @@ def test_extendable_textpage():
748782
pno = 0
749783
ydelta = 0
750784
for block in d['blocks']:
751-
print(f'block')
785+
print(f'block {block["bbox"]=}')
752786
for line in block['lines']:
753-
print(f' line')
787+
print(f' line {line["bbox"]=}')
754788
for span in line['spans']:
755-
print(f' span')
789+
print(f' span {span["bbox"]=}')
756790
bbox = span['bbox']
757791
x0, y0, x1, y1 = bbox
758792
dy = y0 - y0_prev if y0_prev else 0
759793
y0_prev = y0
760794
print(f' {dy=: 5.2f} height={y1-y0:.02f} {x0:.02f} {y0:.02f} {x1:.02f} {y1:.02f} {span["text"]=}')
761795
if 'eee' in span['text']:
762796
pno = 1
763-
ydelta = rect2.y1 - rect2.y0
797+
ydelta = rect.y1 - rect.y0
764798
y0 -= ydelta
765799
y1 -= ydelta
800+
# Debugging - add green lines on original document
801+
# translating final blocks info into original coors.
766802
document[pno].draw_rect((x0, y0, x1, y1), (0, 1, 0))
767803

768-
print('\n\n\n\n')
804+
print('\n\n')
769805

770806
print(f'Using text_page.extractText()')
771807
text = text_page.extractText(True)
772808
print(f'{text}')
773809

774-
print('\n\n\n\n')
810+
print('\n\n')
775811
print(f'Using extractBLOCKS')
776812
text = list()
777813
for x0, y0, x1, y1, line, no, type_ in text_page.extractBLOCKS():
@@ -780,7 +816,7 @@ def test_extendable_textpage():
780816
print(f' {line=}')
781817
text.append(line)
782818

783-
print("\n\n\n")
819+
print("\n\n")
784820
print(f'extractBLOCKS joined by newlines:')
785821
print('\n'.join(text))
786822

0 commit comments

Comments
 (0)