Skip to content

Commit b3331ef

Browse files
tests/test_textextract.py: add test_extendable_textpage.
Checks we can extend a textpage across two pages.
1 parent e76dfc2 commit b3331ef

File tree

1 file changed

+154
-0
lines changed

1 file changed

+154
-0
lines changed

tests/test_textextract.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,3 +607,157 @@ def test_4179():
607607
finally:
608608
pymupdf.TOOLS.set_aa_level(aa)
609609
pymupdf.TOOLS.unset_quad_corrections(uqc)
610+
611+
612+
def test_extendable_textpage():
613+
614+
# 2025-01-28:
615+
#
616+
# We can create a pdf with two pages whose text is adjacent when stitched
617+
# together vertically.
618+
#
619+
# We can append page to stext_page ok.
620+
#
621+
# Extracted spans are adjacent vertically as hoped.
622+
#
623+
# But... We always get a separate block for each page, even though the y
624+
# coordinates are adjacent and so we would expect stext_page to return a
625+
# single block. This is all with `sort=True`.
626+
#
627+
# Maybe sort=true doesn't ever join adjacent blocks??
628+
#
629+
print()
630+
631+
path = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage.pdf')
632+
with pymupdf.open(filetype='pdf') as document:
633+
document.new_page()
634+
document.new_page()
635+
document.save(path)
636+
637+
# Create document with two pages and text where a paragraph spans the two
638+
# pages.
639+
#
640+
with pymupdf.open(path) as document:
641+
page0 = document[0]
642+
page1 = document[1]
643+
y = 100
644+
for i in range(4):
645+
page0.insert_text((100, y+9.6), 'abcd'[i] * 16)
646+
page1.insert_text((100, y+9.6), 'efgh'[i] * 16)
647+
y += 9.6
648+
if i%2 == 0:
649+
y += 9.6*1
650+
rect = (100, 100, 200, y)
651+
rect2 = pymupdf.mupdf.FzRect(*rect)
652+
document[0].draw_rect((100, 100, 200, y), (1, 0, 0))
653+
document[1].draw_rect((100, 100, 200, y), (1, 0, 0))
654+
path2 = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage2.pdf')
655+
document.save(path2)
656+
657+
# Create a stext page for both pages of our document, using direct calls to
658+
# MuPDF for now.
659+
660+
with pymupdf.Document(path2) as document:
661+
662+
# Notes:
663+
#
664+
# We need to reuse the stext device for second page. Otherwise if we
665+
# create a new device, the first text in second page will always be in
666+
# a new block, because pen position for new device is (0, 0) and this
667+
# will usually be treated as a paragraph gap to the first text.
668+
#
669+
# At the moment we use infinite mediabox when using
670+
# fz_new_stext_page()'s to create the stext device. I don't know what a
671+
# non-infinite mediabox would be useful for.
672+
#
673+
# FZ_STEXT_CLIP_RECT isn't useful at the moment, because we would need
674+
# to modify it to be in stext pagae coordinates (i.e. adding ctm.f
675+
# to y0 and y1) when we append the second page. But it's internal
676+
# data and there's no api to modify it. So for now we don't specify
677+
# FZ_STEXT_CLIP_RECT when creating the stext device, so we always
678+
# include each page's entire contents.
679+
#
680+
681+
ctm = pymupdf.mupdf.FzMatrix()
682+
cookie = pymupdf.mupdf.FzCookie()
683+
684+
stext_page = pymupdf.mupdf.FzStextPage(
685+
pymupdf.mupdf.FzRect(pymupdf.mupdf.FzRect.Fixed_INFINITE), # mediabox
686+
)
687+
stext_options = pymupdf.mupdf.FzStextOptions()
688+
#stext_options.flags |= pymupdf.mupdf.FZ_STEXT_CLIP_RECT
689+
#stext_options.clip = rect2.internal()
690+
device = pymupdf.mupdf.fz_new_stext_device(stext_page, stext_options)
691+
692+
# Append second page to stext_page and prepare ctm for any later page.
693+
page = document[0]
694+
pymupdf.mupdf.fz_run_page(page.this, device, ctm, cookie)
695+
ctm.f += rect2.y1 - rect2.y0
696+
697+
# Append second page to stext_page and prepare for any later page.
698+
page = document[1]
699+
pymupdf.mupdf.fz_run_page(page.this, device, ctm, cookie)
700+
ctm.f += rect2.y1 - rect2.y0
701+
702+
# We've finished adding text to stext_page.
703+
pymupdf.mupdf.fz_close_device(device)
704+
705+
# Read text from stext_page.
706+
text_page = pymupdf.TextPage(stext_page)
707+
708+
# Read text from stext_page using text_page.extractDICT().
709+
print(f'Using text_page.extractDICT().')
710+
print(f'{text_page.this.m_internal.mediabox=}')
711+
d = text_page.extractDICT(sort=True)
712+
y0_prev = None
713+
pno = 0
714+
ydelta = 0
715+
for block in d['blocks']:
716+
print(f'block')
717+
for line in block['lines']:
718+
print(f' line')
719+
for span in line['spans']:
720+
print(f' span')
721+
bbox = span['bbox']
722+
x0, y0, x1, y1 = bbox
723+
dy = y0 - y0_prev if y0_prev else 0
724+
y0_prev = y0
725+
print(f' {dy=: 5.2f} height={y1-y0:.02f} {x0:.02f} {y0:.02f} {x1:.02f} {y1:.02f} {span["text"]=}')
726+
if 'eee' in span['text']:
727+
pno = 1
728+
ydelta = rect2.y1 - rect2.y0
729+
y0 -= ydelta
730+
y1 -= ydelta
731+
document[pno].draw_rect((x0, y0, x1, y1), (0, 1, 0))
732+
733+
print('\n\n\n\n')
734+
735+
print(f'Using text_page.extractText()')
736+
text = text_page.extractText(True)
737+
print(f'{text}')
738+
739+
print('\n\n\n\n')
740+
print(f'Using extractBLOCKS')
741+
text = list()
742+
for x0, y0, x1, y1, line, no, type_ in text_page.extractBLOCKS():
743+
print(f'block:')
744+
print(f' bbox={x0, y0, x1, y1} {no=}')
745+
print(f' {line=}')
746+
text.append(line)
747+
748+
print("\n\n\n")
749+
print(f'extractBLOCKS joined by newlines:')
750+
print('\n'.join(text))
751+
752+
# This checks that lines before/after pages break are treated as a
753+
# single paragraph.
754+
assert text == [
755+
'aaaaaaaaaaaaaaaa\n',
756+
'bbbbbbbbbbbbbbbb\ncccccccccccccccc\n',
757+
'dddddddddddddddd\neeeeeeeeeeeeeeee\n',
758+
'ffffffffffffffff\ngggggggggggggggg\n',
759+
'hhhhhhhhhhhhhhhh\n',
760+
]
761+
762+
path3 = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage3.pdf')
763+
document.save(path3)

0 commit comments

Comments
 (0)