Skip to content

Commit e76dfc2

Browse files
tests/: added stext bbox test, test_4179.
This shows problem with square root symbol font definition, which requires pymupdf.TEXT_ACCURATE_BBOXES and disabling of our quad_corrections as a workaround. This new test is disabled with non-optimised pymupdf - looks like behaviour differs because of Pythons's use of unicode instead of raw utf8, and it's non-trivial to fix.
1 parent d419817 commit e76dfc2

File tree

3 files changed

+93
-0
lines changed

3 files changed

+93
-0
lines changed

tests/resources/test_4179.pdf

29.2 KB
Binary file not shown.
1.54 KB
Loading

tests/test_textextract.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -514,3 +514,96 @@ def test_4182():
514514
rms = gentle_compare.pixmaps_rms(path_expected, pixmap)
515515
print(f'{rms=}')
516516
assert rms < 0.01
517+
518+
519+
def test_4179():
520+
if os.environ.get('PYMUPDF_USE_EXTRA') == '0':
521+
# Looks like Python code doesn't behave same as C++, probably because
522+
# of the code not being correct for Python's native unicode strings.
523+
#
524+
print(f'test_4179(): Not running with PYMUPDF_USE_EXTRA=0 because known to fail.')
525+
return
526+
# We check that using TEXT_ACCURATE_BBOXES gives the correct boxes. But
527+
# this also requires that we disable PyMuPDF quad corrections.
528+
#
529+
path = os.path.normpath(f'{__file__}/../../tests/resources/test_4179.pdf')
530+
531+
# Disable anti-aliasing to avoid our drawing of multiple identical bboxes
532+
# (from normal/accurate bboxes) giving slightly different results.
533+
aa = pymupdf.mupdf.fz_aa_level()
534+
uqc = pymupdf._globals.skip_quad_corrections
535+
pymupdf.TOOLS.set_aa_level(0)
536+
pymupdf.TOOLS.unset_quad_corrections(True)
537+
assert pymupdf._globals.skip_quad_corrections
538+
try:
539+
with pymupdf.open(path) as document:
540+
page = document[0]
541+
542+
char_sqrt = b'\xe2\x88\x9a'.decode()
543+
544+
# Search with defaults.
545+
bboxes_search = page.search_for(char_sqrt)
546+
assert len(bboxes_search) == 1
547+
print(f'bboxes_search[0]:\n {bboxes_search[0]!r}')
548+
page.draw_rect(bboxes_search[0], color=(1, 0, 0))
549+
rms = gentle_compare.rms(bboxes_search[0], (250.0489959716797, 91.93604278564453, 258.34783935546875, 101.34073638916016))
550+
assert rms < 0.01
551+
552+
# Search with TEXT_ACCURATE_BBOXES.
553+
bboxes_search_accurate = page.search_for(
554+
char_sqrt,
555+
flags = (0
556+
| pymupdf.TEXT_DEHYPHENATE
557+
| pymupdf.TEXT_PRESERVE_WHITESPACE
558+
| pymupdf.TEXT_PRESERVE_LIGATURES
559+
| pymupdf.TEXT_MEDIABOX_CLIP
560+
| pymupdf.TEXT_ACCURATE_BBOXES
561+
),
562+
)
563+
assert len(bboxes_search_accurate) == 1
564+
print(f'bboxes_search_accurate[0]\n {bboxes_search_accurate[0]!r}')
565+
page.draw_rect(bboxes_search_accurate[0], color=(0, 1, 0))
566+
rms = gentle_compare.rms(bboxes_search_accurate[0], (250.0489959716797, 99.00948333740234, 258.34783935546875, 108.97208404541016))
567+
assert rms < 0.01
568+
569+
# Iterate with TEXT_ACCURATE_BBOXES.
570+
bboxes_iterate_accurate = list()
571+
dict_ = page.get_text(
572+
'rawdict',
573+
flags = pymupdf.TEXT_ACCURATE_BBOXES,
574+
)
575+
linelist = []
576+
for block in dict_['blocks']:
577+
if block['type'] == 0:
578+
if 'lines' in block:
579+
for line in block.get('lines', ()):
580+
for span in line['spans']:
581+
for ch in span['chars']:
582+
if ch['c'] == char_sqrt:
583+
bbox_iterate_accurate = ch['bbox']
584+
bboxes_iterate_accurate.append(bbox_iterate_accurate)
585+
print(f'bbox_iterate_accurate:\n {bbox_iterate_accurate!r}')
586+
page.draw_rect(bbox_iterate_accurate, color=(0, 0, 1))
587+
588+
assert bboxes_search_accurate != bboxes_search
589+
assert bboxes_iterate_accurate == bboxes_search_accurate
590+
pixmap = page.get_pixmap()
591+
592+
path_out = os.path.normpath(f'{__file__}/../../tests/resources/test_4179_out.png')
593+
pixmap.save(path_out)
594+
path_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_4179_expected.png')
595+
rms = gentle_compare.pixmaps_rms(path_expected, pixmap)
596+
pixmap_diff = gentle_compare.pixmaps_diff(path_expected, pixmap)
597+
path_out_diff = os.path.normpath(f'{__file__}/../../tests/resources/test_4179_diff.png')
598+
pixmap_diff.save(path_out_diff)
599+
print(f'Have saved to: {path_out_diff=}')
600+
print(f'{rms=}')
601+
if pymupdf.mupdf_version_tuple < (1, 26):
602+
# Prior to fix for mupdf bug 708274, our rects are rendered slightly incorrectly.
603+
assert 3.5 < rms < 4.5
604+
else:
605+
assert rms < 0.01
606+
607+
finally:
608+
pymupdf.TOOLS.set_aa_level(aa)
609+
pymupdf.TOOLS.unset_quad_corrections(uqc)

0 commit comments

Comments
 (0)