Skip to content

Commit 65e6abf

Browse files
tests/: added test_4503: Undetected character styles
1 parent f2adca5 commit 65e6abf

File tree

2 files changed

+52
-0
lines changed

2 files changed

+52
-0
lines changed

tests/resources/test_4503.pdf

49.5 KB
Binary file not shown.

tests/test_textextract.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -850,3 +850,55 @@ def test_4546():
850850
assert text == expected_mupdf_1_26_1
851851
else:
852852
print(f'No expected output for {pymupdf.mupdf_version_tuple=}')
853+
854+
855+
def test_4503():
856+
# Check detection of strikeout text. Behaviour is improved with
857+
# mupdf>=1.26.2, but not perfect.
858+
#
859+
path = os.path.normpath(f'{__file__}/../../tests/resources/test_4503.pdf')
860+
span_0 = None
861+
text_0 = None
862+
print()
863+
print(f'{pymupdf.mupdf_version_tuple=}')
864+
with pymupdf.open(path) as document:
865+
page = document[0]
866+
# Specify TEXT_COLLECT_STYLES so we collect char_flags, which contains
867+
# FZ_STEXT_STRIKEOUT etc.
868+
#
869+
text = page.get_text('rawdict', flags=pymupdf.TEXTFLAGS_RAWDICT | pymupdf.TEXT_COLLECT_STYLES)
870+
for i, block in enumerate(text['blocks']):
871+
print(f'block {i}:')
872+
for j, line in enumerate(block['lines']):
873+
print(f' line {j}:')
874+
for k, span in enumerate(line['spans']):
875+
text = ''
876+
for char in span['chars']:
877+
text += char['c']
878+
print(f' span {k}: {span["flags"]=:#x} {span["char_flags"]=:#x}: {text!r}')
879+
if 'the right to request the state to review' in text:
880+
span_0 = span
881+
text_0 = text
882+
assert span_0
883+
#print(f'{span_0=}')
884+
print(f'{span_0["flags"]=:#x}')
885+
print(f'{span_0["char_flags"]=:#x}')
886+
print(f'{text_0=}')
887+
strikeout = span_0['char_flags'] & pymupdf.mupdf.FZ_STEXT_STRIKEOUT
888+
print(f'{strikeout=}')
889+
890+
if pymupdf.mupdf_version_tuple >= (1, 26, 2):
891+
# 2025-06-09: This is still incorrect - the span should include the
892+
# following text 'and, if appropriate,'. It looks like following spans
893+
# are:
894+
# strikeout=0: 'and, '
895+
# strikeout=1: 'if '
896+
# strikeout=0: 'appropri'
897+
# strikeout=1: 'ate,'
898+
#
899+
assert strikeout, f'Expected bit 0 (FZ_STEXT_STRIKEOUT) to be set in {span_0["char_flags"]=:#x}.'
900+
assert text_0 == 'the right to request the state to review '
901+
else:
902+
# Expecting the bug.
903+
assert not strikeout, f'Expected bit 0 (FZ_STEXT_STRIKEOUT) to be unset in {span_0["char_flags"]=:#x}.'
904+
assert text_0 == 'notice the right to request the state to review and, if appropriate,'

0 commit comments

Comments
 (0)