Skip to content

Commit 734dbe5

Browse files
tests/: support mupdf-1.25.2's new fz_stext_char flags.
Add new `char_flags` member to span dictionary if mupdf >= 1.25.2, containing extra low-level information. For example allows detection of invisible text. Updated docs/textpage.rst. Improved tests of mupdf version in src/extra.i. Added tests/test_textextract.py:test_4147() which checks that we can detect when text is invisible.
1 parent 1b4e3a7 commit 734dbe5

File tree

6 files changed

+138
-24
lines changed

6 files changed

+138
-24
lines changed

docs/textpage.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ ascender ascender of the font *(float)*
288288
descender descender of the font *(float)*
289289
size font size *(float)*
290290
flags font characteristics *(int)*
291+
char_flags char characteristics *(int)*
291292
color text color in sRGB format *(int)*
292293
text (only for :meth:`extractDICT`) text *(str)*
293294
chars (only for :meth:`extractRAWDICT`) *list* of character dictionaries
@@ -335,6 +336,21 @@ Test these characteristics like so:
335336

336337
Bits 1 thru 4 are font properties, i.e. encoded in the font program. Please note, that this information is not necessarily correct or complete: fonts quite often contain wrong data here.
337338

339+
*"char_flags"* is an integer, which represents extra character properties:
340+
341+
* bit 0: strikeout.
342+
* bit 1: underline.
343+
* bit 2: synthetic.
344+
* bit 3: filled.
345+
* bit 4: stroked.
346+
* bit 5: clipped.
347+
348+
For example if not filled and not stroked (`if not (char_flags & 2**3 & 2**4):
349+
...`) then the text will be invisible.
350+
351+
(`char_flags` is new in v1.25.2.)
352+
353+
338354
Character Dictionary for :meth:`extractRAWDICT`
339355
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
340356

src/__init__.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13619,6 +13619,7 @@ class EmptyFileError(FileDataError):
1361913619
dictkey_filename = "filename"
1362013620
dictkey_fill = "fill"
1362113621
dictkey_flags = "flags"
13622+
dictkey_char_flags = "char_flags"
1362213623
dictkey_font = "font"
1362313624
dictkey_glyph = "glyph"
1362413625
dictkey_height = "height"
@@ -14669,7 +14670,9 @@ def JM_char_bbox(line, ch):
1466914670

1467014671

1467114672
def JM_char_font_flags(font, line, ch):
14672-
flags = detect_super_script(line, ch)
14673+
flags = 0
14674+
if line and ch:
14675+
flags += detect_super_script(line, ch)
1467314676
flags += mupdf.fz_font_is_italic(font) * TEXT_FONT_ITALIC
1467414677
flags += mupdf.fz_font_is_serif(font) * TEXT_FONT_SERIFED
1467514678
flags += mupdf.fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED
@@ -16391,19 +16394,27 @@ def __init__(self, rhs=None):
1639116394
if rhs:
1639216395
self.size = rhs.size
1639316396
self.flags = rhs.flags
16397+
if mupdf_version_tuple >= (1, 25, 2):
16398+
self.char_flags = rhs.char_flags
1639416399
self.font = rhs.font
1639516400
self.color = rhs.color
1639616401
self.asc = rhs.asc
1639716402
self.desc = rhs.desc
1639816403
else:
1639916404
self.size = -1
1640016405
self.flags = -1
16406+
if mupdf_version_tuple >= (1, 25, 2):
16407+
self.char_flags = -1
1640116408
self.font = ''
1640216409
self.color = -1
1640316410
self.asc = 0
1640416411
self.desc = 0
1640516412
def __str__(self):
16406-
return f'{self.size} {self.flags} {self.font} {self.color} {self.asc} {self.desc}'
16413+
ret = f'{self.size} {self.flags}'
16414+
if mupdf_version_tuple >= (1, 25, 2):
16415+
ret += f' {self.char_flags}'
16416+
ret += f' {self.font} {self.color} {self.asc} {self.desc}'
16417+
return ret
1640716418

1640816419
old_style = char_style()
1640916420
style = char_style()
@@ -16418,10 +16429,19 @@ def __str__(self):
1641816429
):
1641916430
continue
1642016431

16432+
# Info from:
16433+
# detect_super_script()
16434+
# fz_font_is_italic()
16435+
# fz_font_is_serif()
16436+
# fz_font_is_monospaced()
16437+
# fz_font_is_bold()
16438+
1642116439
flags = JM_char_font_flags(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)), line, ch)
1642216440
origin = mupdf.FzPoint(ch.m_internal.origin)
1642316441
style.size = ch.m_internal.size
1642416442
style.flags = flags
16443+
if mupdf_version_tuple >= (1, 25, 2):
16444+
style.char_flags = ch.m_internal.flags
1642516445
style.font = JM_font_name(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
1642616446
if mupdf_version_tuple >= (1, 25):
1642716447
style.color = ch.m_internal.argb
@@ -16432,6 +16452,10 @@ def __str__(self):
1643216452

1643316453
if (style.size != old_style.size
1643416454
or style.flags != old_style.flags
16455+
or (mupdf_version_tuple >= (1, 25, 2)
16456+
and (style.char_flags & ~mupdf.FZ_STEXT_SYNTHETIC)
16457+
!= (old_style.char_flags & ~mupdf.FZ_STEXT_SYNTHETIC)
16458+
)
1643516459
or style.color != old_style.color
1643616460
or style.font != old_style.font
1643716461
):
@@ -16461,6 +16485,8 @@ def __str__(self):
1646116485

1646216486
span[dictkey_size] = style.size
1646316487
span[dictkey_flags] = style.flags
16488+
if mupdf_version_tuple >= (1, 25, 2):
16489+
span[dictkey_char_flags] = style.char_flags
1646416490
span[dictkey_font] = JM_EscapeStrFromStr(style.font)
1646516491
span[dictkey_color] = style.color
1646616492
span["ascender"] = asc
@@ -18696,7 +18722,7 @@ def jm_trace_text_span(dev, span, type_, ctm, colorspace, color, alpha, seqno):
1869618722
chars = tuple(chars)
1869718723

1869818724
if not space_adv:
18699-
if not mono:
18725+
if not (fflags & TEXT_FONT_MONOSPACED):
1870018726
c, out_font = mupdf.fz_encode_character_with_fallback( span.font(), 32, 0, 0)
1870118727
space_adv = mupdf.fz_advance_glyph(
1870218728
span.font(),

src/extra.i

Lines changed: 65 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ otherwise compilation can fail because free() and malloc() are not declared. */
4343
dictkey_filename = PyUnicode_InternFromString("filename");
4444
dictkey_fill = PyUnicode_InternFromString("fill");
4545
dictkey_flags = PyUnicode_InternFromString("flags");
46+
dictkey_char_flags = PyUnicode_InternFromString("char_flags"); /* Only used with mupdf >= 1.25.2. */
4647
dictkey_font = PyUnicode_InternFromString("font");
4748
dictkey_glyph = PyUnicode_InternFromString("glyph");
4849
dictkey_height = PyUnicode_InternFromString("height");
@@ -103,6 +104,14 @@ catch(...) {
103104
#include <float.h>
104105

105106

107+
#define MAKE_MUPDF_VERSION_INT(major, minor, patch) ((major << 16) + (minor << 8) + (patch << 0))
108+
109+
#define MUPDF_VERSION_INT MAKE_MUPDF_VERSION_INT(FZ_VERSION_MAJOR, FZ_VERSION_MINOR, FZ_VERSION_PATCH)
110+
111+
#define MUPDF_VERSION_GE(major, minor, patch) \
112+
MUPDF_VERSION_INT >= MAKE_MUPDF_VERSION_INT(major, minor, patch)
113+
114+
106115
/* Returns equivalent of `repr(x)`. */
107116
static std::string repr(PyObject* x)
108117
{
@@ -837,6 +846,7 @@ PyObject* dictkey_ext = NULL;
837846
PyObject* dictkey_filename = NULL;
838847
PyObject* dictkey_fill = NULL;
839848
PyObject* dictkey_flags = NULL;
849+
PyObject* dictkey_char_flags = NULL;
840850
PyObject* dictkey_font = NULL;
841851
PyObject* dictkey_glyph = NULL;
842852
PyObject* dictkey_height = NULL;
@@ -1712,6 +1722,29 @@ static const char* JM_font_name(fz_font* font)
17121722
return s + 1;
17131723
}
17141724

1725+
static int detect_super_script(fz_stext_line *line, fz_stext_char *ch)
1726+
{
1727+
if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
1728+
{
1729+
return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
1730+
}
1731+
return 0;
1732+
}
1733+
1734+
static int JM_char_font_flags(fz_font *font, fz_stext_line *line, fz_stext_char *ch)
1735+
{
1736+
int flags = 0;
1737+
if (line && ch)
1738+
{
1739+
flags += detect_super_script(line, ch) * TEXT_FONT_SUPERSCRIPT;
1740+
}
1741+
flags += mupdf::ll_fz_font_is_italic(font) * TEXT_FONT_ITALIC;
1742+
flags += mupdf::ll_fz_font_is_serif(font) * TEXT_FONT_SERIFED;
1743+
flags += mupdf::ll_fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED;
1744+
flags += mupdf::ll_fz_font_is_bold(font) * TEXT_FONT_BOLD;
1745+
return flags;
1746+
}
1747+
17151748
static void jm_trace_text_span(
17161749
jm_tracedraw_device* dev,
17171750
fz_text_span* span,
@@ -1827,7 +1860,7 @@ static void jm_trace_text_span(
18271860
}
18281861
if (!space_adv)
18291862
{
1830-
if (!mono)
1863+
if (!(fflags & TEXT_FONT_MONOSPACED))
18311864
{
18321865
fz_font* out_font = nullptr;
18331866
space_adv = mupdf::ll_fz_advance_glyph(
@@ -2957,25 +2990,6 @@ PyObject* get_cdrawings(mupdf::FzPage& page, PyObject *extended=NULL, PyObject *
29572990
}
29582991

29592992

2960-
static int detect_super_script(fz_stext_line *line, fz_stext_char *ch)
2961-
{
2962-
if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
2963-
{
2964-
return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
2965-
}
2966-
return 0;
2967-
}
2968-
2969-
static int JM_char_font_flags(fz_font *font, fz_stext_line *line, fz_stext_char *ch)
2970-
{
2971-
int flags = detect_super_script(line, ch);
2972-
flags += mupdf::ll_fz_font_is_italic(font) * TEXT_FONT_ITALIC;
2973-
flags += mupdf::ll_fz_font_is_serif(font) * TEXT_FONT_SERIFED;
2974-
flags += mupdf::ll_fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED;
2975-
flags += mupdf::ll_fz_font_is_bold(font) * TEXT_FONT_BOLD;
2976-
return flags;
2977-
}
2978-
29792993
//---------------------------------------------------------------------------
29802994
// APPEND non-ascii runes in unicode escape format to fz_buffer
29812995
//---------------------------------------------------------------------------
@@ -3027,6 +3041,20 @@ mupdf::FzRect JM_make_spanlist(
30273041
{
30283042
float size = -1;
30293043
int flags = -1;
3044+
3045+
#if MUPDF_VERSION_GE(1, 25, 2)
3046+
/* From mupdf:include/mupdf/fitz/structured-text.h:fz_stext_char::flags, which
3047+
uses anonymous enum values:
3048+
FZ_STEXT_STRIKEOUT = 1,
3049+
FZ_STEXT_UNDERLINE = 2,
3050+
FZ_STEXT_SYNTHETIC = 4,
3051+
FZ_STEXT_FILLED = 16,
3052+
FZ_STEXT_STROKED = 32,
3053+
FZ_STEXT_CLIPPED = 64
3054+
*/
3055+
int char_flags;
3056+
#endif
3057+
30303058
const char *font = "";
30313059
unsigned int color = -1;
30323060
float asc = 0;
@@ -3042,12 +3070,22 @@ mupdf::FzRect JM_make_spanlist(
30423070
{
30433071
continue;
30443072
}
3073+
/* Info from:
3074+
detect_super_script()
3075+
fz_font_is_italic()
3076+
fz_font_is_serif()
3077+
fz_font_is_monospaced()
3078+
fz_font_is_bold()
3079+
*/
30453080
int flags = JM_char_font_flags( ch.m_internal->font, line.m_internal, ch.m_internal);
30463081
fz_point origin = ch.m_internal->origin;
30473082
style.size = ch.m_internal->size;
30483083
style.flags = flags;
3084+
#if MUPDF_VERSION_GE(1, 25, 2)
3085+
style.char_flags = ch.m_internal->flags;
3086+
#endif
30493087
style.font = JM_font_name(ch.m_internal->font);
3050-
#if (FZ_VERSION_MAJOR > 1 || (FZ_VERSION_MAJOR == 1 && FZ_VERSION_MINOR >= 25))
3088+
#if MUPDF_VERSION_GE(1, 25, 0)
30513089
style.color = ch.m_internal->argb;
30523090
#else
30533091
style.color = ch.m_internal->color;
@@ -3058,6 +3096,9 @@ mupdf::FzRect JM_make_spanlist(
30583096
if (0
30593097
|| style.size != old_style.size
30603098
|| style.flags != old_style.flags
3099+
#if MUPDF_VERSION_GE(1, 25, 2)
3100+
|| (style.char_flags & ~FZ_STEXT_SYNTHETIC) != (old_style.char_flags & ~FZ_STEXT_SYNTHETIC)
3101+
#endif
30613102
|| style.color != old_style.color
30623103
|| strcmp(style.font, old_style.font) != 0
30633104
)
@@ -3095,6 +3136,9 @@ mupdf::FzRect JM_make_spanlist(
30953136

30963137
DICT_SETITEM_DROP(span, dictkey_size, Py_BuildValue("f", style.size));
30973138
DICT_SETITEM_DROP(span, dictkey_flags, Py_BuildValue("i", style.flags));
3139+
#if MUPDF_VERSION_GE(1, 25, 2)
3140+
DICT_SETITEM_DROP(span, dictkey_char_flags, Py_BuildValue("i", style.char_flags));
3141+
#endif
30983142
DICT_SETITEM_DROP(span, dictkey_font, JM_EscapeStrFromStr(style.font));
30993143
DICT_SETITEM_DROP(span, dictkey_color, Py_BuildValue("i", style.color));
31003144
DICT_SETITEMSTR_DROP(span, "ascender", Py_BuildValue("f", asc));

src/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -929,6 +929,7 @@ def get_text(
929929
"blocks": pymupdf.TEXTFLAGS_BLOCKS,
930930
}
931931
option = option.lower()
932+
assert option in formats
932933
if option not in formats:
933934
option = "text"
934935
if flags is None:

tests/resources/test_4147.pdf

2.77 MB
Binary file not shown.

tests/test_textextract.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,3 +395,30 @@ def test_3725():
395395
text = page.get_text()
396396
if 0:
397397
print(textwrap.indent(text, ' '))
398+
399+
def test_4147():
400+
print()
401+
items = list()
402+
for expect_visible, path in (
403+
(False, os.path.normpath(f'{__file__}/../../tests/resources/test_4147.pdf')),
404+
(True, os.path.normpath(f'{__file__}/../../tests/resources/symbol-list.pdf')),
405+
):
406+
print(f'{expect_visible} {path=}')
407+
with pymupdf.open(path) as document:
408+
page = document[0]
409+
text = page.get_text('rawdict')
410+
for block in text['blocks']:
411+
if block['type'] == 0:
412+
for line in block['lines']:
413+
for span in line['spans']:
414+
#print(f' {span=}')
415+
if pymupdf.mupdf_version_tuple >= (1, 25, 2):
416+
print(f' span: {span["flags"]=:#x} {span["char_flags"]=:#x}')
417+
if expect_visible:
418+
assert span['char_flags'] & pymupdf.mupdf.FZ_STEXT_FILLED
419+
else:
420+
assert not (span['char_flags'] & pymupdf.mupdf.FZ_STEXT_FILLED)
421+
assert not (span['char_flags'] & pymupdf.mupdf.FZ_STEXT_STROKED)
422+
else:
423+
print(f' span: {span["flags"]=:#x}')
424+
assert 'char_flags' not in span

0 commit comments

Comments
 (0)