Skip to content

Commit bbae37e

Browse files
Avoid -ve color values in text-span dicts and add alpha value.
Color value from MuPDF now contains alpha value in top 8 bits so appeared -ve if alpha sets top bit. Also added alpha value to the dict. Addresses #4139.
1 parent 0446c42 commit bbae37e

File tree

5 files changed

+53
-17
lines changed

5 files changed

+53
-17
lines changed

docs/textpage.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,15 +292,22 @@ descender descender of the font *(float)*
292292
size font size *(float)*
293293
flags font characteristics *(int)*
294294
char_flags char characteristics *(int)*
295-
color text color in sRGB format *(int)*
295+
color text color in sRGB format 0xRRGGBB *(int)*.
296+
alpha text opacity 0..255 *(int)*.
296297
text (only for :meth:`extractDICT`) text *(str)*
297298
chars (only for :meth:`extractRAWDICT`) *list* of character dictionaries
298299
=============== =====================================================================
299300

301+
|history_begin|
302+
303+
*(New in version 1.25.3.0):* Added *"alpha"* item.
304+
300305
*(New in version 1.16.0):* *"color"* is the text color encoded in sRGB (int) format, e.g. 0xFF0000 for red. There are functions for converting this integer back to formats (r, g, b) (PDF with float values from 0 to 1) :meth:`sRGB_to_pdf`, or (R, G, B), :meth:`sRGB_to_rgb` (with integer values from 0 to 255).
301306

302307
*(New in v1.18.5):* *"ascender"* and *"descender"* are font properties, provided relative to :data:`fontsize` 1. Note that descender is a negative value. The following picture shows the relationship to other values and properties.
303308

309+
|history_end|
310+
304311
.. image:: images/img-asc-desc.*
305312
:scale: 60
306313

src/__init__.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16355,7 +16355,7 @@ def __init__(self, rhs=None):
1635516355
if mupdf_version_tuple >= (1, 25, 2):
1635616356
self.char_flags = rhs.char_flags
1635716357
self.font = rhs.font
16358-
self.color = rhs.color
16358+
self.argb = rhs.argb
1635916359
self.asc = rhs.asc
1636016360
self.desc = rhs.desc
1636116361
else:
@@ -16364,7 +16364,7 @@ def __init__(self, rhs=None):
1636416364
if mupdf_version_tuple >= (1, 25, 2):
1636516365
self.char_flags = -1
1636616366
self.font = ''
16367-
self.color = -1
16367+
self.argb = -1
1636816368
self.asc = 0
1636916369
self.desc = 0
1637016370
def __str__(self):
@@ -16402,9 +16402,9 @@ def __str__(self):
1640216402
style.char_flags = ch.m_internal.flags
1640316403
style.font = JM_font_name(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
1640416404
if mupdf_version_tuple >= (1, 25):
16405-
style.color = ch.m_internal.argb
16405+
style.argb = ch.m_internal.argb
1640616406
else:
16407-
style.color = ch.m_internal.color
16407+
style.argb = ch.m_internal.color
1640816408
style.asc = JM_font_ascender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
1640916409
style.desc = JM_font_descender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
1641016410

@@ -16414,7 +16414,7 @@ def __str__(self):
1641416414
and (style.char_flags & ~mupdf.FZ_STEXT_SYNTHETIC)
1641516415
!= (old_style.char_flags & ~mupdf.FZ_STEXT_SYNTHETIC)
1641616416
)
16417-
or style.color != old_style.color
16417+
or style.argb != old_style.argb
1641816418
or style.font != old_style.font
1641916419
):
1642016420
if old_style.size >= 0:
@@ -16446,7 +16446,9 @@ def __str__(self):
1644616446
if mupdf_version_tuple >= (1, 25, 2):
1644716447
span[dictkey_char_flags] = style.char_flags
1644816448
span[dictkey_font] = JM_EscapeStrFromStr(style.font)
16449-
span[dictkey_color] = style.color
16449+
span[dictkey_color] = style.argb & 0xffffff
16450+
if mupdf_version_tuple >= (1, 25, 0):
16451+
span['alpha'] = style.argb >> 24
1645016452
span["ascender"] = asc
1645116453
span["descender"] = desc
1645216454

src/extra.i

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1920,7 +1920,6 @@ static void jm_trace_text_span(
19201920
<< " fsize=" << fsize
19211921
<< " linewidth=" << linewidth
19221922
<< "\n";
1923-
19241923
dict_setitem_drop(span_dict, dictkey_color, Py_BuildValue("fff", rgb[0], rgb[1], rgb[2]));
19251924
dict_setitem_drop(span_dict, dictkey_size, PyFloat_FromDouble(fsize));
19261925
dict_setitemstr_drop(span_dict, "opacity", PyFloat_FromDouble((double) alpha));
@@ -3040,7 +3039,7 @@ mupdf::FzRect JM_make_spanlist(
30403039
struct char_style
30413040
{
30423041
float size = -1;
3043-
int flags = -1;
3042+
unsigned flags = 0;
30443043

30453044
#if MUPDF_VERSION_GE(1, 25, 2)
30463045
/* From mupdf:include/mupdf/fitz/structured-text.h:fz_stext_char::flags, which
@@ -3052,11 +3051,11 @@ mupdf::FzRect JM_make_spanlist(
30523051
FZ_STEXT_STROKED = 32,
30533052
FZ_STEXT_CLIPPED = 64
30543053
*/
3055-
int char_flags;
3054+
unsigned char_flags = 0;
30563055
#endif
30573056

30583057
const char *font = "";
3059-
unsigned int color = -1;
3058+
unsigned argb = 0;
30603059
float asc = 0;
30613060
float desc = 0;
30623061
};
@@ -3086,9 +3085,9 @@ mupdf::FzRect JM_make_spanlist(
30863085
#endif
30873086
style.font = JM_font_name(ch.m_internal->font);
30883087
#if MUPDF_VERSION_GE(1, 25, 0)
3089-
style.color = ch.m_internal->argb;
3088+
style.argb = ch.m_internal->argb;
30903089
#else
3091-
style.color = ch.m_internal->color;
3090+
style.argb = ch.m_internal->color;
30923091
#endif
30933092
style.asc = JM_font_ascender(ch.m_internal->font);
30943093
style.desc = JM_font_descender(ch.m_internal->font);
@@ -3099,7 +3098,7 @@ mupdf::FzRect JM_make_spanlist(
30993098
#if MUPDF_VERSION_GE(1, 25, 2)
31003099
|| (style.char_flags & ~FZ_STEXT_SYNTHETIC) != (old_style.char_flags & ~FZ_STEXT_SYNTHETIC)
31013100
#endif
3102-
|| style.color != old_style.color
3101+
|| style.argb != old_style.argb
31033102
|| strcmp(style.font, old_style.font) != 0
31043103
)
31053104
{
@@ -3135,12 +3134,15 @@ mupdf::FzRect JM_make_spanlist(
31353134
}
31363135

31373136
DICT_SETITEM_DROP(span, dictkey_size, Py_BuildValue("f", style.size));
3138-
DICT_SETITEM_DROP(span, dictkey_flags, Py_BuildValue("i", style.flags));
3137+
DICT_SETITEM_DROP(span, dictkey_flags, Py_BuildValue("I", style.flags));
31393138
#if MUPDF_VERSION_GE(1, 25, 2)
3140-
DICT_SETITEM_DROP(span, dictkey_char_flags, Py_BuildValue("i", style.char_flags));
3139+
DICT_SETITEM_DROP(span, dictkey_char_flags, Py_BuildValue("I", style.char_flags));
31413140
#endif
31423141
DICT_SETITEM_DROP(span, dictkey_font, JM_EscapeStrFromStr(style.font));
3143-
DICT_SETITEM_DROP(span, dictkey_color, Py_BuildValue("i", style.color));
3142+
DICT_SETITEM_DROP(span, dictkey_color, Py_BuildValue("I", style.argb & 0xffffff));
3143+
#if MUPDF_VERSION_GE(1, 25, 0)
3144+
DICT_SETITEMSTR_DROP(span, "alpha", Py_BuildValue("I", style.argb >> 24));
3145+
#endif
31443146
DICT_SETITEMSTR_DROP(span, "ascender", Py_BuildValue("f", asc));
31453147
DICT_SETITEMSTR_DROP(span, "descender", Py_BuildValue("f", desc));
31463148

tests/resources/test_4139.pdf

36.5 KB
Binary file not shown.

tests/test_textextract.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,3 +422,28 @@ def test_4147():
422422
else:
423423
print(f' span: {span["flags"]=:#x}')
424424
assert 'char_flags' not in span
425+
426+
427+
def test_4139():
428+
path = os.path.normpath(f'{__file__}/../../tests/resources/test_4139.pdf')
429+
flags = (0
430+
| pymupdf.TEXT_PRESERVE_IMAGES
431+
| pymupdf.TEXT_PRESERVE_WHITESPACE
432+
| pymupdf.TEXT_CID_FOR_UNKNOWN_UNICODE
433+
)
434+
with pymupdf.open(path) as document:
435+
page = document[0]
436+
dicts = page.get_text('dict', flags=flags, sort=True)
437+
seen = set()
438+
for b_ctr, b in enumerate(dicts['blocks']):
439+
for l_ctr, l in enumerate(b.get('lines', [])):
440+
for s_ctr, s in enumerate(l['spans']):
441+
color = s.get('color')
442+
if color is not None and color not in seen:
443+
seen.add(color)
444+
print(f"B{b_ctr}.L{l_ctr}.S{s_ctr}: {color=} {hex(color)=} {s=}")
445+
assert color == 0, f'{s=}'
446+
if pymupdf.mupdf_version_tuple >= (1, 25):
447+
assert s['alpha'] == 255
448+
else:
449+
assert not 'alpha' in s

0 commit comments

Comments
 (0)