Skip to content

Commit 050a199

Browse files
src/ tests/ docs/: add 'bidi' to span dict, add 'synthetic' to char dict.
Also removed synthetic bit from span dict, as it is allowed to change within a span. Added code to tests/test_textextract.py:test_4147() to check for span['bidi'] and ch['synthetic'].
1 parent b3331ef commit 050a199

File tree

4 files changed

+34
-12
lines changed

4 files changed

+34
-12
lines changed

docs/textpage.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ Bits 1 thru 4 are font properties, i.e. encoded in the font program. Please note
351351

352352
* bit 0: strikeout.
353353
* bit 1: underline.
354-
* bit 2: synthetic.
354+
* bit 2: synthetic (always 0, see char dictionary).
355355
* bit 3: filled.
356356
* bit 4: stroked.
357357
* bit 5: clipped.
@@ -370,16 +370,18 @@ Character Dictionary for :meth:`extractRAWDICT`
370370
=============== ===========================================================
371371
origin character's left baseline point, :data:`point_like`
372372
bbox character rectangle, :data:`rect_like`
373+
synthetic bool.
373374
c the character (unicode)
374375
=============== ===========================================================
375376

377+
(`synthetic` is new in v1.25.3.)
378+
376379
This image shows the relationship between a character's bbox and its quad: |textpagechar|
377380

378381
.. |textpagechar| image:: images/img-textpage-char.*
379382
:align: top
380383
:scale: 66
381384

382-
383385
.. rubric:: Footnotes
384386

385387
.. [#f1] Image specifications for a PDF page are done in a page's (sub-) :data:`dictionary`, called `/Resources`. Resource dictionaries can be **inherited** from any of the page's parent objects (usually the :data:`catalog` -- the top-level parent). The PDF creator may e.g. define one `/Resources` on file level, naming all images and / or all fonts ever used by any page. In these cases, :meth:`Page.get_images` and :meth:`Page.get_fonts` will consequently return the same lists for all pages. If desired, this situation can be reverted using :meth:`Page.clean_contents`. After execution, the page's object definition will show fonts and images that are actually used.

src/__init__.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13579,6 +13579,7 @@ class EmptyFileError(FileDataError):
1357913579
#
1358013580
dictkey_align = "align"
1358113581
dictkey_asc = "ascender"
13582+
dictkey_bidi = "bidi"
1358213583
dictkey_bbox = "bbox"
1358313584
dictkey_blocks = "blocks"
1358413585
dictkey_bpc = "bpc"
@@ -16367,6 +16368,7 @@ def __init__(self, rhs=None):
1636716368
self.argb = rhs.argb
1636816369
self.asc = rhs.asc
1636916370
self.desc = rhs.desc
16371+
self.bidi = rhs.bidi
1637016372
else:
1637116373
self.size = -1
1637216374
self.flags = -1
@@ -16376,6 +16378,7 @@ def __init__(self, rhs=None):
1637616378
self.argb = -1
1637716379
self.asc = 0
1637816380
self.desc = 0
16381+
self.bidi = 0
1637916382
def __str__(self):
1638016383
ret = f'{self.size} {self.flags}'
1638116384
if mupdf_version_tuple >= (1, 25, 2):
@@ -16408,23 +16411,25 @@ def __str__(self):
1640816411
style.size = ch.m_internal.size
1640916412
style.flags = flags
1641016413
if mupdf_version_tuple >= (1, 25, 2):
16411-
style.char_flags = ch.m_internal.flags
16414+
# FZ_STEXT_SYNTHETIC is per-char, not per-span.
16415+
style.char_flags = ch.m_internal.flags & ~mupdf.FZ_STEXT_SYNTHETIC
1641216416
style.font = JM_font_name(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
1641316417
if mupdf_version_tuple >= (1, 25):
1641416418
style.argb = ch.m_internal.argb
1641516419
else:
1641616420
style.argb = ch.m_internal.color
1641716421
style.asc = JM_font_ascender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
1641816422
style.desc = JM_font_descender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
16423+
style.bidi = ch.m_internal.bidi
1641916424

1642016425
if (style.size != old_style.size
1642116426
or style.flags != old_style.flags
1642216427
or (mupdf_version_tuple >= (1, 25, 2)
16423-
and (style.char_flags & ~mupdf.FZ_STEXT_SYNTHETIC)
16424-
!= (old_style.char_flags & ~mupdf.FZ_STEXT_SYNTHETIC)
16428+
and (style.char_flags != old_style.char_flags)
1642516429
)
1642616430
or style.argb != old_style.argb
1642716431
or style.font != old_style.font
16432+
or style.bidi != old_style.bidi
1642816433
):
1642916434
if old_style.size >= 0:
1643016435
# not first one, output previous
@@ -16452,6 +16457,7 @@ def __str__(self):
1645216457

1645316458
span[dictkey_size] = style.size
1645416459
span[dictkey_flags] = style.flags
16460+
span[dictkey_bidi] = style.bidi
1645516461
if mupdf_version_tuple >= (1, 25, 2):
1645616462
span[dictkey_char_flags] = style.char_flags
1645716463
span[dictkey_font] = JM_EscapeStrFromStr(style.font)
@@ -16474,6 +16480,7 @@ def __str__(self):
1647416480
char_dict[dictkey_origin] = JM_py_from_point( ch.m_internal.origin)
1647516481
char_dict[dictkey_bbox] = JM_py_from_rect(r)
1647616482
char_dict[dictkey_c] = chr(ch.m_internal.c)
16483+
char_dict['synthetic'] = bool(ch.m_internal.flags & mupdf.FZ_STEXT_SYNTHETIC)
1647716484

1647816485
if char_list is None:
1647916486
char_list = []

src/extra.i

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ otherwise compilation can fail because free() and malloc() are not declared. */
2323

2424
dictkey_align = PyUnicode_InternFromString("align");
2525
dictkey_ascender = PyUnicode_InternFromString("ascender");
26+
dictkey_bidi = PyUnicode_InternFromString("bidi");
2627
dictkey_bbox = PyUnicode_InternFromString("bbox");
2728
dictkey_blocks = PyUnicode_InternFromString("blocks");
2829
dictkey_bpc = PyUnicode_InternFromString("bpc");
@@ -821,6 +822,7 @@ static PyObject* JM_outline_xrefs(mupdf::PdfObj obj, PyObject* xrefs)
821822

822823
PyObject* dictkey_align = NULL;
823824
PyObject* dictkey_ascender = NULL;
825+
PyObject* dictkey_bidi = NULL;
824826
PyObject* dictkey_bbox = NULL;
825827
PyObject* dictkey_blocks = NULL;
826828
PyObject* dictkey_bpc = NULL;
@@ -841,6 +843,7 @@ PyObject* dictkey_ext = NULL;
841843
PyObject* dictkey_filename = NULL;
842844
PyObject* dictkey_fill = NULL;
843845
PyObject* dictkey_flags = NULL;
846+
PyObject* dictkey_char_bidi = NULL;
844847
PyObject* dictkey_char_flags = NULL;
845848
PyObject* dictkey_font = NULL;
846849
PyObject* dictkey_glyph = NULL;
@@ -3053,6 +3056,7 @@ mupdf::FzRect JM_make_spanlist(
30533056
unsigned argb = 0;
30543057
float asc = 0;
30553058
float desc = 0;
3059+
uint16_t bidi = 0;
30563060
};
30573061
char_style old_style;
30583062
char_style style;
@@ -3076,7 +3080,8 @@ mupdf::FzRect JM_make_spanlist(
30763080
style.size = ch.m_internal->size;
30773081
style.flags = flags;
30783082
#if MUPDF_VERSION_GE(1, 25, 2)
3079-
style.char_flags = ch.m_internal->flags;
3083+
/* FZ_STEXT_SYNTHETIC is per-char, not per-span. */
3084+
style.char_flags = ch.m_internal->flags & ~FZ_STEXT_SYNTHETIC;
30803085
#endif
30813086
style.font = JM_font_name(ch.m_internal->font);
30823087
#if MUPDF_VERSION_GE(1, 25, 0)
@@ -3091,10 +3096,11 @@ mupdf::FzRect JM_make_spanlist(
30913096
|| style.size != old_style.size
30923097
|| style.flags != old_style.flags
30933098
#if MUPDF_VERSION_GE(1, 25, 2)
3094-
|| (style.char_flags & ~FZ_STEXT_SYNTHETIC) != (old_style.char_flags & ~FZ_STEXT_SYNTHETIC)
3099+
|| style.char_flags != old_style.char_flags
30953100
#endif
30963101
|| style.argb != old_style.argb
30973102
|| strcmp(style.font, old_style.font) != 0
3103+
|| style.bidi != old_style.bidi
30983104
)
30993105
{
31003106
if (old_style.size >= 0)
@@ -3130,6 +3136,7 @@ mupdf::FzRect JM_make_spanlist(
31303136

31313137
DICT_SETITEM_DROP(span, dictkey_size, Py_BuildValue("f", style.size));
31323138
DICT_SETITEM_DROP(span, dictkey_flags, Py_BuildValue("I", style.flags));
3139+
DICT_SETITEM_DROP(span, dictkey_bidi, Py_BuildValue("I", style.bidi));
31333140
#if MUPDF_VERSION_GE(1, 25, 2)
31343141
DICT_SETITEM_DROP(span, dictkey_char_flags, Py_BuildValue("I", style.char_flags));
31353142
#endif
@@ -3157,7 +3164,7 @@ mupdf::FzRect JM_make_spanlist(
31573164
DICT_SETITEM_DROP(char_dict, dictkey_bbox, JM_py_from_rect(r));
31583165

31593166
DICT_SETITEM_DROP(char_dict, dictkey_c, Py_BuildValue("C", ch.m_internal->c));
3160-
3167+
DICT_SETITEMSTR_DROP(char_dict, "synthetic", Py_BuildValue("O", (ch.m_internal->flags & FZ_STEXT_SYNTHETIC) ? Py_True : Py_False));
31613168
if (!char_list)
31623169
{
31633170
char_list = PyList_New(0);

tests/test_textextract.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -407,25 +407,31 @@ def test_4147():
407407
(False, os.path.normpath(f'{__file__}/../../tests/resources/test_4147.pdf')),
408408
(True, os.path.normpath(f'{__file__}/../../tests/resources/symbol-list.pdf')),
409409
):
410-
print(f'{expect_visible} {path=}')
410+
print(f'{expect_visible=} {path=}')
411411
with pymupdf.open(path) as document:
412412
page = document[0]
413413
text = page.get_text('rawdict')
414414
for block in text['blocks']:
415415
if block['type'] == 0:
416+
#print(f' block')
416417
for line in block['lines']:
418+
#print(f' line')
417419
for span in line['spans']:
418-
#print(f' {span=}')
420+
#print(f' span')
419421
if pymupdf.mupdf_version_tuple >= (1, 25, 2):
420-
print(f' span: {span["flags"]=:#x} {span["char_flags"]=:#x}')
422+
#print(f' span: {span["flags"]=:#x} {span["char_flags"]=:#x}')
421423
if expect_visible:
422424
assert span['char_flags'] & pymupdf.mupdf.FZ_STEXT_FILLED
423425
else:
424426
assert not (span['char_flags'] & pymupdf.mupdf.FZ_STEXT_FILLED)
425427
assert not (span['char_flags'] & pymupdf.mupdf.FZ_STEXT_STROKED)
426428
else:
427-
print(f' span: {span["flags"]=:#x}')
429+
#print(f' span: {span["flags"]=:#x}')
428430
assert 'char_flags' not in span
431+
# Check commit `add 'bidi' to span dict, add 'synthetic' to char dict.`
432+
assert span['bidi'] == 0
433+
for ch in span['chars']:
434+
assert isinstance(ch['synthetic'], bool)
429435

430436

431437
def test_4139():

0 commit comments

Comments
 (0)