Skip to content

Commit 4c27c50

Browse files
committed
Support new stext character properties
1 parent 3343765 commit 4c27c50

File tree

2 files changed

+161
-56
lines changed

2 files changed

+161
-56
lines changed

src/__init__.py

Lines changed: 83 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1031,6 +1031,7 @@ def get_textpage(self, clip=None, flags=0):
10311031
ret.parent = p
10321032
else:
10331033
ret.parent = weakref.proxy(p)
1034+
ret._dev_flags = flags
10341035
return ret
10351036

10361037
@property
@@ -2782,6 +2783,7 @@ def get_textpage(self, flags=3):
27822783
stext_options.flags = flags
27832784
val = mupdf.FzStextPage(self.this, stext_options)
27842785
val.thisown = True
2786+
val._dev_flags = flags
27852787
return val
27862788

27872789
@property
@@ -7970,6 +7972,7 @@ def _get_textpage(self, clip=None, flags=0, matrix=None):
79707972
if g_use_extra:
79717973
ll_tpage = extra.page_get_textpage(self.this, clip, flags, matrix)
79727974
tpage = mupdf.FzStextPage(ll_tpage)
7975+
tpage._dev_flags = flags
79737976
return tpage
79747977
page = self.this
79757978
options = mupdf.FzStextOptions(flags)
@@ -7989,6 +7992,7 @@ def _get_textpage(self, clip=None, flags=0, matrix=None):
79897992
assert 0, f'Unrecognised {type(page)=}'
79907993
mupdf.fz_run_page(page, dev, ctm, mupdf.FzCookie())
79917994
mupdf.fz_close_device(dev)
7995+
tpage._dev_flags = flags
79927996
return tpage
79937997

79947998
def _insert_image(self,
@@ -9142,6 +9146,12 @@ def remove_rotation(self):
91429146
if rot == 0:
91439147
return Identity # nothing to do
91449148

9149+
# save annotation rectangle information before we do anything
9150+
rot_matrix = self.rotation_matrix # original rotation matrix
9151+
annots=[(a.xref,a.rect * rot_matrix) for a in self.annots()]
9152+
get_links = self.get_links()
9153+
widgets = [(w.xref,w.rect * rot_matrix) for w in self.widgets()]
9154+
91459155
# need to derotate the page's content
91469156
mb = self.mediabox # current mediabox
91479157

@@ -9172,20 +9182,22 @@ def remove_rotation(self):
91729182
self.set_rotation(0)
91739183
rot = ~mat # inverse of the derotation matrix
91749184

9175-
for annot in self.annots(): # modify rectangles of annotations
9176-
r = annot.rect * rot
9177-
# TODO: only try to set rectangle for applicable annot types
9178-
annot.set_rect(r)
9179-
for link in self.get_links(): # modify 'from' rectangles of links
9180-
r = link["from"] * rot
9185+
for xref, rect in annots: # modify rectangles of annotations
9186+
annot = self.load_annot(xref)
9187+
# TODO: only do this for applicable annot types
9188+
annot.set_rect(rect)
9189+
9190+
for link in get_links: # modify 'from' rectangles of links
9191+
r = link["from"] * rot_matrix
91819192
self.delete_link(link)
91829193
link["from"] = r
91839194
try: # invalid links remain deleted
91849195
self.insert_link(link)
91859196
except Exception:
91869197
pass
9187-
for widget in self.widgets(): # modify field rectangles
9188-
r = widget.rect * rot
9198+
9199+
for xref, rect in widgets: # modify field rectangles
9200+
widget = page.load_widget(xref)
91899201
widget.rect = r
91909202
widget.update()
91919203
return rot # the inverse of the generated derotation matrix
@@ -9432,6 +9444,7 @@ def get_textpage(self, clip: rect_like = None, flags: int = 0, matrix=None) -> "
94329444
self.set_rotation(old_rotation)
94339445
textpage = TextPage(textpage)
94349446
textpage.parent = weakref.proxy(self)
9447+
textpage._dev_flags = flags
94359448
return textpage
94369449

94379450
def get_texttrace(self):
@@ -16440,9 +16453,13 @@ def JM_make_annot_DA(annot, ncol, col, fontname, fontsize):
1644016453
mupdf.pdf_dict_put_text_string(mupdf.pdf_annot_obj(annot), mupdf.PDF_ENUM_NAME_DA, buf)
1644116454

1644216455

16443-
def JM_make_spanlist(line_dict, line, raw, buff, tp_rect):
16456+
def JM_make_spanlist(line_dict, line, raw, buff, tp_rect, dev_flags):
1644416457
if g_use_extra:
16445-
return extra.JM_make_spanlist(line_dict, line, raw, buff, tp_rect)
16458+
return extra.JM_make_spanlist(line_dict, line, raw, buff, tp_rect, dev_flags)
16459+
# relevant MuPDF versions
16460+
MUPDF1250 = (1, 25, 0)
16461+
MUPDF1251 = (1, 25, 1)
16462+
THIS_MUPDF = mupdf_version_tuple
1644616463
char_list = None
1644716464
span_list = []
1644816465
mupdf.fz_clear_buffer(buff)
@@ -16453,18 +16470,24 @@ class char_style:
1645316470
def __init__(self, rhs=None):
1645416471
if rhs:
1645516472
self.size = rhs.size
16473+
self.font_flags = rhs.font_flags
1645616474
self.flags = rhs.flags
1645716475
self.font = rhs.font
1645816476
self.color = rhs.color
1645916477
self.asc = rhs.asc
1646016478
self.desc = rhs.desc
16479+
self.bidi = rhs.bidi
16480+
self.opacity = rhs.opacity
1646116481
else:
1646216482
self.size = -1
16463-
self.flags = -1
16464-
self.font = ''
16465-
self.color = -1
16483+
self.font_flags = 0
16484+
self.flags = 0
16485+
self.font = ""
16486+
self.color = 0
1646616487
self.asc = 0
1646716488
self.desc = 0
16489+
self.bidi = 0
16490+
self.opacity = 1
1646816491
def __str__(self):
1646916492
return f'{self.size} {self.flags} {self.font} {self.color} {self.asc} {self.desc}'
1647016493

@@ -16481,38 +16504,44 @@ def __str__(self):
1648116504
):
1648216505
continue
1648316506

16484-
flags = JM_char_font_flags(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)), line, ch)
16507+
font_flags = JM_char_font_flags(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)), line, ch)
1648516508
origin = mupdf.FzPoint(ch.m_internal.origin)
1648616509
style.size = ch.m_internal.size
16487-
style.flags = flags
16510+
style.flags = ch.m_internal.flags
1648816511
style.font = JM_font_name(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
16489-
if mupdf_version_tuple >= (1, 25):
16490-
style.color = ch.m_internal.argb
16512+
if THIS_MUPDF >= MUPDF1250:
16513+
style.opacity = (ch.m_internal.argb >> 24) / 255
16514+
style.color = ch.m_internal.argb & ~0xff000000
1649116515
else:
1649216516
style.color = ch.m_internal.color
1649316517
style.asc = JM_font_ascender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
1649416518
style.desc = JM_font_descender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
1649516519

16496-
if (style.size != old_style.size
16497-
or style.flags != old_style.flags
16498-
or style.color != old_style.color
16499-
or style.font != old_style.font
16500-
):
16501-
if old_style.size >= 0:
16520+
if (0
16521+
or style.size != old_style.size
16522+
or style.bidi != old_style.bidi
16523+
or style.font_flags != old_style.font_flags
16524+
# compare flags w/o synthetic property
16525+
or (style.flags & ~4) != (old_style.flags & ~4)
16526+
or style.color != old_style.color
16527+
or style.opacity != old_style.opacity
16528+
or style.font != old_style.font
16529+
):
16530+
if old_style.size > 0:
1650216531
# not first one, output previous
1650316532
if raw:
1650416533
# put character list in the span
1650516534
span[dictkey_chars] = char_list
1650616535
char_list = None
1650716536
else:
1650816537
# put text string in the span
16509-
span[dictkey_text] = JM_EscapeStrFromBuffer( buff)
16538+
span[dictkey_text] = JM_EscapeStrFromBuffer(buff)
1651016539
mupdf.fz_clear_buffer(buff)
1651116540

1651216541
span[dictkey_origin] = JM_py_from_point(span_origin)
1651316542
span[dictkey_bbox] = JM_py_from_rect(span_rect)
1651416543
line_rect = mupdf.fz_union_rect(line_rect, span_rect)
16515-
span_list.append( span)
16544+
span_list.append(span)
1651616545
span = None
1651716546

1651816547
span = dict()
@@ -16522,12 +16551,33 @@ def __str__(self):
1652216551
asc = 0.9
1652316552
desc = -0.1
1652416553

16554+
span["bidi"] = style.bidi
1652516555
span[dictkey_size] = style.size
16526-
span[dictkey_flags] = style.flags
16556+
span[dictkey_flags] = style.font_flags
1652716557
span[dictkey_font] = JM_EscapeStrFromStr(style.font)
1652816558
span[dictkey_color] = style.color
1652916559
span["ascender"] = asc
1653016560
span["descender"] = desc
16561+
span["opacity"] = style.opacity
16562+
# add more keys depending on MuPDF version
16563+
if THIS_MUPDF >= MUPDF1250: #separate if because not flags-dependent
16564+
span["opacity"] = style.opacity
16565+
# rest of keys only make sense for FZ_STEXT_COLLECT_FLAGS
16566+
if dev_flags & mupdf.FZ_STEXT_COLLECT_FLAGS:
16567+
span["underline"] = bool(style.flags & mupdf.FZ_STEXT_UNDERLINE)
16568+
span["strikeout"] = bool(style.flags & mupdf.FZ_STEXT_STRIKEOUT)
16569+
else:
16570+
span["underline"] = None
16571+
span["strikeout"] = None
16572+
16573+
if THIS_MUPDF > MUPDF1251:
16574+
if dev_flags & mupdf.FZ_STEXT_COLLECT_FLAGS:
16575+
span["bold"] = bool(style.flags & mupdf.FZ_STEXT_BOLD)
16576+
else:
16577+
span["bold"] = None
16578+
span["filled"] = bool(style.flags & mupdf.FZ_STEXT_FILLED)
16579+
span["stroked"] = bool(style.flags & mupdf.FZ_STEXT_STROKED)
16580+
span["clipped"] = bool(style.flags & mupdf.FZ_STEXT_CLIPPED)
1653116581

1653216582
# Need to be careful here - doing 'old_style=style' does a shallow
1653316583
# copy, but we need to keep old_style as a distinct instance.
@@ -16541,6 +16591,8 @@ def __str__(self):
1654116591
char_dict = dict()
1654216592
char_dict[dictkey_origin] = JM_py_from_point( ch.m_internal.origin)
1654316593
char_dict[dictkey_bbox] = JM_py_from_rect(r)
16594+
if THIS_MUPDF >= MUPDF1250:
16595+
char_dict["synthetic"] = bool(ch.m_internal.flags & mupdf.FZ_STEXT_SYNTHETIC)
1654416596
char_dict[dictkey_c] = chr(ch.m_internal.c)
1654516597

1654616598
if char_list is None:
@@ -16604,9 +16656,9 @@ def JM_make_image_block(block, block_dict):
1660416656
block_dict[ dictkey_image] = bytes_
1660516657

1660616658

16607-
def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
16659+
def JM_make_text_block(block, block_dict, raw, buff, tp_rect, dev_flags):
1660816660
if g_use_extra:
16609-
return extra.JM_make_text_block(block.m_internal, block_dict, raw, buff.m_internal, tp_rect.m_internal)
16661+
return extra.JM_make_text_block(block.m_internal, block_dict, raw, buff.m_internal, tp_rect.m_internal, dev_flags)
1661016662
line_list = []
1661116663
block_rect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY)
1661216664
#log(f'{block=}')
@@ -16617,7 +16669,7 @@ def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
1661716669
):
1661816670
continue
1661916671
line_dict = dict()
16620-
line_rect = JM_make_spanlist(line_dict, line, raw, buff, tp_rect)
16672+
line_rect = JM_make_spanlist(line_dict, line, raw, buff, tp_rect, dev_flags)
1662116673
block_rect = mupdf.fz_union_rect(block_rect, line_rect)
1662216674
line_dict[dictkey_wmode] = line.m_internal.wmode
1662316675
line_dict[dictkey_dir] = JM_py_from_point(line.m_internal.dir)
@@ -16629,7 +16681,7 @@ def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
1662916681

1663016682
def JM_make_textpage_dict(tp, page_dict, raw):
1663116683
if g_use_extra:
16632-
return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw)
16684+
return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw, tp._dev_flags)
1663316685
text_buffer = mupdf.fz_new_buffer(128)
1663416686
block_list = []
1663516687
tp_rect = mupdf.FzRect(tp.m_internal.mediabox)
@@ -16654,7 +16706,7 @@ def JM_make_textpage_dict(tp, page_dict, raw):
1665416706
block_dict[dictkey_bbox] = JM_py_from_rect(block.m_internal.bbox)
1665516707
JM_make_image_block(block, block_dict)
1665616708
else:
16657-
JM_make_text_block(block, block_dict, raw, text_buffer, tp_rect)
16709+
JM_make_text_block(block, block_dict, raw, text_buffer, tp_rect, tp._dev_flags)
1665816710

1665916711
block_list.append(block_dict)
1666016712
page_dict[dictkey_blocks] = block_list
@@ -21164,14 +21216,12 @@ def get_text(
2116421216
pages=None,
2116521217
method='single',
2116621218
concurrency=None,
21167-
2116821219
option='text',
2116921220
clip=None,
2117021221
flags=None,
2117121222
textpage=None,
2117221223
sort=False,
2117321224
delimiters=None,
21174-
2117521225
_stats=False,
2117621226
):
2117721227
'''

0 commit comments

Comments
 (0)