@@ -1031,6 +1031,7 @@ def get_textpage(self, clip=None, flags=0):
10311031 ret.parent = p
10321032 else:
10331033 ret.parent = weakref.proxy(p)
1034+ ret._dev_flags = flags
10341035 return ret
10351036
10361037 @property
@@ -2782,6 +2783,7 @@ def get_textpage(self, flags=3):
27822783 stext_options.flags = flags
27832784 val = mupdf.FzStextPage(self.this, stext_options)
27842785 val.thisown = True
2786+ val._dev_flags = flags
27852787 return val
27862788
27872789 @property
@@ -7970,6 +7972,7 @@ def _get_textpage(self, clip=None, flags=0, matrix=None):
79707972 if g_use_extra:
79717973 ll_tpage = extra.page_get_textpage(self.this, clip, flags, matrix)
79727974 tpage = mupdf.FzStextPage(ll_tpage)
7975+ tpage._dev_flags = flags
79737976 return tpage
79747977 page = self.this
79757978 options = mupdf.FzStextOptions(flags)
@@ -7989,6 +7992,7 @@ def _get_textpage(self, clip=None, flags=0, matrix=None):
79897992 assert 0, f'Unrecognised {type(page)=}'
79907993 mupdf.fz_run_page(page, dev, ctm, mupdf.FzCookie())
79917994 mupdf.fz_close_device(dev)
7995+ tpage._dev_flags = flags
79927996 return tpage
79937997
79947998 def _insert_image(self,
@@ -9142,6 +9146,12 @@ def remove_rotation(self):
91429146 if rot == 0:
91439147 return Identity # nothing to do
91449148
9149+ # save annotation rectangle information before we do anything
9150+ rot_matrix = self.rotation_matrix # original rotation matrix
9151+ annots=[(a.xref,a.rect * rot_matrix) for a in self.annots()]
9152+ get_links = self.get_links()
9153+ widgets = [(w.xref,w.rect * rot_matrix) for w in self.widgets()]
9154+
91459155 # need to derotate the page's content
91469156 mb = self.mediabox # current mediabox
91479157
@@ -9172,20 +9182,22 @@ def remove_rotation(self):
91729182 self.set_rotation(0)
91739183 rot = ~mat # inverse of the derotation matrix
91749184
9175- for annot in self.annots(): # modify rectangles of annotations
9176- r = annot.rect * rot
9177- # TODO: only try to set rectangle for applicable annot types
9178- annot.set_rect(r)
9179- for link in self.get_links(): # modify 'from' rectangles of links
9180- r = link["from"] * rot
9185+ for xref, rect in annots: # modify rectangles of annotations
9186+ annot = self.load_annot(xref)
9187+ # TODO: only do this for applicable annot types
9188+ annot.set_rect(rect)
9189+
9190+ for link in get_links: # modify 'from' rectangles of links
9191+ r = link["from"] * rot_matrix
91819192 self.delete_link(link)
91829193 link["from"] = r
91839194 try: # invalid links remain deleted
91849195 self.insert_link(link)
91859196 except Exception:
91869197 pass
9187- for widget in self.widgets(): # modify field rectangles
9188- r = widget.rect * rot
9198+
9199+ for xref, rect in widgets: # modify field rectangles
9200+ widget = page.load_widget(xref)
91899201 widget.rect = r
91909202 widget.update()
91919203 return rot # the inverse of the generated derotation matrix
@@ -9432,6 +9444,7 @@ def get_textpage(self, clip: rect_like = None, flags: int = 0, matrix=None) -> "
94329444 self.set_rotation(old_rotation)
94339445 textpage = TextPage(textpage)
94349446 textpage.parent = weakref.proxy(self)
9447+ textpage._dev_flags = flags
94359448 return textpage
94369449
94379450 def get_texttrace(self):
@@ -16440,9 +16453,13 @@ def JM_make_annot_DA(annot, ncol, col, fontname, fontsize):
1644016453 mupdf.pdf_dict_put_text_string(mupdf.pdf_annot_obj(annot), mupdf.PDF_ENUM_NAME_DA, buf)
1644116454
1644216455
16443- def JM_make_spanlist(line_dict, line, raw, buff, tp_rect):
16456+ def JM_make_spanlist(line_dict, line, raw, buff, tp_rect, dev_flags ):
1644416457 if g_use_extra:
16445- return extra.JM_make_spanlist(line_dict, line, raw, buff, tp_rect)
16458+ return extra.JM_make_spanlist(line_dict, line, raw, buff, tp_rect, dev_flags)
16459+ # relevant MuPDF versions
16460+ MUPDF1250 = (1, 25, 0)
16461+ MUPDF1251 = (1, 25, 1)
16462+ THIS_MUPDF = mupdf_version_tuple
1644616463 char_list = None
1644716464 span_list = []
1644816465 mupdf.fz_clear_buffer(buff)
@@ -16453,18 +16470,24 @@ class char_style:
1645316470 def __init__(self, rhs=None):
1645416471 if rhs:
1645516472 self.size = rhs.size
16473+ self.font_flags = rhs.font_flags
1645616474 self.flags = rhs.flags
1645716475 self.font = rhs.font
1645816476 self.color = rhs.color
1645916477 self.asc = rhs.asc
1646016478 self.desc = rhs.desc
16479+ self.bidi = rhs.bidi
16480+ self.opacity = rhs.opacity
1646116481 else:
1646216482 self.size = -1
16463- self.flags = -1
16464- self.font = ''
16465- self.color = -1
16483+ self.font_flags = 0
16484+ self.flags = 0
16485+ self.font = ""
16486+ self.color = 0
1646616487 self.asc = 0
1646716488 self.desc = 0
16489+ self.bidi = 0
16490+ self.opacity = 1
1646816491 def __str__(self):
1646916492 return f'{self.size} {self.flags} {self.font} {self.color} {self.asc} {self.desc}'
1647016493
@@ -16481,38 +16504,44 @@ def __str__(self):
1648116504 ):
1648216505 continue
1648316506
16484- flags = JM_char_font_flags(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)), line, ch)
16507+ font_flags = JM_char_font_flags(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)), line, ch)
1648516508 origin = mupdf.FzPoint(ch.m_internal.origin)
1648616509 style.size = ch.m_internal.size
16487- style.flags = flags
16510+ style.flags = ch.m_internal. flags
1648816511 style.font = JM_font_name(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
16489- if mupdf_version_tuple >= (1, 25):
16490- style.color = ch.m_internal.argb
16512+ if THIS_MUPDF >= MUPDF1250:
16513+ style.opacity = (ch.m_internal.argb >> 24) / 255
16514+ style.color = ch.m_internal.argb & ~0xff000000
1649116515 else:
1649216516 style.color = ch.m_internal.color
1649316517 style.asc = JM_font_ascender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
1649416518 style.desc = JM_font_descender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
1649516519
16496- if (style.size != old_style.size
16497- or style.flags != old_style.flags
16498- or style.color != old_style.color
16499- or style.font != old_style.font
16500- ):
16501- if old_style.size >= 0:
16520+ if (0
16521+ or style.size != old_style.size
16522+ or style.bidi != old_style.bidi
16523+ or style.font_flags != old_style.font_flags
16524+ # compare flags w/o synthetic property
16525+ or (style.flags & ~4) != (old_style.flags & ~4)
16526+ or style.color != old_style.color
16527+ or style.opacity != old_style.opacity
16528+ or style.font != old_style.font
16529+ ):
16530+ if old_style.size > 0:
1650216531 # not first one, output previous
1650316532 if raw:
1650416533 # put character list in the span
1650516534 span[dictkey_chars] = char_list
1650616535 char_list = None
1650716536 else:
1650816537 # put text string in the span
16509- span[dictkey_text] = JM_EscapeStrFromBuffer( buff)
16538+ span[dictkey_text] = JM_EscapeStrFromBuffer(buff)
1651016539 mupdf.fz_clear_buffer(buff)
1651116540
1651216541 span[dictkey_origin] = JM_py_from_point(span_origin)
1651316542 span[dictkey_bbox] = JM_py_from_rect(span_rect)
1651416543 line_rect = mupdf.fz_union_rect(line_rect, span_rect)
16515- span_list.append( span)
16544+ span_list.append(span)
1651616545 span = None
1651716546
1651816547 span = dict()
@@ -16522,12 +16551,33 @@ def __str__(self):
1652216551 asc = 0.9
1652316552 desc = -0.1
1652416553
16554+ span["bidi"] = style.bidi
1652516555 span[dictkey_size] = style.size
16526- span[dictkey_flags] = style.flags
16556+ span[dictkey_flags] = style.font_flags
1652716557 span[dictkey_font] = JM_EscapeStrFromStr(style.font)
1652816558 span[dictkey_color] = style.color
1652916559 span["ascender"] = asc
1653016560 span["descender"] = desc
16561+ span["opacity"] = style.opacity
16562+ # add more keys depending on MuPDF version
16563+ if THIS_MUPDF >= MUPDF1250: #separate if because not flags-dependent
16564+ span["opacity"] = style.opacity
16565+ # rest of keys only make sense for FZ_STEXT_COLLECT_FLAGS
16566+ if dev_flags & mupdf.FZ_STEXT_COLLECT_FLAGS:
16567+ span["underline"] = bool(style.flags & mupdf.FZ_STEXT_UNDERLINE)
16568+ span["strikeout"] = bool(style.flags & mupdf.FZ_STEXT_STRIKEOUT)
16569+ else:
16570+ span["underline"] = None
16571+ span["strikeout"] = None
16572+
16573+ if THIS_MUPDF > MUPDF1251:
16574+ if dev_flags & mupdf.FZ_STEXT_COLLECT_FLAGS:
16575+ span["bold"] = bool(style.flags & mupdf.FZ_STEXT_BOLD)
16576+ else:
16577+ span["bold"] = None
16578+ span["filled"] = bool(style.flags & mupdf.FZ_STEXT_FILLED)
16579+ span["stroked"] = bool(style.flags & mupdf.FZ_STEXT_STROKED)
16580+ span["clipped"] = bool(style.flags & mupdf.FZ_STEXT_CLIPPED)
1653116581
1653216582 # Need to be careful here - doing 'old_style=style' does a shallow
1653316583 # copy, but we need to keep old_style as a distinct instance.
@@ -16541,6 +16591,8 @@ def __str__(self):
1654116591 char_dict = dict()
1654216592 char_dict[dictkey_origin] = JM_py_from_point( ch.m_internal.origin)
1654316593 char_dict[dictkey_bbox] = JM_py_from_rect(r)
16594+ if THIS_MUPDF >= MUPDF1250:
16595+ char_dict["synthetic"] = bool(ch.m_internal.flags & mupdf.FZ_STEXT_SYNTHETIC)
1654416596 char_dict[dictkey_c] = chr(ch.m_internal.c)
1654516597
1654616598 if char_list is None:
@@ -16604,9 +16656,9 @@ def JM_make_image_block(block, block_dict):
1660416656 block_dict[ dictkey_image] = bytes_
1660516657
1660616658
16607- def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
16659+ def JM_make_text_block(block, block_dict, raw, buff, tp_rect, dev_flags ):
1660816660 if g_use_extra:
16609- return extra.JM_make_text_block(block.m_internal, block_dict, raw, buff.m_internal, tp_rect.m_internal)
16661+ return extra.JM_make_text_block(block.m_internal, block_dict, raw, buff.m_internal, tp_rect.m_internal, dev_flags )
1661016662 line_list = []
1661116663 block_rect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY)
1661216664 #log(f'{block=}')
@@ -16617,7 +16669,7 @@ def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
1661716669 ):
1661816670 continue
1661916671 line_dict = dict()
16620- line_rect = JM_make_spanlist(line_dict, line, raw, buff, tp_rect)
16672+ line_rect = JM_make_spanlist(line_dict, line, raw, buff, tp_rect, dev_flags )
1662116673 block_rect = mupdf.fz_union_rect(block_rect, line_rect)
1662216674 line_dict[dictkey_wmode] = line.m_internal.wmode
1662316675 line_dict[dictkey_dir] = JM_py_from_point(line.m_internal.dir)
@@ -16629,7 +16681,7 @@ def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
1662916681
1663016682def JM_make_textpage_dict(tp, page_dict, raw):
1663116683 if g_use_extra:
16632- return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw)
16684+ return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw, tp._dev_flags )
1663316685 text_buffer = mupdf.fz_new_buffer(128)
1663416686 block_list = []
1663516687 tp_rect = mupdf.FzRect(tp.m_internal.mediabox)
@@ -16654,7 +16706,7 @@ def JM_make_textpage_dict(tp, page_dict, raw):
1665416706 block_dict[dictkey_bbox] = JM_py_from_rect(block.m_internal.bbox)
1665516707 JM_make_image_block(block, block_dict)
1665616708 else:
16657- JM_make_text_block(block, block_dict, raw, text_buffer, tp_rect)
16709+ JM_make_text_block(block, block_dict, raw, text_buffer, tp_rect, tp._dev_flags )
1665816710
1665916711 block_list.append(block_dict)
1666016712 page_dict[dictkey_blocks] = block_list
@@ -21164,14 +21216,12 @@ def get_text(
2116421216 pages=None,
2116521217 method='single',
2116621218 concurrency=None,
21167-
2116821219 option='text',
2116921220 clip=None,
2117021221 flags=None,
2117121222 textpage=None,
2117221223 sort=False,
2117321224 delimiters=None,
21174-
2117521225 _stats=False,
2117621226 ):
2117721227 '''
0 commit comments