2828import zipfile
2929
3030from . import extra
31-
31+ import importlib.util
3232
3333# Set up g_out_log and g_out_message from environment variables.
3434#
@@ -333,6 +333,30 @@ def __init__(self):
333333
334334_globals = _Globals()
335335
336+ _get_layout: typing.Optional[typing.Callable] = None
337+ _recommend_layout = True
338+
339+
340+ def no_recommend_layout():
341+ global _recommend_layout
342+ _recommend_layout = False
343+
344+
345+ def _warn_layout_once():
346+ msg="""Consider using the pymupdf_layout package for a greatly improved page layout analysis."""
347+
348+ global _recommend_layout
349+ if (
350+ 1
351+ and _recommend_layout
352+ and not callable(_get_layout)
353+ and os.getenv("PYMUPDF_SUGGEST_LAYOUT_ANALYZER") != "0"
354+ and not hasattr(pymupdf, "layout")
355+ and not importlib.util.find_spec("pymupdf.layout")
356+ ):
357+ print(msg)
358+ _recommend_layout = False
359+
336360
337361# Optionally use MuPDF via cppyy bindings; experimental and not tested recently
338362# as of 2023-01-20 11:51:40
@@ -383,7 +407,8 @@ def _int_rc(text):
383407from ._build import pymupdf_git_diff # noqa F401
384408from ._build import pymupdf_git_sha # noqa F401
385409from ._build import pymupdf_version # noqa F401
386- from ._build import pymupdf_version_tuple # noqa F401
410+ #from ._build import pymupdf_version_tuple # noqa F401
411+ pymupdf_version_tuple = tuple(map(int, pymupdf_version.split(".")))
387412from ._build import swig_version # noqa F401
388413from ._build import swig_version_tuple # noqa F401
389414
@@ -1054,6 +1079,7 @@ def get_textpage(self, clip=None, flags=0):
10541079 annot = self.this
10551080 stextpage = mupdf.FzStextPage(annot, options)
10561081 ret = TextPage(stextpage)
1082+ ret._dev_flags = flags
10571083 p = self.get_parent()
10581084 if isinstance(p, weakref.ProxyType):
10591085 ret.parent = p
@@ -2784,6 +2810,7 @@ def get_textpage(self, flags=3):
27842810 stext_options.flags = flags
27852811 val = mupdf.FzStextPage(self.this, stext_options)
27862812 val.thisown = True
2813+ val._dev_flags = flags
27872814 return val
27882815
27892816 @property
@@ -9952,9 +9979,10 @@ def _get_resource_properties(self):
99529979 return rc
99539980
99549981 def _get_textpage(self, clip=None, flags=0, matrix=None):
9955- if g_use_extra:
9982+ if 1 or g_use_extra:
99569983 ll_tpage = extra.page_get_textpage(self.this, clip, flags, matrix)
99579984 tpage = mupdf.FzStextPage(ll_tpage)
9985+ tpage._dev_flags = flags
99589986 return tpage
99599987 page = self.this
99609988 options = mupdf.FzStextOptions(flags)
@@ -10781,6 +10809,20 @@ def clip_to_rect(self, rect):
1078110809 pclip = JM_rect_from_py(clip)
1078210810 mupdf.pdf_clip_page(pdfpage, pclip)
1078310811
10812+ def get_layout(self, vertical_gap=12):
10813+ """Try to access layout information."""
10814+
10815+ if self.layout_information is not None:
10816+ # layout information already present
10817+ return
10818+
10819+ if not callable(_get_layout):
10820+ # no layout information available
10821+ return
10822+
10823+ layout_info = _get_layout(self)
10824+ self.layout_information = layout_info
10825+
1078410826 @property
1078510827 def artbox(self):
1078610828 """The ArtBox"""
@@ -11432,7 +11474,7 @@ def get_cdrawings(self, extended=None, callback=None, method=None):
1143211474 assert isinstance(page, mupdf.FzPage), f'{self.this=}'
1143311475 clips = True if extended else False
1143411476 prect = mupdf.fz_bound_page(page)
11435- if g_use_extra:
11477+ if 1 or g_use_extra:
1143611478 rc = extra.get_cdrawings(page, extended, callback, method)
1143711479 else:
1143811480 rc = list()
@@ -12146,6 +12188,7 @@ def get_textpage(self, clip: rect_like = None, flags: int = 0, matrix=None) -> "
1214612188 if old_rotation != 0:
1214712189 self.set_rotation(old_rotation)
1214812190 textpage = TextPage(textpage)
12191+ textpage._dev_flags = flags
1214912192 textpage.parent = weakref.proxy(self)
1215012193 return textpage
1215112194
@@ -12157,7 +12200,7 @@ def get_texttrace(self):
1215712200 self.set_rotation(0)
1215812201 page = self.this
1215912202 rc = []
12160- if g_use_extra:
12203+ if 1 or g_use_extra:
1216112204 dev = extra.JM_new_texttrace_device(rc)
1216212205 else:
1216312206 dev = JM_new_texttrace_device(rc)
@@ -13206,6 +13249,9 @@ def xref(self):
1320613249
1320713250 rect = property(bound, doc="page rectangle")
1320813251
13252+ # any result of layout analysis is stored here
13253+ layout_information: typing.Optional[typing.List[tuple]] = None
13254+
1320913255
1321013256class Pixmap:
1321113257
@@ -16391,7 +16437,7 @@ def _textpage_dict(self, raw=False):
1639116437
1639216438 def extractBLOCKS(self):
1639316439 """Return a list with text block information."""
16394- if g_use_extra:
16440+ if 1 or g_use_extra:
1639516441 return extra.extractBLOCKS(self.this)
1639616442 block_n = -1
1639716443 this_tpage = self.this
@@ -16587,7 +16633,7 @@ def extractTextbox(self, rect):
1658716633
1658816634 def extractWORDS(self, delimiters=None):
1658916635 """Return a list with text word information."""
16590- if g_use_extra:
16636+ if 1 or g_use_extra:
1659116637 return extra.extractWORDS(self.this, delimiters)
1659216638 buflen = 0
1659316639 last_char_rtl = 0
@@ -18969,7 +19015,7 @@ def JM_color_FromSequence(color):
1896919015
1897019016
1897119017def JM_color_count( pm, clip):
18972- if g_use_extra:
19018+ if 1 or g_use_extra:
1897319019 return extra.ll_JM_color_count(pm.m_internal, clip)
1897419020
1897519021 rc = dict()
@@ -20469,7 +20515,7 @@ def JM_make_annot_DA(annot, ncol, col, fontname, fontsize):
2046920515
2047020516
2047120517def JM_make_spanlist(line_dict, line, raw, buff, tp_rect):
20472- if g_use_extra:
20518+ if 1 or g_use_extra:
2047320519 return extra.JM_make_spanlist(line_dict, line, raw, buff, tp_rect)
2047420520 char_list = None
2047520521 span_list = []
@@ -20682,7 +20728,7 @@ def JM_make_image_block(block, block_dict):
2068220728
2068320729
2068420730def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
20685- if g_use_extra:
20731+ if 1 or g_use_extra:
2068620732 return extra.JM_make_text_block(block.m_internal, block_dict, raw, buff.m_internal, tp_rect.m_internal)
2068720733 line_list = []
2068820734 block_rect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY)
@@ -20705,8 +20751,8 @@ def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
2070520751
2070620752
2070720753def JM_make_textpage_dict(tp, page_dict, raw):
20708- if g_use_extra:
20709- return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw)
20754+ if 1 or g_use_extra:
20755+ return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw, tp._dev_flags )
2071020756 text_buffer = mupdf.fz_new_buffer(128)
2071120757 block_list = []
2071220758 tp_rect = mupdf.FzRect(tp.m_internal.mediabox)
@@ -21356,7 +21402,7 @@ def JM_rotate_page_matrix(page):
2135621402
2135721403
2135821404def JM_search_stext_page(page, needle):
21359- if g_use_extra:
21405+ if 1 or g_use_extra:
2136021406 return extra.JM_search_stext_page(page.m_internal, needle)
2136121407
2136221408 rect = mupdf.FzRect(page.m_internal.mediabox)
0 commit comments