2828import zipfile
2929
3030from . import extra
31-
31+ import importlib.util
3232
3333# Set up g_out_log and g_out_message from environment variables.
3434#
@@ -333,6 +333,29 @@ def __init__(self):
333333
334334_globals = _Globals()
335335
336+ _get_layout: typing.Optional[typing.Callable] = None
337+ _recommend_layout = True
338+
339+
340+ def no_recommend_layout():
341+ global _recommend_layout
342+ _recommend_layout = False
343+
344+
345+ def _warn_layout_once():
346+ msg="""Consider using the pymupdf_layout package for a greatly improved page layout analysis."""
347+
348+ global _recommend_layout
349+ if (
350+ 1
351+ and _recommend_layout
352+ and not callable(_get_layout)
353+ and os.getenv("PYMUPDF_SUGGEST_LAYOUT_ANALYZER") != "0"
354+ and not importlib.util.find_spec("pymupdf.layout")
355+ ):
356+ print(msg)
357+ _recommend_layout = False
358+
336359
337360# Optionally use MuPDF via cppyy bindings; experimental and not tested recently
338361# as of 2023-01-20 11:51:40
@@ -1054,6 +1077,7 @@ def get_textpage(self, clip=None, flags=0):
10541077 annot = self.this
10551078 stextpage = mupdf.FzStextPage(annot, options)
10561079 ret = TextPage(stextpage)
1080+ ret._dev_flags = flags
10571081 p = self.get_parent()
10581082 if isinstance(p, weakref.ProxyType):
10591083 ret.parent = p
@@ -2784,6 +2808,7 @@ def get_textpage(self, flags=3):
27842808 stext_options.flags = flags
27852809 val = mupdf.FzStextPage(self.this, stext_options)
27862810 val.thisown = True
2811+ val._dev_flags = flags
27872812 return val
27882813
27892814 @property
@@ -9952,9 +9977,10 @@ def _get_resource_properties(self):
99529977 return rc
99539978
99549979 def _get_textpage(self, clip=None, flags=0, matrix=None):
9955- if g_use_extra:
9980+ if 1 or g_use_extra:
99569981 ll_tpage = extra.page_get_textpage(self.this, clip, flags, matrix)
99579982 tpage = mupdf.FzStextPage(ll_tpage)
9983+ tpage._dev_flags = flags
99589984 return tpage
99599985 page = self.this
99609986 options = mupdf.FzStextOptions(flags)
@@ -10781,6 +10807,20 @@ def clip_to_rect(self, rect):
1078110807 pclip = JM_rect_from_py(clip)
1078210808 mupdf.pdf_clip_page(pdfpage, pclip)
1078310809
10810+ def get_layout(self, vertical_gap=12):
10811+ """Try to access layout information."""
10812+
10813+ if self.layout_information is not None:
10814+ # layout information already present
10815+ return
10816+
10817+ if not _get_layout:
10818+ # no layout information available
10819+ return
10820+
10821+ layout_info = _get_layout(self)
10822+ self.layout_information = layout_info
10823+
1078410824 @property
1078510825 def artbox(self):
1078610826 """The ArtBox"""
@@ -11432,7 +11472,7 @@ def get_cdrawings(self, extended=None, callback=None, method=None):
1143211472 assert isinstance(page, mupdf.FzPage), f'{self.this=}'
1143311473 clips = True if extended else False
1143411474 prect = mupdf.fz_bound_page(page)
11435- if g_use_extra:
11475+ if 1 or g_use_extra:
1143611476 rc = extra.get_cdrawings(page, extended, callback, method)
1143711477 else:
1143811478 rc = list()
@@ -12146,6 +12186,7 @@ def get_textpage(self, clip: rect_like = None, flags: int = 0, matrix=None) -> "
1214612186 if old_rotation != 0:
1214712187 self.set_rotation(old_rotation)
1214812188 textpage = TextPage(textpage)
12189+ textpage._dev_flags = flags
1214912190 textpage.parent = weakref.proxy(self)
1215012191 return textpage
1215112192
@@ -12157,7 +12198,7 @@ def get_texttrace(self):
1215712198 self.set_rotation(0)
1215812199 page = self.this
1215912200 rc = []
12160- if g_use_extra:
12201+ if 1 or g_use_extra:
1216112202 dev = extra.JM_new_texttrace_device(rc)
1216212203 else:
1216312204 dev = JM_new_texttrace_device(rc)
@@ -13206,6 +13247,9 @@ def xref(self):
1320613247
1320713248 rect = property(bound, doc="page rectangle")
1320813249
13250+ # any result of layout analysis is stored here
13251+ layout_information: typing.Optional[typing.List[tuple]] = None
13252+
1320913253
1321013254class Pixmap:
1321113255
@@ -16391,7 +16435,7 @@ def _textpage_dict(self, raw=False):
1639116435
1639216436 def extractBLOCKS(self):
1639316437 """Return a list with text block information."""
16394- if g_use_extra:
16438+ if 1 or g_use_extra:
1639516439 return extra.extractBLOCKS(self.this)
1639616440 block_n = -1
1639716441 this_tpage = self.this
@@ -16587,7 +16631,7 @@ def extractTextbox(self, rect):
1658716631
1658816632 def extractWORDS(self, delimiters=None):
1658916633 """Return a list with text word information."""
16590- if g_use_extra:
16634+ if 1 or g_use_extra:
1659116635 return extra.extractWORDS(self.this, delimiters)
1659216636 buflen = 0
1659316637 last_char_rtl = 0
@@ -18969,7 +19013,7 @@ def JM_color_FromSequence(color):
1896919013
1897019014
1897119015def JM_color_count( pm, clip):
18972- if g_use_extra:
19016+ if 1 or g_use_extra:
1897319017 return extra.ll_JM_color_count(pm.m_internal, clip)
1897419018
1897519019 rc = dict()
@@ -20469,7 +20513,7 @@ def JM_make_annot_DA(annot, ncol, col, fontname, fontsize):
2046920513
2047020514
2047120515def JM_make_spanlist(line_dict, line, raw, buff, tp_rect):
20472- if g_use_extra:
20516+ if 1 or g_use_extra:
2047320517 return extra.JM_make_spanlist(line_dict, line, raw, buff, tp_rect)
2047420518 char_list = None
2047520519 span_list = []
@@ -20682,7 +20726,7 @@ def JM_make_image_block(block, block_dict):
2068220726
2068320727
2068420728def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
20685- if g_use_extra:
20729+ if 1 or g_use_extra:
2068620730 return extra.JM_make_text_block(block.m_internal, block_dict, raw, buff.m_internal, tp_rect.m_internal)
2068720731 line_list = []
2068820732 block_rect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY)
@@ -20705,8 +20749,8 @@ def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
2070520749
2070620750
2070720751def JM_make_textpage_dict(tp, page_dict, raw):
20708- if g_use_extra:
20709- return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw)
20752+ if 1 or g_use_extra:
20753+ return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw, tp._dev_flags )
2071020754 text_buffer = mupdf.fz_new_buffer(128)
2071120755 block_list = []
2071220756 tp_rect = mupdf.FzRect(tp.m_internal.mediabox)
@@ -21356,7 +21400,7 @@ def JM_rotate_page_matrix(page):
2135621400
2135721401
2135821402def JM_search_stext_page(page, needle):
21359- if g_use_extra:
21403+ if 1 or g_use_extra:
2136021404 return extra.JM_search_stext_page(page.m_internal, needle)
2136121405
2136221406 rect = mupdf.FzRect(page.m_internal.mediabox)
0 commit comments