Skip to content

Commit 2ec81f1

Browse files
committed
Support pymupdf_layout
1 parent 8f34325 commit 2ec81f1

File tree

3 files changed

+621
-195
lines changed

3 files changed

+621
-195
lines changed

src/__init__.py

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
import zipfile
2929

3030
from . import extra
31-
31+
import importlib.util
3232

3333
# Set up g_out_log and g_out_message from environment variables.
3434
#
@@ -333,6 +333,30 @@ def __init__(self):
333333

334334
_globals = _Globals()
335335

336+
_get_layout: typing.Optional[typing.Callable] = None
337+
_recommend_layout = True
338+
339+
340+
def no_recommend_layout():
341+
global _recommend_layout
342+
_recommend_layout = False
343+
344+
345+
def _warn_layout_once():
346+
msg="""Consider using the pymupdf_layout package for a greatly improved page layout analysis."""
347+
348+
global _recommend_layout
349+
if (
350+
1
351+
and _recommend_layout
352+
and not callable(_get_layout)
353+
and os.getenv("PYMUPDF_SUGGEST_LAYOUT_ANALYZER") != "0"
354+
and not hasattr(pymupdf, "layout")
355+
and not importlib.util.find_spec("pymupdf.layout")
356+
):
357+
print(msg)
358+
_recommend_layout = False
359+
336360

337361
# Optionally use MuPDF via cppyy bindings; experimental and not tested recently
338362
# as of 2023-01-20 11:51:40
@@ -383,7 +407,8 @@ def _int_rc(text):
383407
from ._build import pymupdf_git_diff # noqa F401
384408
from ._build import pymupdf_git_sha # noqa F401
385409
from ._build import pymupdf_version # noqa F401
386-
from ._build import pymupdf_version_tuple # noqa F401
410+
#from ._build import pymupdf_version_tuple # noqa F401
411+
pymupdf_version_tuple = tuple(map(int, pymupdf_version.split(".")))
387412
from ._build import swig_version # noqa F401
388413
from ._build import swig_version_tuple # noqa F401
389414

@@ -1054,6 +1079,7 @@ def get_textpage(self, clip=None, flags=0):
10541079
annot = self.this
10551080
stextpage = mupdf.FzStextPage(annot, options)
10561081
ret = TextPage(stextpage)
1082+
ret._dev_flags = flags
10571083
p = self.get_parent()
10581084
if isinstance(p, weakref.ProxyType):
10591085
ret.parent = p
@@ -2784,6 +2810,7 @@ def get_textpage(self, flags=3):
27842810
stext_options.flags = flags
27852811
val = mupdf.FzStextPage(self.this, stext_options)
27862812
val.thisown = True
2813+
val._dev_flags = flags
27872814
return val
27882815

27892816
@property
@@ -9952,9 +9979,10 @@ def _get_resource_properties(self):
99529979
return rc
99539980

99549981
def _get_textpage(self, clip=None, flags=0, matrix=None):
9955-
if g_use_extra:
9982+
if 1 or g_use_extra:
99569983
ll_tpage = extra.page_get_textpage(self.this, clip, flags, matrix)
99579984
tpage = mupdf.FzStextPage(ll_tpage)
9985+
tpage._dev_flags = flags
99589986
return tpage
99599987
page = self.this
99609988
options = mupdf.FzStextOptions(flags)
@@ -10781,6 +10809,20 @@ def clip_to_rect(self, rect):
1078110809
pclip = JM_rect_from_py(clip)
1078210810
mupdf.pdf_clip_page(pdfpage, pclip)
1078310811

10812+
def get_layout(self, vertical_gap=12):
10813+
"""Try to access layout information."""
10814+
10815+
if self.layout_information is not None:
10816+
# layout information already present
10817+
return
10818+
10819+
if not callable(_get_layout):
10820+
# no layout information available
10821+
return
10822+
10823+
layout_info = _get_layout(self)
10824+
self.layout_information = layout_info
10825+
1078410826
@property
1078510827
def artbox(self):
1078610828
"""The ArtBox"""
@@ -11432,7 +11474,7 @@ def get_cdrawings(self, extended=None, callback=None, method=None):
1143211474
assert isinstance(page, mupdf.FzPage), f'{self.this=}'
1143311475
clips = True if extended else False
1143411476
prect = mupdf.fz_bound_page(page)
11435-
if g_use_extra:
11477+
if 1 or g_use_extra:
1143611478
rc = extra.get_cdrawings(page, extended, callback, method)
1143711479
else:
1143811480
rc = list()
@@ -12146,6 +12188,7 @@ def get_textpage(self, clip: rect_like = None, flags: int = 0, matrix=None) -> "
1214612188
if old_rotation != 0:
1214712189
self.set_rotation(old_rotation)
1214812190
textpage = TextPage(textpage)
12191+
textpage._dev_flags = flags
1214912192
textpage.parent = weakref.proxy(self)
1215012193
return textpage
1215112194

@@ -12157,7 +12200,7 @@ def get_texttrace(self):
1215712200
self.set_rotation(0)
1215812201
page = self.this
1215912202
rc = []
12160-
if g_use_extra:
12203+
if 1 or g_use_extra:
1216112204
dev = extra.JM_new_texttrace_device(rc)
1216212205
else:
1216312206
dev = JM_new_texttrace_device(rc)
@@ -13206,6 +13249,9 @@ def xref(self):
1320613249

1320713250
rect = property(bound, doc="page rectangle")
1320813251

13252+
# any result of layout analysis is stored here
13253+
layout_information: typing.Optional[typing.List[tuple]] = None
13254+
1320913255

1321013256
class Pixmap:
1321113257

@@ -16391,7 +16437,7 @@ def _textpage_dict(self, raw=False):
1639116437

1639216438
def extractBLOCKS(self):
1639316439
"""Return a list with text block information."""
16394-
if g_use_extra:
16440+
if 1 or g_use_extra:
1639516441
return extra.extractBLOCKS(self.this)
1639616442
block_n = -1
1639716443
this_tpage = self.this
@@ -16587,7 +16633,7 @@ def extractTextbox(self, rect):
1658716633

1658816634
def extractWORDS(self, delimiters=None):
1658916635
"""Return a list with text word information."""
16590-
if g_use_extra:
16636+
if 1 or g_use_extra:
1659116637
return extra.extractWORDS(self.this, delimiters)
1659216638
buflen = 0
1659316639
last_char_rtl = 0
@@ -18969,7 +19015,7 @@ def JM_color_FromSequence(color):
1896919015

1897019016

1897119017
def JM_color_count( pm, clip):
18972-
if g_use_extra:
19018+
if 1 or g_use_extra:
1897319019
return extra.ll_JM_color_count(pm.m_internal, clip)
1897419020

1897519021
rc = dict()
@@ -20469,7 +20515,7 @@ def JM_make_annot_DA(annot, ncol, col, fontname, fontsize):
2046920515

2047020516

2047120517
def JM_make_spanlist(line_dict, line, raw, buff, tp_rect):
20472-
if g_use_extra:
20518+
if 1 or g_use_extra:
2047320519
return extra.JM_make_spanlist(line_dict, line, raw, buff, tp_rect)
2047420520
char_list = None
2047520521
span_list = []
@@ -20682,7 +20728,7 @@ def JM_make_image_block(block, block_dict):
2068220728

2068320729

2068420730
def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
20685-
if g_use_extra:
20731+
if 1 or g_use_extra:
2068620732
return extra.JM_make_text_block(block.m_internal, block_dict, raw, buff.m_internal, tp_rect.m_internal)
2068720733
line_list = []
2068820734
block_rect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY)
@@ -20705,8 +20751,8 @@ def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
2070520751

2070620752

2070720753
def JM_make_textpage_dict(tp, page_dict, raw):
20708-
if g_use_extra:
20709-
return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw)
20754+
if 1 or g_use_extra:
20755+
return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw, tp._dev_flags)
2071020756
text_buffer = mupdf.fz_new_buffer(128)
2071120757
block_list = []
2071220758
tp_rect = mupdf.FzRect(tp.m_internal.mediabox)
@@ -21356,7 +21402,7 @@ def JM_rotate_page_matrix(page):
2135621402

2135721403

2135821404
def JM_search_stext_page(page, needle):
21359-
if g_use_extra:
21405+
if 1 or g_use_extra:
2136021406
return extra.JM_search_stext_page(page.m_internal, needle)
2136121407

2136221408
rect = mupdf.FzRect(page.m_internal.mediabox)

0 commit comments

Comments
 (0)