Skip to content

Commit cf1b642

Browse files
committed
parse page layout recursively
2 parents c76aa75 + c5da67b commit cf1b642

39 files changed

+46156
-42088
lines changed

README.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@
1414
- [x] text in horizontal/vertical direction: from left to right, from bottom to top
1515
- [x] font style, e.g. font name, size, weight, italic and color
1616
- [x] text format, e.g. highlight, underline, strike-through
17-
- [x] text alignment, e.g. left/right/center/justify
17+
- [x] text alignment, e.g. left/right/center/justify
18+
- [x] external hyper link
1819
- [x] paragraph layout: horizontal alignment and vertical spacing
1920
- [ ] list style
20-
- [ ] href link
21-
21+
2222
- [x] Parse and re-create image
2323
- [x] in-line image
2424
- [x] image in Gray/RGB/CMYK mode
@@ -30,7 +30,8 @@
3030
- [x] shading style, i.e. background color
3131
- [x] merged cells
3232
- [x] vertical direction cell
33-
- [x] table with partly hidden borders
33+
- [x] table with partly hidden borders
34+
- [x] nested tables
3435

3536
- [x] Parsing pages with multi-processing
3637

pdf2docx/common/Block.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -167,15 +167,6 @@ def is_flow_layout(self, *args):
167167
return True
168168

169169

170-
def parse_text_format(self, *args, **kwargs):
171-
"""Parse text format.
172-
173-
Raises:
174-
NotImplementedError
175-
"""
176-
raise NotImplementedError
177-
178-
179170
def make_docx(self, *args, **kwargs):
180171
"""Create associated docx element.
181172

pdf2docx/common/Collection.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010

1111
class BaseCollection:
1212
'''Base collection of specific instances.'''
13-
def __init__(self, instances:list=[]):
13+
def __init__(self, instances:list=None):
1414
'''Init collection from a list of Element instances.'''
15-
self._instances = instances if instances else [] # type: list[Element]
15+
self._instances = instances or [] # type: list[Element]
1616

1717
def __getitem__(self, idx):
1818
try:
@@ -119,7 +119,7 @@ def group_by_connectivity(self, dx:float, dy:float):
119119

120120
class Collection(BaseCollection, IText):
121121
'''Collection of specific instances.'''
122-
def __init__(self, instances:list=[], parent=None):
122+
def __init__(self, instances:list=None, parent=None):
123123
'''Init collection from a list of Element instances.'''
124124
self._parent = parent # type: Element
125125
super().__init__(instances)
@@ -250,19 +250,24 @@ def contained_in_bbox(self, bbox):
250250
return self.__class__(instances)
251251

252252

253-
def split_with_intersection(self, bbox):
253+
def split_with_intersection(self, bbox:fitz.Rect, threshold:float=0.0):
254254
"""Split instances into two groups: one intersects with ``bbox``, the other not.
255255
256256
Args:
257257
bbox (fitz.Rect): target rect box.
258+
threshold (float): It's intersected when the overlap rate exceeds this threshold. Defaults to 0.
258259
259260
Returns:
260261
tuple: two group in original class type.
261262
"""
262-
intersection, no_intersection = [], []
263+
intersections, no_intersections = [], []
263264
for instance in self._instances:
264-
if bbox.intersects(instance.bbox):
265-
intersection.append(instance)
265+
# A contains B => A & B = B
266+
intersection = instance.bbox & bbox
267+
factor = round(intersection.getArea()/instance.bbox.getArea(), 2)
268+
269+
if factor >= threshold:
270+
intersections.append(instance)
266271
else:
267-
no_intersection.append(instance)
268-
return self.__class__(intersection), self.__class__(no_intersection)
272+
no_intersections.append(instance)
273+
return self.__class__(intersections), self.__class__(no_intersections)

pdf2docx/common/share.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def new_page(doc, width:float, height:float, title:str):
198198
return page
199199

200200

201-
def debug_plot(title:str):
201+
def debug_plot(title:str, show=True):
202202
'''Plot the returned objects of inner function.
203203
204204
Args:
@@ -215,11 +215,11 @@ def inner(*args, **kwargs):
215215
doc = page.settings.get('debug_doc', None)
216216
filename = page.settings.get('debug_filename', None)
217217

218-
if objects and debug and doc is not None:
218+
if show and objects and debug and doc is not None:
219219
# create a new page
220-
page = new_page(doc, page.width, page.height, title)
220+
debug_page = new_page(doc, page.width, page.height, title)
221221
# plot objects, e.g. text blocks, shapes, tables...
222-
objects.plot(page)
222+
objects.plot(debug_page)
223223
doc.save(filename)
224224

225225
return objects

0 commit comments

Comments
 (0)