Skip to content
This repository was archived by the owner on Apr 11, 2025. It is now read-only.

Commit 2421d9d

Browse files
committed
[REF]: get_table_areas
1 parent ff9a501 commit 2421d9d

File tree

1 file changed

+133
-43
lines changed

1 file changed

+133
-43
lines changed

camelot/core.py

Lines changed: 133 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -258,60 +258,118 @@ def get_relevant(self):
258258
)
259259

260260
def get_table_areas(self, textlines, relevant_textedges):
261-
"""Return a dict of interesting table areas on the PDF page.
261+
"""
262+
Return a dict of interesting table areas on the PDF page.
262263
263264
The table areas are calculated using relevant text edges.
264-
"""
265265
266-
def pad(area, average_row_height):
267-
x0 = area[0] - TABLE_AREA_PADDING
268-
y0 = area[1] - TABLE_AREA_PADDING
269-
x1 = area[2] + TABLE_AREA_PADDING
270-
# add a constant since table headers can be relatively up
271-
y1 = area[3] + average_row_height * 5
272-
return (x0, y0, x1, y1)
266+
Parameters
267+
----------
268+
textlines : list
269+
List of text line objects that are relevant for determining table areas.
270+
relevant_textedges : list
271+
List of relevant text edge objects used to identify table areas.
272+
273+
Returns
274+
-------
275+
dict
276+
A dictionary with padded table areas as keys and None as values.
277+
"""
273278

274-
# sort relevant textedges in reading order
279+
# Sort relevant text edges in reading order
275280
relevant_textedges.sort(key=lambda te: (-te.y0, te.coord))
276281

282+
table_areas = self._initialize_table_areas(relevant_textedges)
283+
self._extend_table_areas_with_textlines(table_areas, textlines)
284+
285+
# Add padding to table areas
286+
average_textline_height = self._calculate_average_textline_height(textlines)
287+
padded_table_areas = {
288+
self._pad(area, average_textline_height): None for area in table_areas
289+
}
290+
291+
return padded_table_areas
292+
293+
def _initialize_table_areas(self, relevant_textedges):
294+
"""
295+
Initialize table areas based on relevant text edges.
296+
297+
Parameters
298+
----------
299+
relevant_textedges : list
300+
List of relevant text edge objects used to initialize table areas.
301+
302+
Returns
303+
-------
304+
dict
305+
A dictionary of table areas initialized from relevant text edges.
306+
"""
277307
table_areas = {}
278308
for te in relevant_textedges:
279309
if not table_areas:
280310
table_areas[(te.coord, te.y0, te.coord, te.y1)] = None
281311
else:
282-
found = None
283-
for area in table_areas:
284-
# check for overlap
285-
if te.y1 >= area[1] and te.y0 <= area[3]:
286-
found = area
287-
break
288-
if found is None:
289-
table_areas[(te.coord, te.y0, te.coord, te.y1)] = None
290-
else:
291-
table_areas.pop(found)
292-
updated_area = (
293-
found[0],
294-
min(te.y0, found[1]),
295-
max(found[2], te.coord),
296-
max(found[3], te.y1),
297-
)
298-
table_areas[updated_area] = None
299-
300-
# extend table areas based on textlines that overlap
301-
# vertically. it's possible that these textlines were
302-
# eliminated during textedges generation since numbers and
303-
# chars/words/sentences are often aligned differently.
304-
# drawback: table areas that have paragraphs on their sides
305-
# will include the paragraphs too.
306-
sum_textline_height = 0
312+
self._update_table_areas(table_areas, te)
313+
314+
return table_areas
315+
316+
def _update_table_areas(self, table_areas, te):
317+
"""
318+
Update table areas by checking for overlaps with new text edges.
319+
320+
Parameters
321+
----------
322+
table_areas : dict
323+
Current table areas to be updated.
324+
te : object
325+
The new text edge object to check for overlaps.
326+
327+
Returns
328+
-------
329+
None
330+
"""
331+
found = None
332+
for area in table_areas:
333+
# Check for overlap
334+
if te.y1 >= area[1] and te.y0 <= area[3]:
335+
found = area
336+
break
337+
338+
if found is None:
339+
table_areas[(te.coord, te.y0, te.coord, te.y1)] = None
340+
else:
341+
table_areas.pop(found)
342+
updated_area = (
343+
found[0],
344+
min(te.y0, found[1]),
345+
max(found[2], te.coord),
346+
max(found[3], te.y1),
347+
)
348+
table_areas[updated_area] = None
349+
350+
def _extend_table_areas_with_textlines(self, table_areas, textlines):
351+
"""
352+
Extend table areas based on text lines that overlap vertically.
353+
354+
Parameters
355+
----------
356+
table_areas : dict
357+
Current table areas to be extended.
358+
textlines : list
359+
List of text line objects relevant for extending table areas.
360+
361+
Returns
362+
-------
363+
None
364+
"""
307365
for tl in textlines:
308-
sum_textline_height += tl.y1 - tl.y0
309366
found = None
310367
for area in table_areas:
311-
# check for overlap
368+
# Check for overlap
312369
if tl.y0 >= area[1] and tl.y1 <= area[3]:
313370
found = area
314371
break
372+
315373
if found is not None:
316374
table_areas.pop(found)
317375
updated_area = (
@@ -321,14 +379,46 @@ def pad(area, average_row_height):
321379
max(found[3], tl.y1),
322380
)
323381
table_areas[updated_area] = None
324-
average_textline_height = sum_textline_height / float(len(textlines))
325382

326-
# add some padding to table areas
327-
table_areas_padded = {}
328-
for area in table_areas:
329-
table_areas_padded[pad(area, average_textline_height)] = None
383+
def _calculate_average_textline_height(self, textlines):
384+
"""
385+
Calculate the average height of text lines.
330386
331-
return table_areas_padded
387+
Parameters
388+
----------
389+
textlines : list
390+
List of text line objects.
391+
392+
Returns
393+
-------
394+
float
395+
The average height of the text lines.
396+
"""
397+
sum_textline_height = sum(tl.y1 - tl.y0 for tl in textlines)
398+
return sum_textline_height / float(len(textlines)) if textlines else 0
399+
400+
def _pad(self, area, average_row_height):
401+
"""
402+
Pad a given area by a constant value.
403+
404+
Parameters
405+
----------
406+
area : tuple
407+
The area to be padded defined as (x0, y0, x1, y1).
408+
average_row_height : float
409+
The average height of rows to use for padding.
410+
411+
Returns
412+
-------
413+
tuple
414+
The padded area.
415+
"""
416+
x0 = area[0] - TABLE_AREA_PADDING
417+
y0 = area[1] - TABLE_AREA_PADDING
418+
x1 = area[2] + TABLE_AREA_PADDING
419+
# Add a constant since table headers can be relatively up
420+
y1 = area[3] + average_row_height * 5
421+
return (x0, y0, x1, y1)
332422

333423

334424
class Cell:

0 commit comments

Comments
 (0)