-
Notifications
You must be signed in to change notification settings - Fork 520
Open
Description
It should be possible to vertically stack tables together which have been interrupted by page breaks.
I wrote a simple proof-of-concept which stacks all tables which have the same number of columns together from a TableList.
E.g., stitch together these separate tables:
| ID | Name | Value |
|---|---|---|
| 1 | Alice | 42 |
| 2 | Bob | 37 |
| 3 | Charlie | 29 |
| 4 | Diana | 51 |
| 5 | Ethan | 28 |
| 6 | Fiona | 33 |
| 7 | George | 45 |
| 8 | Hannah | 30 |
...into the following combined table:
| ID | Name | Value |
|---|---|---|
| 1 | Alice | 42 |
| 2 | Bob | 37 |
| 3 | Charlie | 29 |
| 4 | Diana | 51 |
| 5 | Ethan | 28 |
| 6 | Fiona | 33 |
| 7 | George | 45 |
| 8 | Hannah | 30 |
Interface Example
import camelot
tables = camelot.read_pdf("syllabus.pdf", pages="all", parallel=True)
tables = stack_contiguous_tables(tables)Implementation Example
from copy import deepcopy
from typing import Callable, List
import pandas as pd
from camelot.core import Table, TableList
def vstack_tables(x: Table, y: Table) -> Table:
"""Vertically concatenate two Camelot Table objects.
This function returns a new Table, created as a deep copy of ``x``,
with rows, cells, and data from ``y`` appended below it.
Both tables must have the same number of columns.
Args:
x (Table): The first (top) table.
y (Table): The second (bottom) table to append under ``x``.
Returns:
Table: A new Table containing rows from ``x`` followed by rows from ``y``.
Raises:
ValueError: If the two tables have different numbers of columns.
"""
if len(x.cols) != len(y.cols):
raise ValueError("Both tables must have the same number of columns")
# Copy tables to prevent modifying originals
a, b = deepcopy(x), deepcopy(y)
# Calculate vertical shift for aligning b under a
a_bottom = a._bbox[1] # y1 (bottom) of a
b_top = b._bbox[3] # y2 (top) of b
vertical_distance = a_bottom - b_top
# Shift b.rows
b.rows = [(r0 + vertical_distance, r1 + vertical_distance) for (r0, r1) in b.rows]
# Shift b.cells
for row in b.cells:
for cell in row:
cell.y1 += vertical_distance
cell.y2 += vertical_distance
# Append data
a.rows.extend(b.rows)
a.cells.extend(b.cells)
a.data.extend(b.data)
# Recompute attributes
a.df = pd.DataFrame(a.data)
a.shape = a.df.shape
# Recompute bounding box to cover both tables
ax1, ay1, ax2, ay2 = a._bbox
bx1, by1, bx2, by2 = b._bbox
by1 += vertical_distance
by2 += vertical_distance
new_x1 = min(ax1, bx1)
new_x2 = max(ax2, bx2)
new_y1 = min(ay1, by1)
new_y2 = max(ay2, by2)
a._bbox = (new_x1, new_y1, new_x2, new_y2)
return a
def matching_columns(a: Table, b: Table) -> bool:
"""Check if two Camelot tables have the same number of columns.
Args:
a (Table): First table.
b (Table): Second table.
Returns:
bool: True if both tables have the same number of columns, False otherwise.
"""
return a.shape[1] == b.shape[1]
def group_contiguous_tables(
tables: TableList,
matching_function: Callable[[Table, Table], bool] = matching_columns,
) -> List[List[Table]]:
"""Group contiguous Camelot tables across page breaks.
Contiguous tables are defined as adjacent tables in the input list that
satisfy the provided matching function (by default, same number of columns).
Args:
tables (TableList): List of Camelot Table objects.
matching_function (Callable[[Table, Table], bool], optional):
Function to decide whether two adjacent tables should be grouped.
Defaults to ``matching_columns``.
Returns:
List[List[Table]]: A list of table groups. Each group is a list of
Table objects that should be considered contiguous.
"""
groups: List[List[Table]] = []
current_group: List[Table] = []
for prev_table, curr_table in zip(tables, tables[1:]):
if matching_function(prev_table, curr_table):
if not current_group: # start new group
current_group.append(prev_table)
current_group.append(curr_table)
else:
if current_group: # finalize group
groups.append(current_group)
current_group = []
if current_group: # finalize last group
groups.append(current_group)
return groups
def stack_contiguous_tables(tables: TableList) -> TableList:
"""Stack groups of contiguous tables into single tables.
Groups contiguous tables across page breaks (using
:func:`group_contiguous_tables`) and vertically concatenates each group
(using :func:`vstack_tables`). Returns a new TableList of merged tables.
Args:
tables (TableList): A list of Camelot Table objects.
Returns:
TableList: A new TableList where contiguous tables have been merged.
"""
table_groups = group_contiguous_tables(tables)
merged_tables: List[Table] = [
vstack_tables(*group) if len(group) > 1 else group[0]
for group in table_groups
]
return TableList(merged_tables)Metadata
Metadata
Assignees
Labels
No labels