Skip to content

Feature Request: Vertically stack tables separated by page breaks #628

@Alzter

Description

@Alzter

It should be possible to vertically stack tables together which have been interrupted by page breaks.
I wrote a simple proof-of-concept which stacks all tables which have the same number of columns together from a TableList.

E.g., stitch together these separate tables:

ID Name Value
1 Alice 42
2 Bob 37
3 Charlie 29

4 Diana 51
5 Ethan 28
6 Fiona 33
7 George 45
8 Hannah 30

...into the following combined table:

ID Name Value
1 Alice 42
2 Bob 37
3 Charlie 29
4 Diana 51
5 Ethan 28
6 Fiona 33
7 George 45
8 Hannah 30

Interface Example

import camelot
tables = camelot.read_pdf("syllabus.pdf", pages="all", parallel=True)
tables = stack_contiguous_tables(tables)

Implementation Example

from copy import deepcopy
from typing import Callable, List

import pandas as pd
from camelot.core import Table, TableList

def vstack_tables(x: Table, y: Table) -> Table:
    """Vertically concatenate two Camelot Table objects.

    This function returns a new Table, created as a deep copy of ``x``,
    with rows, cells, and data from ``y`` appended below it.
    Both tables must have the same number of columns.

    Args:
        x (Table): The first (top) table.
        y (Table): The second (bottom) table to append under ``x``.

    Returns:
        Table: A new Table containing rows from ``x`` followed by rows from ``y``.

    Raises:
        ValueError: If the two tables have different numbers of columns.
    """
    if len(x.cols) != len(y.cols):
        raise ValueError("Both tables must have the same number of columns")

    # Copy tables to prevent modifying originals
    a, b = deepcopy(x), deepcopy(y)

    # Calculate vertical shift for aligning b under a
    a_bottom = a._bbox[1]  # y1 (bottom) of a
    b_top = b._bbox[3]     # y2 (top) of b
    vertical_distance = a_bottom - b_top

    # Shift b.rows
    b.rows = [(r0 + vertical_distance, r1 + vertical_distance) for (r0, r1) in b.rows]

    # Shift b.cells
    for row in b.cells:
        for cell in row:
            cell.y1 += vertical_distance
            cell.y2 += vertical_distance

    # Append data
    a.rows.extend(b.rows)
    a.cells.extend(b.cells)
    a.data.extend(b.data)

    # Recompute attributes
    a.df = pd.DataFrame(a.data)
    a.shape = a.df.shape

    # Recompute bounding box to cover both tables
    ax1, ay1, ax2, ay2 = a._bbox
    bx1, by1, bx2, by2 = b._bbox
    by1 += vertical_distance
    by2 += vertical_distance

    new_x1 = min(ax1, bx1)
    new_x2 = max(ax2, bx2)
    new_y1 = min(ay1, by1)
    new_y2 = max(ay2, by2)
    a._bbox = (new_x1, new_y1, new_x2, new_y2)

    return a


def matching_columns(a: Table, b: Table) -> bool:
    """Check if two Camelot tables have the same number of columns.

    Args:
        a (Table): First table.
        b (Table): Second table.

    Returns:
        bool: True if both tables have the same number of columns, False otherwise.
    """
    return a.shape[1] == b.shape[1]


def group_contiguous_tables(
    tables: TableList,
    matching_function: Callable[[Table, Table], bool] = matching_columns,
) -> List[List[Table]]:
    """Group contiguous Camelot tables across page breaks.

    Contiguous tables are defined as adjacent tables in the input list that
    satisfy the provided matching function (by default, same number of columns).

    Args:
        tables (TableList): List of Camelot Table objects.
        matching_function (Callable[[Table, Table], bool], optional):
            Function to decide whether two adjacent tables should be grouped.
            Defaults to ``matching_columns``.

    Returns:
        List[List[Table]]: A list of table groups. Each group is a list of
        Table objects that should be considered contiguous.
    """
    groups: List[List[Table]] = []
    current_group: List[Table] = []

    for prev_table, curr_table in zip(tables, tables[1:]):
        if matching_function(prev_table, curr_table):
            if not current_group:  # start new group
                current_group.append(prev_table)
            current_group.append(curr_table)
        else:
            if current_group:  # finalize group
                groups.append(current_group)
                current_group = []

    if current_group:  # finalize last group
        groups.append(current_group)

    return groups


def stack_contiguous_tables(tables: TableList) -> TableList:
    """Stack groups of contiguous tables into single tables.

    Groups contiguous tables across page breaks (using
    :func:`group_contiguous_tables`) and vertically concatenates each group
    (using :func:`vstack_tables`). Returns a new TableList of merged tables.

    Args:
        tables (TableList): A list of Camelot Table objects.

    Returns:
        TableList: A new TableList where contiguous tables have been merged.
    """
    table_groups = group_contiguous_tables(tables)

    merged_tables: List[Table] = [
        vstack_tables(*group) if len(group) > 1 else group[0]
        for group in table_groups
    ]

    return TableList(merged_tables)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions