Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 46 additions & 5 deletions docling/backend/msexcel_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ContentLayer,
CoordOrigin,
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
Expand Down Expand Up @@ -275,9 +276,10 @@ def _find_tables_in_sheet(

if self.workbook is not None:
content_layer = self._get_sheet_content_layer(sheet)
tables = self._find_data_tables(sheet)
# Returns list of (ExcelTable, optional_title_text) tuples
tables_with_titles = self._find_data_tables(sheet)

for excel_table in tables:
for excel_table, title_text in tables_with_titles:
origin_col = excel_table.anchor[0]
origin_row = excel_table.anchor[1]
num_rows = excel_table.num_rows
Expand All @@ -303,9 +305,19 @@ def _find_tables_in_sheet(
)
table_data.table_cells.append(cell)

# Create caption if a title was found
caption = None
if title_text:
caption = doc.add_text(
text=title_text,
parent=self.parents[0],
label=DocItemLabel.CAPTION,
)

page_no = self.workbook.index(sheet) + 1
doc.add_table(
data=table_data,
caption=caption,
parent=self.parents[0],
prov=ProvenanceItem(
page_no=page_no,
Expand Down Expand Up @@ -367,14 +379,20 @@ def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:

return DataRegion(min_row, max_row, min_col, max_col)

def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
def _find_data_tables(
self, sheet: Worksheet
) -> list[tuple[ExcelTable, Optional[str]]]:
"""Find all compact rectangular data tables in an Excel worksheet.

Detects 1x1 tables positioned above larger tables (with at least one empty row
between them) as titles, and associates them as captions.

Args:
sheet: The Excel worksheet to be parsed.

Returns:
A list of ExcelTable objects representing the data tables.
A list of tuples (ExcelTable, title_text) where title_text is None
if no title was found, or a string if a 1x1 table above serves as title.
"""
bounds: DataRegion = self._find_true_data_bounds(
sheet
Expand Down Expand Up @@ -405,7 +423,30 @@ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
visited.update(visited_cells) # Mark these cells as visited
tables.append(table_bounds)

return tables
# detect titles (1x1 tables above larger tables)
tables_with_titles: list[tuple[ExcelTable, str | None]] = []
skip_next = False

for i, table in enumerate(tables):
if skip_next:
skip_next = False
continue

# Check if this is a 1x1 table that could be a title
if table.num_rows == 1 and table.num_cols == 1 and i + 1 < len(tables):
next_table = tables[i + 1]
title_end_row = table.anchor[1] + table.num_rows
next_table_start_row = next_table.anchor[1]

if next_table_start_row > title_end_row:
title_text = table.data[0].text if table.data else ""
tables_with_titles.append((next_table, title_text))
skip_next = True
continue

tables_with_titles.append((table, None))

return tables_with_titles

def _find_table_bounds(
self,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group sheet: Duck Observations
item-2 at level 2: caption: Number of freshwater ducks per year
item-3 at level 2: table with [7x2]
item-3 at level 3: caption: Number of freshwater ducks per year
Loading