Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 57 additions & 8 deletions docling/backend/msexcel_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
return DataRegion(min_row, max_row, min_col, max_col)

def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
"""Find all compact rectangular data tables in an Excel worksheet.
"""Find all rectangular data tables in an Excel worksheet. Two non-empty cells forms a data table if they are adjancent.

Args:
sheet: The Excel worksheet to be parsed.
Expand Down Expand Up @@ -437,7 +437,7 @@ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:

# If the cell starts a new table, find its bounds
table_bounds, visited_cells = self._find_table_bounds(
sheet, ri, rj, bounds.max_row, bounds.max_col
sheet, ri, rj, bounds
)

visited.update(visited_cells) # Mark these cells as visited
Expand All @@ -450,26 +450,48 @@ def _find_table_bounds(
sheet: Worksheet,
start_row: int,
start_col: int,
max_row: int,
max_col: int,
bounds: DataRegion,
) -> tuple[ExcelTable, set[tuple[int, int]]]:
"""Determine the bounds of a compact rectangular table.
"""Determine the bounds of a rectangular table. Two non-empty cells forms a data table if they are adjancent.

Args:
sheet: The Excel worksheet to be parsed.
start_row: The row number of the starting cell.
start_col: The column number of the starting cell.
max_row: Maximum row boundary from true data bounds.
max_col: Maximum column boundary from true data bounds.
bounds: boundary from true data bounds.

Returns:
A tuple with an Excel table and a set of cell coordinates.
"""
_log.debug("find_table_bounds")

max_row = bounds.max_row
max_col = bounds.max_col

table_max_row = self._find_table_bottom(sheet, start_row, start_col, max_row)
table_max_col = self._find_table_right(sheet, start_row, start_col, max_col)

# Expand table on the left, bounds indexing starts with + 1
for rj in range(start_col, bounds.min_col - 1, -1):
if self._is_column_empty(sheet, start_row + 2, table_max_row + 1, rj):
break
else:
start_col = rj - 1

# Expand table on the right
for rj in range(table_max_col + 1, max_col):
if self._is_column_empty(sheet, start_row + 2, table_max_row + 1, rj + 1):
break
else:
table_max_col = rj

# Expand the table on the bottom
for ri in range(table_max_row + 1, max_row):
if self._is_row_empty(sheet, ri + 1, start_col + 1, table_max_col + 1):
break
else:
table_max_row = ri

# Collect the data within the bounds
data = []
visited_cells: set[tuple[int, int]] = set()
Expand Down Expand Up @@ -556,7 +578,6 @@ def _find_table_bottom(
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
None,
)

if cell.value is None and not merged_range:
break # Stop if the cell is empty and not merged

Expand Down Expand Up @@ -684,3 +705,31 @@ def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
else ContentLayer.INVISIBLE
)

@staticmethod
def _is_column_empty(
sheet: Worksheet, start_row: int, end_row: int, col: int
) -> bool:
for (value,) in sheet.iter_rows(
min_row=start_row,
max_row=end_row,
min_col=col,
max_col=col,
values_only=True,
):
if value not in (None, ""):
return False # Found a non-empty value
return True # All cells were empty

@staticmethod
def _is_row_empty(sheet: Worksheet, row: int, start_col: int, end_col: int) -> bool:
for (value,) in sheet.iter_cols(
min_row=row,
max_row=row,
min_col=start_col,
max_col=end_col,
values_only=True,
):
if value not in (None, ""):
return False # Found a non-empty value
return True # All cells were empty
8 changes: 4 additions & 4 deletions tests/data/groundtruth/docling_v2/xlsx_01.xlsx.json

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "sample_sales_data",
"version": "1.8.0",
"name": "xlsx_02_sample_sales_data",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"binary_hash": 14806485565397602516,
"filename": "sample_sales_data.xlsm"
"filename": "xlsx_02_sample_sales_data.xlsm"
},
"furniture": {
"self_ref": "#/furniture",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"version": "1.8.0",
"name": "xlsx_03_chartsheet",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"binary_hash": 548415533138925042,
"binary_hash": 472193488349663234,
"filename": "xlsx_03_chartsheet.xlsx"
},
"furniture": {
Expand Down Expand Up @@ -855,4 +855,4 @@
"page_no": 2
}
}
}
}
8 changes: 4 additions & 4 deletions tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "test-02",
"version": "1.8.0",
"name": "xlsx_04_inflated",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"binary_hash": 13967282146026440806,
"filename": "test-02.xlsx"
"binary_hash": 8997038978642400831,
"filename": "xlsx_04_inflated.xlsx"
},
"furniture": {
"self_ref": "#/furniture",
Expand Down
Loading