Skip to content

Commit 947963a

Browse files
committed
fix: recognize when a table is actually its title in a xlsx document
Signed-off-by: glypt <[email protected]>
1 parent 6a04e27 commit 947963a

File tree

6 files changed

+637
-5
lines changed

6 files changed

+637
-5
lines changed

docling/backend/msexcel_backend.py

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
ContentLayer,
99
CoordOrigin,
1010
DocItem,
11+
DocItemLabel,
1112
DoclingDocument,
1213
DocumentOrigin,
1314
GroupLabel,
@@ -275,9 +276,10 @@ def _find_tables_in_sheet(
275276

276277
if self.workbook is not None:
277278
content_layer = self._get_sheet_content_layer(sheet)
278-
tables = self._find_data_tables(sheet)
279+
# Returns list of (ExcelTable, optional_title_text) tuples
280+
tables_with_titles = self._find_data_tables(sheet)
279281

280-
for excel_table in tables:
282+
for excel_table, title_text in tables_with_titles:
281283
origin_col = excel_table.anchor[0]
282284
origin_row = excel_table.anchor[1]
283285
num_rows = excel_table.num_rows
@@ -303,9 +305,19 @@ def _find_tables_in_sheet(
303305
)
304306
table_data.table_cells.append(cell)
305307

308+
# Create caption if a title was found
309+
caption = None
310+
if title_text:
311+
caption = doc.add_text(
312+
text=title_text,
313+
parent=self.parents[0],
314+
label=DocItemLabel.CAPTION,
315+
)
316+
306317
page_no = self.workbook.index(sheet) + 1
307318
doc.add_table(
308319
data=table_data,
320+
caption=caption,
309321
parent=self.parents[0],
310322
prov=ProvenanceItem(
311323
page_no=page_no,
@@ -367,14 +379,20 @@ def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
367379

368380
return DataRegion(min_row, max_row, min_col, max_col)
369381

370-
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
382+
def _find_data_tables(
383+
self, sheet: Worksheet
384+
) -> list[tuple[ExcelTable, Optional[str]]]:
371385
"""Find all compact rectangular data tables in an Excel worksheet.
372386
387+
Detects 1x1 tables positioned above larger tables (with at least one empty row
388+
between them) as titles, and associates them as captions.
389+
373390
Args:
374391
sheet: The Excel worksheet to be parsed.
375392
376393
Returns:
377-
A list of ExcelTable objects representing the data tables.
394+
A list of tuples (ExcelTable, title_text) where title_text is None
395+
if no title was found, or a string if a 1x1 table above serves as title.
378396
"""
379397
bounds: DataRegion = self._find_true_data_bounds(
380398
sheet
@@ -405,7 +423,30 @@ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
405423
visited.update(visited_cells) # Mark these cells as visited
406424
tables.append(table_bounds)
407425

408-
return tables
426+
# detect titles (1x1 tables above larger tables)
427+
tables_with_titles: list[tuple[ExcelTable, str | None]] = []
428+
skip_next = False
429+
430+
for i, table in enumerate(tables):
431+
if skip_next:
432+
skip_next = False
433+
continue
434+
435+
# Check if this is a 1x1 table that could be a title
436+
if table.num_rows == 1 and table.num_cols == 1 and i + 1 < len(tables):
437+
next_table = tables[i + 1]
438+
title_end_row = table.anchor[1] + table.num_rows
439+
next_table_start_row = next_table.anchor[1]
440+
441+
if next_table_start_row > title_end_row:
442+
title_text = table.data[0].text if table.data else ""
443+
tables_with_titles.append((next_table, title_text))
444+
skip_next = True
445+
continue
446+
447+
tables_with_titles.append((table, None))
448+
449+
return tables_with_titles
409450

410451
def _find_table_bounds(
411452
self,
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
item-0 at level 0: unspecified: group _root_
2+
item-1 at level 1: section: group sheet: Duck Observations
3+
item-2 at level 2: caption: Number of freshwater ducks per year
4+
item-3 at level 2: table with [7x2]
5+
item-3 at level 3: caption: Number of freshwater ducks per year

0 commit comments

Comments
 (0)