diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index aeaed4f18..ed6f1f551 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -8,6 +8,7 @@ ContentLayer, CoordOrigin, DocItem, + DocItemLabel, DoclingDocument, DocumentOrigin, GroupLabel, @@ -275,9 +276,10 @@ def _find_tables_in_sheet( if self.workbook is not None: content_layer = self._get_sheet_content_layer(sheet) - tables = self._find_data_tables(sheet) + # Returns list of (ExcelTable, optional_title_text) tuples + tables_with_titles = self._find_data_tables(sheet) - for excel_table in tables: + for excel_table, title_text in tables_with_titles: origin_col = excel_table.anchor[0] origin_row = excel_table.anchor[1] num_rows = excel_table.num_rows @@ -303,9 +305,19 @@ def _find_tables_in_sheet( ) table_data.table_cells.append(cell) + # Create caption if a title was found + caption = None + if title_text: + caption = doc.add_text( + text=title_text, + parent=self.parents[0], + label=DocItemLabel.CAPTION, + ) + page_no = self.workbook.index(sheet) + 1 doc.add_table( data=table_data, + caption=caption, parent=self.parents[0], prov=ProvenanceItem( page_no=page_no, @@ -367,14 +379,20 @@ def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion: return DataRegion(min_row, max_row, min_col, max_col) - def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]: + def _find_data_tables( + self, sheet: Worksheet + ) -> list[tuple[ExcelTable, Optional[str]]]: """Find all compact rectangular data tables in an Excel worksheet. + Detects 1x1 tables positioned above larger tables (with at least one empty row + between them) as titles, and associates them as captions. + Args: sheet: The Excel worksheet to be parsed. Returns: - A list of ExcelTable objects representing the data tables. + A list of tuples (ExcelTable, title_text) where title_text is None + if no title was found, or a string if a 1x1 table above serves as title. """ bounds: DataRegion = self._find_true_data_bounds( sheet @@ -405,7 +423,30 @@ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]: visited.update(visited_cells) # Mark these cells as visited tables.append(table_bounds) - return tables + # detect titles (1x1 tables above larger tables) + tables_with_titles: list[tuple[ExcelTable, str | None]] = [] + skip_next = False + + for i, table in enumerate(tables): + if skip_next: + skip_next = False + continue + + # Check if this is a 1x1 table that could be a title + if table.num_rows == 1 and table.num_cols == 1 and i + 1 < len(tables): + next_table = tables[i + 1] + title_end_row = table.anchor[1] + table.num_rows + next_table_start_row = next_table.anchor[1] + + if next_table_start_row > title_end_row: + title_text = table.data[0].text if table.data else "" + tables_with_titles.append((next_table, title_text)) + skip_next = True + continue + + tables_with_titles.append((table, None)) + + return tables_with_titles def _find_table_bounds( self, diff --git a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt new file mode 100644 index 000000000..d645aeb29 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt @@ -0,0 +1,5 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group sheet: Duck Observations + item-2 at level 2: caption: Number of freshwater ducks per year + item-3 at level 2: table with [7x2] + item-3 at level 3: caption: Number of freshwater ducks per year \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json new file mode 100644 index 000000000..e4ff91496 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json @@ -0,0 +1,529 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.7.0", + "name": "xlsx_05_table_with_title", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "binary_hash": 18126553641942797758, + "filename": "xlsx_05_table_with_title.xlsx", + "uri": null + }, + "furniture": { + "self_ref": "#/furniture", + "parent": null, + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "parent": null, + "children": [ + { + "cref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "cref": "#/body" + }, + "children": [ + { + "cref": "#/texts/0" + }, + { + "cref": "#/tables/0" + } + ], + "content_layer": "body", + "name": "sheet: Duck Observations", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "cref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "caption", + "prov": [], + "orig": "Number of freshwater ducks per year", + "text": "Number of freshwater ducks per year", + "formatting": null, + "hyperlink": null + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "cref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 1.0, + "t": 3.0, + "r": 3.0, + "b": 10.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], + "captions": [ + { + "cref": "#/texts/0" + } + ], + "references": [], + "footnotes": [], + "image": null, + "data": { + "table_cells": [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Freshwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2019", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2020", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "135", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2021", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "150", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2022", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "170", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2023", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "160", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2024", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "180", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 7, + "num_cols": 2, + "grid": [ + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Freshwater Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2019", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "120", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2020", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "135", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2021", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "150", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2022", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "170", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2023", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "160", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2024", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "180", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + } + ], + "key_value_items": [], + "form_items": [], + "pages": { + "1": { + "size": { + "width": 2.0, + "height": 7.0 + }, + "image": null, + "page_no": 1 + } + } +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.md b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.md new file mode 100644 index 000000000..7b3e9641a --- /dev/null +++ b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.md @@ -0,0 +1,10 @@ +Number of freshwater ducks per year + +| Year | Freshwater Ducks | +|--------|--------------------| +| 2019 | 120 | +| 2020 | 135 | +| 2021 | 150 | +| 2022 | 170 | +| 2023 | 160 | +| 2024 | 180 | \ No newline at end of file diff --git a/tests/data/xlsx/xlsx_05_table_with_title.xlsx b/tests/data/xlsx/xlsx_05_table_with_title.xlsx new file mode 100644 index 000000000..b7a04de7b Binary files /dev/null and b/tests/data/xlsx/xlsx_05_table_with_title.xlsx differ diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 6084a4b62..618059784 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -227,6 +227,53 @@ def test_inflated_rows_handling(documents) -> None: ) +def test_table_with_title(): + """Test that 1x1 tables above larger tables are detected as titles. + + xlsx_05_table_with_title.xlsx contains a 1x1 cell title. This test verifies that when a 1x1 + cell which is positioned above a larger table (with one empty row between them), it's + treated as a caption for that table rather than a separate table. + """ + path = next( + item for item in get_excel_paths() if item.stem == "xlsx_05_table_with_title" + ) + + converter = get_converter() + conv_result: ConversionResult = converter.convert(path) + doc: DoclingDocument = conv_result.document + + tables = list(doc.tables) + assert len(tables) == 1, ( + f"Should have 1 table (title should not be separate), got {len(tables)}" + ) + + table = tables[0] + + assert table.captions, "Table should have a caption" + assert len(table.captions) == 1, "Table should have exactly one caption" + + # Get the caption text + caption_ref = table.captions[0] + caption_text = None + for text_item in doc.texts: + if text_item.self_ref == caption_ref.cref: + caption_text = text_item.text + break + + assert caption_text is not None, "Should be able to find caption text" + assert caption_text == "Number of freshwater ducks per year", ( + f"Caption should be 'Number of freshwater ducks per year', got '{caption_text}'" + ) + + # table dimensions should be the data table, not including the title + assert table.data.num_rows == 7, ( + f"Table should have 7 rows, got {table.data.num_rows}" + ) + assert table.data.num_cols == 2, ( + f"Table should have 2 columns, got {table.data.num_cols}" + ) + + def test_bytesio_stream(): """Test that Excel files can be loaded from BytesIO streams.