Skip to content

Commit a283ccf

Browse files
authored
feat(msexcel): set ContentLayer.INVISIBLE for invisible sheet (#1876)
* feat(msexcel): ignore invisible sheet * DCO Remediation Commit for Qiefan Jiang <[email protected]> I, Qiefan Jiang <[email protected]>, hereby add my Signed-off-by to this commit: ca391f4 Signed-off-by: Qiefan Jiang <[email protected]> * retain invisible sheet with ContentLayer.INVISIBLE Signed-off-by: Qiefan Jiang <[email protected]> * update UT Signed-off-by: Qiefan Jiang <[email protected]> * fix: use Optional for python3.9 Signed-off-by: Qiefan Jiang <[email protected]> * DCO Remediation Commit for Qiefan Jiang <[email protected]> I, Qiefan Jiang <[email protected]>, hereby add my Signed-off-by to this commit: a34371a Signed-off-by: Qiefan Jiang <[email protected]> --------- Signed-off-by: Qiefan Jiang <[email protected]>
1 parent be26044 commit a283ccf

File tree

4 files changed

+132
-4
lines changed

4 files changed

+132
-4
lines changed

docling/backend/msexcel_backend.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import logging
22
from io import BytesIO
33
from pathlib import Path
4-
from typing import Any, Union, cast
4+
from typing import Any, Optional, Union, cast
55

66
from docling_core.types.doc import (
77
BoundingBox,
8+
ContentLayer,
89
CoordOrigin,
910
DocItem,
1011
DoclingDocument,
@@ -197,6 +198,7 @@ def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
197198
parent=None,
198199
label=GroupLabel.SECTION,
199200
name=f"sheet: {sheet_name}",
201+
content_layer=self._get_sheet_content_layer(sheet),
200202
)
201203
doc = self._convert_sheet(doc, sheet)
202204
width, height = self._find_page_size(doc, page_no)
@@ -237,6 +239,7 @@ def _find_tables_in_sheet(
237239
"""
238240

239241
if self.workbook is not None:
242+
content_layer = self._get_sheet_content_layer(sheet)
240243
tables = self._find_data_tables(sheet)
241244

242245
for excel_table in tables:
@@ -282,6 +285,7 @@ def _find_tables_in_sheet(
282285
origin=CoordOrigin.TOPLEFT,
283286
),
284287
),
288+
content_layer=content_layer,
285289
)
286290

287291
return doc
@@ -486,6 +490,7 @@ def _find_images_in_sheet(
486490
The updated DoclingDocument.
487491
"""
488492
if self.workbook is not None:
493+
content_layer = self._get_sheet_content_layer(sheet)
489494
# Iterate over byte images in the sheet
490495
for item in sheet._images: # type: ignore[attr-defined]
491496
try:
@@ -511,6 +516,7 @@ def _find_images_in_sheet(
511516
anchor, origin=CoordOrigin.TOPLEFT
512517
),
513518
),
519+
content_layer=content_layer,
514520
)
515521
except Exception:
516522
_log.error("could not extract the image from excel sheets")
@@ -536,3 +542,11 @@ def _find_page_size(
536542
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
537543

538544
return (right - left, bottom - top)
545+
546+
@staticmethod
547+
def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
548+
return (
549+
None
550+
if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
551+
else ContentLayer.INVISIBLE
552+
)

tests/data/groundtruth/docling_v2/test-01.xlsx.json

Lines changed: 114 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"name": "test-01",
55
"origin": {
66
"mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
7-
"binary_hash": 13665052226482254103,
7+
"binary_hash": 5648670872883457266,
88
"filename": "test-01.xlsx"
99
},
1010
"furniture": {
@@ -25,6 +25,9 @@
2525
},
2626
{
2727
"$ref": "#/groups/2"
28+
},
29+
{
30+
"$ref": "#/groups/3"
2831
}
2932
],
3033
"content_layer": "body",
@@ -85,6 +88,20 @@
8588
"content_layer": "body",
8689
"name": "sheet: Sheet3",
8790
"label": "section"
91+
},
92+
{
93+
"self_ref": "#/groups/3",
94+
"parent": {
95+
"$ref": "#/body"
96+
},
97+
"children": [
98+
{
99+
"$ref": "#/tables/6"
100+
}
101+
],
102+
"content_layer": "invisible",
103+
"name": "sheet: Sheet4",
104+
"label": "section"
88105
}
89106
],
90107
"texts": [],
@@ -3382,6 +3399,95 @@
33823399
]
33833400
},
33843401
"annotations": []
3402+
},
3403+
{
3404+
"self_ref": "#/tables/6",
3405+
"parent": {
3406+
"$ref": "#/groups/3"
3407+
},
3408+
"children": [],
3409+
"content_layer": "invisible",
3410+
"label": "table",
3411+
"prov": [
3412+
{
3413+
"page_no": 4,
3414+
"bbox": {
3415+
"l": 0.0,
3416+
"t": 0.0,
3417+
"r": 1.0,
3418+
"b": 2.0,
3419+
"coord_origin": "TOPLEFT"
3420+
},
3421+
"charspan": [
3422+
0,
3423+
0
3424+
]
3425+
}
3426+
],
3427+
"captions": [],
3428+
"references": [],
3429+
"footnotes": [],
3430+
"data": {
3431+
"table_cells": [
3432+
{
3433+
"row_span": 1,
3434+
"col_span": 1,
3435+
"start_row_offset_idx": 0,
3436+
"end_row_offset_idx": 1,
3437+
"start_col_offset_idx": 0,
3438+
"end_col_offset_idx": 1,
3439+
"text": "header",
3440+
"column_header": true,
3441+
"row_header": false,
3442+
"row_section": false
3443+
},
3444+
{
3445+
"row_span": 1,
3446+
"col_span": 1,
3447+
"start_row_offset_idx": 1,
3448+
"end_row_offset_idx": 2,
3449+
"start_col_offset_idx": 0,
3450+
"end_col_offset_idx": 1,
3451+
"text": "1",
3452+
"column_header": false,
3453+
"row_header": false,
3454+
"row_section": false
3455+
}
3456+
],
3457+
"num_rows": 2,
3458+
"num_cols": 1,
3459+
"grid": [
3460+
[
3461+
{
3462+
"row_span": 1,
3463+
"col_span": 1,
3464+
"start_row_offset_idx": 0,
3465+
"end_row_offset_idx": 1,
3466+
"start_col_offset_idx": 0,
3467+
"end_col_offset_idx": 1,
3468+
"text": "header",
3469+
"column_header": true,
3470+
"row_header": false,
3471+
"row_section": false
3472+
}
3473+
],
3474+
[
3475+
{
3476+
"row_span": 1,
3477+
"col_span": 1,
3478+
"start_row_offset_idx": 1,
3479+
"end_row_offset_idx": 2,
3480+
"start_col_offset_idx": 0,
3481+
"end_col_offset_idx": 1,
3482+
"text": "1",
3483+
"column_header": false,
3484+
"row_header": false,
3485+
"row_section": false
3486+
}
3487+
]
3488+
]
3489+
},
3490+
"annotations": []
33853491
}
33863492
],
33873493
"key_value_items": [],
@@ -3407,6 +3513,13 @@
34073513
"height": 36.0
34083514
},
34093515
"page_no": 3
3516+
},
3517+
"4": {
3518+
"size": {
3519+
"width": 0.0,
3520+
"height": 0.0
3521+
},
3522+
"page_no": 4
34103523
}
34113524
}
34123525
}

tests/data/xlsx/test-01.xlsx

882 Bytes
Binary file not shown.

tests/test_backend_msexcel.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,14 @@ def test_pages(documents) -> None:
8787
backend=MsExcelDocumentBackend,
8888
)
8989
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
90-
assert backend.page_count() == 3
90+
assert backend.page_count() == 4
9191

9292
# number of pages from the converted document
9393
doc = next(item for path, item in documents if path.stem == "test-01")
94-
assert len(doc.pages) == 3
94+
assert len(doc.pages) == 4
9595

9696
# page sizes as number of cells
9797
assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
9898
assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
9999
assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)
100+
assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0)

0 commit comments

Comments
 (0)