Skip to content

Commit c4fe51f

Browse files
fix: add the bbox to TableData from annotations (#123)
Signed-off-by: Peter Staar <[email protected]>
1 parent b39f2e7 commit c4fe51f

File tree

1 file changed

+25
-5
lines changed

1 file changed

+25
-5
lines changed

docling_eval/utils/utils.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
1818
from docling.datamodel.base_models import InputFormat, Page
1919
from docling.datamodel.document import InputDocument
20-
from docling_core.types.doc.base import BoundingBox, Size
20+
from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
2121
from docling_core.types.doc.document import (
2222
DoclingDocument,
2323
GraphData,
@@ -177,6 +177,20 @@ def yield_cells_from_html_table(
177177
text_cell = text_cells[text_cell_id]
178178
text = "".join(text_cell["tokens"])
179179

180+
bbox = None
181+
if (
182+
text_cells is not None
183+
and text_cell_id < len(text_cells)
184+
and "bbox" in text_cells[text_cell_id]
185+
):
186+
bbox = BoundingBox(
187+
l=text_cells[text_cell_id]["bbox"][0],
188+
b=text_cells[text_cell_id]["bbox"][1],
189+
r=text_cells[text_cell_id]["bbox"][2],
190+
t=text_cells[text_cell_id]["bbox"][3],
191+
coord_origin=CoordOrigin.BOTTOMLEFT,
192+
)
193+
180194
rowspan = int(cell.get("rowspan", 1))
181195
colspan = int(cell.get("colspan", 1))
182196

@@ -186,7 +200,7 @@ def yield_cells_from_html_table(
186200
grid[row_idx + r][col_idx + c] = text
187201

188202
# print(f"Row: {row_idx + 1}, Col: {col_idx + 1}, Text: {text}")
189-
yield row_idx, col_idx, rowspan, colspan, text
203+
yield row_idx, col_idx, rowspan, colspan, text, bbox
190204

191205
col_idx += colspan # Move to next column after colspan
192206

@@ -202,9 +216,14 @@ def convert_html_table_into_docling_tabledata(
202216
cells = []
203217

204218
try:
205-
for row_idx, col_idx, rowspan, colspan, text in yield_cells_from_html_table(
206-
table_html=table_html, text_cells=text_cells
207-
):
219+
for (
220+
row_idx,
221+
col_idx,
222+
rowspan,
223+
colspan,
224+
text,
225+
bbox,
226+
) in yield_cells_from_html_table(table_html=table_html, text_cells=text_cells):
208227
cell = TableCell(
209228
row_span=rowspan,
210229
col_span=colspan,
@@ -213,6 +232,7 @@ def convert_html_table_into_docling_tabledata(
213232
start_col_offset_idx=col_idx,
214233
end_col_offset_idx=col_idx + colspan,
215234
text=text,
235+
bbox=bbox,
216236
)
217237
cells.append(cell)
218238

0 commit comments

Comments
 (0)